All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (2 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 02/16] sched: maintain per-rq runnable averages Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  6:06   ` Namhyung Kim
  2012-07-04 15:32   ` Peter Zijlstra
  2012-06-28  2:24 ` [PATCH 04/16] sched: maintain the load contribution of blocked entities Paul Turner
                   ` (11 subsequent siblings)
  15 siblings, 2 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Instead of tracking averaging the load parented by a cfs_rq, we can track
entity load directly.  With the load for a given cfs_Rq then being the sum of
its children.

To do this we represent the historical contribution to runnable average within each
trailing 1024us of execution as the coefficients of a geometric series.

We can express this for a given task t as:
  runnable_sum(t) = \Sum u_i * y^i ,
  load(t) = weight_t * runnable_sum(t) / (\Sum 1024 * y^i)

Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms
and y is chosen such that y^k = 1/2.  We currently choose k to be 32 which
roughly translates to about a sched period.

Signed-off-by: Paul Turner <pjt@google.com>
---
 include/linux/sched.h |    8 +++
 kernel/sched/debug.c  |    4 ++
 kernel/sched/fair.c   |  128 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 140 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9dced2e..5bf5c79 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1136,6 +1136,11 @@ struct load_weight {
 	unsigned long weight, inv_weight;
 };
 
+struct sched_avg {
+	u32 runnable_avg_sum, runnable_avg_period;
+	u64 last_runnable_update;
+};
+
 #ifdef CONFIG_SCHEDSTATS
 struct sched_statistics {
 	u64			wait_start;
@@ -1196,6 +1201,9 @@ struct sched_entity {
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
+#ifdef CONFIG_SMP
+	struct sched_avg	avg;
+#endif
 };
 
 struct sched_rt_entity {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c09a4e7..cd5ef23 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	P(se->statistics.wait_count);
 #endif
 	P(se->load.weight);
+#ifdef CONFIG_SMP
+	P(se->avg.runnable_avg_sum);
+	P(se->avg.runnable_avg_period);
+#endif
 #undef PN
 #undef P
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3704ad3..864a122 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -976,6 +976,125 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_SMP
+/*
+ * Approximate:
+ *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, int n)
+{
+	for (; n && val; n--) {
+		val *= 4008;
+		val >>= 12;
+	}
+
+	return val;
+}
+
+/* We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series.  To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ *      p0            p1           p1
+ *     (now)       (~1ms ago)  (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ *   y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+							struct sched_avg *sa,
+							int runnable)
+{
+	u64 delta;
+	int delta_w, decayed = 0;
+
+	delta = now - sa->last_runnable_update;
+	/*
+	 * This should only happen when time goes backwards, which it
+	 * unfortunately does during sched clock init when we swap over to TSC.
+	 */
+	if ((s64)delta < 0) {
+		sa->last_runnable_update = now;
+		return 0;
+	}
+
+	/*
+	 * Use 1024ns as the unit of measurement since it's a reasonable
+	 * approximation of 1us and fast to compute.
+	 */
+	delta >>= 10;
+	if (!delta)
+		return 0;
+	sa->last_runnable_update = now;
+
+	/* delta_w is the amount already accumulated against our next period */
+	delta_w = sa->runnable_avg_period % 1024;
+	if (delta + delta_w >= 1024) {
+		/* period roll-over */
+		decayed = 1;
+
+		/*
+		 * Now that we know we're crossing a period boundary, figure
+		 * out how much from delta we need to complete the current
+		 * period and accrue it.
+		 */
+		delta_w = 1024 - delta_w;
+		BUG_ON(delta_w > delta);
+		do {
+			if (runnable)
+				sa->runnable_avg_sum += delta_w;
+			sa->runnable_avg_period += delta_w;
+
+			/*
+			 * Remainder of delta initiates a new period, roll over
+			 * the previous.
+			 */
+			sa->runnable_avg_sum =
+				decay_load(sa->runnable_avg_sum, 1);
+			sa->runnable_avg_period =
+				decay_load(sa->runnable_avg_period, 1);
+
+			delta -= delta_w;
+			/* New period is empty */
+			delta_w = 1024;
+		} while (delta >= 1024);
+	}
+
+	/* Remainder of delta accrued against u_0` */
+	if (runnable)
+		sa->runnable_avg_sum += delta;
+	sa->runnable_avg_period += delta;
+
+	return decayed;
+}
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se)
+{
+	__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
+				     se->on_rq);
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se) {}
+#endif
+
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
@@ -1102,6 +1221,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
+	update_entity_load_avg(se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -1176,6 +1296,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	update_entity_load_avg(se);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
@@ -1345,6 +1466,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
+		/* in !on_rq case, update occurred at dequeue */
+		update_entity_load_avg(prev);
 	}
 	cfs_rq->curr = NULL;
 }
@@ -1358,6 +1481,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	update_curr(cfs_rq);
 
 	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_entity_load_avg(curr);
+
+	/*
 	 * Update share accounting for long-running entities.
 	 */
 	update_entity_shares_tick(cfs_rq);



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 04/16] sched: maintain the load contribution of blocked entities
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (3 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-29  1:27   ` Namhyung Kim
  2012-06-28  2:24 ` [PATCH 07/16] sched: aggregate total task_group load Paul Turner
                   ` (10 subsequent siblings)
  15 siblings, 1 reply; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

We are currently maintaining:
  runnable_load(cfs_rq) = \Sum task_load(t)

For all running children t of cfs_rq.  While this can be naturally updated for
tasks in a runnable state (as they are scheduled); this does not account for
the load contributed by blocked task entities.

This can be solved by introducing a separate accounting for blocked load:
  blocked_load(cfs_rq) = \Sum runnable(b) * weight(b)

Obviously we do not want to iterate over all blocked entities to account for
their decay, we instead observe that:
  runnable_load(t) = \Sum p_i*y^i

and that to account for an additional idle period we only need to compute:
  y*runnable_load(t).

This means that we can compute all blocked entities at once by evaluating:
  blocked_load(cfs_rq)` = y * blocked_load(cfs_rq)

Finally we maintain a decay counter so that when a sleeping entity re-awakens
we can determine how much of its load should be removed from the blocked sum.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 include/linux/sched.h |    1 
 kernel/sched/core.c   |    3 +
 kernel/sched/debug.c  |    3 +
 kernel/sched/fair.c   |  130 ++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sched/sched.h  |    4 +-
 5 files changed, 126 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0c54ce0..842c4df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1139,6 +1139,7 @@ struct load_weight {
 struct sched_avg {
 	u32 runnable_avg_sum, runnable_avg_period;
 	u64 last_runnable_update;
+	s64 decay_count;
 	unsigned long load_avg_contrib;
 };
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9bb7d28..aeb8e56 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1713,6 +1713,9 @@ static void __sched_fork(struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	p->se.avg.decay_count = 0;
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aeb74e3..2aa60cf 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	P(se->avg.runnable_avg_sum);
 	P(se->avg.runnable_avg_period);
 	P(se->avg.load_avg_contrib);
+	P(se->avg.decay_count);
 #endif
 #undef PN
 #undef P
@@ -230,6 +231,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			atomic_read(&cfs_rq->tg->load_weight));
 	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
 			cfs_rq->runnable_load_avg);
+	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
+			cfs_rq->blocked_load_avg);
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8229766..6200d20 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1085,6 +1085,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 	return decayed;
 }
 
+/* Synchronize an entity's decay with its parentin cfs_rq.*/
+static inline void __synchronize_entity_decay(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+	decays -= se->avg.decay_count;
+	if (!decays)
+		return;
+
+	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+	se->avg.decay_count += decays;
+}
+
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
@@ -1100,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
 	return se->avg.load_avg_contrib - old_contrib;
 }
 
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+						 long load_contrib)
+{
+	if (likely(load_contrib < cfs_rq->blocked_load_avg))
+		cfs_rq->blocked_load_avg -= load_contrib;
+	else
+		cfs_rq->blocked_load_avg = 0;
+}
+
 /* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se)
+static inline void update_entity_load_avg(struct sched_entity *se,
+					  int update_cfs_rq)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	long contrib_delta;
@@ -1111,8 +1135,34 @@ static inline void update_entity_load_avg(struct sched_entity *se)
 		return;
 
 	contrib_delta = __update_entity_load_avg_contrib(se);
+
+	if (!update_cfs_rq)
+		return;
+
 	if (se->on_rq)
 		cfs_rq->runnable_load_avg += contrib_delta;
+	else
+		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * they their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq)
+{
+	u64 now = rq_of(cfs_rq)->clock_task >> 20;
+	u64 decays;
+
+	decays = now - cfs_rq->last_decay;
+	if (!decays)
+		return;
+
+	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+					      decays);
+	atomic64_add(decays, &cfs_rq->decay_counter);
+
+	cfs_rq->last_decay = now;
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1122,26 +1172,56 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 
 /* Add the load generated by se into cfs_rq's child load-average */
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se)
-{
-	update_entity_load_avg(se);
+						  struct sched_entity *se,
+						  int wakeup)
+{
+	/* we track migrations using entity decay_count == 0 */
+	if (unlikely(!se->avg.decay_count)) {
+		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+		wakeup = 0;
+	} else {
+		__synchronize_entity_decay(se);
+	}
+
+	if (wakeup)
+		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+
+	update_entity_load_avg(se, 0);
 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+	update_cfs_rq_blocked_load(cfs_rq);
 }
 
-/* Remove se's load from this cfs_rq child load-average */
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se)
+						  struct sched_entity *se,
+						  int sleep)
 {
-	update_entity_load_avg(se);
+	update_entity_load_avg(se, 1);
+
 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+	if (sleep) {
+		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+	} else {
+		se->avg.decay_count = 0;
+	}
 }
 #else
-static inline void update_entity_load_avg(struct sched_entity *se) {}
+static inline void update_entity_load_avg(struct sched_entity *se,
+					  int update_cfs_rq) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se) {}
+					   struct sched_entity *se,
+					   int wakeup) {}
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-						  struct sched_entity *se) {}
+					   struct sched_entity *se,
+					   int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {}
 #endif
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1270,7 +1350,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
-	enqueue_entity_load_avg(cfs_rq, se);
+	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -1345,7 +1425,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	dequeue_entity_load_avg(cfs_rq, se);
+	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
@@ -1516,7 +1596,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
-		update_entity_load_avg(prev);
+		update_entity_load_avg(prev, 1);
 	}
 	cfs_rq->curr = NULL;
 }
@@ -1532,7 +1612,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Ensure that runnable average is periodically updated.
 	 */
-	update_entity_load_avg(curr);
+	update_entity_load_avg(curr, 1);
+	update_cfs_rq_blocked_load(cfs_rq);
 
 	/*
 	 * Update share accounting for long-running entities.
@@ -2391,6 +2472,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
+		update_entity_load_avg(se, 1);
 	}
 
 	if (!se) {
@@ -2452,6 +2534,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
+		update_entity_load_avg(se, 1);
 	}
 
 	if (!se) {
@@ -3557,6 +3640,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 
 	update_rq_clock(rq);
 	update_cfs_load(cfs_rq, 1);
+	update_cfs_rq_blocked_load(cfs_rq);
 
 	/*
 	 * We need to update shares after updating tg->load_weight in
@@ -5379,6 +5463,21 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 		place_entity(cfs_rq, se, 0);
 		se->vruntime -= cfs_rq->min_vruntime;
 	}
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	/*
+	* Remove our load from contribution when we leave sched_fair
+	* and ensure we don't carry in an old decay_count if we
+	* switch back.
+	*/
+	if (p->se.avg.decay_count) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+		__synchronize_entity_decay(&p->se);
+		subtract_blocked_load_contrib(cfs_rq,
+				p->se.avg.load_avg_contrib);
+		p->se.avg.decay_count = 0;
+	}
+#endif
 }
 
 /*
@@ -5425,6 +5524,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	atomic64_set(&cfs_rq->decay_counter, 1);
+#endif
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 26cc36f..a96adf1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -229,7 +229,9 @@ struct cfs_rq {
 	 * This allows for the description of both thread and group usage (in
 	 * the FAIR_GROUP_SCHED case).
 	 */
-	u64 runnable_load_avg;
+	u64 runnable_load_avg, blocked_load_avg;
+	atomic64_t decay_counter;
+	u64 last_decay;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 00/16] Series short description
@ 2012-06-28  2:24 Paul Turner
  2012-06-28  2:24 ` [PATCH 09/16] sched: normalize tg load contributions against runnable time Paul Turner
                   ` (15 more replies)
  0 siblings, 16 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Hi all,

Please find attached the latest version for CFS load-tracking.

It implements load-tracking on a per-sched_entity (currently SCHED_NORMAL, but
could be extended to RT as well) basis. This results in a bottom-up
load-computation in which entities contribute to their parents' load, as
opposed to the current top-down where the parent averages its children.  In
particular this allows us to correctly migrate load with their accompanying
entities and provides the necessary inputs for intelligent load-balancing and
power-management.

Modulo review I think this is now close to a mergable state.

Notable updates:
----------------
- Few stability issues fixed; in particular on systems with tasks having idle
  periods spanning several days.  Minor code cleanups.
- 32 bit performance improved (now reported faster[*] on 32-bit ARM than
  without, presumably due to better load-balance or reduced overhead.  Thanks
  to Vincent Guittot and others for looking at this)
- All configs delinted
- Config dependencies seperated so that load-tracking can be used without
  FAIR_GROUP_SCHED so that we may generally use it for power governors and
  per-entity load-balance (in progress).

Thanks to Peter, Vincent, Morten, Nikunj, and others for testing and comments.

Rewinding some context since I've been buried with internal work and it's been
a while since my last posting:

Background:
----------
We currently track the load average at the parenting cfs_rq level. We divide
execution into a series of consecutive "windows, w_i".  Within each we track:
  \Sum load_i * t_i where \Sum t_i = w and each load_i a disjoint load level.

The load average is then \Sum w_j / 2^n.

This this works reasonably well but there's a few problems
1) Since decay occurs at boundary window boundary there are 'skews':
At a window boundary the 'foreground' time has a bias against the
time immediately preceding it (as a result of the folding division)
e.g.  __xx_|_yyyy___ vs __xx_yyyy_|___  (where '|' is a window boundary).

The accounting here is 2x + 4y/2 or 2x + 4y, depending on which side of the
window your load lands on.

2) Everything within a window 'w' is accounted equally, we only fold at
the boundaries.  This actually means you can't set 'w' large enough to
really have meaningful coverage of the sched period without throwing
decay out the window.  But then with w/2 << sched_period (currently
w/2=5ms) the average ends up having fairly drastic swings even with
stable loads.

(Note:  Before we even looked at migrating to per-entity tracking we evaluating
foreground window into the considered average until it was "complete", this
represented a measurable improvement in stability under predictable load.)

3) Since the load sum average is per-cfs_rq and not per-entity when a task
entity migrates we lose its contribution to load-sum and effectively double
count it while it former sum decays.

New approach:
-------------
Instead of tracking load on a per-cfs_rq basis we do it on a per-sched_entity
basis.  The load sum for a cfs_rq is then just the sum of its childrens' load
averages.  The obvious immediately nice property is that when we migrate thread
A from cpu1-->cpu2, we carry its load with it; leaving the global load sum
unmodified.

The 'windows' above are replaced with more fine-grained tracking, that is (for
an entity j):

runnable_j = u_i * y^i , load_avg_j = runnable_j * weight_j [*]

Where: u_i is the usage in the last i`th ~ms and y is chosen such that
y^~sched_period = 1/2 (we currently choose k=32).This means that load tracked 1
sched_period ago contributes about ~50% as current execution.

Now, the real challenge in tracking at an entity basis is handling blocked
entities.  Obviously runnable entities will be updated naturally as we iterate
through them but there could be O(many) blocked entities so we can't afford to
iterate through them and update their tracking.

That our decay for a unit period is exponential introduces a particularly nice
property here:
We can separate the contributed load on a cfs_rq into blocked and runnable.
Runnable load is just the sum of all load_avg_j above, maintained by the
entities themselves as they run and self update (when they update their
load_avg they update the cumulative sum also).

Blocked load then looks like:
  load_avg_j = weight_k * \Sum u[k]_n * y^n

To account an entirely idle period we then only need to multiply by y.

This ends up being orders of magnitude more accurate than the current
tracking schema, even with the old shares_window massively dilated to
better capture a stable load-state.  It's also obviously stable under
migration.

As referenced above this also allows us to potentially improve decisions within
the load-balancer, for both distribution and power-management.

Example: consider 1x80% task  and 2x40% tasks on a 2-core machine.  It's
currently a bit of a gamble as to whether you get an {AB, B} or {A, BB} split
since they have equal weight (assume 1024).  With per-task tracking we can
actually consider them at their contributed weight and see a stable ~{800,{400,
400}} load-split.  Likewise within balance_tasks we can consider the load
migrated to be that actually contributed.  We have some patches looking at this
and will post them as they mature.

Thanks,

- Paul

---

Ben Segall (1):
      sched: maintain per-rq runnable averages

Paul Turner (15):
      sched: track the runnable average on a per-task entitiy basis
      sched: aggregate load contributed by task entities on parenting cfs_rq
      sched: maintain the load contribution of blocked entities
      sched: add an rq migration call-back to sched_class
      sched: account for blocked load waking back up
      sched: aggregate total task_group load
      sched: compute load contribution by a group entity
      sched: normalize tg load contributions against runnable time
      sched: maintain runnable averages across throttled periods
      sched: replace update_shares weight distribution with per-entity computation
      sched: refactor update_shares_cpu() -> update_blocked_avgs()
      sched: update_cfs_shares at period edge
      sched: make __update_entity_runnable_avg() fast
      sched: implement usage tracking
      sched: introduce temporary FAIR_GROUP_SCHED dependency for load-tracking


 include/linux/sched.h |   18 +
 kernel/sched/core.c   |    5 
 kernel/sched/debug.c  |   39 ++
 kernel/sched/fair.c   |  793 +++++++++++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h  |   60 ++--
 5 files changed, 720 insertions(+), 195 deletions(-)

-- 
Signature


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH 02/16] sched: maintain per-rq runnable averages
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
  2012-06-28  2:24 ` [PATCH 09/16] sched: normalize tg load contributions against runnable time Paul Turner
  2012-06-28  2:24 ` [PATCH 06/16] sched: account for blocked load waking back up Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  2:24 ` [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis Paul Turner
                   ` (12 subsequent siblings)
  15 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

From: Ben Segall <bsegall@google.com>

Since runqueues do not have a corresponding sched_entity we instead embed a
sched_avg structure directly.

Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Paul Turner <pjt@google.com>
---
 kernel/sched/debug.c |   10 ++++++++--
 kernel/sched/fair.c  |   18 ++++++++++++++++--
 kernel/sched/sched.h |    2 ++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cd5ef23..5d4a7dd 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
 	struct sched_entity *se = tg->se[cpu];
-	if (!se)
-		return;
 
 #define P(F) \
 	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
 #define PN(F) \
 	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
 
+	if (!se) {
+		struct sched_avg *avg = &cpu_rq(cpu)->avg;
+		P(avg->runnable_avg_sum);
+		P(avg->runnable_avg_period);
+		return;
+	}
+
+
 	PN(se->exec_start);
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 864a122..08bd3e0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1091,8 +1091,14 @@ static inline void update_entity_load_avg(struct sched_entity *se)
 	__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
 				     se->on_rq);
 }
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+}
 #else
 static inline void update_entity_load_avg(struct sched_entity *se) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 #endif
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2344,8 +2350,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(cfs_rq);
 	}
 
-	if (!se)
+	if (!se) {
+		update_rq_runnable_avg(rq, rq->nr_running);
 		inc_nr_running(rq);
+	}
 	hrtick_update(rq);
 }
 
@@ -2403,8 +2411,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		update_cfs_shares(cfs_rq);
 	}
 
-	if (!se)
+	if (!se) {
 		dec_nr_running(rq);
+		update_rq_runnable_avg(rq, 1);
+	}
 	hrtick_update(rq);
 }
 
@@ -4732,6 +4742,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
 		return;
 
+	update_rq_runnable_avg(this_rq, 1);
+
 	/*
 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
 	 */
@@ -5230,6 +5242,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		cfs_rq = cfs_rq_of(se);
 		entity_tick(cfs_rq, se, queued);
 	}
+
+	update_rq_runnable_avg(rq, 1);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4134d37..bfdb119 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -469,6 +469,8 @@ struct rq {
 #ifdef CONFIG_SMP
 	struct llist_head wake_list;
 #endif
+
+	struct sched_avg avg;
 };
 
 static inline struct list_head *offnode_tasks(struct rq *rq)



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 05/16] sched: add an rq migration call-back to sched_class
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (5 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 07/16] sched: aggregate total task_group load Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-29  1:32   ` Namhyung Kim
  2012-06-28  2:24 ` [PATCH 08/16] sched: compute load contribution by a group entity Paul Turner
                   ` (8 subsequent siblings)
  15 siblings, 1 reply; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Since we are now doing bottom up load accumulation we need explicit
notification when a task has been re-parented so that the old hierarchy can be
updated.

Adds task_migrate_rq(struct rq *prev, struct *rq new_rq);

(The alternative is to do this out of __set_task_cpu, but it was suggested that
this would be a cleaner encapsulation.)

Signed-off-by: Paul Turner <pjt@google.com>
---
 include/linux/sched.h |    1 +
 kernel/sched/core.c   |    2 ++
 kernel/sched/fair.c   |   12 ++++++++++++
 3 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 842c4df..fdfdfab 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1102,6 +1102,7 @@ struct sched_class {
 
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aeb8e56..c3686eb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1109,6 +1109,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);
 
 	if (task_cpu(p) != new_cpu) {
+		if (p->sched_class->migrate_task_rq)
+			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6200d20..33f582a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3089,6 +3089,17 @@ unlock:
 
 	return new_cpu;
 }
+
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including rq->lock state, should be made.
+ * Caller guarantees p->pi_lock held, but nothing else.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu) {
+}
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -5754,6 +5765,7 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_fair,
+	.migrate_task_rq	= migrate_task_rq_fair,
 
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 06/16] sched: account for blocked load waking back up
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
  2012-06-28  2:24 ` [PATCH 09/16] sched: normalize tg load contributions against runnable time Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  2:24 ` [PATCH 02/16] sched: maintain per-rq runnable averages Paul Turner
                   ` (13 subsequent siblings)
  15 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

When a running entity blocks we migrate its tracked load to
cfs_rq->blocked_runnable_avg.  In the sleep case this occurs while holding
rq->lock and so is a natural transition.  Wake-ups however, are potentially
asynchronous in the presence of migration and so special care must be taken.

We use an atomic counter to track such migrated load, taking care to match this
with the previously introduced decay counters so that we don't migrate too much
load.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/fair.c  |   94 ++++++++++++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h |    2 +
 2 files changed, 77 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33f582a..ea3d4f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1086,17 +1086,19 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 }
 
 /* Synchronize an entity's decay with its parentin cfs_rq.*/
-static inline void __synchronize_entity_decay(struct sched_entity *se)
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
 
 	decays -= se->avg.decay_count;
 	if (!decays)
-		return;
+		return 0;
 
 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
 	se->avg.decay_count += decays;
+
+	return decays;
 }
 
 /* Compute the current contribution to load_avg by se, return any delta */
@@ -1149,20 +1151,26 @@ static inline void update_entity_load_avg(struct sched_entity *se,
  * Decay the load contributed by all blocked children and account this so that
  * they their contribution may appropriately discounted when they wake up.
  */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq)
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 {
 	u64 now = rq_of(cfs_rq)->clock_task >> 20;
 	u64 decays;
 
 	decays = now - cfs_rq->last_decay;
-	if (!decays)
+	if (!decays && !force_update)
 		return;
 
-	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
-					      decays);
-	atomic64_add(decays, &cfs_rq->decay_counter);
+	if (atomic64_read(&cfs_rq->removed_load)) {
+		u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+		subtract_blocked_load_contrib(cfs_rq, removed_load);
+	}
 
-	cfs_rq->last_decay = now;
+	if (decays) {
+		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+						      decays);
+		atomic64_add(decays, &cfs_rq->decay_counter);
+		cfs_rq->last_decay = now;
+	}
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1175,21 +1183,41 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 						  struct sched_entity *se,
 						  int wakeup)
 {
-	/* we track migrations using entity decay_count == 0 */
-	if (unlikely(!se->avg.decay_count)) {
+	/*
+	 * We track migrations using entity decay_count <= 0, on a wake-up
+	 * migration we use a negative decay count to track the remote decays
+	 * accumulated while sleeping.
+	 */
+	if (unlikely(se->avg.decay_count <= 0)) {
 		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+		if (se->avg.decay_count) {
+			/*
+			 * In a wake-up migration we have to approximate the
+			 * time sleeping.  This is because we can't synchronize
+			 * clock_task between the two cpus, and it is not
+			 * guaranteed to be read-safe.  Instead, we can
+			 * approximate this using our carried decays, which are
+			 * explicitly atomically readable.
+			 */
+			se->avg.last_runnable_update -= (-se->avg.decay_count)
+							<< 20;
+			update_entity_load_avg(se, 0);
+		}
 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 		wakeup = 0;
 	} else {
 		__synchronize_entity_decay(se);
 	}
 
-	if (wakeup)
+	/* migrated tasks did not contribute to our blocked load */
+	if (wakeup) {
 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+		update_entity_load_avg(se, 0);
+	}
 
-	update_entity_load_avg(se, 0);
 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
-	update_cfs_rq_blocked_load(cfs_rq);
+	/* we force update consideration on load-balancer moves */
+	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
 
 /*
@@ -1202,6 +1230,8 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 						  int sleep)
 {
 	update_entity_load_avg(se, 1);
+	/* we force update consideration on load-balancer moves */
+	update_cfs_rq_blocked_load(cfs_rq, !sleep);
 
 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
 	if (sleep) {
@@ -1221,7 +1251,8 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 					   struct sched_entity *se,
 					   int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+					      int force_update) {}
 #endif
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1613,7 +1644,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_entity_load_avg(curr, 1);
-	update_cfs_rq_blocked_load(cfs_rq);
+	update_cfs_rq_blocked_load(cfs_rq, 1);
 
 	/*
 	 * Update share accounting for long-running entities.
@@ -3099,7 +3130,21 @@ unlock:
  */
 static void
 migrate_task_rq_fair(struct task_struct *p, int next_cpu) {
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	/*
+	 * Load tracking: accumulate removed load so that it can be processed
+	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
+	 * to blocked load iff they have a non-zero decay-count.
+	 */
+	if (se->avg.decay_count) {
+		se->avg.decay_count = -__synchronize_entity_decay(se);
+		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+	}
 }
+
+
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -3651,7 +3696,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 
 	update_rq_clock(rq);
 	update_cfs_load(cfs_rq, 1);
-	update_cfs_rq_blocked_load(cfs_rq);
+	update_cfs_rq_blocked_load(cfs_rq, 1);
 
 	/*
 	 * We need to update shares after updating tg->load_weight in
@@ -5537,12 +5582,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #endif
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 	atomic64_set(&cfs_rq->decay_counter, 1);
+	atomic64_set(&cfs_rq->removed_load, 0);
 #endif
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+	struct cfs_rq *cfs_rq;
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5574,8 +5621,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	if (!on_rq)
 		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
-	if (!on_rq)
-		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+	if (!on_rq) {
+		cfs_rq = cfs_rq_of(&p->se);
+		p->se.vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+		/*
+		 * set_task_rq will() have removed our previous contribution,
+		 * but we must synchronize explicitly against further decay
+		 * here.
+		 */
+		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+#endif
+	}
 }
 
 void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a96adf1..ca1003d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -230,7 +230,7 @@ struct cfs_rq {
 	 * the FAIR_GROUP_SCHED case).
 	 */
 	u64 runnable_load_avg, blocked_load_avg;
-	atomic64_t decay_counter;
+	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 09/16] sched: normalize tg load contributions against runnable time
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-29  7:26   ` Namhyung Kim
                     ` (2 more replies)
  2012-06-28  2:24 ` [PATCH 06/16] sched: account for blocked load waking back up Paul Turner
                   ` (14 subsequent siblings)
  15 siblings, 3 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Entities of equal weight should receive equitable distribution of cpu time.
This is challenging in the case of a task_group's shares as execution may be
occurring on multiple cpus simultaneously.

To handle this we divide up the shares into weights proportionate with the load
on each cfs_rq.  This does not however, account for the fact that the sum of
the parts may be less than one cpu and so we need to normalize:
  load(tg) = min(runnable_avg(tg), 1) * tg->shares
Where runnable_avg is the aggregate time in which the task_group had runnable
children.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>.
---
 kernel/sched/debug.c |    4 ++++
 kernel/sched/fair.c  |   39 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |    2 ++
 3 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9268fb7..9334c68 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -237,6 +237,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			atomic64_read(&cfs_rq->tg->load_avg));
 	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
 			cfs_rq->tg_load_contrib);
+	SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
+			cfs_rq->tg_runnable_contrib);
+	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
+			atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a416296..91d0b21 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1117,19 +1117,56 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 	}
 }
 
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+						  struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg = cfs_rq->tg;
+	long contrib;
+
+	contrib = div_u64(sa->runnable_avg_sum << 12,
+			  sa->runnable_avg_period + 1);
+	contrib -= cfs_rq->tg_runnable_contrib;
+
+	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+		atomic_add(contrib, &tg->runnable_avg);
+		cfs_rq->tg_runnable_contrib += contrib;
+	}
+}
+
 static inline void __update_group_entity_contrib(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 	struct task_group *tg = cfs_rq->tg;
+	int runnable_avg;
+
 	u64 contrib;
 
 	contrib = cfs_rq->tg_load_contrib * tg->shares;
 	se->avg.load_avg_contrib = div64_u64(contrib,
 					     atomic64_read(&tg->load_avg) + 1);
+
+	/*
+	 * Unlike a task-entity, a group entity may be using >=1 cpu globally.
+	 * However, in the case that it's using <1 cpu we need to form a
+	 * correction term so that we contribute the same load as a task of
+	 * equal weight. (Global runnable time is taken as a fraction over
+	 * 2^12.)
+	 */
+	runnable_avg = atomic_read(&tg->runnable_avg);
+	if (runnable_avg < (1<<12)) {
+		se->avg.load_avg_contrib *= runnable_avg;
+		se->avg.load_avg_contrib /= (1<<12);
+	}
 }
 #else
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 						 int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+						  struct cfs_rq *cfs_rq) {}
 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
 #endif
 
@@ -1151,6 +1188,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
 	if (entity_is_task(se)) {
 		__update_task_entity_contrib(se);
 	} else {
+		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
 		__update_group_entity_contrib(se);
 	}
 
@@ -1219,6 +1257,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 
 /* Add the load generated by se into cfs_rq's child load-average */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4d3b3ad..b48bbd7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -113,6 +113,7 @@ struct task_group {
 
 	atomic_t load_weight;
 	atomic64_t load_avg;
+	atomic_t runnable_avg;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -234,6 +235,7 @@ struct cfs_rq {
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	u32 tg_runnable_contrib;
 	u64 tg_load_contrib;
 #endif
 #endif



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 08/16] sched: compute load contribution by a group entity
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (6 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 05/16] sched: add an rq migration call-back to sched_class Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  2:24 ` [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq Paul Turner
                   ` (7 subsequent siblings)
  15 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Unlike task entities who have a fixed weight, group entities instead own a
fraction of their parenting task_group's shares as their contributed weight.

Compute this fraction so that we can correctly account hierarchies and shared
entity nodes.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/fair.c |   33 +++++++++++++++++++++++++++------
 1 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fbb9686..a416296 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1116,22 +1116,43 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 		cfs_rq->tg_load_contrib += tg_contrib;
 	}
 }
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
+	struct task_group *tg = cfs_rq->tg;
+	u64 contrib;
+
+	contrib = cfs_rq->tg_load_contrib * tg->shares;
+	se->avg.load_avg_contrib = div64_u64(contrib,
+					     atomic64_read(&tg->load_avg) + 1);
+}
 #else
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 						 int force_update) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
 #endif
 
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+	u32 contrib;
+
+	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+	contrib /= (se->avg.runnable_avg_period + 1);
+	se->avg.load_avg_contrib = scale_load(contrib);
+}
+
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
 	long old_contrib = se->avg.load_avg_contrib;
 
-	if (!entity_is_task(se))
-		return 0;
-
-	se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum *
-					     se->load.weight,
-					     se->avg.runnable_avg_period + 1);
+	if (entity_is_task(se)) {
+		__update_task_entity_contrib(se);
+	} else {
+		__update_group_entity_contrib(se);
+	}
 
 	return se->avg.load_avg_contrib - old_contrib;
 }



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (7 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 08/16] sched: compute load contribution by a group entity Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  6:33   ` Namhyung Kim
  2012-07-04 15:28   ` Peter Zijlstra
  2012-06-28  2:24 ` [PATCH 11/16] sched: replace update_shares weight distribution with per-entity computation Paul Turner
                   ` (6 subsequent siblings)
  15 siblings, 2 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

For a given task t, we can compute its contribution to load as:
  task_load(t) = runnable_avg(t) * weight(t)

On a parenting cfs_rq we can then aggregate
  runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t

Maintain this bottom up, with task entities adding their contributed load to
the parenting cfs_rq sum.  When a task entities load changes we add the same
delta to the maintained sum.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 include/linux/sched.h |    1 +
 kernel/sched/debug.c  |    3 +++
 kernel/sched/fair.c   |   51 +++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h  |   10 +++++++++-
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5bf5c79..0c54ce0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1139,6 +1139,7 @@ struct load_weight {
 struct sched_avg {
 	u32 runnable_avg_sum, runnable_avg_period;
 	u64 last_runnable_update;
+	unsigned long load_avg_contrib;
 };
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5d4a7dd..aeb74e3 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #ifdef CONFIG_SMP
 	P(se->avg.runnable_avg_sum);
 	P(se->avg.runnable_avg_period);
+	P(se->avg.load_avg_contrib);
 #endif
 #undef PN
 #undef P
@@ -227,6 +228,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->load_contribution);
 	SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
 			atomic_read(&cfs_rq->tg->load_weight));
+	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
+			cfs_rq->runnable_load_avg);
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08bd3e0..8229766 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1085,20 +1085,63 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 	return decayed;
 }
 
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+	long old_contrib = se->avg.load_avg_contrib;
+
+	if (!entity_is_task(se))
+		return 0;
+
+	se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum *
+					     se->load.weight,
+					     se->avg.runnable_avg_period + 1);
+
+	return se->avg.load_avg_contrib - old_contrib;
+}
+
 /* Update a sched_entity's runnable average */
 static inline void update_entity_load_avg(struct sched_entity *se)
 {
-	__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
-				     se->on_rq);
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	long contrib_delta;
+
+	if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg,
+					  se->on_rq))
+		return;
+
+	contrib_delta = __update_entity_load_avg_contrib(se);
+	if (se->on_rq)
+		cfs_rq->runnable_load_avg += contrib_delta;
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
 }
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+						  struct sched_entity *se)
+{
+	update_entity_load_avg(se);
+	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+}
+
+/* Remove se's load from this cfs_rq child load-average */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+						  struct sched_entity *se)
+{
+	update_entity_load_avg(se);
+	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+}
 #else
 static inline void update_entity_load_avg(struct sched_entity *se) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+						  struct sched_entity *se) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+						  struct sched_entity *se) {}
 #endif
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1227,7 +1270,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	update_curr(cfs_rq);
 	update_cfs_load(cfs_rq, 0);
-	update_entity_load_avg(se);
+	enqueue_entity_load_avg(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 
@@ -1302,7 +1345,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_entity_load_avg(se);
+	dequeue_entity_load_avg(cfs_rq, se);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bfdb119..26cc36f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -222,6 +222,15 @@ struct cfs_rq {
 	unsigned int nr_spread_over;
 #endif
 
+#ifdef CONFIG_SMP
+	/*
+	 * CFS Load tracking
+	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
+	 * This allows for the description of both thread and group usage (in
+	 * the FAIR_GROUP_SCHED case).
+	 */
+	u64 runnable_load_avg;
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
@@ -1204,4 +1213,3 @@ static inline void account_numa_dequeue(struct task_struct *p) { }
 static inline void init_sched_numa(void) { }
 
 #endif /* CONFIG_NUMA */
-



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 07/16] sched: aggregate total task_group load
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (4 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 04/16] sched: maintain the load contribution of blocked entities Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  2:24 ` [PATCH 05/16] sched: add an rq migration call-back to sched_class Paul Turner
                   ` (9 subsequent siblings)
  15 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Maintain a global running sum of the average load seen on each cfs_rq belonging
to each task group so that it may be used in calculating an appropriate
shares:weight distribution.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/debug.c |    4 ++++
 kernel/sched/fair.c  |   22 ++++++++++++++++++++++
 kernel/sched/sched.h |    4 ++++
 3 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2aa60cf..9268fb7 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -233,6 +233,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
 			cfs_rq->blocked_load_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
+			atomic64_read(&cfs_rq->tg->load_avg));
+	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
+			cfs_rq->tg_load_contrib);
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea3d4f5..fbb9686 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1101,6 +1101,26 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 	return decays;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+						 int force_update)
+{
+	struct task_group *tg = cfs_rq->tg;
+	s64 tg_contrib;
+
+	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+	tg_contrib -= cfs_rq->tg_load_contrib;
+
+	if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+		atomic64_add(tg_contrib, &tg->load_avg);
+		cfs_rq->tg_load_contrib += tg_contrib;
+	}
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+						 int force_update) {}
+#endif
+
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
@@ -1171,6 +1191,8 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 		atomic64_add(decays, &cfs_rq->decay_counter);
 		cfs_rq->last_decay = now;
 	}
+
+	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ca1003d..4d3b3ad 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,7 @@ struct task_group {
 	unsigned long shares;
 
 	atomic_t load_weight;
+	atomic64_t load_avg;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -232,6 +233,9 @@ struct cfs_rq {
 	u64 runnable_load_avg, blocked_load_avg;
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	u64 tg_load_contrib;
+#endif
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs()
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (10 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 16/16] sched: introduce temporary FAIR_GROUP_SCHED dependency for load-tracking Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-29  7:28   ` Namhyung Kim
  2012-07-05 11:58   ` Peter Zijlstra
  2012-06-28  2:24 ` [PATCH 15/16] sched: implement usage tracking Paul Turner
                   ` (3 subsequent siblings)
  15 siblings, 2 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Now that running entities maintain their own load-averages the work we must do
in update_shares() is largely restricted to the periodic decay of blocked
entities.  This allows us to be a little less pessimistic regarding our
occupancy on rq->lock and the associated rq->clock updates required.

Signed-off-by: Paul Turner <pjt@google.com>
---
 kernel/sched/fair.c |   59 +++++++++++++++++++++++++++++----------------------
 1 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a9a828..dd1ef8a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3678,23 +3678,20 @@ out:
 /*
  * update tg->load_weight by folding this cpu's load_avg
  */
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
 {
-	struct sched_entity *se;
-	struct cfs_rq *cfs_rq;
-	unsigned long flags;
-	struct rq *rq;
-
-
-	rq = cpu_rq(cpu);
-	se = tg->se[cpu];
-	cfs_rq = tg->cfs_rq[cpu];
+	struct sched_entity *se = tg->se[cpu];
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
 
-	raw_spin_lock_irqsave(&rq->lock, flags);
+	/* throttled entities do not contribute to load */
+	if (throttled_hierarchy(cfs_rq))
+		return;
 
-	update_rq_clock(rq);
 	update_cfs_rq_blocked_load(cfs_rq, 1);
-	update_entity_load_avg(tg->se[cpu], 1);
+	if (se)
+		update_entity_load_avg(se, 1);
+	else
+		update_rq_runnable_avg(rq_of(cfs_rq), 1);
 
 	if (se) {
 		/*
@@ -3707,29 +3704,39 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
 		else
 			list_del_leaf_cfs_rq(cfs_rq);
 	}
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-	return 0;
 }
 
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
 {
-	struct cfs_rq *cfs_rq;
 	struct rq *rq = cpu_rq(cpu);
+	struct cfs_rq *cfs_rq;
+
+	unsigned long flags;
+	int num_updates = 0;
 
 	rcu_read_lock();
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	update_rq_clock(rq);
 	/*
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		/* throttled entities do not contribute to load */
-		if (throttled_hierarchy(cfs_rq))
-			continue;
+		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
 
-		update_shares_cpu(cfs_rq->tg, cpu);
+		/*
+		 * Periodically release the lock so that a cfs_rq with many
+		 * children cannot hold it for an arbitrary period of time.
+		 */
+		if (num_updates++ % 20 == 0) {
+			raw_spin_unlock_irqrestore(&rq->lock, flags);
+			cpu_relax();
+			raw_spin_lock_irqsave(&rq->lock, flags);
+			update_rq_clock(rq);
+		}
 	}
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	rcu_read_unlock();
 }
 
@@ -3774,7 +3781,7 @@ unsigned long task_h_load(struct task_struct *p)
 	return load;
 }
 #else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
 {
 }
 
@@ -4936,7 +4943,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
 	 */
 	raw_spin_unlock(&this_rq->lock);
 
-	update_shares(this_cpu);
+	update_blocked_averages(this_cpu);
 	rcu_read_lock();
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -5196,7 +5203,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	int update_next_balance = 0;
 	int need_serialize;
 
-	update_shares(cpu);
+	update_blocked_averages(cpu);
 
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 15/16] sched: implement usage tracking
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (11 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs() Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  2:24 ` [PATCH 13/16] sched: update_cfs_shares at period edge Paul Turner
                   ` (2 subsequent siblings)
  15 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

With the frame-work for runnable tracking now fully in place.  Per-entity usage
tracking is a simple and low-overhead addition.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 include/linux/sched.h |    1 +
 kernel/sched/debug.c  |    3 +++
 kernel/sched/fair.c   |   33 ++++++++++++++++++++++++++++-----
 kernel/sched/sched.h  |    4 ++--
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdfdfab..a65c097 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1142,6 +1142,7 @@ struct sched_avg {
 	u64 last_runnable_update;
 	s64 decay_count;
 	unsigned long load_avg_contrib;
+	u32 usage_avg_sum;
 };
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index fbd1517..e8f8d51 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #ifdef CONFIG_SMP
 	P(se->avg.runnable_avg_sum);
 	P(se->avg.runnable_avg_period);
+	P(se->avg.usage_avg_sum);
 	P(se->avg.load_avg_contrib);
 	P(se->avg.decay_count);
 #endif
@@ -233,6 +234,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->tg_runnable_contrib);
 	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
 			atomic_read(&cfs_rq->tg->runnable_avg));
+	SEQ_printf(m, "  .%-30s: %d\n", "tg->usage_avg",
+			atomic_read(&cfs_rq->tg->usage_avg));
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 89c353c..3d1aaa8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -996,7 +996,8 @@ static u32 __compute_runnable_contrib(int n)
  */
 static __always_inline int __update_entity_runnable_avg(u64 now,
 							struct sched_avg *sa,
-							int runnable)
+							int runnable,
+							int running)
 {
 	u64 delta, periods;
 	u32 runnable_contrib;
@@ -1035,6 +1036,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 		delta_w = 1024 - delta_w;
 		if (runnable)
 			sa->runnable_avg_sum += delta_w;
+		if (running)
+			sa->usage_avg_sum += delta_w;
 		sa->runnable_avg_period += delta_w;
 
 		delta -= delta_w;
@@ -1047,17 +1050,22 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 						  periods + 1);
 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
 						     periods + 1);
+		sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
 
 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
 		runnable_contrib = __compute_runnable_contrib(periods);
 		if (runnable)
 			sa->runnable_avg_sum += runnable_contrib;
+		if (running)
+			sa->usage_avg_sum += runnable_contrib;
 		sa->runnable_avg_period += runnable_contrib;
 	}
 
 	/* Remainder of delta accrued against u_0` */
 	if (runnable)
 		sa->runnable_avg_sum += delta;
+	if (running)
+		sa->usage_avg_sum += delta;
 	sa->runnable_avg_period += delta;
 
 	return decayed;
@@ -1103,15 +1111,27 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 						  struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg = cfs_rq->tg;
-	long contrib;
+	long contrib, usage_contrib;
 
 	contrib = div_u64(sa->runnable_avg_sum << 12,
 			  sa->runnable_avg_period + 1);
 	contrib -= cfs_rq->tg_runnable_contrib;
 
-	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+	usage_contrib = div_u64(sa->usage_avg_sum << 12,
+			        sa->runnable_avg_period + 1);
+	usage_contrib -= cfs_rq->tg_usage_contrib;
+
+	/*
+	 * contrib/usage at this point represent deltas, only update if they
+	 * are substantive.
+	 */
+	if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
+	    (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
 		atomic_add(contrib, &tg->runnable_avg);
 		cfs_rq->tg_runnable_contrib += contrib;
+
+		atomic_add(usage_contrib, &tg->usage_avg);
+		cfs_rq->tg_usage_contrib += usage_contrib;
 	}
 }
 
@@ -1201,7 +1221,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
 	else
 		now = cfs_rq_clock_task(group_cfs_rq(se));
 
-	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
+					  cfs_rq->curr == se))
 		return;
 
 	contrib_delta = __update_entity_load_avg_contrib(se);
@@ -1246,7 +1267,8 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
-	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
+				     runnable);
 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 
@@ -1615,6 +1637,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
+		update_entity_load_avg(se, 1);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 67cc5e1..bb76895 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -113,7 +113,7 @@ struct task_group {
 
 	atomic_t load_weight;
 	atomic64_t load_avg;
-	atomic_t runnable_avg;
+	atomic_t runnable_avg, usage_avg;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -236,7 +236,7 @@ struct cfs_rq {
 	u64 last_decay;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	u32 tg_runnable_contrib;
+	u32 tg_runnable_contrib, tg_usage_contrib;
 	u64 tg_load_contrib;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 10/16] sched: maintain runnable averages across throttled periods
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (13 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 13/16] sched: update_cfs_shares at period edge Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  2:24 ` [PATCH 14/16] sched: make __update_entity_runnable_avg() fast Paul Turner
  15 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

With bandwidth control tracked entities may cease execution according to user
specified bandwidth limits.  Charging this time as either throttled or blocked
however, is incorrect and would falsely skew in either direction.

What we actually want is for any throttled periods to be "invisible" to
load-tracking as they are removed from the system for that interval and
contribute normally otherwise.

Do this by moderating the progression of time to omit any periods in which the
entity belonged to a throttled hierarchy.

Signed-off-by: Paul Turner <pjt@google.com>
---
 kernel/sched/fair.c  |   50 ++++++++++++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h |    3 ++-
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 91d0b21..b78d03e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1204,15 +1204,26 @@ static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
 		cfs_rq->blocked_load_avg = 0;
 }
 
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
 /* Update a sched_entity's runnable average */
 static inline void update_entity_load_avg(struct sched_entity *se,
 					  int update_cfs_rq)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	long contrib_delta;
+	u64 now;
 
-	if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg,
-					  se->on_rq))
+	/*
+	 * For a group entity we need to use their owned cfs_rq_clock_task() in
+	 * case they are the parent of a throttled hierarchy.
+	 */
+	if (entity_is_task(se))
+		now = cfs_rq_clock_task(cfs_rq);
+	else
+		now = cfs_rq_clock_task(group_cfs_rq(se));
+
+	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
 		return;
 
 	contrib_delta = __update_entity_load_avg_contrib(se);
@@ -1232,7 +1243,7 @@ static inline void update_entity_load_avg(struct sched_entity *se,
  */
 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 {
-	u64 now = rq_of(cfs_rq)->clock_task >> 20;
+	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
 	u64 decays;
 
 	decays = now - cfs_rq->last_decay;
@@ -1824,6 +1835,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 	return &tg->cfs_bandwidth;
 }
 
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+	if (unlikely(cfs_rq->throttle_count))
+		return cfs_rq->throttled_clock_task;
+
+	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
+
 /* returns 0 on failure to allocate runtime */
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -1974,6 +1994,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 		cfs_rq->load_stamp += delta;
 		cfs_rq->load_last += delta;
 
+		/* adjust cfs_rq_clock_task() */
+		cfs_rq->throttled_clock_task_time += rq->clock_task -
+					     cfs_rq->throttled_clock_task;
+
 		/* update entity weight now that we are on_rq again */
 		update_cfs_shares(cfs_rq);
 	}
@@ -1988,8 +2012,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
 	/* group is entering throttled state, record last load */
-	if (!cfs_rq->throttle_count)
+	if (!cfs_rq->throttle_count) {
 		update_cfs_load(cfs_rq, 0);
+		cfs_rq->throttled_clock_task = rq->clock_task;
+	}
 	cfs_rq->throttle_count++;
 
 	return 0;
@@ -2004,7 +2030,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
-	/* account load preceding throttle */
+	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
@@ -2028,7 +2054,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 		rq->nr_running -= task_delta;
 
 	cfs_rq->throttled = 1;
-	cfs_rq->throttled_timestamp = rq->clock;
+	cfs_rq->throttled_clock = rq->clock;
 	raw_spin_lock(&cfs_b->lock);
 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 	raw_spin_unlock(&cfs_b->lock);
@@ -2046,10 +2072,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	cfs_rq->throttled = 0;
 	raw_spin_lock(&cfs_b->lock);
-	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
 	list_del_rcu(&cfs_rq->throttled_list);
 	raw_spin_unlock(&cfs_b->lock);
-	cfs_rq->throttled_timestamp = 0;
 
 	update_rq_clock(rq);
 	/* update hierarchical throttle state */
@@ -2449,8 +2474,13 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+	return rq_of(cfs_rq)->clock_task;
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b48bbd7..60c0935 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -281,7 +281,8 @@ struct cfs_rq {
 	u64 runtime_expires;
 	s64 runtime_remaining;
 
-	u64 throttled_timestamp;
+	u64 throttled_clock, throttled_clock_task;
+	u64 throttled_clock_task_time;
 	int throttled, throttle_count;
 	struct list_head throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 16/16] sched: introduce temporary FAIR_GROUP_SCHED dependency for load-tracking
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (9 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 11/16] sched: replace update_shares weight distribution with per-entity computation Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  2:24 ` [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs() Paul Turner
                   ` (4 subsequent siblings)
  15 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

While per-entity load-tracking is generally useful, beyond computing shares
distribution, e.g.
  runnable based load-balance (in progress), governors, power-management, etc

These facilities are not yet consumers of this data.  This may be trivially
reverted when the information is required; but avoid paying the overhead for
calculations we will not use until then.

Signed-off-by: Paul Turner <pjt@google.com>
---
 include/linux/sched.h |    8 +++++++-
 kernel/sched/fair.c   |   14 +++++++++++---
 kernel/sched/sched.h  |    9 ++++++++-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a65c097..7c0c72d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1205,7 +1205,13 @@ struct sched_entity {
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
-#ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+	/* Per-entity load-tracking */
 	struct sched_avg	avg;
 #endif
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3d1aaa8..0b6e2df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -882,7 +882,8 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-#ifdef CONFIG_SMP
+/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
 /*
  * We choose a half-life close to 1 scheduling period.
  * Note: The tables below are dependent on this value.
@@ -3214,6 +3215,12 @@ unlock:
 }
 
 /*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
@@ -3235,7 +3242,7 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) {
 		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
 	}
 }
-
+#endif
 
 #endif /* CONFIG_SMP */
 
@@ -5927,8 +5934,9 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_fair,
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	.migrate_task_rq	= migrate_task_rq_fair,
-
+#endif
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bb76895..d920943 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -225,6 +225,12 @@ struct cfs_rq {
 #endif
 
 #ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
 	 * CFS Load tracking
 	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
@@ -234,7 +240,8 @@ struct cfs_rq {
 	u64 runnable_load_avg, blocked_load_avg;
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
-
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+/* These always depend on CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	u32 tg_runnable_contrib, tg_usage_contrib;
 	u64 tg_load_contrib;



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 13/16] sched: update_cfs_shares at period edge
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (12 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 15/16] sched: implement usage tracking Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  2:24 ` [PATCH 10/16] sched: maintain runnable averages across throttled periods Paul Turner
  2012-06-28  2:24 ` [PATCH 14/16] sched: make __update_entity_runnable_avg() fast Paul Turner
  15 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Now that our measurement intervals are small (~1ms) we can amortize the posting
of update_shares() to be about each period overflow.  This is a large cost
saving for frequently switching tasks.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/fair.c |   18 ++++++++++--------
 1 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd1ef8a..f67842f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1169,6 +1169,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 	}
 
 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+	update_cfs_shares(cfs_rq);
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1379,9 +1380,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
+	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
@@ -1454,7 +1454,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
@@ -1474,8 +1473,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
-	se->on_rq = 0;
 	account_entity_dequeue(cfs_rq, se);
+	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
@@ -1489,7 +1488,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	return_cfs_rq_runtime(cfs_rq);
 
 	update_min_vruntime(cfs_rq);
-	update_cfs_shares(cfs_rq);
+	se->on_rq = 0;
 }
 
 /*
@@ -2501,8 +2500,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
+		update_cfs_rq_blocked_load(cfs_rq, 0);
 	}
 
 	if (!se) {
@@ -2562,8 +2561,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
+		update_cfs_rq_blocked_load(cfs_rq, 0);
 	}
 
 	if (!se) {
@@ -5775,8 +5774,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		se = tg->se[i];
 		/* Propagate contribution to hierarchy */
 		raw_spin_lock_irqsave(&rq->lock, flags);
-		for_each_sched_entity(se)
+		for_each_sched_entity(se) {
 			update_cfs_shares(group_cfs_rq(se));
+			/* update contribution to parent */
+			update_entity_load_avg(se, 1);
+		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 14/16] sched: make __update_entity_runnable_avg() fast
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (14 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 10/16] sched: maintain runnable averages across throttled periods Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-07-04 15:41   ` Peter Zijlstra
  2012-07-04 16:51   ` Peter Zijlstra
  15 siblings, 2 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

__update_entity_runnable_avg forms the core of maintaining an entity's runnable
load average.  In this function we charge the accumulated run-time since last
update and handle appropriate decay.  In some cases, e.g. a waking task, this
time interval may be much larger than our period unit.

Fortunately we can exploit some properties of our series to perform decay for a
blocked update in constant time and account the contribution for a running
update in essentially-constant* time.

[*]: For any running entity they should be performing updates at the tick which
gives us a soft limit of 1 jiffy between updates, and we can compute up to a
32 jiffy update in a single pass.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/fair.c |  122 +++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 97 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f67842f..89c353c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -884,17 +884,87 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 #ifdef CONFIG_SMP
 /*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 46742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 516 /* number of full periods it takes to produce max */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+	0xffffffff, 0xfa83b2db, 0xf5257d15, 0xefe4b99b, 0xeac0c6e7, 0xe5b906e7,
+	0xe0ccdeec, 0xdbfbb797, 0xd744fcca, 0xd2a81d91, 0xce248c15, 0xc9b9bd86,
+	0xc5672a11, 0xc12c4cca, 0xbd08a39f, 0xb8fbaf47, 0xb504f333, 0xb123f581,
+	0xad583eea, 0xa9a15ab4, 0xa5fed6a9, 0xa2704303, 0x9ef53260, 0x9b8d39b9,
+	0x9837f051, 0x94f4efa8, 0x91c3d373, 0x8ea4398b, 0x8b95c1e3, 0x88980e80,
+	0x85aac367, 0x82cd8698,
+};
+
+/* Precomputed \Sum y^k { 1<=k<=n } */
+static const u32 runnable_avg_yN_sum[] = {
+	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  */
-static __always_inline u64 decay_load(u64 val, int n)
+static __always_inline u64 decay_load(u64 val, u64 n)
 {
-	for (; n && val; n--) {
-		val *= 4008;
-		val >>= 12;
+	int local_n;
+	if (!n)
+		return val;
+	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+		return 0;
+
+	/* will be 32 bits if that's desirable */
+	local_n = n;
+
+	/*
+	 * As y^PERIOD = 1/2, we can combine
+	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+	 * With a look-up table which covers k^n (n<PERIOD)
+	 *
+	 * To achieve constant time decay_load.
+	 */
+	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+		val >>= local_n / LOAD_AVG_PERIOD;
+		n %= LOAD_AVG_PERIOD;
 	}
 
-	return val;
+	val *= runnable_avg_yN_inv[local_n];
+	return SRR(val, 32);
+}
+
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(int n)
+{
+	u32 contrib = 0;
+
+	if (likely(n <= LOAD_AVG_PERIOD))
+		return runnable_avg_yN_sum[n];
+	else if (unlikely(n >= LOAD_AVG_MAX_N))
+		return LOAD_AVG_MAX;
+
+	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+	do {
+		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+		n -= LOAD_AVG_PERIOD;
+	} while (n > LOAD_AVG_PERIOD);
+
+	contrib = decay_load(contrib, n);
+	return contrib + runnable_avg_yN_sum[n];
 }
 
 /* We can represent the historical contribution to runnable average as the
@@ -928,7 +998,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 							struct sched_avg *sa,
 							int runnable)
 {
-	u64 delta;
+	u64 delta, periods;
+	u32 runnable_contrib;
 	int delta_w, decayed = 0;
 
 	delta = now - sa->last_runnable_update;
@@ -962,25 +1033,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
 		 * period and accrue it.
 		 */
 		delta_w = 1024 - delta_w;
-		BUG_ON(delta_w > delta);
-		do {
-			if (runnable)
-				sa->runnable_avg_sum += delta_w;
-			sa->runnable_avg_period += delta_w;
-
-			/*
-			 * Remainder of delta initiates a new period, roll over
-			 * the previous.
-			 */
-			sa->runnable_avg_sum =
-				decay_load(sa->runnable_avg_sum, 1);
-			sa->runnable_avg_period =
-				decay_load(sa->runnable_avg_period, 1);
-
-			delta -= delta_w;
-			/* New period is empty */
-			delta_w = 1024;
-		} while (delta >= 1024);
+		if (runnable)
+			sa->runnable_avg_sum += delta_w;
+		sa->runnable_avg_period += delta_w;
+
+		delta -= delta_w;
+
+		/* Figure out how many additional periods this update spans */
+		periods = delta / 1024;
+		delta %= 1024;
+
+		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+						  periods + 1);
+		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+						     periods + 1);
+
+		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
+		runnable_contrib = __compute_runnable_contrib(periods);
+		if (runnable)
+			sa->runnable_avg_sum += runnable_contrib;
+		sa->runnable_avg_period += runnable_contrib;
 	}
 
 	/* Remainder of delta accrued against u_0` */



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 11/16] sched: replace update_shares weight distribution with per-entity computation
  2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
                   ` (8 preceding siblings ...)
  2012-06-28  2:24 ` [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq Paul Turner
@ 2012-06-28  2:24 ` Paul Turner
  2012-06-28  2:24 ` [PATCH 16/16] sched: introduce temporary FAIR_GROUP_SCHED dependency for load-tracking Paul Turner
                   ` (5 subsequent siblings)
  15 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-06-28  2:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Venki Pallipadi, Srivatsa Vaddagiri, Vincent Guittot,
	Peter Zijlstra, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Now that the machinery in place is in place to compute contributed load in a
bottom up fashion; replace the shares distribution code within update_shares()
accordingly.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Ben Segall <bsegall@google.com>
---
 kernel/sched/debug.c |    8 ---
 kernel/sched/fair.c  |  152 +++++++-------------------------------------------
 kernel/sched/sched.h |   36 ++++--------
 3 files changed, 32 insertions(+), 164 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9334c68..fbd1517 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -221,14 +221,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
-			SPLIT_NS(cfs_rq->load_avg));
-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
-			SPLIT_NS(cfs_rq->load_period));
-	SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
-			cfs_rq->load_contribution);
-	SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
-			atomic_read(&cfs_rq->tg->load_weight));
 	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
 			cfs_rq->runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b78d03e..4a9a828 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -654,9 +654,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -676,10 +673,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->vruntime += delta_exec_weighted;
 	update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-	cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -806,72 +799,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
-					    int global_update)
-{
-	struct task_group *tg = cfs_rq->tg;
-	long load_avg;
-
-	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-	load_avg -= cfs_rq->load_contribution;
-
-	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
-		atomic_add(load_avg, &tg->load_weight);
-		cfs_rq->load_contribution += load_avg;
-	}
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-	u64 period = sysctl_sched_shares_window;
-	u64 now, delta;
-	unsigned long load = cfs_rq->load.weight;
-
-	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
-		return;
-
-	now = rq_of(cfs_rq)->clock_task;
-	delta = now - cfs_rq->load_stamp;
-
-	/* truncate load history at 4 idle periods */
-	if (cfs_rq->load_stamp > cfs_rq->load_last &&
-	    now - cfs_rq->load_last > 4 * period) {
-		cfs_rq->load_period = 0;
-		cfs_rq->load_avg = 0;
-		delta = period - 1;
-	}
-
-	cfs_rq->load_stamp = now;
-	cfs_rq->load_unacc_exec_time = 0;
-	cfs_rq->load_period += delta;
-	if (load) {
-		cfs_rq->load_last = now;
-		cfs_rq->load_avg += delta * load;
-	}
-
-	/* consider updating load contribution on each fold or truncate */
-	if (global_update || cfs_rq->load_period > period
-	    || !cfs_rq->load_period)
-		update_cfs_rq_load_contribution(cfs_rq, global_update);
-
-	while (cfs_rq->load_period > period) {
-		/*
-		 * Inline assembly required to prevent the compiler
-		 * optimising this loop into a divmod call.
-		 * See __iter_div_u64_rem() for another example of this.
-		 */
-		asm("" : "+rm" (cfs_rq->load_period));
-		cfs_rq->load_period /= 2;
-		cfs_rq->load_avg /= 2;
-	}
-
-	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
-		list_del_leaf_cfs_rq(cfs_rq);
-}
-
 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 {
 	long tg_weight;
@@ -881,8 +809,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 	 * to gain a more accurate current total weight. See
 	 * update_cfs_rq_load_contribution().
 	 */
-	tg_weight = atomic_read(&tg->load_weight);
-	tg_weight -= cfs_rq->load_contribution;
+	tg_weight = atomic64_read(&tg->load_avg);
+	tg_weight -= cfs_rq->tg_load_contrib;
 	tg_weight += cfs_rq->load.weight;
 
 	return tg_weight;
@@ -906,27 +834,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 
 	return shares;
 }
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq);
-	}
-}
 # else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	return tg->shares;
 }
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
 # endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
@@ -944,6 +856,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		account_entity_enqueue(cfs_rq, se);
 }
 
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
 static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg;
@@ -963,17 +877,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
@@ -1473,7 +1379,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
-	update_cfs_load(cfs_rq, 0);
 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
@@ -1570,7 +1475,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
-	update_cfs_load(cfs_rq, 0);
 	account_entity_dequeue(cfs_rq, se);
 
 	/*
@@ -1739,11 +1643,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	update_entity_load_avg(curr, 1);
 	update_cfs_rq_blocked_load(cfs_rq, 1);
 
-	/*
-	 * Update share accounting for long-running entities.
-	 */
-	update_entity_shares_tick(cfs_rq);
-
 #ifdef CONFIG_SCHED_HRTICK
 	/*
 	 * queued ticks are scheduled to match the slice, so don't bother
@@ -1988,18 +1887,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 	cfs_rq->throttle_count--;
 #ifdef CONFIG_SMP
 	if (!cfs_rq->throttle_count) {
-		u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
-		/* leaving throttled state, advance shares averaging windows */
-		cfs_rq->load_stamp += delta;
-		cfs_rq->load_last += delta;
-
 		/* adjust cfs_rq_clock_task() */
 		cfs_rq->throttled_clock_task_time += rq->clock_task -
 					     cfs_rq->throttled_clock_task;
-
-		/* update entity weight now that we are on_rq again */
-		update_cfs_shares(cfs_rq);
 	}
 #endif
 
@@ -2011,11 +1901,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-	/* group is entering throttled state, record last load */
-	if (!cfs_rq->throttle_count) {
-		update_cfs_load(cfs_rq, 0);
+	/* group is entering throttled state, stop time */
+	if (!cfs_rq->throttle_count)
 		cfs_rq->throttled_clock_task = rq->clock_task;
-	}
 	cfs_rq->throttle_count++;
 
 	return 0;
@@ -2613,7 +2501,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
@@ -2675,7 +2562,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_cfs_load(cfs_rq, 0);
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
@@ -3794,27 +3680,33 @@ out:
  */
 static int update_shares_cpu(struct task_group *tg, int cpu)
 {
+	struct sched_entity *se;
 	struct cfs_rq *cfs_rq;
 	unsigned long flags;
 	struct rq *rq;
 
-	if (!tg->se[cpu])
-		return 0;
 
 	rq = cpu_rq(cpu);
+	se = tg->se[cpu];
 	cfs_rq = tg->cfs_rq[cpu];
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
 
 	update_rq_clock(rq);
-	update_cfs_load(cfs_rq, 1);
 	update_cfs_rq_blocked_load(cfs_rq, 1);
+	update_entity_load_avg(tg->se[cpu], 1);
 
-	/*
-	 * We need to update shares after updating tg->load_weight in
-	 * order to adjust the weight of groups with long running tasks.
-	 */
-	update_cfs_shares(cfs_rq);
+	if (se) {
+		/*
+		 * We can pivot on the runnable average decaying to zero for
+		 * list removal since the parent average will always be >=
+		 * child.
+		 */
+		if (se->avg.runnable_avg_sum)
+			update_cfs_shares(cfs_rq);
+		else
+			list_del_leaf_cfs_rq(cfs_rq);
+	}
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
@@ -5830,10 +5722,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 
 	cfs_rq->tg = tg;
 	cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
-	/* allow initial update_cfs_load() to truncate */
-	cfs_rq->load_stamp = 1;
-#endif
 	init_cfs_rq_runtime(cfs_rq);
 
 	tg->cfs_rq[cpu] = cfs_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 60c0935..67cc5e1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -234,11 +234,21 @@ struct cfs_rq {
 	u64 runnable_load_avg, blocked_load_avg;
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	u32 tg_runnable_contrib;
 	u64 tg_load_contrib;
-#endif
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+	/*
+	 *   h_load = weight * f(tg)
+	 *
+	 * Where f(tg) is the recursive weight fraction assigned to
+	 * this group.
+	 */
+	unsigned long h_load;
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 
@@ -254,28 +264,6 @@ struct cfs_rq {
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
-#ifdef CONFIG_SMP
-	/*
-	 *   h_load = weight * f(tg)
-	 *
-	 * Where f(tg) is the recursive weight fraction assigned to
-	 * this group.
-	 */
-	unsigned long h_load;
-
-	/*
-	 * Maintaining per-cpu shares distribution for group scheduling
-	 *
-	 * load_stamp is the last time we updated the load average
-	 * load_last is the last time we updated the load average and saw load
-	 * load_unacc_exec_time is currently unaccounted execution time
-	 */
-	u64 load_avg;
-	u64 load_period;
-	u64 load_stamp, load_last, load_unacc_exec_time;
-
-	unsigned long load_contribution;
-#endif /* CONFIG_SMP */
 #ifdef CONFIG_CFS_BANDWIDTH
 	int runtime_enabled;
 	u64 runtime_expires;



^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis
  2012-06-28  2:24 ` [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis Paul Turner
@ 2012-06-28  6:06   ` Namhyung Kim
  2012-07-12  0:14     ` Paul Turner
  2012-07-04 15:32   ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Namhyung Kim @ 2012-06-28  6:06 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Peter Zijlstra, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ben Segall, Ingo Molnar,
	Paul E. McKenney, Morten Rasmussen, Vaidyanathan Srinivasan

Hi,

Some nitpicks and questions.


On Wed, 27 Jun 2012 19:24:14 -0700, Paul Turner wrote:
> Instead of tracking averaging the load parented by a cfs_rq, we can track
> entity load directly.  With the load for a given cfs_Rq then being the sum of

s/cfs_Rq/cfs_rq/


> its children.
>
> To do this we represent the historical contribution to runnable average within each
> trailing 1024us of execution as the coefficients of a geometric series.
>
> We can express this for a given task t as:
>   runnable_sum(t) = \Sum u_i * y^i ,
>   load(t) = weight_t * runnable_sum(t) / (\Sum 1024 * y^i)
>

This "\Sum 1024 *y^i" is the runnable(_avg)_period, right?


> Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms
> and y is chosen such that y^k = 1/2.  We currently choose k to be 32 which
> roughly translates to about a sched period.
>
> Signed-off-by: Paul Turner <pjt@google.com>
> ---
>  include/linux/sched.h |    8 +++
>  kernel/sched/debug.c  |    4 ++
>  kernel/sched/fair.c   |  128 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 140 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9dced2e..5bf5c79 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1136,6 +1136,11 @@ struct load_weight {
>  	unsigned long weight, inv_weight;
>  };
>  
> +struct sched_avg {
> +	u32 runnable_avg_sum, runnable_avg_period;
> +	u64 last_runnable_update;
> +};
> +
>  #ifdef CONFIG_SCHEDSTATS
>  struct sched_statistics {
>  	u64			wait_start;
> @@ -1196,6 +1201,9 @@ struct sched_entity {
>  	/* rq "owned" by this entity/group: */
>  	struct cfs_rq		*my_q;
>  #endif
> +#ifdef CONFIG_SMP
> +	struct sched_avg	avg;
> +#endif
>  };
>  
>  struct sched_rt_entity {
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index c09a4e7..cd5ef23 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
>  	P(se->statistics.wait_count);
>  #endif
>  	P(se->load.weight);
> +#ifdef CONFIG_SMP
> +	P(se->avg.runnable_avg_sum);
> +	P(se->avg.runnable_avg_period);
> +#endif
>  #undef PN
>  #undef P
>  }
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3704ad3..864a122 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -976,6 +976,125 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
>  }
>  #endif /* CONFIG_FAIR_GROUP_SCHED */
>  
> +#ifdef CONFIG_SMP
> +/*
> + * Approximate:
> + *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
> + */
> +static __always_inline u64 decay_load(u64 val, int n)
> +{
> +	for (; n && val; n--) {
> +		val *= 4008;
> +		val >>= 12;
> +	}
> +
> +	return val;
> +}
> +
> +/* We can represent the historical contribution to runnable average as the
> + * coefficients of a geometric series.  To do this we sub-divide our runnable
> + * history into segments of approximately 1ms (1024us); label the segment that
> + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> + *
> + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> + *      p0            p1           p1
                                      p2

> + *     (now)       (~1ms ago)  (~2ms ago)
> + *
> + * Let u_i denote the fraction of p_i that the entity was runnable.
> + *
> + * We then designate the fractions u_i as our co-efficients, yielding the
> + * following representation of historical load:
> + *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> + *
> + * We choose y based on the with of a reasonably scheduling period, fixing:
                               width ?
> + *   y^32 = 0.5
> + *
> + * This means that the contribution to load ~32ms ago (u_32) will be weighted
> + * approximately half as much as the contribution to load within the last ms
> + * (u_0).
> + *
> + * When a period "rolls over" and we have new u_0`, multiplying the previous
> + * sum again by y is sufficient to update:
> + *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> + *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]
                                                                    u_{i+1}]

> + */
> +static __always_inline int __update_entity_runnable_avg(u64 now,

Is the return value used by elsewhere?

Thanks,
Namhyung


> +							struct sched_avg *sa,
> +							int runnable)
> +{
> +	u64 delta;
> +	int delta_w, decayed = 0;
> +
> +	delta = now - sa->last_runnable_update;
> +	/*
> +	 * This should only happen when time goes backwards, which it
> +	 * unfortunately does during sched clock init when we swap over to TSC.
> +	 */
> +	if ((s64)delta < 0) {
> +		sa->last_runnable_update = now;
> +		return 0;
> +	}
> +
> +	/*
> +	 * Use 1024ns as the unit of measurement since it's a reasonable
> +	 * approximation of 1us and fast to compute.
> +	 */
> +	delta >>= 10;
> +	if (!delta)
> +		return 0;
> +	sa->last_runnable_update = now;
> +
> +	/* delta_w is the amount already accumulated against our next period */
> +	delta_w = sa->runnable_avg_period % 1024;
> +	if (delta + delta_w >= 1024) {
> +		/* period roll-over */
> +		decayed = 1;
> +
> +		/*
> +		 * Now that we know we're crossing a period boundary, figure
> +		 * out how much from delta we need to complete the current
> +		 * period and accrue it.
> +		 */
> +		delta_w = 1024 - delta_w;
> +		BUG_ON(delta_w > delta);
> +		do {
> +			if (runnable)
> +				sa->runnable_avg_sum += delta_w;
> +			sa->runnable_avg_period += delta_w;
> +
> +			/*
> +			 * Remainder of delta initiates a new period, roll over
> +			 * the previous.
> +			 */
> +			sa->runnable_avg_sum =
> +				decay_load(sa->runnable_avg_sum, 1);
> +			sa->runnable_avg_period =
> +				decay_load(sa->runnable_avg_period, 1);
> +
> +			delta -= delta_w;
> +			/* New period is empty */
> +			delta_w = 1024;
> +		} while (delta >= 1024);
> +	}
> +
> +	/* Remainder of delta accrued against u_0` */
> +	if (runnable)
> +		sa->runnable_avg_sum += delta;
> +	sa->runnable_avg_period += delta;
> +
> +	return decayed;
> +}
> +
> +/* Update a sched_entity's runnable average */
> +static inline void update_entity_load_avg(struct sched_entity *se)
> +{
> +	__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
> +				     se->on_rq);
> +}
> +#else
> +static inline void update_entity_load_avg(struct sched_entity *se) {}
> +#endif
> +
>  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>  #ifdef CONFIG_SCHEDSTATS
> @@ -1102,6 +1221,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	 */
>  	update_curr(cfs_rq);
>  	update_cfs_load(cfs_rq, 0);
> +	update_entity_load_avg(se);
>  	account_entity_enqueue(cfs_rq, se);
>  	update_cfs_shares(cfs_rq);
>  
> @@ -1176,6 +1296,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	 * Update run-time statistics of the 'current'.
>  	 */
>  	update_curr(cfs_rq);
> +	update_entity_load_avg(se);
>  
>  	update_stats_dequeue(cfs_rq, se);
>  	if (flags & DEQUEUE_SLEEP) {
> @@ -1345,6 +1466,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
>  		update_stats_wait_start(cfs_rq, prev);
>  		/* Put 'current' back into the tree. */
>  		__enqueue_entity(cfs_rq, prev);
> +		/* in !on_rq case, update occurred at dequeue */
> +		update_entity_load_avg(prev);
>  	}
>  	cfs_rq->curr = NULL;
>  }
> @@ -1358,6 +1481,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>  	update_curr(cfs_rq);
>  
>  	/*
> +	 * Ensure that runnable average is periodically updated.
> +	 */
> +	update_entity_load_avg(curr);
> +
> +	/*
>  	 * Update share accounting for long-running entities.
>  	 */
>  	update_entity_shares_tick(cfs_rq);

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq
  2012-06-28  2:24 ` [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq Paul Turner
@ 2012-06-28  6:33   ` Namhyung Kim
  2012-07-04 15:28   ` Peter Zijlstra
  1 sibling, 0 replies; 46+ messages in thread
From: Namhyung Kim @ 2012-06-28  6:33 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Peter Zijlstra, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ben Segall, Ingo Molnar,
	Paul E. McKenney, Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 27 Jun 2012 19:24:14 -0700, Paul Turner wrote:
> For a given task t, we can compute its contribution to load as:
>   task_load(t) = runnable_avg(t) * weight(t)
>
> On a parenting cfs_rq we can then aggregate
>   runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t
>
> Maintain this bottom up, with task entities adding their contributed load to
> the parenting cfs_rq sum.  When a task entities load changes we add the same
                                         entity's ?

> delta to the maintained sum.
>
> Signed-off-by: Paul Turner <pjt@google.com>
> Signed-off-by: Ben Segall <bsegall@google.com>
> ---
>  include/linux/sched.h |    1 +
>  kernel/sched/debug.c  |    3 +++
>  kernel/sched/fair.c   |   51 +++++++++++++++++++++++++++++++++++++++++++++----
>  kernel/sched/sched.h  |   10 +++++++++-
>  4 files changed, 60 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 5bf5c79..0c54ce0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1139,6 +1139,7 @@ struct load_weight {
>  struct sched_avg {
>  	u32 runnable_avg_sum, runnable_avg_period;
>  	u64 last_runnable_update;
> +	unsigned long load_avg_contrib;
>  };
>  
>  #ifdef CONFIG_SCHEDSTATS
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 5d4a7dd..aeb74e3 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
>  #ifdef CONFIG_SMP
>  	P(se->avg.runnable_avg_sum);
>  	P(se->avg.runnable_avg_period);
> +	P(se->avg.load_avg_contrib);
>  #endif
>  #undef PN
>  #undef P
> @@ -227,6 +228,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>  			cfs_rq->load_contribution);
>  	SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
>  			atomic_read(&cfs_rq->tg->load_weight));
> +	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
> +			cfs_rq->runnable_load_avg);
>  #endif
>  
>  	print_cfs_group_stats(m, cpu, cfs_rq->tg);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 08bd3e0..8229766 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1085,20 +1085,63 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
>  	return decayed;
>  }
>  
> +/* Compute the current contribution to load_avg by se, return any delta */
> +static long __update_entity_load_avg_contrib(struct sched_entity *se)
> +{
> +	long old_contrib = se->avg.load_avg_contrib;
> +
> +	if (!entity_is_task(se))
> +		return 0;
> +
> +	se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum *
> +					     se->load.weight,
> +					     se->avg.runnable_avg_period + 1);
> +
> +	return se->avg.load_avg_contrib - old_contrib;
> +}
> +
>  /* Update a sched_entity's runnable average */
>  static inline void update_entity_load_avg(struct sched_entity *se)
>  {
> -	__update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
> -				     se->on_rq);
> +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +	long contrib_delta;
> +
> +	if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg,
> +					  se->on_rq))

Ok, now I see that the return value is used here.

Thanks,
Namhyung


> +		return;
> +
> +	contrib_delta = __update_entity_load_avg_contrib(se);
> +	if (se->on_rq)
> +		cfs_rq->runnable_load_avg += contrib_delta;
>  }
>  
>  static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
>  {
>  	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
>  }
> +
> +/* Add the load generated by se into cfs_rq's child load-average */
> +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
> +						  struct sched_entity *se)
> +{
> +	update_entity_load_avg(se);
> +	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
> +}
> +
> +/* Remove se's load from this cfs_rq child load-average */
> +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
> +						  struct sched_entity *se)
> +{
> +	update_entity_load_avg(se);
> +	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
> +}
>  #else
>  static inline void update_entity_load_avg(struct sched_entity *se) {}
>  static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
> +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
> +						  struct sched_entity *se) {}
> +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
> +						  struct sched_entity *se) {}
>  #endif
>  
>  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> @@ -1227,7 +1270,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	 */
>  	update_curr(cfs_rq);
>  	update_cfs_load(cfs_rq, 0);
> -	update_entity_load_avg(se);
> +	enqueue_entity_load_avg(cfs_rq, se);
>  	account_entity_enqueue(cfs_rq, se);
>  	update_cfs_shares(cfs_rq);
>  
> @@ -1302,7 +1345,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	 * Update run-time statistics of the 'current'.
>  	 */
>  	update_curr(cfs_rq);
> -	update_entity_load_avg(se);
> +	dequeue_entity_load_avg(cfs_rq, se);
>  
>  	update_stats_dequeue(cfs_rq, se);
>  	if (flags & DEQUEUE_SLEEP) {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index bfdb119..26cc36f 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -222,6 +222,15 @@ struct cfs_rq {
>  	unsigned int nr_spread_over;
>  #endif
>  
> +#ifdef CONFIG_SMP
> +	/*
> +	 * CFS Load tracking
> +	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
> +	 * This allows for the description of both thread and group usage (in
> +	 * the FAIR_GROUP_SCHED case).
> +	 */
> +	u64 runnable_load_avg;
> +#endif
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
>  
> @@ -1204,4 +1213,3 @@ static inline void account_numa_dequeue(struct task_struct *p) { }
>  static inline void init_sched_numa(void) { }
>  
>  #endif /* CONFIG_NUMA */
> -

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 04/16] sched: maintain the load contribution of blocked entities
  2012-06-28  2:24 ` [PATCH 04/16] sched: maintain the load contribution of blocked entities Paul Turner
@ 2012-06-29  1:27   ` Namhyung Kim
  0 siblings, 0 replies; 46+ messages in thread
From: Namhyung Kim @ 2012-06-29  1:27 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Peter Zijlstra, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ben Segall, Ingo Molnar,
	Paul E. McKenney, Morten Rasmussen, Vaidyanathan Srinivasan

Hi,

On Wed, 27 Jun 2012 19:24:14 -0700, Paul Turner wrote:
> We are currently maintaining:
>   runnable_load(cfs_rq) = \Sum task_load(t)
>
> For all running children t of cfs_rq.  While this can be naturally updated for
> tasks in a runnable state (as they are scheduled); this does not account for
> the load contributed by blocked task entities.
>
> This can be solved by introducing a separate accounting for blocked load:
>   blocked_load(cfs_rq) = \Sum runnable(b) * weight(b)
>
> Obviously we do not want to iterate over all blocked entities to account for
> their decay, we instead observe that:
>   runnable_load(t) = \Sum p_i*y^i
>
> and that to account for an additional idle period we only need to compute:
>   y*runnable_load(t).
>
> This means that we can compute all blocked entities at once by evaluating:
>   blocked_load(cfs_rq)` = y * blocked_load(cfs_rq)
>
> Finally we maintain a decay counter so that when a sleeping entity re-awakens
> we can determine how much of its load should be removed from the blocked sum.
>
> Signed-off-by: Paul Turner <pjt@google.com>
> Signed-off-by: Ben Segall <bsegall@google.com>
> ---
>  include/linux/sched.h |    1 
>  kernel/sched/core.c   |    3 +
>  kernel/sched/debug.c  |    3 +
>  kernel/sched/fair.c   |  130 ++++++++++++++++++++++++++++++++++++++++++++-----
>  kernel/sched/sched.h  |    4 +-
>  5 files changed, 126 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0c54ce0..842c4df 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1139,6 +1139,7 @@ struct load_weight {
>  struct sched_avg {
>  	u32 runnable_avg_sum, runnable_avg_period;
>  	u64 last_runnable_update;
> +	s64 decay_count;
>  	unsigned long load_avg_contrib;
>  };
>  
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9bb7d28..aeb8e56 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1713,6 +1713,9 @@ static void __sched_fork(struct task_struct *p)
>  	p->se.vruntime			= 0;
>  	INIT_LIST_HEAD(&p->se.group_node);
>  
> +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> +	p->se.avg.decay_count = 0;
> +#endif
>  #ifdef CONFIG_SCHEDSTATS
>  	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
>  #endif
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index aeb74e3..2aa60cf 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
>  	P(se->avg.runnable_avg_sum);
>  	P(se->avg.runnable_avg_period);
>  	P(se->avg.load_avg_contrib);
> +	P(se->avg.decay_count);
>  #endif
>  #undef PN
>  #undef P
> @@ -230,6 +231,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>  			atomic_read(&cfs_rq->tg->load_weight));
>  	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
>  			cfs_rq->runnable_load_avg);
> +	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
> +			cfs_rq->blocked_load_avg);
>  #endif
>  
>  	print_cfs_group_stats(m, cpu, cfs_rq->tg);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8229766..6200d20 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1085,6 +1085,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
>  	return decayed;
>  }
>  
> +/* Synchronize an entity's decay with its parentin cfs_rq.*/
                                             parenting

> +static inline void __synchronize_entity_decay(struct sched_entity *se)
> +{
> +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +	u64 decays = atomic64_read(&cfs_rq->decay_counter);
> +
> +	decays -= se->avg.decay_count;
> +	if (!decays)
> +		return;
> +
> +	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
> +	se->avg.decay_count += decays;
> +}
> +
>  /* Compute the current contribution to load_avg by se, return any delta */
>  static long __update_entity_load_avg_contrib(struct sched_entity *se)
>  {
> @@ -1100,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
>  	return se->avg.load_avg_contrib - old_contrib;
>  }
>  
> +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
> +						 long load_contrib)
> +{
> +	if (likely(load_contrib < cfs_rq->blocked_load_avg))
> +		cfs_rq->blocked_load_avg -= load_contrib;
> +	else
> +		cfs_rq->blocked_load_avg = 0;
> +}
> +
>  /* Update a sched_entity's runnable average */
> -static inline void update_entity_load_avg(struct sched_entity *se)
> +static inline void update_entity_load_avg(struct sched_entity *se,
> +					  int update_cfs_rq)
>  {
>  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
>  	long contrib_delta;
> @@ -1111,8 +1135,34 @@ static inline void update_entity_load_avg(struct sched_entity *se)
>  		return;
>  
>  	contrib_delta = __update_entity_load_avg_contrib(se);
> +
> +	if (!update_cfs_rq)
> +		return;
> +
>  	if (se->on_rq)
>  		cfs_rq->runnable_load_avg += contrib_delta;
> +	else
> +		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);

Subtracting negative delta means an addition, right?


> +}
> +
> +/*
> + * Decay the load contributed by all blocked children and account this so that
> + * they their contribution may appropriately discounted when they wake up.

s/they their/their/ ?


> + */
> +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq)

I guess update_cfs_blocked_load is a bit more consistent name with
update_cfs_{load,shares}.

Thanks,
Namhyung


> +{
> +	u64 now = rq_of(cfs_rq)->clock_task >> 20;
> +	u64 decays;
> +
> +	decays = now - cfs_rq->last_decay;
> +	if (!decays)
> +		return;
> +
> +	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
> +					      decays);
> +	atomic64_add(decays, &cfs_rq->decay_counter);
> +
> +	cfs_rq->last_decay = now;
>  }
>  
>  static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
> @@ -1122,26 +1172,56 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
>  
>  /* Add the load generated by se into cfs_rq's child load-average */
>  static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
> -						  struct sched_entity *se)
> -{
> -	update_entity_load_avg(se);
> +						  struct sched_entity *se,
> +						  int wakeup)
> +{
> +	/* we track migrations using entity decay_count == 0 */
> +	if (unlikely(!se->avg.decay_count)) {
> +		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
> +		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
> +		wakeup = 0;
> +	} else {
> +		__synchronize_entity_decay(se);
> +	}
> +
> +	if (wakeup)
> +		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
> +
> +	update_entity_load_avg(se, 0);
>  	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
> +	update_cfs_rq_blocked_load(cfs_rq);
>  }
>  
> -/* Remove se's load from this cfs_rq child load-average */
> +/*
> + * Remove se's load from this cfs_rq child load-average, if the entity is
> + * transitioning to a blocked state we track its projected decay using
> + * blocked_load_avg.
> + */
>  static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
> -						  struct sched_entity *se)
> +						  struct sched_entity *se,
> +						  int sleep)
>  {
> -	update_entity_load_avg(se);
> +	update_entity_load_avg(se, 1);
> +
>  	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
> +	if (sleep) {
> +		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
> +		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
> +	} else {
> +		se->avg.decay_count = 0;
> +	}
>  }
>  #else
> -static inline void update_entity_load_avg(struct sched_entity *se) {}
> +static inline void update_entity_load_avg(struct sched_entity *se,
> +					  int update_cfs_rq) {}
>  static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
>  static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
> -						  struct sched_entity *se) {}
> +					   struct sched_entity *se,
> +					   int wakeup) {}
>  static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
> -						  struct sched_entity *se) {}
> +					   struct sched_entity *se,
> +					   int sleep) {}
> +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {}
>  #endif
>  
>  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> @@ -1270,7 +1350,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	 */
>  	update_curr(cfs_rq);
>  	update_cfs_load(cfs_rq, 0);
> -	enqueue_entity_load_avg(cfs_rq, se);
> +	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
>  	account_entity_enqueue(cfs_rq, se);
>  	update_cfs_shares(cfs_rq);
>  
> @@ -1345,7 +1425,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	 * Update run-time statistics of the 'current'.
>  	 */
>  	update_curr(cfs_rq);
> -	dequeue_entity_load_avg(cfs_rq, se);
> +	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
>  
>  	update_stats_dequeue(cfs_rq, se);
>  	if (flags & DEQUEUE_SLEEP) {
> @@ -1516,7 +1596,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
>  		/* Put 'current' back into the tree. */
>  		__enqueue_entity(cfs_rq, prev);
>  		/* in !on_rq case, update occurred at dequeue */
> -		update_entity_load_avg(prev);
> +		update_entity_load_avg(prev, 1);
>  	}
>  	cfs_rq->curr = NULL;
>  }
> @@ -1532,7 +1612,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>  	/*
>  	 * Ensure that runnable average is periodically updated.
>  	 */
> -	update_entity_load_avg(curr);
> +	update_entity_load_avg(curr, 1);
> +	update_cfs_rq_blocked_load(cfs_rq);
>  
>  	/*
>  	 * Update share accounting for long-running entities.
> @@ -2391,6 +2472,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  		update_cfs_load(cfs_rq, 0);
>  		update_cfs_shares(cfs_rq);
> +		update_entity_load_avg(se, 1);
>  	}
>  
>  	if (!se) {
> @@ -2452,6 +2534,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  		update_cfs_load(cfs_rq, 0);
>  		update_cfs_shares(cfs_rq);
> +		update_entity_load_avg(se, 1);
>  	}
>  
>  	if (!se) {
> @@ -3557,6 +3640,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
>  
>  	update_rq_clock(rq);
>  	update_cfs_load(cfs_rq, 1);
> +	update_cfs_rq_blocked_load(cfs_rq);
>  
>  	/*
>  	 * We need to update shares after updating tg->load_weight in
> @@ -5379,6 +5463,21 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
>  		place_entity(cfs_rq, se, 0);
>  		se->vruntime -= cfs_rq->min_vruntime;
>  	}
> +
> +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> +	/*
> +	* Remove our load from contribution when we leave sched_fair
> +	* and ensure we don't carry in an old decay_count if we
> +	* switch back.
> +	*/
> +	if (p->se.avg.decay_count) {
> +		struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
> +		__synchronize_entity_decay(&p->se);
> +		subtract_blocked_load_contrib(cfs_rq,
> +				p->se.avg.load_avg_contrib);
> +		p->se.avg.decay_count = 0;
> +	}
> +#endif
>  }
>  
>  /*
> @@ -5425,6 +5524,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
>  #ifndef CONFIG_64BIT
>  	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
>  #endif
> +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> +	atomic64_set(&cfs_rq->decay_counter, 1);
> +#endif
>  }
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 26cc36f..a96adf1 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -229,7 +229,9 @@ struct cfs_rq {
>  	 * This allows for the description of both thread and group usage (in
>  	 * the FAIR_GROUP_SCHED case).
>  	 */
> -	u64 runnable_load_avg;
> +	u64 runnable_load_avg, blocked_load_avg;
> +	atomic64_t decay_counter;
> +	u64 last_decay;
>  #endif
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 05/16] sched: add an rq migration call-back to sched_class
  2012-06-28  2:24 ` [PATCH 05/16] sched: add an rq migration call-back to sched_class Paul Turner
@ 2012-06-29  1:32   ` Namhyung Kim
  0 siblings, 0 replies; 46+ messages in thread
From: Namhyung Kim @ 2012-06-29  1:32 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Peter Zijlstra, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ben Segall, Ingo Molnar,
	Paul E. McKenney, Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 27 Jun 2012 19:24:14 -0700, Paul Turner wrote:
> Since we are now doing bottom up load accumulation we need explicit
> notification when a task has been re-parented so that the old hierarchy can be
> updated.
>
> Adds task_migrate_rq(struct rq *prev, struct *rq new_rq);

It should be:
	migrate_task_rq(struct task_struct *p, int next_cpu);


>
> (The alternative is to do this out of __set_task_cpu, but it was suggested that
> this would be a cleaner encapsulation.)
>
> Signed-off-by: Paul Turner <pjt@google.com>
> ---
>  include/linux/sched.h |    1 +
>  kernel/sched/core.c   |    2 ++
>  kernel/sched/fair.c   |   12 ++++++++++++
>  3 files changed, 15 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 842c4df..fdfdfab 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1102,6 +1102,7 @@ struct sched_class {
>  
>  #ifdef CONFIG_SMP
>  	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
> +	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
>  
>  	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
>  	void (*post_schedule) (struct rq *this_rq);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index aeb8e56..c3686eb 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1109,6 +1109,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
>  	trace_sched_migrate_task(p, new_cpu);
>  
>  	if (task_cpu(p) != new_cpu) {
> +		if (p->sched_class->migrate_task_rq)
> +			p->sched_class->migrate_task_rq(p, new_cpu);
>  		p->se.nr_migrations++;
>  		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
>  	}
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6200d20..33f582a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3089,6 +3089,17 @@ unlock:
>  
>  	return new_cpu;
>  }
> +
> +/*
> + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
> + * cfs_rq_of(p) references at time of call are still valid and identify the
> + * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
> + * other assumptions, including rq->lock state, should be made.
> + * Caller guarantees p->pi_lock held, but nothing else.

Duplicate sentence?


> + */
> +static void
> +migrate_task_rq_fair(struct task_struct *p, int next_cpu) {

The opening brace should start on the next line.

Thanks,
Namhyung

> +}
>  #endif /* CONFIG_SMP */
>  
>  static unsigned long
> @@ -5754,6 +5765,7 @@ const struct sched_class fair_sched_class = {
>  
>  #ifdef CONFIG_SMP
>  	.select_task_rq		= select_task_rq_fair,
> +	.migrate_task_rq	= migrate_task_rq_fair,
>  
>  	.rq_online		= rq_online_fair,
>  	.rq_offline		= rq_offline_fair,

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 09/16] sched: normalize tg load contributions against runnable time
  2012-06-28  2:24 ` [PATCH 09/16] sched: normalize tg load contributions against runnable time Paul Turner
@ 2012-06-29  7:26   ` Namhyung Kim
  2012-07-04 19:48   ` Peter Zijlstra
  2012-07-06 12:23   ` Peter Zijlstra
  2 siblings, 0 replies; 46+ messages in thread
From: Namhyung Kim @ 2012-06-29  7:26 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Peter Zijlstra, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ben Segall, Ingo Molnar,
	Paul E. McKenney, Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 27 Jun 2012 19:24:14 -0700, Paul Turner wrote:
> Entities of equal weight should receive equitable distribution of cpu time.
> This is challenging in the case of a task_group's shares as execution may be
> occurring on multiple cpus simultaneously.
>
> To handle this we divide up the shares into weights proportionate with the load
> on each cfs_rq.  This does not however, account for the fact that the sum of
> the parts may be less than one cpu and so we need to normalize:
>   load(tg) = min(runnable_avg(tg), 1) * tg->shares
> Where runnable_avg is the aggregate time in which the task_group had runnable
> children.
>
> Signed-off-by: Paul Turner <pjt@google.com>
> Signed-off-by: Ben Segall <bsegall@google.com>.
> ---
>  kernel/sched/debug.c |    4 ++++
>  kernel/sched/fair.c  |   39 +++++++++++++++++++++++++++++++++++++++
>  kernel/sched/sched.h |    2 ++
>  3 files changed, 45 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 9268fb7..9334c68 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -237,6 +237,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>  			atomic64_read(&cfs_rq->tg->load_avg));
>  	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
>  			cfs_rq->tg_load_contrib);
> +	SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
> +			cfs_rq->tg_runnable_contrib);
> +	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
> +			atomic_read(&cfs_rq->tg->runnable_avg));
>  #endif
>  
>  	print_cfs_group_stats(m, cpu, cfs_rq->tg);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a416296..91d0b21 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1117,19 +1117,56 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
>  	}
>  }
>  
> +/*
> + * Aggregate cfs_rq runnable averages into an equivalent task_group
> + * representation for computing load contributions.
> + */
> +static inline void __update_tg_runnable_avg(struct sched_avg *sa,
> +						  struct cfs_rq *cfs_rq)
> +{
> +	struct task_group *tg = cfs_rq->tg;
> +	long contrib;
> +
> +	contrib = div_u64(sa->runnable_avg_sum << 12,
> +			  sa->runnable_avg_period + 1);
> +	contrib -= cfs_rq->tg_runnable_contrib;
> +
> +	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
> +		atomic_add(contrib, &tg->runnable_avg);
> +		cfs_rq->tg_runnable_contrib += contrib;
> +	}
> +}
> +
>  static inline void __update_group_entity_contrib(struct sched_entity *se)
>  {
>  	struct cfs_rq *cfs_rq = group_cfs_rq(se);
>  	struct task_group *tg = cfs_rq->tg;
> +	int runnable_avg;
> +
>  	u64 contrib;
>  
>  	contrib = cfs_rq->tg_load_contrib * tg->shares;
>  	se->avg.load_avg_contrib = div64_u64(contrib,
>  					     atomic64_read(&tg->load_avg) + 1);
> +
> +	/*
> +	 * Unlike a task-entity, a group entity may be using >=1 cpu globally.
> +	 * However, in the case that it's using <1 cpu we need to form a
> +	 * correction term so that we contribute the same load as a task of
> +	 * equal weight. (Global runnable time is taken as a fraction over
> +	 * 2^12.)

Wouldn't it be better using a symbolic name rather than the magic number?


> +	 */
> +	runnable_avg = atomic_read(&tg->runnable_avg);
> +	if (runnable_avg < (1<<12)) {
> +		se->avg.load_avg_contrib *= runnable_avg;
> +		se->avg.load_avg_contrib /= (1<<12);

Ditto.

Thanks,
Namhyung


> +	}
>  }
>  #else
>  static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
>  						 int force_update) {}
> +static inline void __update_tg_runnable_avg(struct sched_avg *sa,
> +						  struct cfs_rq *cfs_rq) {}
>  static inline void __update_group_entity_contrib(struct sched_entity *se) {}
>  #endif
>  
> @@ -1151,6 +1188,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
>  	if (entity_is_task(se)) {
>  		__update_task_entity_contrib(se);
>  	} else {
> +		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
>  		__update_group_entity_contrib(se);
>  	}
>  
> @@ -1219,6 +1257,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
>  static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
>  {
>  	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
> +	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
>  }
>  
>  /* Add the load generated by se into cfs_rq's child load-average */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 4d3b3ad..b48bbd7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -113,6 +113,7 @@ struct task_group {
>  
>  	atomic_t load_weight;
>  	atomic64_t load_avg;
> +	atomic_t runnable_avg;
>  #endif
>  
>  #ifdef CONFIG_RT_GROUP_SCHED
> @@ -234,6 +235,7 @@ struct cfs_rq {
>  	atomic64_t decay_counter, removed_load;
>  	u64 last_decay;
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> +	u32 tg_runnable_contrib;
>  	u64 tg_load_contrib;
>  #endif
>  #endif

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs()
  2012-06-28  2:24 ` [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs() Paul Turner
@ 2012-06-29  7:28   ` Namhyung Kim
  2012-07-12  0:03     ` Paul Turner
  2012-07-05 11:58   ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Namhyung Kim @ 2012-06-29  7:28 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Peter Zijlstra, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ben Segall, Ingo Molnar,
	Paul E. McKenney, Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 27 Jun 2012 19:24:15 -0700, Paul Turner wrote:
> Now that running entities maintain their own load-averages the work we must do
> in update_shares() is largely restricted to the periodic decay of blocked
> entities.  This allows us to be a little less pessimistic regarding our
> occupancy on rq->lock and the associated rq->clock updates required.
>
> Signed-off-by: Paul Turner <pjt@google.com>
> ---
>  kernel/sched/fair.c |   59 +++++++++++++++++++++++++++++----------------------
>  1 files changed, 33 insertions(+), 26 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4a9a828..dd1ef8a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3678,23 +3678,20 @@ out:
>  /*
>   * update tg->load_weight by folding this cpu's load_avg
>   */
> -static int update_shares_cpu(struct task_group *tg, int cpu)
> +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
>  {
> -	struct sched_entity *se;
> -	struct cfs_rq *cfs_rq;
> -	unsigned long flags;
> -	struct rq *rq;
> -
> -
> -	rq = cpu_rq(cpu);
> -	se = tg->se[cpu];
> -	cfs_rq = tg->cfs_rq[cpu];
> +	struct sched_entity *se = tg->se[cpu];
> +	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
>  
> -	raw_spin_lock_irqsave(&rq->lock, flags);
> +	/* throttled entities do not contribute to load */
> +	if (throttled_hierarchy(cfs_rq))
> +		return;
>  
> -	update_rq_clock(rq);
>  	update_cfs_rq_blocked_load(cfs_rq, 1);
> -	update_entity_load_avg(tg->se[cpu], 1);
> +	if (se)
> +		update_entity_load_avg(se, 1);
> +	else
> +		update_rq_runnable_avg(rq_of(cfs_rq), 1);
>  
>  	if (se) {
>  		/*
> @@ -3707,29 +3704,39 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
>  		else
>  			list_del_leaf_cfs_rq(cfs_rq);
>  	}
> -
> -	raw_spin_unlock_irqrestore(&rq->lock, flags);
> -
> -	return 0;
>  }
>  
> -static void update_shares(int cpu)
> +static void update_blocked_averages(int cpu)
>  {
> -	struct cfs_rq *cfs_rq;
>  	struct rq *rq = cpu_rq(cpu);
> +	struct cfs_rq *cfs_rq;
> +
> +	unsigned long flags;
> +	int num_updates = 0;
>  
>  	rcu_read_lock();
> +	raw_spin_lock_irqsave(&rq->lock, flags);
> +	update_rq_clock(rq);
>  	/*
>  	 * Iterates the task_group tree in a bottom up fashion, see
>  	 * list_add_leaf_cfs_rq() for details.
>  	 */
>  	for_each_leaf_cfs_rq(rq, cfs_rq) {
> -		/* throttled entities do not contribute to load */
> -		if (throttled_hierarchy(cfs_rq))
> -			continue;
> +		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
>  
> -		update_shares_cpu(cfs_rq->tg, cpu);
> +		/*
> +		 * Periodically release the lock so that a cfs_rq with many
> +		 * children cannot hold it for an arbitrary period of time.
> +		 */
> +		if (num_updates++ % 20 == 0) {

Should it be '++num_updates'? Otherwise, it'll release the lock at the
first iteration?

Thanks,
Namhyung


> +			raw_spin_unlock_irqrestore(&rq->lock, flags);
> +			cpu_relax();
> +			raw_spin_lock_irqsave(&rq->lock, flags);
> +			update_rq_clock(rq);
> +		}
>  	}
> +
> +	raw_spin_unlock_irqrestore(&rq->lock, flags);
>  	rcu_read_unlock();
>  }
>  
> @@ -3774,7 +3781,7 @@ unsigned long task_h_load(struct task_struct *p)
>  	return load;
>  }
>  #else
> -static inline void update_shares(int cpu)
> +static inline void update_blocked_averages(int cpu)
>  {
>  }
>  
> @@ -4936,7 +4943,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
>  	 */
>  	raw_spin_unlock(&this_rq->lock);
>  
> -	update_shares(this_cpu);
> +	update_blocked_averages(this_cpu);
>  	rcu_read_lock();
>  	for_each_domain(this_cpu, sd) {
>  		unsigned long interval;
> @@ -5196,7 +5203,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
>  	int update_next_balance = 0;
>  	int need_serialize;
>  
> -	update_shares(cpu);
> +	update_blocked_averages(cpu);
>  
>  	rcu_read_lock();
>  	for_each_domain(cpu, sd) {

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq
  2012-06-28  2:24 ` [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq Paul Turner
  2012-06-28  6:33   ` Namhyung Kim
@ 2012-07-04 15:28   ` Peter Zijlstra
  2012-07-06 14:53     ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-04 15:28 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> For a given task t, we can compute its contribution to load as:
>   task_load(t) = runnable_avg(t) * weight(t)
> 
> On a parenting cfs_rq we can then aggregate
>   runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t
> 
> Maintain this bottom up, with task entities adding their contributed load to
> the parenting cfs_rq sum.  When a task entities load changes we add the same
> delta to the maintained sum.
> 
> Signed-off-by: Paul Turner <pjt@google.com>
> Signed-off-by: Ben Segall <bsegall@google.com> 

A lot of patches have this funny sob trail.. Ben never send me these
patches, so uhm. ?

Should that be Reviewed-by, or what is the deal with those?


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis
  2012-06-28  2:24 ` [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis Paul Turner
  2012-06-28  6:06   ` Namhyung Kim
@ 2012-07-04 15:32   ` Peter Zijlstra
  2012-07-12  0:12     ` Paul Turner
  1 sibling, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-04 15:32 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> Instead of tracking averaging the load parented by a cfs_rq, we can track
> entity load directly.  With the load for a given cfs_Rq then being the sum of
> its children.
> 
> To do this we represent the historical contribution to runnable average within each
> trailing 1024us of execution as the coefficients of a geometric series.
> 
> We can express this for a given task t as:
>   runnable_sum(t) = \Sum u_i * y^i ,
>   load(t) = weight_t * runnable_sum(t) / (\Sum 1024 * y^i)
> 
> Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms
> and y is chosen such that y^k = 1/2.  We currently choose k to be 32 which
> roughly translates to about a sched period.
> 
> Signed-off-by: Paul Turner <pjt@google.com>
> ---
>  include/linux/sched.h |    8 +++
>  kernel/sched/debug.c  |    4 ++
>  kernel/sched/fair.c   |  128 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 140 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9dced2e..5bf5c79 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1136,6 +1136,11 @@ struct load_weight {
>         unsigned long weight, inv_weight;
>  };
>  
> +struct sched_avg {
> +       u32 runnable_avg_sum, runnable_avg_period;
> +       u64 last_runnable_update;
> +}; 


So we can use u32 because:

             n         1 
lim n->inf \Sum y^i = --- = ~ 46.66804636511427012122 ; y^32 = 0.5
             i=0      1-y

So the values should never be larger than ~47k, right?

/me goes add something like that in a comment.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 14/16] sched: make __update_entity_runnable_avg() fast
  2012-06-28  2:24 ` [PATCH 14/16] sched: make __update_entity_runnable_avg() fast Paul Turner
@ 2012-07-04 15:41   ` Peter Zijlstra
  2012-07-04 17:20     ` Peter Zijlstra
  2012-07-04 16:51   ` Peter Zijlstra
  1 sibling, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-04 15:41 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> +#define LOAD_AVG_MAX 46742 /* maximum possible load avg */

With 1024 * 1/(1-y), with y^32 = 0.5 I get 47788.. sup?


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 14/16] sched: make __update_entity_runnable_avg() fast
  2012-06-28  2:24 ` [PATCH 14/16] sched: make __update_entity_runnable_avg() fast Paul Turner
  2012-07-04 15:41   ` Peter Zijlstra
@ 2012-07-04 16:51   ` Peter Zijlstra
  1 sibling, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-04 16:51 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> +#define LOAD_AVG_MAX_N 516 /* number of full periods it takes to produce max */

y=sqrt(sqrt(sqrt(sqrt(sqrt(0.5)))))
l(1/1024)/l(y)
319.99999999999999998938
1024*y^320
.99999999999999998976

So after only 320 periods the contribution of the term is too small to
matter.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 14/16] sched: make __update_entity_runnable_avg() fast
  2012-07-04 15:41   ` Peter Zijlstra
@ 2012-07-04 17:20     ` Peter Zijlstra
  2012-07-09 20:18       ` Benjamin Segall
  0 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-04 17:20 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-07-04 at 17:41 +0200, Peter Zijlstra wrote:
> On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> > +#define LOAD_AVG_MAX 46742 /* maximum possible load avg */
> 
> With 1024 * 1/(1-y), with y^32 = 0.5 I get 47788.. sup?

Ah, if we do the series starting at 1 we get: 1024*y/(1-y) which yields:
46764. So you're not adding the first term u_0?


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 09/16] sched: normalize tg load contributions against runnable time
  2012-06-28  2:24 ` [PATCH 09/16] sched: normalize tg load contributions against runnable time Paul Turner
  2012-06-29  7:26   ` Namhyung Kim
@ 2012-07-04 19:48   ` Peter Zijlstra
  2012-07-06 11:52     ` Peter Zijlstra
  2012-07-12  0:02     ` Paul Turner
  2012-07-06 12:23   ` Peter Zijlstra
  2 siblings, 2 replies; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-04 19:48 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> Entities of equal weight should receive equitable distribution of cpu time.
> This is challenging in the case of a task_group's shares as execution may be
> occurring on multiple cpus simultaneously.
> 
> To handle this we divide up the shares into weights proportionate with the load
> on each cfs_rq.  This does not however, account for the fact that the sum of
> the parts may be less than one cpu and so we need to normalize:
>   load(tg) = min(runnable_avg(tg), 1) * tg->shares
> Where runnable_avg is the aggregate time in which the task_group had runnable
> children. 

I remember we had a bit of a discussion on this last time, I thought you
were going to convince me this approximation was 'right'.

Care to still do so.. the rationale used should at least live in a
comment somewhere, otherwise someone will go silly trying to understand
things later on.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs()
  2012-06-28  2:24 ` [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs() Paul Turner
  2012-06-29  7:28   ` Namhyung Kim
@ 2012-07-05 11:58   ` Peter Zijlstra
  2012-07-12  0:11     ` Paul Turner
  1 sibling, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-05 11:58 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania

On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> Now that running entities maintain their own load-averages the work we must do
> in update_shares() is largely restricted to the periodic decay of blocked
> entities.  This allows us to be a little less pessimistic regarding our
> occupancy on rq->lock and the associated rq->clock updates required.

So what you're saying is that since 'weight' now includes runtime
behaviour (where we hope the recent past matches the near future) we
don't need to update shares quite as often since the effect of
sleep-wakeup cycles isn't near as big since they're already anticipated.

So how is the decay of blocked load still significant, surely that too
is mostly part of the anticipated sleep/wake cycle already caught in the
runtime behaviour.

Or is this the primary place where we decay? If so that wasn't obvious
and thus wants a comment someplace.

> Signed-off-by: Paul Turner <pjt@google.com>
> ---

> +static void update_blocked_averages(int cpu)
>  {
>  	struct rq *rq = cpu_rq(cpu);
> +	struct cfs_rq *cfs_rq;
> +
> +	unsigned long flags;
> +	int num_updates = 0;
>  
>  	rcu_read_lock();
> +	raw_spin_lock_irqsave(&rq->lock, flags);
> +	update_rq_clock(rq);
>  	/*
>  	 * Iterates the task_group tree in a bottom up fashion, see
>  	 * list_add_leaf_cfs_rq() for details.
>  	 */
>  	for_each_leaf_cfs_rq(rq, cfs_rq) {
> +		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
>  
> +		/*
> +		 * Periodically release the lock so that a cfs_rq with many
> +		 * children cannot hold it for an arbitrary period of time.
> +		 */
> +		if (num_updates++ % 20 == 0) {
> +			raw_spin_unlock_irqrestore(&rq->lock, flags);
> +			cpu_relax();
> +			raw_spin_lock_irqsave(&rq->lock, flags);

Gack.. that's not real pretty is it.. Esp. since we're still holding RCU
lock and are thus (mostly) still not preemptable.

How much of a problem was this?, the changelog is silent on this.

> +			update_rq_clock(rq);
> +		}
>  	}
> +
> +	raw_spin_unlock_irqrestore(&rq->lock, flags);
>  	rcu_read_unlock();
>  }
>  




^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 09/16] sched: normalize tg load contributions against runnable time
  2012-07-04 19:48   ` Peter Zijlstra
@ 2012-07-06 11:52     ` Peter Zijlstra
  2012-07-12  1:08       ` Andre Noll
  2012-07-12  0:02     ` Paul Turner
  1 sibling, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-06 11:52 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-07-04 at 21:48 +0200, Peter Zijlstra wrote:
> On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> > Entities of equal weight should receive equitable distribution of cpu time.
> > This is challenging in the case of a task_group's shares as execution may be
> > occurring on multiple cpus simultaneously.
> > 
> > To handle this we divide up the shares into weights proportionate with the load
> > on each cfs_rq.  This does not however, account for the fact that the sum of
> > the parts may be less than one cpu and so we need to normalize:
> >   load(tg) = min(runnable_avg(tg), 1) * tg->shares
> > Where runnable_avg is the aggregate time in which the task_group had runnable
> > children. 
> 
> I remember we had a bit of a discussion on this last time, I thought you
> were going to convince me this approximation was 'right'.
> 
> Care to still do so.. the rationale used should at least live in a
> comment somewhere, otherwise someone will go silly trying to understand
> things later on.

So if we treat the per-cpu utilization u_i as probability, then we're
looking for:

  P(\Union_{i=1..n} u_i) := 
	\Sum_{k=1..n} (-1)^(k-1) P(\Intersection_{i=1..k} u_i)

Computing this however is far too expensive, what we can do is
approximate by setting u = avg(u_i) and then using:

  u_i == u_j for all i,j

and assuming all variables are independent, giving us:

  P(A \Intersection B) = P(A)P(B)

This then yields:

  P(\Union_{i=1..n} u_i) ~= \Sum_{k=1..n} (-1)^(k-1) (n choose k) u^k

Which unfortunately isn't a series I found a sane solution for, but
numerically (see below) we can see it very quickly approaches 1 when n
>> 1.

Therefore, the chosen approximation of min(1, \Sum_i u_i) is indeed a
sane approximation since for very small u_i and/or small n the sum is
less likely to exceed 1 and for big u_i and/or big n the clip to 1 is
indeed correct.

*phew*

Was this what you meant? :-)

Now all that is left is grok the actual code..


probability_union.bc
---


define f (x) {
	if (x <= 1) return (1);
	return (f(x-1) * x);
}

define choose (n,k) {
	return f(n) / (f(n-k) * f(k));
}

define pu (p,n) {
	auto s, k

	s = 0;
	for (k = 1; k <= n; k++) {
		s += (-1)^(k-1) * choose(n,k) * p^k;
	}

	return s;
}


for (n=2; n<128; n*=2) {
	print n, ": "
	for (p = 1; p < 11; p++) {
		print pu(p/10,n), " "
	}
	print "\n"
}
quit

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 09/16] sched: normalize tg load contributions against runnable time
  2012-06-28  2:24 ` [PATCH 09/16] sched: normalize tg load contributions against runnable time Paul Turner
  2012-06-29  7:26   ` Namhyung Kim
  2012-07-04 19:48   ` Peter Zijlstra
@ 2012-07-06 12:23   ` Peter Zijlstra
  2 siblings, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-06 12:23 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:

> +       contrib = div_u64(sa->runnable_avg_sum << 12,
> +                         sa->runnable_avg_period + 1);

> +
> +       /*
> +        * Unlike a task-entity, a group entity may be using >=1 cpu globally.
> +        * However, in the case that it's using <1 cpu we need to form a
> +        * correction term so that we contribute the same load as a task of
> +        * equal weight. (Global runnable time is taken as a fraction over
> +        * 2^12.)
> +        */
> +       runnable_avg = atomic_read(&tg->runnable_avg);
> +       if (runnable_avg < (1<<12)) {
> +               se->avg.load_avg_contrib *= runnable_avg;
> +               se->avg.load_avg_contrib /= (1<<12);
> +       }


Did we really have to open-code a 4096 fixed point here? Couldn't we
re-use some of the existing _SCALE things.. if not why 12?



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq
  2012-07-04 15:28   ` Peter Zijlstra
@ 2012-07-06 14:53     ` Peter Zijlstra
  2012-07-09  9:15       ` Ingo Molnar
  0 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-06 14:53 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-07-04 at 17:28 +0200, Peter Zijlstra wrote:
> On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> > For a given task t, we can compute its contribution to load as:
> >   task_load(t) = runnable_avg(t) * weight(t)
> > 
> > On a parenting cfs_rq we can then aggregate
> >   runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t
> > 
> > Maintain this bottom up, with task entities adding their contributed load to
> > the parenting cfs_rq sum.  When a task entities load changes we add the same
> > delta to the maintained sum.
> > 
> > Signed-off-by: Paul Turner <pjt@google.com>
> > Signed-off-by: Ben Segall <bsegall@google.com> 
> 
> A lot of patches have this funny sob trail.. Ben never send me these
> patches, so uhm. ?
> 
> Should that be Reviewed-by, or what is the deal with those?

Ben could you clarify what your exact contribution was?

Ingo, supposing Ben is co-author and wrote a significant part of the
patch, what are we supposed to do with these tags?

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq
  2012-07-06 14:53     ` Peter Zijlstra
@ 2012-07-09  9:15       ` Ingo Molnar
  0 siblings, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2012-07-09  9:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul Turner, linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, 2012-07-04 at 17:28 +0200, Peter Zijlstra wrote:
> > On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> > > For a given task t, we can compute its contribution to load as:
> > >   task_load(t) = runnable_avg(t) * weight(t)
> > > 
> > > On a parenting cfs_rq we can then aggregate
> > >   runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t
> > > 
> > > Maintain this bottom up, with task entities adding their contributed load to
> > > the parenting cfs_rq sum.  When a task entities load changes we add the same
> > > delta to the maintained sum.
> > > 
> > > Signed-off-by: Paul Turner <pjt@google.com>
> > > Signed-off-by: Ben Segall <bsegall@google.com> 
> > 
> > A lot of patches have this funny sob trail.. Ben never send me these
> > patches, so uhm. ?
> > 
> > Should that be Reviewed-by, or what is the deal with those?
> 
> Ben could you clarify what your exact contribution was?
> 
> Ingo, supposing Ben is co-author and wrote a significant part 
> of the patch, what are we supposed to do with these tags?

There can be only one author SOB - additional contributions 
should be reflected via credits in the changelog, copyright 
lines and/or Originally-From: tags, or so.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 14/16] sched: make __update_entity_runnable_avg() fast
  2012-07-04 17:20     ` Peter Zijlstra
@ 2012-07-09 20:18       ` Benjamin Segall
  2012-07-10 10:51         ` Peter Zijlstra
  0 siblings, 1 reply; 46+ messages in thread
From: Benjamin Segall @ 2012-07-09 20:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul Turner, linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

Correct, the sum is 1..n. The maximum was chosen as 516/46742 because
that is the point when the approximation of loop + fixed point math
being used reaches a maximum, even if the ideal y^n series would cap out
slightly differently.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 14/16] sched: make __update_entity_runnable_avg() fast
  2012-07-09 20:18       ` Benjamin Segall
@ 2012-07-10 10:51         ` Peter Zijlstra
  2012-07-12  0:15           ` Paul Turner
  0 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-10 10:51 UTC (permalink / raw)
  To: Benjamin Segall
  Cc: Paul Turner, linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Mon, 2012-07-09 at 13:18 -0700, Benjamin Segall wrote:
> Correct, the sum is 1..n. The maximum was chosen as 516/46742 because
> that is the point when the approximation of loop + fixed point math
> being used reaches a maximum, even if the ideal y^n series would cap out
> slightly differently.

Can you run a simple kid like me through the math here.. it seems
strange that fixed point math would give a higher order term than the
regular case. And by such a wide margin as well, (516-320)/32 ~ 6, so
that's 1/64-th of the 320-th term.



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 09/16] sched: normalize tg load contributions against runnable time
  2012-07-04 19:48   ` Peter Zijlstra
  2012-07-06 11:52     ` Peter Zijlstra
@ 2012-07-12  0:02     ` Paul Turner
  1 sibling, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-07-12  0:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, Jul 4, 2012 at 12:48 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
> On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
> > Entities of equal weight should receive equitable distribution of cpu time.
> > This is challenging in the case of a task_group's shares as execution may be
> > occurring on multiple cpus simultaneously.
> >
> > To handle this we divide up the shares into weights proportionate with the load
> > on each cfs_rq.  This does not however, account for the fact that the sum of
> > the parts may be less than one cpu and so we need to normalize:
> >   load(tg) = min(runnable_avg(tg), 1) * tg->shares
> > Where runnable_avg is the aggregate time in which the task_group had runnable
> > children.
>
> I remember we had a bit of a discussion on this last time, I thought you
> were going to convince me this approximation was 'right'.


I thought I had :-)

Here's what I wrote previously:

"""
So, as an estimate it has the nice property that it's always a lower
bound on the true value.

Consider the two cases for runnable contributions across a pair of cpus:

Either they are disjoint by wall time, in which case they would have
accounted for the same amount were they actually on the same cpu.
Or they overlap, in which case that over-lap period would be an
additional run-delay incurred on one of them.

Then, the maximum amount we understate for a given overlap is n(n+1)/2
* k where k is the the width of the overlap and n is the number of
cpus we over-lapped on.

But then, n is bounded by num_cpus -- so on a small machine our error
is bounded (e.g. we can be no more than 1/3 less than true on 2
cpus). On a larger machine this term can be larger, however, if k is
of consequential size we know we accounted at least n*k so we're still
going to approach the ceiling of 1 very quickly at which point it
doesn't matter that we under-estimated. The flip side on a larger
machine is that if k is of inconsequential size then n*2 * k is still
tiny relative to the size of the machine and the accuracy of the
approximation matters less.
"""

I approach it from a slightly different angle, nonetheless, it's
coherent with your analysis in your next reply.

What I forgot to do here was add the comments to this effect.  Will do.


>
>
> Care to still do so.. the rationale used should at least live in a
> comment somewhere, otherwise someone will go silly trying to understand
> things later on.
>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs()
  2012-06-29  7:28   ` Namhyung Kim
@ 2012-07-12  0:03     ` Paul Turner
  0 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-07-12  0:03 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Peter Zijlstra, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ben Segall, Ingo Molnar,
	Paul E. McKenney, Morten Rasmussen, Vaidyanathan Srinivasan

On Fri, Jun 29, 2012 at 12:28 AM, Namhyung Kim <namhyung@kernel.org> wrote:
> On Wed, 27 Jun 2012 19:24:15 -0700, Paul Turner wrote:
>> Now that running entities maintain their own load-averages the work we must do
>> in update_shares() is largely restricted to the periodic decay of blocked
>> entities.  This allows us to be a little less pessimistic regarding our
>> occupancy on rq->lock and the associated rq->clock updates required.
>>
>> Signed-off-by: Paul Turner <pjt@google.com>
>> ---
>>  kernel/sched/fair.c |   59 +++++++++++++++++++++++++++++----------------------
>>  1 files changed, 33 insertions(+), 26 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 4a9a828..dd1ef8a 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -3678,23 +3678,20 @@ out:
>>  /*
>>   * update tg->load_weight by folding this cpu's load_avg
>>   */
>> -static int update_shares_cpu(struct task_group *tg, int cpu)
>> +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
>>  {
>> -     struct sched_entity *se;
>> -     struct cfs_rq *cfs_rq;
>> -     unsigned long flags;
>> -     struct rq *rq;
>> -
>> -
>> -     rq = cpu_rq(cpu);
>> -     se = tg->se[cpu];
>> -     cfs_rq = tg->cfs_rq[cpu];
>> +     struct sched_entity *se = tg->se[cpu];
>> +     struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
>>
>> -     raw_spin_lock_irqsave(&rq->lock, flags);
>> +     /* throttled entities do not contribute to load */
>> +     if (throttled_hierarchy(cfs_rq))
>> +             return;
>>
>> -     update_rq_clock(rq);
>>       update_cfs_rq_blocked_load(cfs_rq, 1);
>> -     update_entity_load_avg(tg->se[cpu], 1);
>> +     if (se)
>> +             update_entity_load_avg(se, 1);
>> +     else
>> +             update_rq_runnable_avg(rq_of(cfs_rq), 1);
>>
>>       if (se) {
>>               /*
>> @@ -3707,29 +3704,39 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
>>               else
>>                       list_del_leaf_cfs_rq(cfs_rq);
>>       }
>> -
>> -     raw_spin_unlock_irqrestore(&rq->lock, flags);
>> -
>> -     return 0;
>>  }
>>
>> -static void update_shares(int cpu)
>> +static void update_blocked_averages(int cpu)
>>  {
>> -     struct cfs_rq *cfs_rq;
>>       struct rq *rq = cpu_rq(cpu);
>> +     struct cfs_rq *cfs_rq;
>> +
>> +     unsigned long flags;
>> +     int num_updates = 0;
>>
>>       rcu_read_lock();
>> +     raw_spin_lock_irqsave(&rq->lock, flags);
>> +     update_rq_clock(rq);
>>       /*
>>        * Iterates the task_group tree in a bottom up fashion, see
>>        * list_add_leaf_cfs_rq() for details.
>>        */
>>       for_each_leaf_cfs_rq(rq, cfs_rq) {
>> -             /* throttled entities do not contribute to load */
>> -             if (throttled_hierarchy(cfs_rq))
>> -                     continue;
>> +             __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
>>
>> -             update_shares_cpu(cfs_rq->tg, cpu);
>> +             /*
>> +              * Periodically release the lock so that a cfs_rq with many
>> +              * children cannot hold it for an arbitrary period of time.
>> +              */
>> +             if (num_updates++ % 20 == 0) {
>
> Should it be '++num_updates'? Otherwise, it'll release the lock at the
> first iteration?

Yes -- applied.  Thanks!

>
> Thanks,
> Namhyung
>
>
>> +                     raw_spin_unlock_irqrestore(&rq->lock, flags);
>> +                     cpu_relax();
>> +                     raw_spin_lock_irqsave(&rq->lock, flags);
>> +                     update_rq_clock(rq);
>> +             }
>>       }
>> +
>> +     raw_spin_unlock_irqrestore(&rq->lock, flags);
>>       rcu_read_unlock();
>>  }
>>
>> @@ -3774,7 +3781,7 @@ unsigned long task_h_load(struct task_struct *p)
>>       return load;
>>  }
>>  #else
>> -static inline void update_shares(int cpu)
>> +static inline void update_blocked_averages(int cpu)
>>  {
>>  }
>>
>> @@ -4936,7 +4943,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
>>        */
>>       raw_spin_unlock(&this_rq->lock);
>>
>> -     update_shares(this_cpu);
>> +     update_blocked_averages(this_cpu);
>>       rcu_read_lock();
>>       for_each_domain(this_cpu, sd) {
>>               unsigned long interval;
>> @@ -5196,7 +5203,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
>>       int update_next_balance = 0;
>>       int need_serialize;
>>
>> -     update_shares(cpu);
>> +     update_blocked_averages(cpu);
>>
>>       rcu_read_lock();
>>       for_each_domain(cpu, sd) {

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs()
  2012-07-05 11:58   ` Peter Zijlstra
@ 2012-07-12  0:11     ` Paul Turner
  2012-07-12 14:40       ` Peter Zijlstra
  0 siblings, 1 reply; 46+ messages in thread
From: Paul Turner @ 2012-07-12  0:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania

On Thu, Jul 5, 2012 at 4:58 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
>> Now that running entities maintain their own load-averages the work we must do
>> in update_shares() is largely restricted to the periodic decay of blocked
>> entities.  This allows us to be a little less pessimistic regarding our
>> occupancy on rq->lock and the associated rq->clock updates required.
>
> So what you're saying is that since 'weight' now includes runtime
> behaviour (where we hope the recent past matches the near future) we
> don't need to update shares quite as often since the effect of
> sleep-wakeup cycles isn't near as big since they're already anticipated.


Not quite: This does not decrease the frequency of updates.

Rather:
The old code used to take and release rq->lock (nested under rcu)
about updating *every* single task-group.

This is because the amount of work that we would have to do to update
a group was not deterministic (we did not know how many times  we'd
have to fold our load average sums).  The new code just says, since
this work is deterministic, let's thrash that lock less.  I suspect 10
is an incredibly conservative value, we probably want something more
like 100.  (But on the flip-side, I don't want to time out on the
wacko machine with 2000 cgroups, so I do have to release at SOME
point.)


> So how is the decay of blocked load still significant, surely that too
> is mostly part of the anticipated sleep/wake cycle already caught in the
> runtime behaviour.

RIght but the run-time behavior only lets us update things while
they're running.

This maintains periodic updates (and hence load-decay) on a group
that's sent everything to sleep.

>
> Or is this the primary place where we decay? If so that wasn't obvious
> and thus wants a comment someplace.
>

For a group with no runnable entities, yes.  For a group with runnable
entities this will also occur naturally about updates for the
aforementioned runnables.

I'll add a comment calling out the blocked case.

>> Signed-off-by: Paul Turner <pjt@google.com>
>> ---
>
>> +static void update_blocked_averages(int cpu)
>>  {
>>       struct rq *rq = cpu_rq(cpu);
>> +     struct cfs_rq *cfs_rq;
>> +
>> +     unsigned long flags;
>> +     int num_updates = 0;
>>
>>       rcu_read_lock();
>> +     raw_spin_lock_irqsave(&rq->lock, flags);
>> +     update_rq_clock(rq);
>>       /*
>>        * Iterates the task_group tree in a bottom up fashion, see
>>        * list_add_leaf_cfs_rq() for details.
>>        */
>>       for_each_leaf_cfs_rq(rq, cfs_rq) {
>> +             __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
>>
>> +             /*
>> +              * Periodically release the lock so that a cfs_rq with many
>> +              * children cannot hold it for an arbitrary period of time.
>> +              */
>> +             if (num_updates++ % 20 == 0) {
>> +                     raw_spin_unlock_irqrestore(&rq->lock, flags);
>> +                     cpu_relax();
>> +                     raw_spin_lock_irqsave(&rq->lock, flags);
>
> Gack.. that's not real pretty is it.. Esp. since we're still holding RCU
> lock and are thus (mostly) still not preemptable.
>
> How much of a problem was this?, the changelog is silent on this.

So the holding of RCU about these operations is nothing new (and
indeed they should be much faster than before).

As above, the bound is only for the crazy-large-numbers of cgroups
case where we don't want to sit on with interrupts disabled forever.
I suspect it wants to be larger, but picked a fairly conservative
number to start with since I also think it's not a big performance
factor either way.

>
>> +                     update_rq_clock(rq);
>> +             }
>>       }
>> +
>> +     raw_spin_unlock_irqrestore(&rq->lock, flags);
>>       rcu_read_unlock();
>>  }
>>
>
>
>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis
  2012-07-04 15:32   ` Peter Zijlstra
@ 2012-07-12  0:12     ` Paul Turner
  0 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-07-12  0:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, Jul 4, 2012 at 8:32 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> On Wed, 2012-06-27 at 19:24 -0700, Paul Turner wrote:
>> Instead of tracking averaging the load parented by a cfs_rq, we can track
>> entity load directly.  With the load for a given cfs_Rq then being the sum of
>> its children.
>>
>> To do this we represent the historical contribution to runnable average within each
>> trailing 1024us of execution as the coefficients of a geometric series.
>>
>> We can express this for a given task t as:
>>   runnable_sum(t) = \Sum u_i * y^i ,
>>   load(t) = weight_t * runnable_sum(t) / (\Sum 1024 * y^i)
>>
>> Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms
>> and y is chosen such that y^k = 1/2.  We currently choose k to be 32 which
>> roughly translates to about a sched period.
>>
>> Signed-off-by: Paul Turner <pjt@google.com>
>> ---
>>  include/linux/sched.h |    8 +++
>>  kernel/sched/debug.c  |    4 ++
>>  kernel/sched/fair.c   |  128 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 140 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 9dced2e..5bf5c79 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1136,6 +1136,11 @@ struct load_weight {
>>         unsigned long weight, inv_weight;
>>  };
>>
>> +struct sched_avg {
>> +       u32 runnable_avg_sum, runnable_avg_period;
>> +       u64 last_runnable_update;
>> +};
>
>
> So we can use u32 because:
>
>              n         1
> lim n->inf \Sum y^i = --- = ~ 46.66804636511427012122 ; y^32 = 0.5
>              i=0      1-y
>
> So the values should never be larger than ~47k, right?

Yes -- this is made explicit later in the series.
>
> /me goes add something like that in a comment.
>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis
  2012-06-28  6:06   ` Namhyung Kim
@ 2012-07-12  0:14     ` Paul Turner
  0 siblings, 0 replies; 46+ messages in thread
From: Paul Turner @ 2012-07-12  0:14 UTC (permalink / raw)
  To: Namhyung Kim
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Peter Zijlstra, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ben Segall, Ingo Molnar,
	Paul E. McKenney, Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, Jun 27, 2012 at 11:06 PM, Namhyung Kim <namhyung@kernel.org> wrote:
> Hi,
>
> Some nitpicks and questions.
>
>
> On Wed, 27 Jun 2012 19:24:14 -0700, Paul Turner wrote:
>> Instead of tracking averaging the load parented by a cfs_rq, we can track
>> entity load directly.  With the load for a given cfs_Rq then being the sum of
>
> s/cfs_Rq/cfs_rq/
>
>
>> its children.
>>
>> To do this we represent the historical contribution to runnable average within each
>> trailing 1024us of execution as the coefficients of a geometric series.
>>
>> We can express this for a given task t as:
>>   runnable_sum(t) = \Sum u_i * y^i ,
>>   load(t) = weight_t * runnable_sum(t) / (\Sum 1024 * y^i)
>>
>
> This "\Sum 1024 *y^i" is the runnable(_avg)_period, right?

Yes.

>
>
>> Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms
>> and y is chosen such that y^k = 1/2.  We currently choose k to be 32 which
>> roughly translates to about a sched period.
>>
>> Signed-off-by: Paul Turner <pjt@google.com>
>> ---
>>  include/linux/sched.h |    8 +++
>>  kernel/sched/debug.c  |    4 ++
>>  kernel/sched/fair.c   |  128 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 140 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 9dced2e..5bf5c79 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1136,6 +1136,11 @@ struct load_weight {
>>       unsigned long weight, inv_weight;
>>  };
>>
>> +struct sched_avg {
>> +     u32 runnable_avg_sum, runnable_avg_period;
>> +     u64 last_runnable_update;
>> +};
>> +
>>  #ifdef CONFIG_SCHEDSTATS
>>  struct sched_statistics {
>>       u64                     wait_start;
>> @@ -1196,6 +1201,9 @@ struct sched_entity {
>>       /* rq "owned" by this entity/group: */
>>       struct cfs_rq           *my_q;
>>  #endif
>> +#ifdef CONFIG_SMP
>> +     struct sched_avg        avg;
>> +#endif
>>  };
>>
>>  struct sched_rt_entity {
>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>> index c09a4e7..cd5ef23 100644
>> --- a/kernel/sched/debug.c
>> +++ b/kernel/sched/debug.c
>> @@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
>>       P(se->statistics.wait_count);
>>  #endif
>>       P(se->load.weight);
>> +#ifdef CONFIG_SMP
>> +     P(se->avg.runnable_avg_sum);
>> +     P(se->avg.runnable_avg_period);
>> +#endif
>>  #undef PN
>>  #undef P
>>  }
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 3704ad3..864a122 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -976,6 +976,125 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
>>  }
>>  #endif /* CONFIG_FAIR_GROUP_SCHED */
>>
>> +#ifdef CONFIG_SMP
>> +/*
>> + * Approximate:
>> + *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
>> + */
>> +static __always_inline u64 decay_load(u64 val, int n)
>> +{
>> +     for (; n && val; n--) {
>> +             val *= 4008;
>> +             val >>= 12;
>> +     }
>> +
>> +     return val;
>> +}
>> +
>> +/* We can represent the historical contribution to runnable average as the
>> + * coefficients of a geometric series.  To do this we sub-divide our runnable
>> + * history into segments of approximately 1ms (1024us); label the segment that
>> + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
>> + *
>> + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
>> + *      p0            p1           p1
>                                       p2
>
>> + *     (now)       (~1ms ago)  (~2ms ago)
>> + *
>> + * Let u_i denote the fraction of p_i that the entity was runnable.
>> + *
>> + * We then designate the fractions u_i as our co-efficients, yielding the
>> + * following representation of historical load:
>> + *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
>> + *
>> + * We choose y based on the with of a reasonably scheduling period, fixing:
>                                width ?
>> + *   y^32 = 0.5
>> + *
>> + * This means that the contribution to load ~32ms ago (u_32) will be weighted
>> + * approximately half as much as the contribution to load within the last ms
>> + * (u_0).
>> + *
>> + * When a period "rolls over" and we have new u_0`, multiplying the previous
>> + * sum again by y is sufficient to update:
>> + *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
>> + *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1]
>                                                                     u_{i+1}]
>
>> + */
>> +static __always_inline int __update_entity_runnable_avg(u64 now,
>
> Is the return value used by elsewhere?
>

Yes, it's later used to trigger updates as we accumulate new usage
periods (1024us increments).

> Thanks,
> Namhyung
>
>
>> +                                                     struct sched_avg *sa,
>> +                                                     int runnable)
>> +{
>> +     u64 delta;
>> +     int delta_w, decayed = 0;
>> +
>> +     delta = now - sa->last_runnable_update;
>> +     /*
>> +      * This should only happen when time goes backwards, which it
>> +      * unfortunately does during sched clock init when we swap over to TSC.
>> +      */
>> +     if ((s64)delta < 0) {
>> +             sa->last_runnable_update = now;
>> +             return 0;
>> +     }
>> +
>> +     /*
>> +      * Use 1024ns as the unit of measurement since it's a reasonable
>> +      * approximation of 1us and fast to compute.
>> +      */
>> +     delta >>= 10;
>> +     if (!delta)
>> +             return 0;
>> +     sa->last_runnable_update = now;
>> +
>> +     /* delta_w is the amount already accumulated against our next period */
>> +     delta_w = sa->runnable_avg_period % 1024;
>> +     if (delta + delta_w >= 1024) {
>> +             /* period roll-over */
>> +             decayed = 1;
>> +
>> +             /*
>> +              * Now that we know we're crossing a period boundary, figure
>> +              * out how much from delta we need to complete the current
>> +              * period and accrue it.
>> +              */
>> +             delta_w = 1024 - delta_w;
>> +             BUG_ON(delta_w > delta);
>> +             do {
>> +                     if (runnable)
>> +                             sa->runnable_avg_sum += delta_w;
>> +                     sa->runnable_avg_period += delta_w;
>> +
>> +                     /*
>> +                      * Remainder of delta initiates a new period, roll over
>> +                      * the previous.
>> +                      */
>> +                     sa->runnable_avg_sum =
>> +                             decay_load(sa->runnable_avg_sum, 1);
>> +                     sa->runnable_avg_period =
>> +                             decay_load(sa->runnable_avg_period, 1);
>> +
>> +                     delta -= delta_w;
>> +                     /* New period is empty */
>> +                     delta_w = 1024;
>> +             } while (delta >= 1024);
>> +     }
>> +
>> +     /* Remainder of delta accrued against u_0` */
>> +     if (runnable)
>> +             sa->runnable_avg_sum += delta;
>> +     sa->runnable_avg_period += delta;
>> +
>> +     return decayed;
>> +}
>> +
>> +/* Update a sched_entity's runnable average */
>> +static inline void update_entity_load_avg(struct sched_entity *se)
>> +{
>> +     __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
>> +                                  se->on_rq);
>> +}
>> +#else
>> +static inline void update_entity_load_avg(struct sched_entity *se) {}
>> +#endif
>> +
>>  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
>>  {
>>  #ifdef CONFIG_SCHEDSTATS
>> @@ -1102,6 +1221,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>>        */
>>       update_curr(cfs_rq);
>>       update_cfs_load(cfs_rq, 0);
>> +     update_entity_load_avg(se);
>>       account_entity_enqueue(cfs_rq, se);
>>       update_cfs_shares(cfs_rq);
>>
>> @@ -1176,6 +1296,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>>        * Update run-time statistics of the 'current'.
>>        */
>>       update_curr(cfs_rq);
>> +     update_entity_load_avg(se);
>>
>>       update_stats_dequeue(cfs_rq, se);
>>       if (flags & DEQUEUE_SLEEP) {
>> @@ -1345,6 +1466,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
>>               update_stats_wait_start(cfs_rq, prev);
>>               /* Put 'current' back into the tree. */
>>               __enqueue_entity(cfs_rq, prev);
>> +             /* in !on_rq case, update occurred at dequeue */
>> +             update_entity_load_avg(prev);
>>       }
>>       cfs_rq->curr = NULL;
>>  }
>> @@ -1358,6 +1481,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>>       update_curr(cfs_rq);
>>
>>       /*
>> +      * Ensure that runnable average is periodically updated.
>> +      */
>> +     update_entity_load_avg(curr);
>> +
>> +     /*
>>        * Update share accounting for long-running entities.
>>        */
>>       update_entity_shares_tick(cfs_rq);

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 14/16] sched: make __update_entity_runnable_avg() fast
  2012-07-10 10:51         ` Peter Zijlstra
@ 2012-07-12  0:15           ` Paul Turner
  2012-07-12 14:30             ` Peter Zijlstra
  0 siblings, 1 reply; 46+ messages in thread
From: Paul Turner @ 2012-07-12  0:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Benjamin Segall, linux-kernel, Venki Pallipadi,
	Srivatsa Vaddagiri, Vincent Guittot, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

So I've been trying to dig up the little proglets that originally
computed this stuff, since some specific adjustments were made but for
the life of me[*] I cannot find it, so I am stuck trying to reverse
engineer it like you :-).
[*] Including some over-night greps on all my source trees.

The short answer is I can explain some of the differences, but not
all; I suspect that perhaps I generated things using a wonky table.
Will update the tables with the tweaked numbers below for next
posting.

Updated values:

inverses for fixed point multiplication by y^k
0: ffffffff
1: fa83b2da
2: f5257d14
3: efe4b99a
4: eac0c6e6
5: e5b906e6
6: e0ccdeeb
7: dbfbb796
8: d744fcc9
9: d2a81d91
10: ce248c14
11: c9b9bd85
12: c5672a10
13: c12c4cc9
14: bd08a39e
15: b8fbaf46
16: b504f333
17: b123f581
18: ad583ee9
19: a9a15ab4
20: a5fed6a9
21: a2704302
22: 9ef5325f
23: 9b8d39b9
24: 9837f050
25: 94f4efa8
26: 91c3d373
27: 8ea4398a
28: 8b95c1e3
29: 88980e80
30: 85aac367
31: 82cd8698
[ Delta versus previous is purely double vs float, no surprises on this one ]

convergence
345> 47765
[ This is accounting for the fact that we're not getting to use a
perfect value of y, but it is the value we will converge to with max
individual updates and our fastmult y^1 approximation ]

sum y^n
[ Where:
exact_n = exact_{n-1} * exact_y + 1024.0
floor_n = FLOOR( floor_{n-1} * exact_y * 1024.0 )
shift_n = approximating exact_n using shift/div for mult
fastmul1 = approximating exact_n using inverse mult of y^1 recursively
fastmul2 = \sum 1024*y^n using  inverse mult of y^k

Error terms for the approximations:
sum y^n
      exact     floor     shift  fastmul1 fastmul2
 1:     1002        -0         0         0         0
 2:     1983        -1         0         1         1
 3:     2942        -1        -1         1         1
 4:     3881        -1        -2         1         1
 5:     4800        -2        -3         1         1
 6:     5699        -2        -4         1         1
 7:     6579        -3        -5         1         1
 8:     7440        -3        -6         1         1
 9:     8283        -4        -6         2         2
10:     9108        -5        -7         1         2
11:     9914        -5        -8         1         2
12:    10704        -6        -9         1         2
13:    11477        -7        -9         2         3
14:    12233        -7       -10         2         3
15:    12973        -7       -11         2         3
16:    13697        -7       -12         2         3
17:    14405        -7       -13         1         3
18:    15099        -8       -14         1         3
19:    15777        -8       -16         0         3
20:    16441        -8       -17         0         3
21:    17091        -9       -18         0         3
22:    17727        -9       -18         1         4
23:    18349        -9       -20         0         3
24:    18958        -9       -20         1         4
25:    19554        -9       -21         1         4
26:    20137        -9       -22         1         4
27:    20707        -9       -24         1         4
28:    21266       -10       -25         1         4
29:    21812       -10       -27         0         3
30:    22347       -11       -28         1         4
31:    22870       -11       -30         0         3

The concern here is that we don't want approximations that
over-estimate to make possible exceeding our converged max load sum
above, which was accumulated using only single y^n steps.

For this reason I prefer the most conservative floor approximation
which never over-estimates, with errors <0.1%.  I think this is what I
chose previously (the first terms all align), but I can't explain the
divergence for higher n.

(Exact values)
      exact     floor     shift  fastmul1 fastmul2
 1:     1002      1002      1002      1002      1002
 2:     1983      1982      1982      1983      1983
 3:     2942      2941      2941      2943      2943
 4:     3881      3880      3879      3882      3882
 5:     4800      4798      4797      4801      4801
 6:     5699      5697      5695      5700      5700
 7:     6579      6576      6574      6580      6580
 8:     7440      7437      7434      7441      7441
 9:     8283      8279      8276      8284      8284
10:     9108      9103      9100      9108      9109
11:     9914      9909      9906      9915      9916
12:    10704     10698     10695     10705     10706
13:    11477     11470     11467     11478     11479
14:    12233     12226     12222     12234     12235
15:    12973     12966     12961     12974     12975
16:    13697     13690     13684     13698     13699
17:    14405     14398     14392     14406     14408
18:    15099     15091     15084     15099     15101
19:    15777     15769     15761     15777     15780
20:    16441     16433     16424     16441     16444
21:    17091     17082     17073     17091     17094
22:    17727     17718     17708     17727     17730
23:    18349     18340     18329     18349     18352
24:    18958     18949     18937     18958     18961
25:    19554     19545     19532     19554     19557
26:    20137     20128     20114     20137     20140
27:    20707     20698     20683     20708     20711
28:    21266     21256     21240     21266     21269
29:    21812     21802     21785     21812     21815
30:    22347     22336     22318     22347     22350
31:    22870     22859     22840     22870     22873

And for posterity, a simple generator so that I don't lose it again:
#include <math.h>
#include <stdio.h>

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
#define N 32
#define WMULT_SHIFT 32

const long WMULT_CONST = ((1UL << N) - 1);
const double y = .97857206208770013451;

long approx_decay(int c) {
	return (c * 4008) >> 12;
}

long mult_inv_array[N];
void calc_mult_inv() {
	int i;
	double yn = 0;

	printf("inverses\n");
	for (i = 0; i < N; i++) {
		yn = (double)WMULT_CONST * pow(y, i);
		mult_inv_array[i] = yn;
		printf("%d: %8lx\n", i, mult_inv_array[i]);
	}

	printf("\n");
}

long mult_inv(long c, int n) {
	return SRR(c * runnable_avg_yN_inv[n], WMULT_SHIFT);
}

void calc_yn_sum(int n)
{
	int i;
	double sum = 0, sum_fl = 0, diff = 0;
	long approx = 0, approx_fm = 0, approx_fm2 = 0;

	printf("sum y^n\n");
	printf("   %8s  %8s  %8s  %8s %8s\n", "exact", "floor", "shift",
	       "fastmul1", "fastmul2");
	for (i = 1; i < n; i++) {
		sum = (y * sum + y * 1024);
		sum_fl = floor(y * sum_fl+ y * 1024);
		approx = approx_decay(approx) + approx_decay(1024);
		approx_fm = mult_inv(approx_fm, 1) + mult_inv(1024, 1);
		approx_fm2 += mult_inv(1024, i);

		/*diff = sum;*/
		printf("%2d: %8.0f  %8.0f  %8ld  %8ld  %8ld\n", i, sum,
				sum_fl - diff,
				approx - (long)diff,
				approx_fm - (long)diff,
				approx_fm2 - (long)diff);

	}
	printf("\n");
}

void calc_conv(long n) {
	long old_n;
	int i = -1;

	printf("convergence\n");
	do {
		old_n = n;
		n = mult_inv(n, 1) + 1024;
		i++;
	} while (n != old_n);
	printf("%d> %ld\n", i - 1, n);
	printf("\n");
}

void main() {
	calc_mult_inv();
	calc_conv(1024);
	calc_yn_sum(N);
}

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 09/16] sched: normalize tg load contributions against runnable time
  2012-07-06 11:52     ` Peter Zijlstra
@ 2012-07-12  1:08       ` Andre Noll
  0 siblings, 0 replies; 46+ messages in thread
From: Andre Noll @ 2012-07-12  1:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul Turner, linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania, Mike Galbraith,
	Kamalesh Babulal, Ben Segall, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

[-- Attachment #1: Type: text/plain, Size: 491 bytes --]

On Fri, Jul 06, 13:52, Peter Zijlstra wrote:
> This then yields:
> 
>   P(\Union_{i=1..n} u_i) ~= \Sum_{k=1..n} (-1)^(k-1) (n choose k) u^k
> 
> Which unfortunately isn't a series I found a sane solution for, but
> numerically (see below) we can see it very quickly approaches 1 when n
> >> 1.

Isn't this series just 1 - (1 - u)^n? So yes, it converges quickly to 1
if u is a probability.

Andre
-- 
The only person who always got his work done by Friday was Robinson Crusoe

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 14/16] sched: make __update_entity_runnable_avg() fast
  2012-07-12  0:15           ` Paul Turner
@ 2012-07-12 14:30             ` Peter Zijlstra
  0 siblings, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-12 14:30 UTC (permalink / raw)
  To: Paul Turner
  Cc: Benjamin Segall, linux-kernel, Venki Pallipadi,
	Srivatsa Vaddagiri, Vincent Guittot, Nikunj A Dadhania,
	Mike Galbraith, Kamalesh Babulal, Ingo Molnar, Paul E. McKenney,
	Morten Rasmussen, Vaidyanathan Srinivasan

On Wed, 2012-07-11 at 17:15 -0700, Paul Turner wrote:
> convergence
> 345> 47765

Ah, 345 is much saner indeed!

> And for posterity, a simple generator so that I don't lose it again:
> #include <math.h>
> #include <stdio.h>
> 
> #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
> #define N 32
> #define WMULT_SHIFT 32
> 
> const long WMULT_CONST = ((1UL << N) - 1);
> const double y = .97857206208770013451;

I used pow(0.5, 1.0/N), no point in loosing precision and mis-typing a
digit or somesuch nonsense ;-)

> 
> long approx_decay(int c) {
>         return (c * 4008) >> 12;
> }
> 
> long mult_inv_array[N];
> void calc_mult_inv() {
>         int i;
>         double yn = 0;
> 
>         printf("inverses\n");
>         for (i = 0; i < N; i++) {
>                 yn = (double)WMULT_CONST * pow(y, i);
>                 mult_inv_array[i] = yn;
>                 printf("%d: %8lx\n", i, mult_inv_array[i]);
>         }
> 
>         printf("\n");
> }
> 
> long mult_inv(long c, int n) {
>         return SRR(c * runnable_avg_yN_inv[n], WMULT_SHIFT);

This works much better when you do:

  s/runnable_avg_yN_inv/mult_inv_array/

> }
> 
> void calc_yn_sum(int n)
> {
>         int i;
>         double sum = 0, sum_fl = 0, diff = 0;
>         long approx = 0, approx_fm = 0, approx_fm2 = 0;
> 
>         printf("sum y^n\n");
>         printf("   %8s  %8s  %8s  %8s %8s\n", "exact", "floor", "shift",
>                "fastmul1", "fastmul2");
>         for (i = 1; i < n; i++) {
>                 sum = (y * sum + y * 1024);
>                 sum_fl = floor(y * sum_fl+ y * 1024);
>                 approx = approx_decay(approx) + approx_decay(1024);
>                 approx_fm = mult_inv(approx_fm, 1) + mult_inv(1024, 1);
>                 approx_fm2 += mult_inv(1024, i);
> 
>                 /*diff = sum;*/
>                 printf("%2d: %8.0f  %8.0f  %8ld  %8ld  %8ld\n", i, sum,
>                                 sum_fl - diff,
>                                 approx - (long)diff,
>                                 approx_fm - (long)diff,
>                                 approx_fm2 - (long)diff);
> 
>         }
>         printf("\n");
> }
> 
> void calc_conv(long n) {
>         long old_n;
>         int i = -1;
> 
>         printf("convergence\n");
>         do {
>                 old_n = n;
>                 n = mult_inv(n, 1) + 1024;
>                 i++;
>         } while (n != old_n);
>         printf("%d> %ld\n", i - 1, n);
>         printf("\n");
> }
> 
> void main() {
>         calc_mult_inv();
>         calc_conv(1024);
>         calc_yn_sum(N);
> }
> 
> 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs()
  2012-07-12  0:11     ` Paul Turner
@ 2012-07-12 14:40       ` Peter Zijlstra
  0 siblings, 0 replies; 46+ messages in thread
From: Peter Zijlstra @ 2012-07-12 14:40 UTC (permalink / raw)
  To: Paul Turner
  Cc: linux-kernel, Venki Pallipadi, Srivatsa Vaddagiri,
	Vincent Guittot, Nikunj A Dadhania

On Wed, 2012-07-11 at 17:11 -0700, Paul Turner wrote:
> >>       for_each_leaf_cfs_rq(rq, cfs_rq) {
> >> +             __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
> >>
> >> +             /*
> >> +              * Periodically release the lock so that a cfs_rq with many
> >> +              * children cannot hold it for an arbitrary period of time.
> >> +              */
> >> +             if (num_updates++ % 20 == 0) {
> >> +                     raw_spin_unlock_irqrestore(&rq->lock, flags);
> >> +                     cpu_relax();
> >> +                     raw_spin_lock_irqsave(&rq->lock, flags);
> >
> > Gack.. that's not real pretty is it.. Esp. since we're still holding RCU
> > lock and are thus (mostly) still not preemptable.
> >
> > How much of a problem was this?, the changelog is silent on this.
> 
> So the holding of RCU about these operations is nothing new (and
> indeed they should be much faster than before).
> 
> As above, the bound is only for the crazy-large-numbers of cgroups
> case where we don't want to sit on with interrupts disabled forever.
> I suspect it wants to be larger, but picked a fairly conservative
> number to start with since I also think it's not a big performance
> factor either way.

How about you leave this ugly out for now (its unrelated to the rest of
the changes anyway).. and we can revisit this later?



^ permalink raw reply	[flat|nested] 46+ messages in thread

* [patch 09/16] sched: normalize tg load contributions against runnable time
  2012-08-23 14:14 [patch 00/16] sched: per-entity load-tracking pjt
@ 2012-08-23 14:14 ` pjt
  0 siblings, 0 replies; 46+ messages in thread
From: pjt @ 2012-08-23 14:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, Vaidyanathan Srinivasan,
	Srivatsa Vaddagiri, Kamalesh Babulal, Venki Pallipadi,
	Ben Segall, Mike Galbraith, Vincent Guittot, Nikunj A Dadhania,
	Morten Rasmussen, Paul E. McKenney, Namhyung Kim

[-- Attachment #1: sched-normalize_runnable_shares.patch --]
[-- Type: text/plain, Size: 5440 bytes --]

From: Paul Turner <pjt@google.com>

Entities of equal weight should receive equitable distribution of cpu time.
This is challenging in the case of a task_group's shares as execution may be
occurring on multiple cpus simultaneously.

To handle this we divide up the shares into weights proportionate with the load
on each cfs_rq.  This does not however, account for the fact that the sum of
the parts may be less than one cpu and so we need to normalize:
  load(tg) = min(runnable_avg(tg), 1) * tg->shares
Where runnable_avg is the aggregate time in which the task_group had runnable
children.

Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Ben Segall <bsegall@google.com>.
---
 kernel/sched/debug.c |    4 ++++
 kernel/sched/fair.c  |   56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |    2 ++
 3 files changed, 62 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2908923..71b0ea3 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			atomic64_read(&cfs_rq->tg->load_avg));
 	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
 			cfs_rq->tg_load_contrib);
+	SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
+			cfs_rq->tg_runnable_contrib);
+	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
+			atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 92ef5f1..47a7998 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1112,19 +1112,73 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 	}
 }
 
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+						  struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg = cfs_rq->tg;
+	long contrib;
+
+	/* The fraction of a cpu used by this cfs_rq */
+	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+			  sa->runnable_avg_period + 1);
+	contrib -= cfs_rq->tg_runnable_contrib;
+
+	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+		atomic_add(contrib, &tg->runnable_avg);
+		cfs_rq->tg_runnable_contrib += contrib;
+	}
+}
+
 static inline void __update_group_entity_contrib(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 	struct task_group *tg = cfs_rq->tg;
+	int runnable_avg;
+
 	u64 contrib;
 
 	contrib = cfs_rq->tg_load_contrib * tg->shares;
 	se->avg.load_avg_contrib = div64_u64(contrib,
 					     atomic64_read(&tg->load_avg) + 1);
+
+	/*
+	 * For group entities we need to compute a correction term in the case
+	 * that they are consuming <1 cpu so that we would contribute the same
+	 * load as a task of equal weight.
+	 *
+	 * Explicitly co-ordinating this measurement would be expensive, but
+	 * fortunately the sum of each cpus contribution forms a usable
+	 * lower-bound on the true value.
+	 *
+	 * Consider the aggregate of 2 contributions.  Either they are disjoint
+	 * (and the sum represents true value) or they are disjoint and we are
+	 * understating by the aggregate of their overlap.
+	 *
+	 * Extending this to N cpus, for a given overlap, the maximum amount we
+	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+	 * cpus that overlap for this interval and w_i is the interval width.
+	 *
+	 * On a small machine; the first term is well-bounded which bounds the
+	 * total error since w_i is a subset of the period.  Whereas on a
+	 * larger machine, while this first term can be larger, if w_i is the
+	 * of consequential size guaranteed to see n_i*w_i quickly converge to
+	 * our upper bound of 1-cpu.
+	 */
+	runnable_avg = atomic_read(&tg->runnable_avg);
+	if (runnable_avg < NICE_0_LOAD) {
+		se->avg.load_avg_contrib *= runnable_avg;
+		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+	}
 }
 #else
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 						 int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+						  struct cfs_rq *cfs_rq) {}
 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
 #endif
 
@@ -1146,6 +1200,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
 	if (entity_is_task(se)) {
 		__update_task_entity_contrib(se);
 	} else {
+		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
 		__update_group_entity_contrib(se);
 	}
 
@@ -1214,6 +1269,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
 	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 
 /* Add the load generated by se into cfs_rq's child load-average */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0c453e7..1474bf2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -113,6 +113,7 @@ struct task_group {
 
 	atomic_t load_weight;
 	atomic64_t load_avg;
+	atomic_t runnable_avg;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -234,6 +235,7 @@ struct cfs_rq {
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	u32 tg_runnable_contrib;
 	u64 tg_load_contrib;
 #endif
 #endif



^ permalink raw reply related	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2012-08-23 14:15 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-28  2:24 [PATCH 00/16] Series short description Paul Turner
2012-06-28  2:24 ` [PATCH 09/16] sched: normalize tg load contributions against runnable time Paul Turner
2012-06-29  7:26   ` Namhyung Kim
2012-07-04 19:48   ` Peter Zijlstra
2012-07-06 11:52     ` Peter Zijlstra
2012-07-12  1:08       ` Andre Noll
2012-07-12  0:02     ` Paul Turner
2012-07-06 12:23   ` Peter Zijlstra
2012-06-28  2:24 ` [PATCH 06/16] sched: account for blocked load waking back up Paul Turner
2012-06-28  2:24 ` [PATCH 02/16] sched: maintain per-rq runnable averages Paul Turner
2012-06-28  2:24 ` [PATCH 01/16] sched: track the runnable average on a per-task entitiy basis Paul Turner
2012-06-28  6:06   ` Namhyung Kim
2012-07-12  0:14     ` Paul Turner
2012-07-04 15:32   ` Peter Zijlstra
2012-07-12  0:12     ` Paul Turner
2012-06-28  2:24 ` [PATCH 04/16] sched: maintain the load contribution of blocked entities Paul Turner
2012-06-29  1:27   ` Namhyung Kim
2012-06-28  2:24 ` [PATCH 07/16] sched: aggregate total task_group load Paul Turner
2012-06-28  2:24 ` [PATCH 05/16] sched: add an rq migration call-back to sched_class Paul Turner
2012-06-29  1:32   ` Namhyung Kim
2012-06-28  2:24 ` [PATCH 08/16] sched: compute load contribution by a group entity Paul Turner
2012-06-28  2:24 ` [PATCH 03/16] sched: aggregate load contributed by task entities on parenting cfs_rq Paul Turner
2012-06-28  6:33   ` Namhyung Kim
2012-07-04 15:28   ` Peter Zijlstra
2012-07-06 14:53     ` Peter Zijlstra
2012-07-09  9:15       ` Ingo Molnar
2012-06-28  2:24 ` [PATCH 11/16] sched: replace update_shares weight distribution with per-entity computation Paul Turner
2012-06-28  2:24 ` [PATCH 16/16] sched: introduce temporary FAIR_GROUP_SCHED dependency for load-tracking Paul Turner
2012-06-28  2:24 ` [PATCH 12/16] sched: refactor update_shares_cpu() -> update_blocked_avgs() Paul Turner
2012-06-29  7:28   ` Namhyung Kim
2012-07-12  0:03     ` Paul Turner
2012-07-05 11:58   ` Peter Zijlstra
2012-07-12  0:11     ` Paul Turner
2012-07-12 14:40       ` Peter Zijlstra
2012-06-28  2:24 ` [PATCH 15/16] sched: implement usage tracking Paul Turner
2012-06-28  2:24 ` [PATCH 13/16] sched: update_cfs_shares at period edge Paul Turner
2012-06-28  2:24 ` [PATCH 10/16] sched: maintain runnable averages across throttled periods Paul Turner
2012-06-28  2:24 ` [PATCH 14/16] sched: make __update_entity_runnable_avg() fast Paul Turner
2012-07-04 15:41   ` Peter Zijlstra
2012-07-04 17:20     ` Peter Zijlstra
2012-07-09 20:18       ` Benjamin Segall
2012-07-10 10:51         ` Peter Zijlstra
2012-07-12  0:15           ` Paul Turner
2012-07-12 14:30             ` Peter Zijlstra
2012-07-04 16:51   ` Peter Zijlstra
2012-08-23 14:14 [patch 00/16] sched: per-entity load-tracking pjt
2012-08-23 14:14 ` [patch 09/16] sched: normalize tg load contributions against runnable time pjt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.