[PATCH 2/4] sched/fair: Drop out incomplete current period when sched averages accrue

From: Yuyang Du <yuyang.du@intel.com>
To: peterz@infradead.org, mingo@kernel.org, linux-kernel@vger.kernel.org
Cc: bsegall@google.com, pjt@google.com, morten.rasmussen@arm.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	juri.lelli@arm.com, Yuyang Du <yuyang.du@intel.com>
Subject: [PATCH 2/4] sched/fair: Drop out incomplete current period when sched averages accrue
Date: Mon, 11 Apr 2016 06:36:03 +0800	[thread overview]
Message-ID: <1460327765-18024-3-git-send-email-yuyang.du@intel.com> (raw)
In-Reply-To: <1460327765-18024-1-git-send-email-yuyang.du@intel.com>

In __update_load_avg(), the current period is never complete. This
basically leads to a slightly over-decayed average, say on average we
have 50% current period, then we will lose 1.08%(=(1-0.5^(1/64)) of
past avg. More importantly, the incomplete current period significantly
complicates the avg computation, even a full period is only about 1ms.

So we attempt to drop it. The outcome is that for any x.y periods to
update, we will either lose the .y period or unduely gain (1-.y) period.
How big is the impact? For a large x (say 20ms), you barely notice the
difference, which is plus/minus 1% (=(before-after)/before). Moreover,
the aggregated losses and gains in the long run should statistically
even out.

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
---
 include/linux/sched.h |   2 +-
 kernel/sched/fair.c   | 170 +++++++++++++++++++-------------------------------
 2 files changed, 66 insertions(+), 106 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 45e848c..c5948e1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1214,7 +1214,7 @@ struct load_weight {
  */
 struct sched_avg {
 	u64 last_update_time, load_sum;
-	u32 util_sum, period_contrib;
+	u32 util_sum;
 	unsigned long load_avg, util_avg;
 };
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e0eec0..68273e8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -661,7 +661,7 @@ static unsigned long task_h_load(struct task_struct *p);
 
 /*
  * We choose a half-life close to 1 scheduling period.
- * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
+ * Note: The tables __decay_inv_multiply_N and __accumulated_sum_N are
  * dependent on this value.
  */
 #define LOAD_AVG_PERIOD 32
@@ -674,12 +674,6 @@ void init_entity_runnable_average(struct sched_entity *se)
 	struct sched_avg *sa = &se->avg;
 
 	sa->last_update_time = 0;
-	/*
-	 * sched_avg's period_contrib should be strictly less then 1024, so
-	 * we give it 1023 to make sure it is almost a period (1024us), and
-	 * will definitely be update (after enqueue).
-	 */
-	sa->period_contrib = 1023;
 	sa->load_avg = scale_load_down(se->load.weight);
 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 	/*
@@ -2582,8 +2576,8 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
+/* Precomputed fixed inverse multipliers for multiplication by y^n */
+static const u32 __decay_inv_multiply_N[] = {
 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
@@ -2593,10 +2587,10 @@ static const u32 runnable_avg_yN_inv[] = {
 };
 
 /*
- * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
+ * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
  * over-estimates when re-combining.
  */
-static const u32 runnable_avg_yN_sum[] = {
+static const u32 __accumulated_sum_N[] = {
 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
@@ -2612,58 +2606,54 @@ static const u32 __accumulated_sum_N32[] = {
 };
 
 /*
- * Approximate:
- *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
+ * val * y^n, where y^32 ~= 0.5 and n's unit is ms (slightly bigger than ms),
+ * so 32-bit n is approximately a maximum of 49 (2^32/1000/3600/24) days
  */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static __always_inline u64 __decay_sum(u64 val, u32 n)
 {
-	unsigned int local_n;
-
 	if (!n)
 		return val;
 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
 		return 0;
 
-	/* after bounds checking we can collapse to 32-bit */
-	local_n = n;
-
 	/*
-	 * As y^PERIOD = 1/2, we can combine
-	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
-	 * With a look-up table which covers y^n (n<PERIOD)
-	 *
-	 * To achieve constant time decay_load.
+	 * As y^32 = 1/2, we can accelerate the computation as:
+	 * y^n = 1/2^(n/32) * y^(n%32) with a look-up table which
+	 * covers y^x (x<32). This achieves a constant time __decay_sum.
 	 */
-	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
-		val >>= local_n / LOAD_AVG_PERIOD;
-		local_n %= LOAD_AVG_PERIOD;
+	if (unlikely(n >= LOAD_AVG_PERIOD)) {
+		val >>= n / LOAD_AVG_PERIOD;
+		n %= LOAD_AVG_PERIOD;
 	}
 
-	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+	val = mul_u64_u32_shr(val, __decay_inv_multiply_N[n], 32);
 	return val;
 }
 
 /*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
+ * For updates fully spanning n periods, the accumulated contribution
+ * will be: \Sum 1024*y^n, in the meantime the contribution should
+ * be decayed.
+ *
+ * We can compute this efficiently by combining:
+ * y^32 = 1/2 with precomputed \Sum 1024*y^n {for n < 32}
  *
- * We can compute this reasonably efficiently by combining:
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
+ * Here, n is also large enough to hold the number of past periods
  */
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u32 n)
 {
 	u32 contrib = 0;
 
 	if (likely(n <= LOAD_AVG_PERIOD))
-		return runnable_avg_yN_sum[n];
+		return __accumulated_sum_N[n];
 	else if (unlikely(n >= LOAD_AVG_MAX_N))
 		return LOAD_AVG_MAX;
 
 	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
 	contrib = __accumulated_sum_N32[n>>5]; /* =n/LOAD_AVG_PERIOD */
 	n %= LOAD_AVG_PERIOD;
-	contrib = decay_load(contrib, n);
-	return contrib + runnable_avg_yN_sum[n];
+	contrib = __decay_sum(contrib, n);
+	return contrib + __accumulated_sum_N[n];
 }
 
 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
@@ -2704,11 +2694,14 @@ static __always_inline int
 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-	u64 delta, scaled_delta, periods;
-	u32 contrib;
-	unsigned int delta_w, scaled_delta_w, decayed = 0;
+	u64 delta;
+	u32 contrib, periods;
 	unsigned long scale_freq, scale_cpu;
 
+	/*
+	 * now rolls down to a period boundary
+	 */
+	now = now && (u64)(~0xFFFFF);
 	delta = now - sa->last_update_time;
 	/*
 	 * This should only happen when time goes backwards, which it
@@ -2720,89 +2713,56 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 	}
 
 	/*
-	 * Use 1024ns as the unit of measurement since it's a reasonable
-	 * approximation of 1us and fast to compute.
+	 * Use 1024*1024ns as an approximation of 1ms period, pretty close.
 	 */
-	delta >>= 10;
-	if (!delta)
+	periods = delta >> 20;
+	if (!periods)
 		return 0;
 	sa->last_update_time = now;
 
 	scale_freq = arch_scale_freq_capacity(NULL, cpu);
 	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 
-	/* delta_w is the amount already accumulated against our next period */
-	delta_w = sa->period_contrib;
-	if (delta + delta_w >= 1024) {
-		decayed = 1;
-
-		/* how much left for next period will start over, we don't know yet */
-		sa->period_contrib = 0;
-
-		/*
-		 * Now that we know we're crossing a period boundary, figure
-		 * out how much from delta we need to complete the current
-		 * period and accrue it.
-		 */
-		delta_w = 1024 - delta_w;
-		scaled_delta_w = cap_scale(delta_w, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * scaled_delta_w;
-			if (cfs_rq) {
-				cfs_rq->runnable_load_sum +=
-						weight * scaled_delta_w;
-			}
-		}
-		if (running)
-			sa->util_sum += scaled_delta_w * scale_cpu;
-
-		delta -= delta_w;
-
-		/* Figure out how many additional periods this update spans */
-		periods = delta / 1024;
-		delta %= 1024;
+	/*
+	 * Now we know we're crossing period boundaries, and the *_sum accrues by
+	 * two steps.
+	 */
 
-		sa->load_sum = decay_load(sa->load_sum, periods + 1);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_sum =
-				decay_load(cfs_rq->runnable_load_sum, periods + 1);
-		}
-		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
-		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
-		contrib = __compute_runnable_contrib(periods);
-		contrib = cap_scale(contrib, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * contrib;
-			if (cfs_rq)
-				cfs_rq->runnable_load_sum += weight * contrib;
-		}
-		if (running)
-			sa->util_sum += contrib * scale_cpu;
+	/*
+	 * Step 1: decay old *_sum
+	 */
+	sa->load_sum = __decay_sum(sa->load_sum, periods);
+	if (cfs_rq) {
+		cfs_rq->runnable_load_sum =
+			__decay_sum(cfs_rq->runnable_load_sum, periods);
 	}
+	sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
 
-	/* Remainder of delta accrued against u_0` */
-	scaled_delta = cap_scale(delta, scale_freq);
+	/*
+	 * Step 2: accumulate and meanwhile decay new *_sum by periods since
+	 * last_update_time
+	 */
+	contrib = __accumulate_sum(periods);
+	contrib = cap_scale(contrib, scale_freq);
 	if (weight) {
-		sa->load_sum += weight * scaled_delta;
+		sa->load_sum += weight * contrib;
 		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * scaled_delta;
+			cfs_rq->runnable_load_sum += weight * contrib;
 	}
 	if (running)
-		sa->util_sum += scaled_delta * scale_cpu;
+		sa->util_sum += contrib * scale_cpu;
 
-	sa->period_contrib += delta;
-
-	if (decayed) {
-		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_avg =
-				div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
-		}
-		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+	/*
+	 * *_sum must have been evolved, we update *_avg
+	 */
+	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+	if (cfs_rq) {
+		cfs_rq->runnable_load_avg =
+			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
 	}
+	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
 
-	return decayed;
+	return 1;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
2.1.4