Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

From: Peter Zijlstra <peterz@infradead.org>
To: Yuyang Du <yuyang.du@intel.com>
Cc: mingo@kernel.org, linux-kernel@vger.kernel.org, pjt@google.com,
	bsegall@google.com, morten.rasmussen@arm.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	matt@codeblueprint.co.uk, umgwanakikbuti@gmail.com
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()
Date: Tue, 28 Mar 2017 16:46:25 +0200	[thread overview]
Message-ID: <20170328144625.GX3093@worktop> (raw)
In-Reply-To: <1486935863-25251-3-git-send-email-yuyang.du@intel.com>

On Mon, Feb 13, 2017 at 05:44:23AM +0800, Yuyang Du wrote:
> __update_load_avg() has the following steps:
> 
>   1. add the remainder of the last incomplete period
>   2. decay old sum
>   3. accumulate new sum in full periods since last_update_time
>   4. accumulate the current incomplete period
>   5. update averages
> 
> However, there is no need to separately compute steps 1, 3, and 4.
> 
> Illustation:
> 
>              c1          c3           c4
>              ^           ^            ^
>              |           |            |
>            |<->|<----------------->|<--->|
>    ... |---x---|------| ... |------|-----x (now)
> 
> c1, c3, and c4 are the accumulated (meanwhile decayed) contributions
> in timing in steps 1, 3, and 4 respectively.
> 
> With them, the accumulated contribution to load_sum, for example, is:
> 
> contrib = c1 * weight * freq_scaled;
> contrib += c3 * weight * freq_scaled;
> contrib += c4 * weight * freq_scaled;
> 
> Obviously, we can optimize the above and they equate to:
> 
> contrib = c1 + c3 + c4;
> contrib *= weight * freq_scaled;
> 

So I figured out what it is you're doing, how's this? I still need to
rewrite the Changelog to make this cleared, but I think the code now has
understandable comments.

---

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[]
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
 {
 	unsigned int local_n;
 
@@ -2795,32 +2795,111 @@ static __always_inline u64 decay_load(u6
 	return val;
 }
 
-/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
- */
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
 {
-	u32 contrib = 0;
+	u32 contrib;
+
+	if (!periods)
+		return remainder - period_contrib;
 
-	if (likely(n <= LOAD_AVG_PERIOD))
-		return runnable_avg_yN_sum[n];
-	else if (unlikely(n >= LOAD_AVG_MAX_N))
+	if (unlikely(periods >= LOAD_AVG_MAX_N))
 		return LOAD_AVG_MAX;
 
-	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
-	contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
-	n %= LOAD_AVG_PERIOD;
-	contrib = decay_load(contrib, n);
-	return contrib + runnable_avg_yN_sum[n];
+	/*
+	 * c1 y^(p+1) + c3 y^0
+	 */
+	remainder += decay_load((u64)(1024 - period_contrib), periods);
+
+	periods -= 1;
+	/*
+	 * For updates fully spanning n periods, the contribution to runnable
+	 * average will be: 1024 \Sum y^n
+	 *
+	 * We can compute this reasonably efficiently by combining:
+	 *
+	 *   y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+	 */
+	if (likely(periods <= LOAD_AVG_PERIOD)) {
+		contrib = runnable_avg_yN_sum[periods];
+	} else {
+		contrib = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
+		periods %= LOAD_AVG_PERIOD;
+		contrib = decay_load(contrib, periods);
+		contrib += runnable_avg_yN_sum[periods];
+	}
+
+	return contrib + remainder;
 }
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
 /*
+ * Accumulate the three separate parts of the sum; c1 the remainder
+ * of the last (incomplete) period, c2 the span of full periods and c3
+ * the remainder of the (incomplete) current period.
+ *
+ *           c1          c2           c3
+ *           ^           ^            ^
+ *           |           |            |
+ *         |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ *                                p
+ * u' = (u + c1) y^(p+1) + 1024 \Sum y^n + c3 y^0
+ *                               n=1
+ *
+ *    = u y^(p+1) +				(Step 1)
+ *
+ *                          p
+ *      c1 y^(p+1) + 1024 \Sum y^n + c3 y^0	(Step 2)
+ *                         n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+	       unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+	unsigned long scale_freq, scale_cpu;
+	u64 periods;
+	u32 contrib;
+
+	scale_freq = arch_scale_freq_capacity(NULL, cpu);
+	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+	delta += sa->period_contrib;
+	periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+	/*
+	 * Step 1: decay old *_sum if we crossed period boundaries.
+	 */
+	if (periods) {
+		sa->load_sum = decay_load(sa->load_sum, periods);
+		if (cfs_rq) {
+			cfs_rq->runnable_load_sum =
+				decay_load(cfs_rq->runnable_load_sum, periods);
+		}
+		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+	}
+
+	/*
+	 * Step 2
+	 */
+	delta %= 1024;
+	contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+	sa->period_contrib = delta;
+
+	contrib = cap_scale(contrib, scale_freq);
+	if (weight) {
+		sa->load_sum += weight * contrib;
+		if (cfs_rq)
+			cfs_rq->runnable_load_sum += weight * contrib;
+	}
+	if (running)
+		sa->util_sum += contrib * scale_cpu;
+
+	return periods;
+}
+
+/*
  * We can represent the historical contribution to runnable average as the
  * coefficients of a geometric series.  To do this we sub-divide our runnable
  * history into segments of approximately 1ms (1024us); label the segment that
@@ -2852,10 +2931,7 @@ static __always_inline int
 ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-	u64 delta, scaled_delta, periods;
-	u32 contrib;
-	unsigned int delta_w, scaled_delta_w, decayed = 0;
-	unsigned long scale_freq, scale_cpu;
+	u64 delta;
 
 	delta = now - sa->last_update_time;
 	/*
@@ -2876,81 +2952,27 @@ ___update_load_avg(u64 now, int cpu, str
 		return 0;
 	sa->last_update_time = now;
 
-	scale_freq = arch_scale_freq_capacity(NULL, cpu);
-	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-	/* delta_w is the amount already accumulated against our next period */
-	delta_w = sa->period_contrib;
-	if (delta + delta_w >= 1024) {
-		decayed = 1;
-
-		/* how much left for next period will start over, we don't know yet */
-		sa->period_contrib = 0;
-
-		/*
-		 * Now that we know we're crossing a period boundary, figure
-		 * out how much from delta we need to complete the current
-		 * period and accrue it.
-		 */
-		delta_w = 1024 - delta_w;
-		scaled_delta_w = cap_scale(delta_w, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * scaled_delta_w;
-			if (cfs_rq) {
-				cfs_rq->runnable_load_sum +=
-						weight * scaled_delta_w;
-			}
-		}
-		if (running)
-			sa->util_sum += scaled_delta_w * scale_cpu;
-
-		delta -= delta_w;
-
-		/* Figure out how many additional periods this update spans */
-		periods = delta / 1024;
-		delta %= 1024;
-
-		sa->load_sum = decay_load(sa->load_sum, periods + 1);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_sum =
-				decay_load(cfs_rq->runnable_load_sum, periods + 1);
-		}
-		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
-		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
-		contrib = __compute_runnable_contrib(periods);
-		contrib = cap_scale(contrib, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * contrib;
-			if (cfs_rq)
-				cfs_rq->runnable_load_sum += weight * contrib;
-		}
-		if (running)
-			sa->util_sum += contrib * scale_cpu;
-	}
-
-	/* Remainder of delta accrued against u_0` */
-	scaled_delta = cap_scale(delta, scale_freq);
-	if (weight) {
-		sa->load_sum += weight * scaled_delta;
-		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * scaled_delta;
-	}
-	if (running)
-		sa->util_sum += scaled_delta * scale_cpu;
-
-	sa->period_contrib += delta;
+	/*
+	 * Now we know we crossed measurement unit boundaries. The *_avg
+	 * accrues by two steps:
+	 *
+	 * Step 1: accumulate *_sum since last_update_time. If we haven't
+	 * crossed period boundaries, finish.
+	 */
+	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+		return 0;
 
-	if (decayed) {
-		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_avg =
-				div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
-		}
-		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+	/*
+	 * Step 2: update *_avg.
+	 */
+	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+	if (cfs_rq) {
+		cfs_rq->runnable_load_avg =
+			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
 	}
+	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
 
-	return decayed;
+	return 1;
 }
 
 static int