All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Yuyang Du <yuyang.du@intel.com>
Cc: mingo@kernel.org, linux-kernel@vger.kernel.org, pjt@google.com,
	bsegall@google.com, morten.rasmussen@arm.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	matt@codeblueprint.co.uk, umgwanakikbuti@gmail.com
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()
Date: Tue, 28 Mar 2017 16:46:25 +0200	[thread overview]
Message-ID: <20170328144625.GX3093@worktop> (raw)
In-Reply-To: <1486935863-25251-3-git-send-email-yuyang.du@intel.com>

On Mon, Feb 13, 2017 at 05:44:23AM +0800, Yuyang Du wrote:
> __update_load_avg() has the following steps:
> 
>   1. add the remainder of the last incomplete period
>   2. decay old sum
>   3. accumulate new sum in full periods since last_update_time
>   4. accumulate the current incomplete period
>   5. update averages
> 
> However, there is no need to separately compute steps 1, 3, and 4.
> 
> Illustation:
> 
>              c1          c3           c4
>              ^           ^            ^
>              |           |            |
>            |<->|<----------------->|<--->|
>    ... |---x---|------| ... |------|-----x (now)
> 
> c1, c3, and c4 are the accumulated (meanwhile decayed) contributions
> in timing in steps 1, 3, and 4 respectively.
> 
> With them, the accumulated contribution to load_sum, for example, is:
> 
> contrib = c1 * weight * freq_scaled;
> contrib += c3 * weight * freq_scaled;
> contrib += c4 * weight * freq_scaled;
> 
> Obviously, we can optimize the above and they equate to:
> 
> contrib = c1 + c3 + c4;
> contrib *= weight * freq_scaled;
> 

So I figured out what it is you're doing, how's this? I still need to
rewrite the Changelog to make this cleared, but I think the code now has
understandable comments.

---


--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[]
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
 {
 	unsigned int local_n;
 
@@ -2795,32 +2795,111 @@ static __always_inline u64 decay_load(u6
 	return val;
 }
 
-/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
- */
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
 {
-	u32 contrib = 0;
+	u32 contrib;
+
+	if (!periods)
+		return remainder - period_contrib;
 
-	if (likely(n <= LOAD_AVG_PERIOD))
-		return runnable_avg_yN_sum[n];
-	else if (unlikely(n >= LOAD_AVG_MAX_N))
+	if (unlikely(periods >= LOAD_AVG_MAX_N))
 		return LOAD_AVG_MAX;
 
-	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
-	contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
-	n %= LOAD_AVG_PERIOD;
-	contrib = decay_load(contrib, n);
-	return contrib + runnable_avg_yN_sum[n];
+	/*
+	 * c1 y^(p+1) + c3 y^0
+	 */
+	remainder += decay_load((u64)(1024 - period_contrib), periods);
+
+	periods -= 1;
+	/*
+	 * For updates fully spanning n periods, the contribution to runnable
+	 * average will be: 1024 \Sum y^n
+	 *
+	 * We can compute this reasonably efficiently by combining:
+	 *
+	 *   y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+	 */
+	if (likely(periods <= LOAD_AVG_PERIOD)) {
+		contrib = runnable_avg_yN_sum[periods];
+	} else {
+		contrib = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
+		periods %= LOAD_AVG_PERIOD;
+		contrib = decay_load(contrib, periods);
+		contrib += runnable_avg_yN_sum[periods];
+	}
+
+	return contrib + remainder;
 }
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
 /*
+ * Accumulate the three separate parts of the sum; c1 the remainder
+ * of the last (incomplete) period, c2 the span of full periods and c3
+ * the remainder of the (incomplete) current period.
+ *
+ *           c1          c2           c3
+ *           ^           ^            ^
+ *           |           |            |
+ *         |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ *                                p
+ * u' = (u + c1) y^(p+1) + 1024 \Sum y^n + c3 y^0
+ *                               n=1
+ *
+ *    = u y^(p+1) +				(Step 1)
+ *
+ *                          p
+ *      c1 y^(p+1) + 1024 \Sum y^n + c3 y^0	(Step 2)
+ *                         n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+	       unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+	unsigned long scale_freq, scale_cpu;
+	u64 periods;
+	u32 contrib;
+
+	scale_freq = arch_scale_freq_capacity(NULL, cpu);
+	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+	delta += sa->period_contrib;
+	periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+	/*
+	 * Step 1: decay old *_sum if we crossed period boundaries.
+	 */
+	if (periods) {
+		sa->load_sum = decay_load(sa->load_sum, periods);
+		if (cfs_rq) {
+			cfs_rq->runnable_load_sum =
+				decay_load(cfs_rq->runnable_load_sum, periods);
+		}
+		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+	}
+
+	/*
+	 * Step 2
+	 */
+	delta %= 1024;
+	contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+	sa->period_contrib = delta;
+
+	contrib = cap_scale(contrib, scale_freq);
+	if (weight) {
+		sa->load_sum += weight * contrib;
+		if (cfs_rq)
+			cfs_rq->runnable_load_sum += weight * contrib;
+	}
+	if (running)
+		sa->util_sum += contrib * scale_cpu;
+
+	return periods;
+}
+
+/*
  * We can represent the historical contribution to runnable average as the
  * coefficients of a geometric series.  To do this we sub-divide our runnable
  * history into segments of approximately 1ms (1024us); label the segment that
@@ -2852,10 +2931,7 @@ static __always_inline int
 ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-	u64 delta, scaled_delta, periods;
-	u32 contrib;
-	unsigned int delta_w, scaled_delta_w, decayed = 0;
-	unsigned long scale_freq, scale_cpu;
+	u64 delta;
 
 	delta = now - sa->last_update_time;
 	/*
@@ -2876,81 +2952,27 @@ ___update_load_avg(u64 now, int cpu, str
 		return 0;
 	sa->last_update_time = now;
 
-	scale_freq = arch_scale_freq_capacity(NULL, cpu);
-	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-	/* delta_w is the amount already accumulated against our next period */
-	delta_w = sa->period_contrib;
-	if (delta + delta_w >= 1024) {
-		decayed = 1;
-
-		/* how much left for next period will start over, we don't know yet */
-		sa->period_contrib = 0;
-
-		/*
-		 * Now that we know we're crossing a period boundary, figure
-		 * out how much from delta we need to complete the current
-		 * period and accrue it.
-		 */
-		delta_w = 1024 - delta_w;
-		scaled_delta_w = cap_scale(delta_w, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * scaled_delta_w;
-			if (cfs_rq) {
-				cfs_rq->runnable_load_sum +=
-						weight * scaled_delta_w;
-			}
-		}
-		if (running)
-			sa->util_sum += scaled_delta_w * scale_cpu;
-
-		delta -= delta_w;
-
-		/* Figure out how many additional periods this update spans */
-		periods = delta / 1024;
-		delta %= 1024;
-
-		sa->load_sum = decay_load(sa->load_sum, periods + 1);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_sum =
-				decay_load(cfs_rq->runnable_load_sum, periods + 1);
-		}
-		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
-		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
-		contrib = __compute_runnable_contrib(periods);
-		contrib = cap_scale(contrib, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * contrib;
-			if (cfs_rq)
-				cfs_rq->runnable_load_sum += weight * contrib;
-		}
-		if (running)
-			sa->util_sum += contrib * scale_cpu;
-	}
-
-	/* Remainder of delta accrued against u_0` */
-	scaled_delta = cap_scale(delta, scale_freq);
-	if (weight) {
-		sa->load_sum += weight * scaled_delta;
-		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * scaled_delta;
-	}
-	if (running)
-		sa->util_sum += scaled_delta * scale_cpu;
-
-	sa->period_contrib += delta;
+	/*
+	 * Now we know we crossed measurement unit boundaries. The *_avg
+	 * accrues by two steps:
+	 *
+	 * Step 1: accumulate *_sum since last_update_time. If we haven't
+	 * crossed period boundaries, finish.
+	 */
+	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+		return 0;
 
-	if (decayed) {
-		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_avg =
-				div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
-		}
-		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+	/*
+	 * Step 2: update *_avg.
+	 */
+	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+	if (cfs_rq) {
+		cfs_rq->runnable_load_avg =
+			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
 	}
+	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
 
-	return decayed;
+	return 1;
 }
 
 static int

  parent reply	other threads:[~2017-03-28 14:46 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-02-12 21:44 [RESEND PATCH 0/2] sched/fair: Add documentation and optimize __update_sched_avg() Yuyang Du
2017-02-12 21:44 ` [RESEND PATCH 1/2] documentation: Add scheduler/sched-avg.txt Yuyang Du
2017-04-14  9:28   ` [tip:sched/core] sched/Documentation: Add 'sched-pelt' tool tip-bot for Yuyang Du
2017-02-12 21:44 ` [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg() Yuyang Du
2017-03-28 12:50   ` Peter Zijlstra
2017-03-28 23:57     ` Yuyang Du
2017-03-28 14:46   ` Peter Zijlstra [this message]
2017-03-29  0:04     ` Yuyang Du
2017-03-29 10:41       ` Peter Zijlstra
2017-03-29 18:41         ` Yuyang Du
2017-03-30 11:21         ` Paul Turner
2017-03-30 12:16           ` Peter Zijlstra
2017-03-30 13:46             ` Peter Zijlstra
2017-03-30 18:50               ` Yuyang Du
2017-03-30 14:14             ` Peter Zijlstra
2017-03-30 19:13               ` Yuyang Du
2017-03-30 19:41                 ` Yuyang Du
2017-03-31  7:13                   ` Peter Zijlstra
2017-03-30 22:02               ` Paul Turner
2017-03-31  7:01                 ` Peter Zijlstra
2017-03-31  9:58                   ` Paul Turner
2017-03-31 11:23                     ` Peter Zijlstra
2017-04-10  7:39                       ` Dietmar Eggemann
2017-04-10  8:40                         ` Peter Zijlstra
2017-03-31 11:30                     ` Peter Zijlstra
2017-03-31 10:55               ` Paul Turner
2017-03-31 11:38                 ` Peter Zijlstra
2017-04-10 10:47                   ` Peter Zijlstra
2017-03-30 18:39           ` Yuyang Du
2017-03-30  8:32   ` [tip:sched/core] sched/fair: Optimize ___update_sched_avg() tip-bot for Yuyang Du
2017-02-28  0:59 ` [RESEND PATCH 0/2] sched/fair: Add documentation and optimize __update_sched_avg() Yuyang Du

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170328144625.GX3093@worktop \
    --to=peterz@infradead.org \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=matt@codeblueprint.co.uk \
    --cc=mingo@kernel.org \
    --cc=morten.rasmussen@arm.com \
    --cc=pjt@google.com \
    --cc=umgwanakikbuti@gmail.com \
    --cc=vincent.guittot@linaro.org \
    --cc=yuyang.du@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.