From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1752734AbdEDU33 (ORCPT <rfc822;w@1wt.eu>);
        Thu, 4 May 2017 16:29:29 -0400
Received: from mail-yb0-f194.google.com ([209.85.213.194]:33400 "EHLO
        mail-yb0-f194.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1751214AbdEDU31 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Thu, 4 May 2017 16:29:27 -0400
Date: Thu, 4 May 2017 16:29:25 -0400
From: Tejun Heo <tj@kernel.org>
To: Ingo Molnar <mingo@redhat.com>, Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org,
        Linus Torvalds <torvalds@linux-foundation.org>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        Mike Galbraith <efault@gmx.de>, Paul Turner <pjt@google.com>,
        Chris Mason <clm@fb.com>, kernel-team@fb.com
Subject: [PATCH 1/3] sched/fair: Peter's shares_type patch
Message-ID: <20170504202925.GB2647@htj.duckdns.org>
References: <20170504202838.GA2647@htj.duckdns.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20170504202838.GA2647@htj.duckdns.org>
User-Agent: Mutt/1.8.0 (2017-02-23)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

From: Peter Zijlstra <peterz@infradead.org>

This patch is combination of

   http://lkml.kernel.org/r/20170502081905.GA4626@worktop.programming.kicks-ass.net
 + http://lkml.kernel.org/r/20170502083009.GA3377@worktop.programming.kicks-ass.net
 + build fix & use shares_avg for propagating load_avg instead of runnable

This fixes the propagation problem described in the following while
keeping group se->load_avg.avg in line with the matching
cfs_rq->load_avg.avg.

   http://lkml.kernel.org/r/20170424201415.GB14169@wtj.duckdns.org

---
 kernel/sched/fair.c |   98 +++++++++++++++++++++++++---------------------------
 1 file changed, 48 insertions(+), 50 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2636,26 +2636,57 @@ account_entity_dequeue(struct cfs_rq *cf
 	cfs_rq->nr_running--;
 }
 
+enum shares_type {
+	shares_runnable,
+	shares_avg,
+	shares_weight,
+};
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static long
+calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+		enum shares_type shares_type)
 {
-	long tg_weight, load, shares;
+	long tg_weight, tg_shares, load, shares;
 
-	/*
-	 * This really should be: cfs_rq->avg.load_avg, but instead we use
-	 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
-	 * the shares for small weight interactive tasks.
-	 */
-	load = scale_load_down(cfs_rq->load.weight);
+	tg_shares = READ_ONCE(tg->shares);
+
+	switch (shares_type) {
+	case shares_runnable:
+		/*
+		 * Instead of the correct cfs_rq->avg.load_avg we use
+		 * cfs_rq->runnable_load_avg, which does not include the
+		 * blocked load.
+		 */
+		load = cfs_rq->runnable_load_avg;
+		break;
+
+	case shares_avg:
+		load = cfs_rq->avg.load_avg;
+		break;
+
+	case shares_weight:
+		/*
+		 * Instead of the correct cfs_rq->avg.load_avg we use
+		 * cfs_rq->load.weight, which is its upper bound. This helps
+		 * ramp up the shares for small weight interactive tasks.
+		 */
+		load = scale_load_down(cfs_rq->load.weight);
+		break;
+	}
 
 	tg_weight = atomic_long_read(&tg->load_avg);
 
-	/* Ensure tg_weight >= load */
+	/*
+	 * This ensures the sum is up-to-date for this CPU, in case of the other
+	 * two approximations it biases the sum towards their value and in case
+	 * of (near) UP ensures the division ends up <= 1.
+	 */
 	tg_weight -= cfs_rq->tg_load_avg_contrib;
 	tg_weight += load;
 
-	shares = (tg->shares * load);
+	shares = (tg_shares * load);
 	if (tg_weight)
 		shares /= tg_weight;
 
@@ -2671,15 +2702,11 @@ static long calc_cfs_shares(struct cfs_r
 	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
 	 * instead of 0.
 	 */
-	if (shares < MIN_SHARES)
-		shares = MIN_SHARES;
-	if (shares > tg->shares)
-		shares = tg->shares;
-
-	return shares;
+	return clamp_t(long, shares, MIN_SHARES, tg_shares);
 }
 # else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static inline long
+calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, enum shares_type shares_type)
 {
 	return tg->shares;
 }
@@ -2721,7 +2748,7 @@ static void update_cfs_shares(struct sch
 	if (likely(se->load.weight == tg->shares))
 		return;
 #endif
-	shares = calc_cfs_shares(cfs_rq, tg);
+	shares = calc_cfs_shares(cfs_rq, tg, shares_weight);
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
@@ -3078,39 +3105,10 @@ static inline void
 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-	long delta, load = gcfs_rq->avg.load_avg;
-
-	/*
-	 * If the load of group cfs_rq is null, the load of the
-	 * sched_entity will also be null so we can skip the formula
-	 */
-	if (load) {
-		long tg_load;
-
-		/* Get tg's load and ensure tg_load > 0 */
-		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
-		/* Ensure tg_load >= load and updated with current load*/
-		tg_load -= gcfs_rq->tg_load_avg_contrib;
-		tg_load += load;
-
-		/*
-		 * We need to compute a correction term in the case that the
-		 * task group is consuming more CPU than a task of equal
-		 * weight. A task with a weight equals to tg->shares will have
-		 * a load less or equal to scale_load_down(tg->shares).
-		 * Similarly, the sched_entities that represent the task group
-		 * at parent level, can't have a load higher than
-		 * scale_load_down(tg->shares). And the Sum of sched_entities'
-		 * load must be <= scale_load_down(tg->shares).
-		 */
-		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
-			/* scale gcfs_rq's load into tg's shares*/
-			load *= scale_load_down(gcfs_rq->tg->shares);
-			load /= tg_load;
-		}
-	}
+	long load, delta;
 
+	load = scale_load_down(calc_cfs_shares(gcfs_rq, gcfs_rq->tg,
+					       shares_avg));
 	delta = load - se->avg.load_avg;
 
 	/* Nothing to update */