Re: [PATCH 4/6 v7] sched: propagate load during synchronous attach/detach

From: Peter Zijlstra <peterz@infradead.org>
To: Vincent Guittot <vincent.guittot@linaro.org>
Cc: mingo@kernel.org, linux-kernel@vger.kernel.org,
	dietmar.eggemann@arm.com, yuyang.du@intel.com,
	Morten.Rasmussen@arm.com, pjt@google.com, bsegall@google.com,
	kernellwp@gmail.com
Subject: Re: [PATCH 4/6 v7] sched: propagate load during synchronous attach/detach
Date: Wed, 9 Nov 2016 16:03:19 +0100	[thread overview]
Message-ID: <20161109150319.GR3117@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <1478598827-32372-5-git-send-email-vincent.guittot@linaro.org>

On Tue, Nov 08, 2016 at 10:53:45AM +0100, Vincent Guittot wrote:
> When a task moves from/to a cfs_rq, we set a flag which is then used to
> propagate the change at parent level (sched_entity and cfs_rq) during
> next update. If the cfs_rq is throttled, the flag will stay pending until
> the cfs_rq is unthrottled.
> 
> For propagating the utilization, we copy the utilization of group cfs_rq to
> the sched_entity.
> 
> For propagating the load, we have to take into account the load of the
> whole task group in order to evaluate the load of the sched_entity.
> Similarly to what was done before the rewrite of PELT, we add a correction
> factor in case the task group's load is greater than its share so it will
> contribute the same load of a task of equal weight.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---


I did the below on top, that basically moves code about a bit to reduce
some #ifdef and kills a few comments that I thought were of the:

	i++; /* increment by one */

quality.

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2918,6 +2918,26 @@ __update_load_avg(u64 now, int cpu, stru
 	return decayed;
 }
 
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {                           \
+	typeof(_ptr) ptr = (_ptr);                              \
+	typeof(_val) val = (_val);                              \
+	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+								\
+	res = var + val;                                        \
+								\
+	if (val < 0 && res > var)                               \
+		res = 0;                                        \
+								\
+	WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /**
  * update_tg_load_avg - update the tg's load avg
@@ -2997,59 +3017,12 @@ void set_task_rq_fair(struct sched_entit
 		se->avg.last_update_time = n_last_update_time;
 	}
 }
-#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
-	if (&this_rq()->cfs == cfs_rq) {
-		/*
-		 * There are a few boundary cases this might miss but it should
-		 * get called often enough that that should (hopefully) not be
-		 * a real problem -- added to that it only calls on the local
-		 * CPU, so if we enqueue remotely we'll miss an update, but
-		 * the next tick/schedule should update.
-		 *
-		 * It will not get called when we go idle, because the idle
-		 * thread is a different class (!fair), nor will the utilization
-		 * number include things like RT tasks.
-		 *
-		 * As is, the util number is not freq-invariant (we'd have to
-		 * implement arch_scale_freq_capacity() for that).
-		 *
-		 * See cpu_util().
-		 */
-		cpufreq_update_util(rq_of(cfs_rq), 0);
-	}
-}
-
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do {                           \
-	typeof(_ptr) ptr = (_ptr);                              \
-	typeof(_val) val = (_val);                              \
-	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
-								\
-	res = var + val;                                        \
-								\
-	if (val < 0 && res > var)                               \
-		res = 0;                                        \
-								\
-	WRITE_ONCE(*ptr, res);                                  \
-} while (0)
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
 /* Take into account change of utilization of a child task group */
 static inline void
 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	struct cfs_rq *gcfs_rq =  group_cfs_rq(se);
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
 
 	/* Nothing to update */
@@ -3130,22 +3103,17 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq
 
 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
 {
-	/* set cfs_rq's flag */
 	cfs_rq->propagate_avg = 1;
 }
 
 static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
 {
-	/* Get my cfs_rq */
 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 
-	/* Nothing to propagate */
 	if (!cfs_rq->propagate_avg)
 		return 0;
 
-	/* Clear my cfs_rq's flag */
 	cfs_rq->propagate_avg = 0;
-
 	return 1;
 }
 
@@ -3160,28 +3128,51 @@ static inline int propagate_entity_load_
 	if (!test_and_clear_tg_cfs_propagate(se))
 		return 0;
 
-	/* Get parent cfs_rq */
 	cfs_rq = cfs_rq_of(se);
 
-	/* Propagate to parent */
 	set_tg_cfs_propagate(cfs_rq);
 
-	/* Update utilization */
 	update_tg_cfs_util(cfs_rq, se);
-
-	/* Update load */
 	update_tg_cfs_load(cfs_rq, se);
 
 	return 1;
 }
-#else
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
 	return 0;
 }
 
 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
-#endif
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+	if (&this_rq()->cfs == cfs_rq) {
+		/*
+		 * There are a few boundary cases this might miss but it should
+		 * get called often enough that that should (hopefully) not be
+		 * a real problem -- added to that it only calls on the local
+		 * CPU, so if we enqueue remotely we'll miss an update, but
+		 * the next tick/schedule should update.
+		 *
+		 * It will not get called when we go idle, because the idle
+		 * thread is a different class (!fair), nor will the utilization
+		 * number include things like RT tasks.
+		 *
+		 * As is, the util number is not freq-invariant (we'd have to
+		 * implement arch_scale_freq_capacity() for that).
+		 *
+		 * See cpu_util().
+		 */
+		cpufreq_update_util(rq_of(cfs_rq), 0);
+	}
+}
 
 /*
  * Unsigned subtract and clamp on underflow.
@@ -3276,8 +3267,7 @@ static inline void update_load_avg(struc
 			  cfs_rq->curr == se, NULL);
 	}
 
-	decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
-
+	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
 	decayed |= propagate_entity_load_avg(se);
 
 	if (decayed && (flags & UPDATE_TG))
@@ -8993,11 +8983,6 @@ static void detach_entity_cfs_rq(struct
 	update_load_avg(se, 0);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
-
-	/*
-	 * Propagate the detach across the tg tree to make it visible to the
-	 * root
-	 */
 	propagate_entity_cfs_rq(se);
 }
 
@@ -9017,11 +9002,6 @@ static void attach_entity_cfs_rq(struct
 	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
-
-	/*
-	 * Propagate the attach across the tg tree to make it visible to the
-	 * root
-	 */
 	propagate_entity_cfs_rq(se);
 }