Re: [PATCH v6 1/4] sched/fair: Fix attaching task sched avgs twice when switching to fair or changing task group

From: Peter Zijlstra <peterz@infradead.org>
To: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yuyang Du <yuyang.du@intel.com>, Ingo Molnar <mingo@kernel.org>,
	linux-kernel <linux-kernel@vger.kernel.org>,
	Mike Galbraith <umgwanakikbuti@gmail.com>,
	Benjamin Segall <bsegall@google.com>,
	Paul Turner <pjt@google.com>,
	Morten Rasmussen <morten.rasmussen@arm.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Matt Fleming <matt@codeblueprint.co.uk>
Subject: Re: [PATCH v6 1/4] sched/fair: Fix attaching task sched avgs twice when switching to fair or changing task group
Date: Wed, 15 Jun 2016 17:22:17 +0200	[thread overview]
Message-ID: <20160615152217.GN30921@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <CAKfTPtC6SqsTkH4u6GT_64wwVr9t0mf8J0TchxSQGfbH6oAX9A@mail.gmail.com>

On Wed, Jun 15, 2016 at 09:46:53AM +0200, Vincent Guittot wrote:
> I still have concerned with this change of the behavior that  attaches
> the task only when it is enqueued. The load avg of the task will not
> be decayed between the time we move it into its new group until its
> enqueue. With this change, a task's load can stay high whereas it has
> slept for the last couple of seconds. Then, its load and utilization
> is no more accounted anywhere in the mean time just because we have
> moved the task which will be enqueued on the same rq.
> A task should always be attached to a cfs_rq and its load/utilization
> should always be accounted on a cfs_rq and decayed for its sleep
> period

OK; so I think I agree with that. Does the below (completely untested,
hasn't even been near a compiler) look reasonable?

The general idea is to always attach to a cfs_rq -- through
post_init_entity_util_avg(). This covers both the new task isn't
attached yet and was never in the fair class to begin with issues.

That only leaves a tiny hole in fork() where the task is hashed but
hasn't yet passed through wake_up_new_task() in which someone can do
cgroup move on it. That is closed with TASK_NEW and can_attach()
refusing those tasks.

I think this covers all scenarios, but my brain is completely fried,
I'll go over it (and my many callgraph notes) again tomorrow.


(there's an optimization to the fork path squashed in here, I'll split
that out in a separate patch, it avoids calling set_task_cpu() twice
and doing the IRQ disable/enable dance twice).

---
 include/linux/sched.h |  5 +++--
 kernel/sched/core.c   | 25 +++++++++++++++++++------
 kernel/sched/fair.c   | 42 +++++++++++++++++-------------------------
 3 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e31758b962ec..648126572c34 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -220,9 +220,10 @@ extern void proc_sched_set_task(struct task_struct *p);
 #define TASK_WAKING		256
 #define TASK_PARKED		512
 #define TASK_NOLOAD		1024
-#define TASK_STATE_MAX		2048
+#define TASK_NEW		2048
+#define TASK_STATE_MAX		4096
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 31c30e5bc338..de5522260dc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2340,11 +2340,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 	__sched_fork(clone_flags, p);
 	/*
-	 * We mark the process as running here. This guarantees that
+	 * We mark the process as NEW here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
-	p->state = TASK_RUNNING;
+	p->state = TASK_NEW;
 
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
@@ -2381,8 +2381,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 		p->sched_class = &fair_sched_class;
 	}
 
-	if (p->sched_class->task_fork)
-		p->sched_class->task_fork(p);
+	init_entity_runnable_average(&p->se);
 
 	/*
 	 * The child is not yet in the pid-hash so no cgroup attach races,
@@ -2393,6 +2392,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	set_task_cpu(p, cpu);
+	if (p->sched_class->task_fork)
+		p->sched_class->task_fork(p);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 #ifdef CONFIG_SCHED_INFO
@@ -2524,9 +2525,8 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq_flags rf;
 	struct rq *rq;
 
-	/* Initialize new task's runnable average */
-	init_entity_runnable_average(&p->se);
 	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
@@ -8220,6 +8220,19 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
 		if (task->sched_class != &fair_sched_class)
 			return -EINVAL;
 #endif
+		/*
+		 * Serialize against wake_up_new_task() such
+		 * that if its running, we're sure to observe
+		 * its full state.
+		 */
+		raw_spin_unlock_wait(&task->pi_lock);
+		/*
+		 * Avoid calling sched_move_task() before wake_up_new_task()
+		 * has happened. This would lead to problems with PELT. See
+		 * XXX.
+		 */
+		if (task->state == TASK_NEW)
+			return -EINVAL;
 	}
 	return 0;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f75930bdd326..8c5f59fc205a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -733,6 +733,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
 		}
 		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 	}
+
+	attach_entity_load_avg(cfs_rq, se);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
@@ -2941,6 +2943,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	/*
 	 * If we got migrated (either between CPUs or between cgroups) we'll
 	 * have aged the average right before clearing @last_update_time.
+	 *
+	 * Or we're fresh through post_init_entity_util_avg().
 	 */
 	if (se->avg.last_update_time) {
 		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3046,11 +3050,14 @@ void remove_entity_load_avg(struct sched_entity *se)
 	u64 last_update_time;
 
 	/*
-	 * Newly created task or never used group entity should not be removed
-	 * from its (source) cfs_rq
+	 * tasks cannot exit without having gone through wake_up_new_task() ->
+	 * post_init_entity_util_avg() which will have added things to the
+	 * cfs_rq, so we can remove unconditionally.
+	 *
+	 * Similarly for groups, they will have passed through
+	 * post_init_entity_util_avg() before unregister_sched_fair_group()
+	 * calls this.
 	 */
-	if (se->avg.last_update_time == 0)
-		return;
 
 	last_update_time = cfs_rq_last_update_time(cfs_rq);
 
@@ -4418,7 +4425,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		 *
 		 * note: in the case of encountering a throttled cfs_rq we will
 		 * post the final h_nr_running increment below.
-		*/
+		 */
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running++;
@@ -8255,31 +8262,17 @@ static void task_fork_fair(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se, *curr;
-	int this_cpu = smp_processor_id();
 	struct rq *rq = this_rq();
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&rq->lock, flags);
 
+	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 
 	cfs_rq = task_cfs_rq(current);
 	curr = cfs_rq->curr;
-
-	/*
-	 * Not only the cpu but also the task_group of the parent might have
-	 * been changed after parent->se.parent,cfs_rq were copied to
-	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
-	 * of child point to valid ones.
-	 */
-	rcu_read_lock();
-	__set_task_cpu(p, this_cpu);
-	rcu_read_unlock();
-
-	update_curr(cfs_rq);
-
-	if (curr)
+	if (curr) {
+		update_curr(cfs_rq);
 		se->vruntime = curr->vruntime;
+	}
 	place_entity(cfs_rq, se, 1);
 
 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8292,8 +8285,7 @@ static void task_fork_fair(struct task_struct *p)
 	}
 
 	se->vruntime -= cfs_rq->min_vruntime;
-
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_unlock(&rq->lock);
 }
 
 /*