From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1752061AbcEJRtg (ORCPT <rfc822;w@1wt.eu>);
	Tue, 10 May 2016 13:49:36 -0400
Received: from merlin.infradead.org ([205.233.59.134]:58731 "EHLO
	merlin.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1751625AbcEJRtK (ORCPT
	<rfc822;linux-kernel@vger.kernel.org>);
	Tue, 10 May 2016 13:49:10 -0400
Message-Id: <20160510174613.902178264@infradead.org>
User-Agent: quilt/0.61-1
Date: Tue, 10 May 2016 19:43:16 +0200
From: Peter Zijlstra <peterz@infradead.org>
To: mingo@kernel.org, linux-kernel@vger.kernel.org
Cc: peterz@infradead.org, Pavan Kondeti <pkondeti@codeaurora.org>,
        Ben Segall <bsegall@google.com>,
        Matt Fleming <matt@codeblueprint.co.uk>,
        Morten Rasmussen <morten.rasmussen@arm.com>,
        Paul Turner <pjt@google.com>, Thomas Gleixner <tglx@linutronix.de>,
        byungchul.park@lge.com, Andrew Hunter <ahh@google.com>,
        Mike Galbraith <mgalbraith@suse.de>
Subject: [PATCH 2/3] sched,fair: Fix local starvation
References: <20160510174314.355953085@infradead.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Disposition: inline; filename=peterz-sched-fix-local-starvation.patch
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

Mike reported that the recent commit 3a47d5124a95 ("sched/fair: Fix
fairness issue on migration") broke interactivity and the signal
starve test.

The problem is that I assumed ENQUEUE_WAKING was only set when we do a
cross-cpu wakeup (migration), which isn't true. This means we now
destroy the vruntime history of tasks and wakeup-preemption suffers.

Cure this by making my assumption true, only call
sched_class::task_waking() when we do a cross-cpu wakeup. This avoids
the indirect call in the case we do a local wakeup.

Cc: Pavan Kondeti <pkondeti@codeaurora.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: byungchul.park@lge.com
Cc: Andrew Hunter <ahh@google.com>
Fixes: 3a47d5124a95 ("sched/fair: Fix fairness issue on migration")
Reported-by: Mike Galbraith <mgalbraith@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   29 +++++++++++++++++++++--------
 kernel/sched/fair.c |   41 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 9 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1709,14 +1709,22 @@ static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		 struct pin_cookie cookie)
 {
+	int en_flags = ENQUEUE_WAKEUP;
+
 	lockdep_assert_held(&rq->lock);
 
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
+
+	/*
+	 * If we migrated; we must have called sched_class::task_waking().
+	 */
+	if (wake_flags & WF_MIGRATED)
+		en_flags |= ENQUEUE_WAKING;
 #endif
 
-	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+	ttwu_activate(rq, p, en_flags);
 	ttwu_do_wakeup(rq, p, wake_flags, cookie);
 }
 
@@ -1762,7 +1770,11 @@ void sched_ttwu_pending(void)
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
-		ttwu_do_activate(rq, p, 0, cookie);
+		/*
+		 * See ttwu_queue(); we only call ttwu_queue_remote() when
+		 * its a x-cpu wakeup.
+		 */
+		ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
 	}
 
 	lockdep_unpin_lock(&rq->lock, cookie);
@@ -1849,7 +1861,7 @@ bool cpus_share_cache(int this_cpu, int
 }
 #endif /* CONFIG_SMP */
 
-static void ttwu_queue(struct task_struct *p, int cpu)
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct pin_cookie cookie;
@@ -1864,7 +1876,7 @@ static void ttwu_queue(struct task_struc
 
 	raw_spin_lock(&rq->lock);
 	cookie = lockdep_pin_lock(&rq->lock);
-	ttwu_do_activate(rq, p, 0, cookie);
+	ttwu_do_activate(rq, p, wake_flags, cookie);
 	lockdep_unpin_lock(&rq->lock, cookie);
 	raw_spin_unlock(&rq->lock);
 }
@@ -2034,17 +2046,18 @@ try_to_wake_up(struct task_struct *p, un
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
-	if (p->sched_class->task_waking)
-		p->sched_class->task_waking(p);
-
 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
+
+		if (p->sched_class->task_waking)
+			p->sched_class->task_waking(p);
+
 		set_task_cpu(p, cpu);
 	}
 #endif /* CONFIG_SMP */
 
-	ttwu_queue(p, cpu);
+	ttwu_queue(p, cpu, wake_flags);
 stat:
 	if (schedstat_enabled())
 		ttwu_stat(p, cpu, wake_flags);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3247,6 +3247,37 @@ static inline void check_schedstat_requi
 #endif
 }
 
+
+/*
+ * MIGRATION
+ *
+ *	dequeue
+ *	  update_curr()
+ *	    update_min_vruntime()
+ *	  vruntime -= min_vruntime
+ *
+ *	enqueue
+ *	  update_curr()
+ *	    update_min_vruntime()
+ *	  vruntime += min_vruntime
+ *
+ * this way the vruntime transition between RQs is done when both
+ * min_vruntime are up-to-date.
+ *
+ * WAKEUP (remote)
+ *
+ *	->task_waking_fair()
+ *	  vruntime -= min_vruntime
+ *
+ *	enqueue
+ *	  update_curr()
+ *	    update_min_vruntime()
+ *	  vruntime += min_vruntime
+ *
+ * this way we don't have the most up-to-date min_vruntime on the originating
+ * CPU and an up-to-date min_vruntime on the destination CPU.
+ */
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -3264,7 +3295,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 
 	/*
 	 * Otherwise, renormalise after, such that we're placed at the current
-	 * moment in time, instead of some random moment in the past.
+	 * moment in time, instead of some random moment in the past. Being
+	 * placed in the past could significantly boost this task to the
+	 * fairness detriment of existing tasks.
 	 */
 	if (renorm && !curr)
 		se->vruntime += cfs_rq->min_vruntime;
@@ -4811,6 +4844,12 @@ static unsigned long cpu_avg_load_per_ta
 	return 0;
 }
 
+/*
+ * Called to migrate a waking task; as blocked tasks retain absolute vruntime
+ * the migration needs to deal with this by subtracting the old and adding the
+ * new min_vruntime -- the latter is done by enqueue_entity() when placing
+ * the task on the new runqueue.
+ */
 static void task_waking_fair(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;