From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752061AbcEJRtg (ORCPT ); Tue, 10 May 2016 13:49:36 -0400 Received: from merlin.infradead.org ([205.233.59.134]:58731 "EHLO merlin.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751625AbcEJRtK (ORCPT ); Tue, 10 May 2016 13:49:10 -0400 Message-Id: <20160510174613.902178264@infradead.org> User-Agent: quilt/0.61-1 Date: Tue, 10 May 2016 19:43:16 +0200 From: Peter Zijlstra To: mingo@kernel.org, linux-kernel@vger.kernel.org Cc: peterz@infradead.org, Pavan Kondeti , Ben Segall , Matt Fleming , Morten Rasmussen , Paul Turner , Thomas Gleixner , byungchul.park@lge.com, Andrew Hunter , Mike Galbraith Subject: [PATCH 2/3] sched,fair: Fix local starvation References: <20160510174314.355953085@infradead.org> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Disposition: inline; filename=peterz-sched-fix-local-starvation.patch Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Mike reported that the recent commit 3a47d5124a95 ("sched/fair: Fix fairness issue on migration") broke interactivity and the signal starve test. The problem is that I assumed ENQUEUE_WAKING was only set when we do a cross-cpu wakeup (migration), which isn't true. This means we now destroy the vruntime history of tasks and wakeup-preemption suffers. Cure this by making my assumption true, only call sched_class::task_waking() when we do a cross-cpu wakeup. This avoids the indirect call in the case we do a local wakeup. Cc: Pavan Kondeti Cc: Ben Segall Cc: Matt Fleming Cc: Morten Rasmussen Cc: Paul Turner Cc: Thomas Gleixner Cc: byungchul.park@lge.com Cc: Andrew Hunter Fixes: 3a47d5124a95 ("sched/fair: Fix fairness issue on migration") Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/core.c | 29 +++++++++++++++++++++-------- kernel/sched/fair.c | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 9 deletions(-) --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1709,14 +1709,22 @@ static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct pin_cookie cookie) { + int en_flags = ENQUEUE_WAKEUP; + lockdep_assert_held(&rq->lock); #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; + + /* + * If we migrated; we must have called sched_class::task_waking(). + */ + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_WAKING; #endif - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); + ttwu_activate(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, cookie); } @@ -1762,7 +1770,11 @@ void sched_ttwu_pending(void) while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); llist = llist_next(llist); - ttwu_do_activate(rq, p, 0, cookie); + /* + * See ttwu_queue(); we only call ttwu_queue_remote() when + * its a x-cpu wakeup. + */ + ttwu_do_activate(rq, p, WF_MIGRATED, cookie); } lockdep_unpin_lock(&rq->lock, cookie); @@ -1849,7 +1861,7 @@ bool cpus_share_cache(int this_cpu, int } #endif /* CONFIG_SMP */ -static void ttwu_queue(struct task_struct *p, int cpu) +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); struct pin_cookie cookie; @@ -1864,7 +1876,7 @@ static void ttwu_queue(struct task_struc raw_spin_lock(&rq->lock); cookie = lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, 0, cookie); + ttwu_do_activate(rq, p, wake_flags, cookie); lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); } @@ -2034,17 +2046,18 @@ try_to_wake_up(struct task_struct *p, un p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; - if (p->sched_class->task_waking) - p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(p); + set_task_cpu(p, cpu); } #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu); + ttwu_queue(p, cpu, wake_flags); stat: if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags); --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3247,6 +3247,37 @@ static inline void check_schedstat_requi #endif } + +/* + * MIGRATION + * + * dequeue + * update_curr() + * update_min_vruntime() + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way the vruntime transition between RQs is done when both + * min_vruntime are up-to-date. + * + * WAKEUP (remote) + * + * ->task_waking_fair() + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way we don't have the most up-to-date min_vruntime on the originating + * CPU and an up-to-date min_vruntime on the destination CPU. + */ + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -3264,7 +3295,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st /* * Otherwise, renormalise after, such that we're placed at the current - * moment in time, instead of some random moment in the past. + * moment in time, instead of some random moment in the past. Being + * placed in the past could significantly boost this task to the + * fairness detriment of existing tasks. */ if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; @@ -4811,6 +4844,12 @@ static unsigned long cpu_avg_load_per_ta return 0; } +/* + * Called to migrate a waking task; as blocked tasks retain absolute vruntime + * the migration needs to deal with this by subtracting the old and adding the + * new min_vruntime -- the latter is done by enqueue_entity() when placing + * the task on the new runqueue. + */ static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se;