From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751381AbdFXH1H (ORCPT ); Sat, 24 Jun 2017 03:27:07 -0400 Received: from terminus.zytor.com ([65.50.211.136]:49599 "EHLO terminus.zytor.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751247AbdFXH1F (ORCPT ); Sat, 24 Jun 2017 03:27:05 -0400 Date: Sat, 24 Jun 2017 00:23:24 -0700 From: tip-bot for Rik van Riel Message-ID: Cc: mingo@kernel.org, torvalds@linux-foundation.org, riel@redhat.com, tglx@linutronix.de, peterz@infradead.org, efault@gmx.de, linux-kernel@vger.kernel.org, hpa@zytor.com, mgorman@suse.de Reply-To: torvalds@linux-foundation.org, tglx@linutronix.de, peterz@infradead.org, riel@redhat.com, mingo@kernel.org, hpa@zytor.com, mgorman@suse.de, efault@gmx.de, linux-kernel@vger.kernel.org In-Reply-To: <20170623165530.22514-4-riel@redhat.com> References: <20170623165530.22514-4-riel@redhat.com> To: linux-tip-commits@vger.kernel.org Subject: [tip:sched/core] sched/numa: Implement NUMA node level wake_affine() Git-Commit-ID: 3fed382b46baac83703130fe4cd3d9147f427fb9 X-Mailer: tip-git-log-daemon Robot-ID: Robot-Unsubscribe: Contact to get blacklisted from these emails MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset=UTF-8 Content-Disposition: inline Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Commit-ID: 3fed382b46baac83703130fe4cd3d9147f427fb9 Gitweb: http://git.kernel.org/tip/3fed382b46baac83703130fe4cd3d9147f427fb9 Author: Rik van Riel AuthorDate: Fri, 23 Jun 2017 12:55:29 -0400 Committer: Ingo Molnar CommitDate: Sat, 24 Jun 2017 08:57:52 +0200 sched/numa: Implement NUMA node level wake_affine() Since select_idle_sibling() can place a task anywhere on a socket, comparing loads between individual CPU cores makes no real sense for deciding whether to do an affine wakeup across sockets, either. Instead, compare the load between the sockets in a similar way the load balancer and the numa balancing code do. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-4-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 130 ++++++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 59 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe19016..79ac078 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2586,6 +2586,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } } + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + */ +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + struct numa_stats prev_load, this_load; + s64 this_eff_load, prev_eff_load; + + update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); + update_numa_stats(&this_load, cpu_to_node(this_cpu)); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) { + unsigned long current_load = task_h_load(current); + + if (this_load.load > current_load) + this_load.load -= current_load; + else + this_load.load = 0; + } + + /* + * In low-load situations, where this_cpu's node is idle due to the + * sync cause above having dropped this_load.load to 0, move the task. + * Moving to an idle socket will not create a bad imbalance. + * + * Otherwise check if the nodes are near enough in load to allow this + * task to be woken on this_cpu's node. + */ + if (this_load.load > 0) { + unsigned long task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_load.compute_capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_load.compute_capacity; + + this_eff_load *= this_load.load + task_load; + prev_eff_load *= prev_load.load - task_load; + + return this_eff_load <= prev_eff_load; + } + + return true; +} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2598,6 +2652,13 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } + +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + return true; +} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -5407,74 +5468,25 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { - s64 this_load, load; - s64 this_eff_load, prev_eff_load; - int idx, this_cpu; - struct task_group *tg; - unsigned long weight; - int balanced; - - idx = sd->wake_idx; - this_cpu = smp_processor_id(); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + int this_cpu = smp_processor_id(); + bool affine = false; /* * Common case: CPUs are in the same socket, and select_idle_sibling() * will do its thing regardless of what we return: */ if (cpus_share_cache(prev_cpu, this_cpu)) - return true; - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - tg = task_group(current); - weight = current->se.avg.load_avg; - - this_load += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); - } - - tg = task_group(p); - weight = p->se.avg.load_avg; - - /* - * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped this_load to 0, we'll - * always have an imbalance, but there's really nothing you can do - * about that, so that's good too. - * - * Otherwise check if either cpus are near enough in load to allow this - * task to be woken on this_cpu. - */ - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); - - if (this_load > 0) { - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); - - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); - } - - balanced = this_eff_load <= prev_eff_load; + affine = true; + else + affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + if (affine) { + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + } - if (!balanced) - return 0; - - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - - return 1; + return affine; } static inline int task_util(struct task_struct *p);