From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751912AbcD3Mrp (ORCPT ); Sat, 30 Apr 2016 08:47:45 -0400 Received: from merlin.infradead.org ([205.233.59.134]:49332 "EHLO merlin.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750782AbcD3Mrn (ORCPT ); Sat, 30 Apr 2016 08:47:43 -0400 Date: Sat, 30 Apr 2016 14:47:31 +0200 From: Peter Zijlstra To: Chris Mason , Ingo Molnar , Matt Fleming , Mike Galbraith , linux-kernel@vger.kernel.org Subject: Re: sched: tweak select_idle_sibling to look for idle threads Message-ID: <20160430124731.GE2975@worktop.cust.blueprintrf.com> References: <20160405180822.tjtyyc3qh4leflfj@floor.thefacebook.com> <20160409190554.honue3gtian2p6vr@floor.thefacebook.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20160409190554.honue3gtian2p6vr@floor.thefacebook.com> User-Agent: Mutt/1.5.22.1 (2013-10-16) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Sat, Apr 09, 2016 at 03:05:54PM -0400, Chris Mason wrote: > select_task_rq_fair() can leave cpu utilization a little lumpy, > especially as the workload ramps up to the maximum capacity of the > machine. The end result can be high p99 response times as apps > wait to get scheduled, even when boxes are mostly idle. > > I wrote schbench to try and measure this: > > git://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git Can you guys have a play with this; I think one and two node tbench are good, but I seem to be getting significant run to run variance on that, so maybe I'm not doing it right. schbench numbers with: ./schbench -m2 -t 20 -c 30000 -s 30000 -r 30 on my ivb-ep (2 sockets, 10 cores/socket, 2 threads/core) appear to be decent. I've also not ran anything other than schbench/tbench so maybe I completely wrecked something else (as per usual..). I've not thought about that bounce_to_target() thing much.. I'll go give that a ponder. --- kernel/sched/fair.c | 180 +++++++++++++++++++++++++++++++++++------------ kernel/sched/features.h | 1 + kernel/sched/idle_task.c | 4 +- kernel/sched/sched.h | 1 + kernel/time/tick-sched.c | 10 +-- 5 files changed, 146 insertions(+), 50 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b8a33abce650..b9d8d1dc5183 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1501,8 +1501,10 @@ static void task_numa_compare(struct task_numa_env *env, * One idle CPU per node is evaluated for a task numa move. * Call select_idle_sibling to maybe find a better one. */ - if (!cur) + if (!cur) { + // XXX borken env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + } assign: assigned = true; @@ -4491,6 +4493,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +/* + * Working cpumask for: + * load_balance, + * load_balance_newidle, + * select_idle_core. + * + * Assumes softirqs are disabled when in use. + */ +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); + #ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. @@ -5162,65 +5175,147 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } +#ifdef CONFIG_SCHED_SMT + +static inline void clear_idle_cores(int cpu) +{ + struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu)); + if (!sd) + return; + + WRITE_ONCE(sd->groups->sgc->has_idle_cores, 0); +} + +static inline void set_idle_cores(int cpu) +{ + struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu)); + if (!sd) + return; + + WRITE_ONCE(sd->groups->sgc->has_idle_cores, 1); +} + +static inline bool test_idle_cores(int cpu) +{ + struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu)); + if (!sd) + return false; + + // XXX static key for !SMT topologies + + return READ_ONCE(sd->groups->sgc->has_idle_cores); +} + +void update_idle_core(struct rq *rq) +{ + int core = cpu_of(rq); + int cpu; + + rcu_read_lock(); + if (test_idle_cores(core)) + goto unlock; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (cpu == core) + continue; + + if (!idle_cpu(cpu)) + goto unlock; + } + + set_idle_cores(core); +unlock: + rcu_read_unlock(); +} + +static int select_idle_core(struct task_struct *p, int target) +{ + struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); + struct sched_domain *sd; + int core, cpu; + + sd = rcu_dereference(per_cpu(sd_llc, target)); + cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p)); + for_each_cpu(core, cpus) { + bool idle = true; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + cpumask_clear_cpu(cpu, cpus); + if (!idle_cpu(cpu)) + idle = false; + } + + if (idle) + break; + } + + return core; +} + +#else /* CONFIG_SCHED_SMT */ + +static inline void clear_idle_cores(int cpu) { } +static inline void set_idle_cores(int cpu) { } + +static inline bool test_idle_cores(int cpu) +{ + return false; +} + +void update_idle_core(struct rq *rq) { } + +static inline int select_idle_core(struct task_struct *p, int target) +{ + return -1; +} + +#endif /* CONFIG_SCHED_SMT */ + /* - * Try and locate an idle CPU in the sched_domain. + * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int target) { struct sched_domain *sd; - struct sched_group *sg; int i = task_cpu(p); if (idle_cpu(target)) return target; /* - * If the prevous cpu is cache affine and idle, don't be stupid. + * If the previous cpu is cache affine and idle, don't be stupid. */ if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) return i; + sd = rcu_dereference(per_cpu(sd_llc, target)); + if (!sd) + return target; + /* - * Otherwise, iterate the domains and find an eligible idle cpu. - * - * A completely idle sched group at higher domains is more - * desirable than an idle group at a lower level, because lower - * domains have smaller groups and usually share hardware - * resources which causes tasks to contend on them, e.g. x86 - * hyperthread siblings in the lowest domain (SMT) can contend - * on the shared cpu pipeline. - * - * However, while we prefer idle groups at higher domains - * finding an idle cpu at the lowest domain is still better than - * returning 'target', which we've already established, isn't - * idle. + * If there are idle cores to be had, go find one. */ - sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - /* Ensure the entire group is idle */ - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + if (sched_feat(IDLE_CORE) && test_idle_cores(target)) { + i = select_idle_core(p, target); + if ((unsigned)i < nr_cpumask_bits) + return i; - /* - * It doesn't matter which cpu we pick, the - * whole group is idle. - */ - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); + /* + * Failed to find an idle core; stop looking for one. + */ + clear_idle_cores(target); } -done: + + /* + * Otherwise, settle for anything idle in this cache domain. + */ + for_each_cpu(i, sched_domain_span(sd)) { + if (!cpumask_test_cpu(i, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(i)) + return i; + } + return target; } @@ -7229,9 +7324,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512 -/* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); - static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..76bb8814649a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -69,3 +69,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +SCHED_FEAT(IDLE_CORE, true) diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 47ce94931f1b..cb394db407e4 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -23,11 +23,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } +extern void update_idle_core(struct rq *rq); + static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev) { put_prev_task(rq, prev); - + update_idle_core(rq); schedstat_inc(rq, sched_goidle); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 69da6fcaa0e8..5994794bfc85 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -866,6 +866,7 @@ struct sched_group_capacity { * Number of busy cpus in this group. */ atomic_t nr_busy_cpus; + int has_idle_cores; unsigned long cpumask[0]; /* iteration mask */ }; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 31872bc53bc4..6e42cd218ba5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void) WARN_ON_ONCE(irqs_disabled()); /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ set_cpu_sd_state_idle(); local_irq_disable();