From: Peter Zijlstra <peterz@infradead.org>
To: Chris Mason <clm@fb.com>, Ingo Molnar <mingo@kernel.org>,
Matt Fleming <matt@codeblueprint.co.uk>,
Mike Galbraith <mgalbraith@suse.de>,
linux-kernel@vger.kernel.org
Subject: Re: sched: tweak select_idle_sibling to look for idle threads
Date: Sat, 30 Apr 2016 14:47:31 +0200 [thread overview]
Message-ID: <20160430124731.GE2975@worktop.cust.blueprintrf.com> (raw)
In-Reply-To: <20160409190554.honue3gtian2p6vr@floor.thefacebook.com>
On Sat, Apr 09, 2016 at 03:05:54PM -0400, Chris Mason wrote:
> select_task_rq_fair() can leave cpu utilization a little lumpy,
> especially as the workload ramps up to the maximum capacity of the
> machine. The end result can be high p99 response times as apps
> wait to get scheduled, even when boxes are mostly idle.
>
> I wrote schbench to try and measure this:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git
Can you guys have a play with this; I think one and two node tbench are
good, but I seem to be getting significant run to run variance on that,
so maybe I'm not doing it right.
schbench numbers with: ./schbench -m2 -t 20 -c 30000 -s 30000 -r 30
on my ivb-ep (2 sockets, 10 cores/socket, 2 threads/core) appear to be
decent.
I've also not ran anything other than schbench/tbench so maybe I
completely wrecked something else (as per usual..).
I've not thought about that bounce_to_target() thing much.. I'll go give
that a ponder.
---
kernel/sched/fair.c | 180 +++++++++++++++++++++++++++++++++++------------
kernel/sched/features.h | 1 +
kernel/sched/idle_task.c | 4 +-
kernel/sched/sched.h | 1 +
kernel/time/tick-sched.c | 10 +--
5 files changed, 146 insertions(+), 50 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33abce650..b9d8d1dc5183 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1501,8 +1501,10 @@ static void task_numa_compare(struct task_numa_env *env,
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
*/
- if (!cur)
+ if (!cur) {
+ // XXX borken
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ }
assign:
assigned = true;
@@ -4491,6 +4493,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+
+/*
+ * Working cpumask for:
+ * load_balance,
+ * load_balance_newidle,
+ * select_idle_core.
+ *
+ * Assumes softirqs are disabled when in use.
+ */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* per rq 'load' arrray crap; XXX kill this.
@@ -5162,65 +5175,147 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
+#ifdef CONFIG_SCHED_SMT
+
+static inline void clear_idle_cores(int cpu)
+{
+ struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ if (!sd)
+ return;
+
+ WRITE_ONCE(sd->groups->sgc->has_idle_cores, 0);
+}
+
+static inline void set_idle_cores(int cpu)
+{
+ struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ if (!sd)
+ return;
+
+ WRITE_ONCE(sd->groups->sgc->has_idle_cores, 1);
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+ struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ if (!sd)
+ return false;
+
+ // XXX static key for !SMT topologies
+
+ return READ_ONCE(sd->groups->sgc->has_idle_cores);
+}
+
+void update_idle_core(struct rq *rq)
+{
+ int core = cpu_of(rq);
+ int cpu;
+
+ rcu_read_lock();
+ if (test_idle_cores(core))
+ goto unlock;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu == core)
+ continue;
+
+ if (!idle_cpu(cpu))
+ goto unlock;
+ }
+
+ set_idle_cores(core);
+unlock:
+ rcu_read_unlock();
+}
+
+static int select_idle_core(struct task_struct *p, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+ struct sched_domain *sd;
+ int core, cpu;
+
+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+ for_each_cpu(core, cpus) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ cpumask_clear_cpu(cpu, cpus);
+ if (!idle_cpu(cpu))
+ idle = false;
+ }
+
+ if (idle)
+ break;
+ }
+
+ return core;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline void clear_idle_cores(int cpu) { }
+static inline void set_idle_cores(int cpu) { }
+
+static inline bool test_idle_cores(int cpu)
+{
+ return false;
+}
+
+void update_idle_core(struct rq *rq) { }
+
+static inline int select_idle_core(struct task_struct *p, int target)
+{
+ return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
/*
- * Try and locate an idle CPU in the sched_domain.
+ * Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
struct sched_domain *sd;
- struct sched_group *sg;
int i = task_cpu(p);
if (idle_cpu(target))
return target;
/*
- * If the prevous cpu is cache affine and idle, don't be stupid.
+ * If the previous cpu is cache affine and idle, don't be stupid.
*/
if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
return i;
+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ if (!sd)
+ return target;
+
/*
- * Otherwise, iterate the domains and find an eligible idle cpu.
- *
- * A completely idle sched group at higher domains is more
- * desirable than an idle group at a lower level, because lower
- * domains have smaller groups and usually share hardware
- * resources which causes tasks to contend on them, e.g. x86
- * hyperthread siblings in the lowest domain (SMT) can contend
- * on the shared cpu pipeline.
- *
- * However, while we prefer idle groups at higher domains
- * finding an idle cpu at the lowest domain is still better than
- * returning 'target', which we've already established, isn't
- * idle.
+ * If there are idle cores to be had, go find one.
*/
- sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- /* Ensure the entire group is idle */
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (sched_feat(IDLE_CORE) && test_idle_cores(target)) {
+ i = select_idle_core(p, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
- /*
- * It doesn't matter which cpu we pick, the
- * whole group is idle.
- */
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ /*
+ * Failed to find an idle core; stop looking for one.
+ */
+ clear_idle_cores(target);
}
-done:
+
+ /*
+ * Otherwise, settle for anything idle in this cache domain.
+ */
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (!cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(i))
+ return i;
+ }
+
return target;
}
@@ -7229,9 +7324,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa46c2f..76bb8814649a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(IDLE_CORE, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 47ce94931f1b..cb394db407e4 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -23,11 +23,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}
+extern void update_idle_core(struct rq *rq);
+
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
put_prev_task(rq, prev);
-
+ update_idle_core(rq);
schedstat_inc(rq, sched_goidle);
return rq->idle;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 69da6fcaa0e8..5994794bfc85 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -866,6 +866,7 @@ struct sched_group_capacity {
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;
+ int has_idle_cores;
unsigned long cpumask[0]; /* iteration mask */
};
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 31872bc53bc4..6e42cd218ba5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void)
WARN_ON_ONCE(irqs_disabled());
/*
- * Update the idle state in the scheduler domain hierarchy
- * when tick_nohz_stop_sched_tick() is called from the idle loop.
- * State will be updated to busy during the first busy tick after
- * exiting idle.
- */
+ * Update the idle state in the scheduler domain hierarchy
+ * when tick_nohz_stop_sched_tick() is called from the idle loop.
+ * State will be updated to busy during the first busy tick after
+ * exiting idle.
+ */
set_cpu_sd_state_idle();
local_irq_disable();
next prev parent reply other threads:[~2016-04-30 12:47 UTC|newest]
Thread overview: 80+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-04-05 18:08 [PATCH RFC] select_idle_sibling experiments Chris Mason
2016-04-05 18:43 ` Bastien Bastien Philbert
2016-04-05 19:28 ` Chris Mason
2016-04-05 20:03 ` Matt Fleming
2016-04-05 21:05 ` Bastien Philbert
2016-04-06 0:44 ` Chris Mason
2016-04-06 7:27 ` Mike Galbraith
2016-04-06 13:36 ` Chris Mason
2016-04-09 17:30 ` Chris Mason
2016-04-12 21:45 ` Matt Fleming
2016-04-13 3:40 ` Mike Galbraith
2016-04-13 15:54 ` Chris Mason
2016-04-28 12:00 ` Peter Zijlstra
2016-04-28 13:17 ` Mike Galbraith
2016-05-02 5:35 ` Mike Galbraith
2016-04-07 15:17 ` Chris Mason
2016-04-09 19:05 ` sched: tweak select_idle_sibling to look for idle threads Chris Mason
2016-04-10 10:04 ` Mike Galbraith
2016-04-10 12:35 ` Chris Mason
2016-04-10 12:46 ` Mike Galbraith
2016-04-10 19:55 ` Chris Mason
2016-04-11 4:54 ` Mike Galbraith
2016-04-12 0:30 ` Chris Mason
2016-04-12 4:44 ` Mike Galbraith
2016-04-12 13:27 ` Chris Mason
2016-04-12 18:16 ` Mike Galbraith
2016-04-12 20:07 ` Chris Mason
2016-04-13 3:18 ` Mike Galbraith
2016-04-13 13:44 ` Chris Mason
2016-04-13 14:22 ` Mike Galbraith
2016-04-13 14:36 ` Chris Mason
2016-04-13 15:05 ` Mike Galbraith
2016-04-13 15:34 ` Mike Galbraith
2016-04-30 12:47 ` Peter Zijlstra [this message]
2016-05-01 7:12 ` Mike Galbraith
2016-05-01 8:53 ` Peter Zijlstra
2016-05-01 9:20 ` Mike Galbraith
2016-05-07 1:24 ` Yuyang Du
2016-05-08 8:08 ` Mike Galbraith
2016-05-08 18:57 ` Yuyang Du
2016-05-09 3:45 ` Mike Galbraith
2016-05-08 20:22 ` Yuyang Du
2016-05-09 7:44 ` Mike Galbraith
2016-05-09 1:13 ` Yuyang Du
2016-05-09 9:39 ` Mike Galbraith
2016-05-09 23:26 ` Yuyang Du
2016-05-10 7:49 ` Mike Galbraith
2016-05-10 15:26 ` Mike Galbraith
2016-05-10 19:16 ` Yuyang Du
2016-05-11 4:17 ` Mike Galbraith
2016-05-11 1:23 ` Yuyang Du
2016-05-11 9:56 ` Mike Galbraith
2016-05-18 6:41 ` Mike Galbraith
2016-05-09 3:52 ` Mike Galbraith
2016-05-08 20:31 ` Yuyang Du
2016-05-02 8:46 ` Peter Zijlstra
2016-05-02 14:50 ` Mike Galbraith
2016-05-02 14:58 ` Peter Zijlstra
2016-05-02 15:47 ` Chris Mason
2016-05-03 14:32 ` Peter Zijlstra
2016-05-03 15:11 ` Chris Mason
2016-05-04 10:37 ` Peter Zijlstra
2016-05-04 15:31 ` Peter Zijlstra
2016-05-05 22:03 ` Matt Fleming
2016-05-06 18:54 ` Mike Galbraith
2016-05-09 8:33 ` Peter Zijlstra
2016-05-09 8:56 ` Mike Galbraith
2016-05-04 15:45 ` Peter Zijlstra
2016-05-04 17:46 ` Chris Mason
2016-05-05 9:33 ` Peter Zijlstra
2016-05-05 13:58 ` Chris Mason
2016-05-06 7:12 ` Peter Zijlstra
2016-05-06 17:27 ` Chris Mason
2016-05-06 7:25 ` Peter Zijlstra
2016-05-02 17:30 ` Mike Galbraith
2016-05-02 15:01 ` Peter Zijlstra
2016-05-02 16:04 ` Ingo Molnar
2016-05-03 11:31 ` Peter Zijlstra
2016-05-03 18:22 ` Peter Zijlstra
2016-05-02 15:10 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160430124731.GE2975@worktop.cust.blueprintrf.com \
--to=peterz@infradead.org \
--cc=clm@fb.com \
--cc=linux-kernel@vger.kernel.org \
--cc=matt@codeblueprint.co.uk \
--cc=mgalbraith@suse.de \
--cc=mingo@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).