Re: sched: tweak select_idle_sibling to look for idle threads

From: Peter Zijlstra <peterz@infradead.org>
To: Chris Mason <clm@fb.com>, Ingo Molnar <mingo@kernel.org>,
	Matt Fleming <matt@codeblueprint.co.uk>,
	Mike Galbraith <mgalbraith@suse.de>,
	linux-kernel@vger.kernel.org
Subject: Re: sched: tweak select_idle_sibling to look for idle threads
Date: Sat, 30 Apr 2016 14:47:31 +0200	[thread overview]
Message-ID: <20160430124731.GE2975@worktop.cust.blueprintrf.com> (raw)
In-Reply-To: <20160409190554.honue3gtian2p6vr@floor.thefacebook.com>

On Sat, Apr 09, 2016 at 03:05:54PM -0400, Chris Mason wrote:
> select_task_rq_fair() can leave cpu utilization a little lumpy,
> especially as the workload ramps up to the maximum capacity of the
> machine.  The end result can be high p99 response times as apps
> wait to get scheduled, even when boxes are mostly idle.
> 
> I wrote schbench to try and measure this:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/mason/schbench.git

Can you guys have a play with this; I think one and two node tbench are
good, but I seem to be getting significant run to run variance on that,
so maybe I'm not doing it right.

schbench numbers with: ./schbench -m2 -t 20 -c 30000 -s 30000 -r 30
on my ivb-ep (2 sockets, 10 cores/socket, 2 threads/core) appear to be
decent.

I've also not ran anything other than schbench/tbench so maybe I
completely wrecked something else (as per usual..).

I've not thought about that bounce_to_target() thing much.. I'll go give
that a ponder.

---
 kernel/sched/fair.c      | 180 +++++++++++++++++++++++++++++++++++------------
 kernel/sched/features.h  |   1 +
 kernel/sched/idle_task.c |   4 +-
 kernel/sched/sched.h     |   1 +
 kernel/time/tick-sched.c |  10 +--
 5 files changed, 146 insertions(+), 50 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33abce650..b9d8d1dc5183 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1501,8 +1501,10 @@ static void task_numa_compare(struct task_numa_env *env,
 	 * One idle CPU per node is evaluated for a task numa move.
 	 * Call select_idle_sibling to maybe find a better one.
 	 */
-	if (!cur)
+	if (!cur) {
+		// XXX borken
 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+	}
 
 assign:
 	assigned = true;
@@ -4491,6 +4493,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+
+/*
+ * Working cpumask for:
+ *   load_balance,
+ *   load_balance_newidle,
+ *   select_idle_core.
+ *
+ * Assumes softirqs are disabled when in use.
+ */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
@@ -5162,65 +5175,147 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
+#ifdef CONFIG_SCHED_SMT
+
+static inline void clear_idle_cores(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+	if (!sd)
+		return;
+
+	WRITE_ONCE(sd->groups->sgc->has_idle_cores, 0);
+}
+
+static inline void set_idle_cores(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+	if (!sd)
+		return;
+
+	WRITE_ONCE(sd->groups->sgc->has_idle_cores, 1);
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+	if (!sd)
+		return false;
+
+	// XXX static key for !SMT topologies
+
+	return READ_ONCE(sd->groups->sgc->has_idle_cores);
+}
+
+void update_idle_core(struct rq *rq)
+{
+	int core = cpu_of(rq);
+	int cpu;
+
+	rcu_read_lock();
+	if (test_idle_cores(core))
+		goto unlock;
+
+	for_each_cpu(cpu, cpu_smt_mask(core)) {
+		if (cpu == core)
+			continue;
+
+		if (!idle_cpu(cpu))
+			goto unlock;
+	}
+
+	set_idle_cores(core);
+unlock:
+	rcu_read_unlock();
+}
+
+static int select_idle_core(struct task_struct *p, int target)
+{
+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+	struct sched_domain *sd;
+	int core, cpu;
+
+	sd = rcu_dereference(per_cpu(sd_llc, target));
+	cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+	for_each_cpu(core, cpus) {
+		bool idle = true;
+
+		for_each_cpu(cpu, cpu_smt_mask(core)) {
+			cpumask_clear_cpu(cpu, cpus);
+			if (!idle_cpu(cpu))
+				idle = false;
+		}
+
+		if (idle)
+			break;
+	}
+
+	return core;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline void clear_idle_cores(int cpu) { }
+static inline void set_idle_cores(int cpu) { }
+
+static inline bool test_idle_cores(int cpu)
+{
+	return false;
+}
+
+void update_idle_core(struct rq *rq) { }
+
+static inline int select_idle_core(struct task_struct *p, int target)
+{
+	return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
 /*
- * Try and locate an idle CPU in the sched_domain.
+ * Try and locate an idle core/thread in the LLC cache domain.
  */
 static int select_idle_sibling(struct task_struct *p, int target)
 {
 	struct sched_domain *sd;
-	struct sched_group *sg;
 	int i = task_cpu(p);
 
 	if (idle_cpu(target))
 		return target;
 
 	/*
-	 * If the prevous cpu is cache affine and idle, don't be stupid.
+	 * If the previous cpu is cache affine and idle, don't be stupid.
 	 */
 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
 		return i;
 
+	sd = rcu_dereference(per_cpu(sd_llc, target));
+	if (!sd)
+		return target;
+
 	/*
-	 * Otherwise, iterate the domains and find an eligible idle cpu.
-	 *
-	 * A completely idle sched group at higher domains is more
-	 * desirable than an idle group at a lower level, because lower
-	 * domains have smaller groups and usually share hardware
-	 * resources which causes tasks to contend on them, e.g. x86
-	 * hyperthread siblings in the lowest domain (SMT) can contend
-	 * on the shared cpu pipeline.
-	 *
-	 * However, while we prefer idle groups at higher domains
-	 * finding an idle cpu at the lowest domain is still better than
-	 * returning 'target', which we've already established, isn't
-	 * idle.
+	 * If there are idle cores to be had, go find one.
 	 */
-	sd = rcu_dereference(per_cpu(sd_llc, target));
-	for_each_lower_domain(sd) {
-		sg = sd->groups;
-		do {
-			if (!cpumask_intersects(sched_group_cpus(sg),
-						tsk_cpus_allowed(p)))
-				goto next;
-
-			/* Ensure the entire group is idle */
-			for_each_cpu(i, sched_group_cpus(sg)) {
-				if (i == target || !idle_cpu(i))
-					goto next;
-			}
+	if (sched_feat(IDLE_CORE) && test_idle_cores(target)) {
+		i = select_idle_core(p, target);
+		if ((unsigned)i < nr_cpumask_bits)
+			return i;
 
-			/*
-			 * It doesn't matter which cpu we pick, the
-			 * whole group is idle.
-			 */
-			target = cpumask_first_and(sched_group_cpus(sg),
-					tsk_cpus_allowed(p));
-			goto done;
-next:
-			sg = sg->next;
-		} while (sg != sd->groups);
+		/*
+		 * Failed to find an idle core; stop looking for one.
+		 */
+		clear_idle_cores(target);
 	}
-done:
+
+	/*
+	 * Otherwise, settle for anything idle in this cache domain.
+	 */
+	for_each_cpu(i, sched_domain_span(sd)) {
+		if (!cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+			continue;
+		if (idle_cpu(i))
+			return i;
+	}
+
 	return target;
 }
 
@@ -7229,9 +7324,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
  */
 #define MAX_PINNED_INTERVAL	512
 
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
 static int need_active_balance(struct lb_env *env)
 {
 	struct sched_domain *sd = env->sd;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa46c2f..76bb8814649a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+SCHED_FEAT(IDLE_CORE, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 47ce94931f1b..cb394db407e4 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -23,11 +23,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 	resched_curr(rq);
 }
 
+extern void update_idle_core(struct rq *rq);
+
 static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
 	put_prev_task(rq, prev);
-
+	update_idle_core(rq);
 	schedstat_inc(rq, sched_goidle);
 	return rq->idle;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 69da6fcaa0e8..5994794bfc85 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -866,6 +866,7 @@ struct sched_group_capacity {
 	 * Number of busy cpus in this group.
 	 */
 	atomic_t nr_busy_cpus;
+	int	has_idle_cores;
 
 	unsigned long cpumask[0]; /* iteration mask */
 };
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 31872bc53bc4..6e42cd218ba5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void)
 	WARN_ON_ONCE(irqs_disabled());
 
 	/*
- 	 * Update the idle state in the scheduler domain hierarchy
- 	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
- 	 * State will be updated to busy during the first busy tick after
- 	 * exiting idle.
- 	 */
+	 * Update the idle state in the scheduler domain hierarchy
+	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
+	 * State will be updated to busy during the first busy tick after
+	 * exiting idle.
+	 */
 	set_cpu_sd_state_idle();
 
 	local_irq_disable();