Re: sched: tweak select_idle_sibling to look for idle threads

From: Peter Zijlstra <peterz@infradead.org>
To: Mike Galbraith <mgalbraith@suse.de>
Cc: Chris Mason <clm@fb.com>, Ingo Molnar <mingo@kernel.org>,
	Matt Fleming <matt@codeblueprint.co.uk>,
	linux-kernel@vger.kernel.org
Subject: Re: sched: tweak select_idle_sibling to look for idle threads
Date: Mon, 2 May 2016 10:46:15 +0200	[thread overview]
Message-ID: <20160502084615.GB3430@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <1462086753.9717.29.camel@suse.de>

On Sun, May 01, 2016 at 09:12:33AM +0200, Mike Galbraith wrote:

> Nah, tbench is just variance prone.  It got dinged up at clients=cores
> on my desktop box, on 4 sockets the high end got seriously dinged up.


Ha!, check this:

root@ivb-ep:~# echo OLD_IDLE > /debug/sched_features ; echo
NO_ORDER_IDLE > /debug/sched_features ; echo IDLE_CORE >
/debug/sched_features ; echo NO_FORCE_CORE > /debug/sched_features ;
tbench 20 -t 10

Throughput 5956.32 MB/sec  20 clients  20 procs  max_latency=0.126 ms


root@ivb-ep:~# echo OLD_IDLE > /debug/sched_features ; echo ORDER_IDLE >
/debug/sched_features ; echo IDLE_CORE > /debug/sched_features ; echo
NO_FORCE_CORE > /debug/sched_features ; tbench 20 -t 10

Throughput 5011.86 MB/sec  20 clients  20 procs  max_latency=0.116 ms



That little ORDER_IDLE thing hurts silly. That's a little patch I had
lying about because some people complained that tasks hop around the
cache domain, instead of being stuck to a CPU.

I suspect what happens is that by all CPUs starting to look for idle at
the same place (the first cpu in the domain) they all find the same idle
cpu and things pile up.

The old behaviour, where they all start iterating from where they were
avoids some of that, at the cost of making tasks hop around.

Lets see if I can get the same behaviour out of the cpumask iteration
code..





---
 kernel/sched/fair.c      | 218 +++++++++++++++++++++++++++++++++++++----------
 kernel/sched/features.h  |   4 +
 kernel/sched/idle_task.c |   4 +-
 kernel/sched/sched.h     |   1 +
 kernel/time/tick-sched.c |  10 +--
 5 files changed, 188 insertions(+), 49 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33ab..b7626a4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1501,8 +1501,10 @@ balance:
 	 * One idle CPU per node is evaluated for a task numa move.
 	 * Call select_idle_sibling to maybe find a better one.
 	 */
-	if (!cur)
+	if (!cur) {
+		// XXX borken
 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+	}
 
 assign:
 	assigned = true;
@@ -4491,6 +4493,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+
+/* 
+ * Working cpumask for:
+ *   load_balance, 
+ *   load_balance_newidle,
+ *   select_idle_core.
+ *
+ * Assumes softirqs are disabled when in use.
+ */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
@@ -5162,65 +5175,187 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
+#ifdef CONFIG_SCHED_SMT
+
+static inline void clear_idle_cores(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+	if (!sd)
+		return;
+
+	WRITE_ONCE(sd->groups->sgc->has_idle_cores, 0);
+}
+
+static inline void set_idle_cores(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+	if (!sd)
+		return;
+
+	WRITE_ONCE(sd->groups->sgc->has_idle_cores, 1);
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+	if (!sd)
+		return false;
+
+	// XXX static key for !SMT topologies
+
+	return READ_ONCE(sd->groups->sgc->has_idle_cores);
+}
+
+void update_idle_core(struct rq *rq)
+{
+	int core = cpu_of(rq);
+	int cpu;
+
+	rcu_read_lock();
+	if (test_idle_cores(core))
+		goto unlock;
+
+	for_each_cpu(cpu, cpu_smt_mask(core)) {
+		if (cpu == core)
+			continue;
+
+		if (!idle_cpu(cpu))
+			goto unlock;
+	}
+
+	set_idle_cores(core);
+unlock:
+	rcu_read_unlock();
+}
+
+static int select_idle_core(struct task_struct *p, int target)
+{
+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+	struct sched_domain *sd;
+	int core, cpu;
+
+	sd = rcu_dereference(per_cpu(sd_llc, target));
+	cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+	for_each_cpu(core, cpus) {
+		bool idle = true;
+
+		for_each_cpu(cpu, cpu_smt_mask(core)) {
+			cpumask_clear_cpu(cpu, cpus);
+			if (!idle_cpu(cpu))
+				idle = false;
+		}
+
+		if (idle)
+			break;
+	}
+
+	return core;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline void clear_idle_cores(int cpu) { }
+static inline void set_idle_cores(int cpu) { }
+
+static inline bool test_idle_cores(int cpu)
+{
+	return false;
+}
+
+void update_idle_core(struct rq *rq) { }
+
+static inline int select_idle_core(struct task_struct *p, int target)
+{
+	return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
 /*
- * Try and locate an idle CPU in the sched_domain.
+ * Try and locate an idle core/thread in the LLC cache domain.
  */
 static int select_idle_sibling(struct task_struct *p, int target)
 {
 	struct sched_domain *sd;
-	struct sched_group *sg;
 	int i = task_cpu(p);
 
 	if (idle_cpu(target))
 		return target;
 
 	/*
-	 * If the prevous cpu is cache affine and idle, don't be stupid.
+	 * If the previous cpu is cache affine and idle, don't be stupid.
 	 */
 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
 		return i;
 
-	/*
-	 * Otherwise, iterate the domains and find an eligible idle cpu.
-	 *
-	 * A completely idle sched group at higher domains is more
-	 * desirable than an idle group at a lower level, because lower
-	 * domains have smaller groups and usually share hardware
-	 * resources which causes tasks to contend on them, e.g. x86
-	 * hyperthread siblings in the lowest domain (SMT) can contend
-	 * on the shared cpu pipeline.
-	 *
-	 * However, while we prefer idle groups at higher domains
-	 * finding an idle cpu at the lowest domain is still better than
-	 * returning 'target', which we've already established, isn't
-	 * idle.
-	 */
-	sd = rcu_dereference(per_cpu(sd_llc, target));
-	for_each_lower_domain(sd) {
-		sg = sd->groups;
-		do {
-			if (!cpumask_intersects(sched_group_cpus(sg),
-						tsk_cpus_allowed(p)))
-				goto next;
+	i = target;
+	if (sched_feat(ORDER_IDLE))
+		i = per_cpu(sd_llc_id, target); /* first cpu in llc domain */
+	sd = rcu_dereference(per_cpu(sd_llc, i));
+	if (!sd)
+		return target;
+
+	if (sched_feat(OLD_IDLE)) {
+		struct sched_group *sg;
 
-			/* Ensure the entire group is idle */
-			for_each_cpu(i, sched_group_cpus(sg)) {
-				if (i == target || !idle_cpu(i))
+		for_each_lower_domain(sd) {
+			sg = sd->groups;
+			do {
+				if (!cpumask_intersects(sched_group_cpus(sg),
+							tsk_cpus_allowed(p)))
 					goto next;
-			}
 
-			/*
-			 * It doesn't matter which cpu we pick, the
-			 * whole group is idle.
-			 */
-			target = cpumask_first_and(sched_group_cpus(sg),
-					tsk_cpus_allowed(p));
-			goto done;
+				/* Ensure the entire group is idle */
+				for_each_cpu(i, sched_group_cpus(sg)) {
+					if (i == target || !idle_cpu(i))
+						goto next;
+				}
+
+				/*
+				 * It doesn't matter which cpu we pick, the
+				 * whole group is idle.
+				 */
+				target = cpumask_first_and(sched_group_cpus(sg),
+						tsk_cpus_allowed(p));
+				goto done;
 next:
-			sg = sg->next;
-		} while (sg != sd->groups);
-	}
+				sg = sg->next;
+			} while (sg != sd->groups);
+		}
 done:
+		return target;
+	}
+
+	/*
+	 * If there are idle cores to be had, go find one.
+	 */
+	if (sched_feat(IDLE_CORE) && test_idle_cores(target)) {
+		i = select_idle_core(p, target);
+		if ((unsigned)i < nr_cpumask_bits)
+			return i;
+
+		/*
+		 * Failed to find an idle core; stop looking for one.
+		 */
+		clear_idle_cores(target);
+	}
+
+	if (sched_feat(FORCE_CORE)) {
+		i = select_idle_core(p, target);
+		if ((unsigned)i < nr_cpumask_bits)
+			return i;
+	}
+
+	/*
+	 * Otherwise, settle for anything idle in this cache domain.
+	 */
+	for_each_cpu(i, sched_domain_span(sd)) {
+		if (!cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+			continue;
+		if (idle_cpu(i))
+			return i;
+	}
+
 	return target;
 }
 
@@ -7229,9 +7364,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
  */
 #define MAX_PINNED_INTERVAL	512
 
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
 static int need_active_balance(struct lb_env *env)
 {
 	struct sched_domain *sd = env->sd;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa..11ab7ab 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,7 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+SCHED_FEAT(OLD_IDLE, false)
+SCHED_FEAT(ORDER_IDLE, false)
+SCHED_FEAT(IDLE_CORE, true)
+SCHED_FEAT(FORCE_CORE, false)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 47ce949..cb394db 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -23,11 +23,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 	resched_curr(rq);
 }
 
+extern void update_idle_core(struct rq *rq);
+
 static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
 	put_prev_task(rq, prev);
-
+	update_idle_core(rq);
 	schedstat_inc(rq, sched_goidle);
 	return rq->idle;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 69da6fc..5994794 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -866,6 +866,7 @@ struct sched_group_capacity {
 	 * Number of busy cpus in this group.
 	 */
 	atomic_t nr_busy_cpus;
+	int	has_idle_cores;
 
 	unsigned long cpumask[0]; /* iteration mask */
 };
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 31872bc..6e42cd2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void)
 	WARN_ON_ONCE(irqs_disabled());
 
 	/*
- 	 * Update the idle state in the scheduler domain hierarchy
- 	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
- 	 * State will be updated to busy during the first busy tick after
- 	 * exiting idle.
- 	 */
+	 * Update the idle state in the scheduler domain hierarchy
+	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
+	 * State will be updated to busy during the first busy tick after
+	 * exiting idle.
+	 */
 	set_cpu_sd_state_idle();
 
 	local_irq_disable();