Re: [PATCH RESEND] sched: prefer an idle cpu vs an idle sibling for BALANCE_WAKE

From: Mike Galbraith <umgwanakikbuti@gmail.com>
To: Josef Bacik <jbacik@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>,
	riel@redhat.com, mingo@redhat.com, linux-kernel@vger.kernel.org,
	morten.rasmussen@arm.com, kernel-team <Kernel-team@fb.com>
Subject: Re: [PATCH RESEND] sched: prefer an idle cpu vs an idle sibling for BALANCE_WAKE
Date: Sat, 04 Jul 2015 17:57:42 +0200	[thread overview]
Message-ID: <1436025462.17152.37.camel@gmail.com> (raw)
In-Reply-To: <1435905658.6418.52.camel@gmail.com>

On Fri, 2015-07-03 at 08:40 +0200, Mike Galbraith wrote:

> Hm.  Seems what this load should like best is if we detect 1:N, skip all
> of the routine gyrations, ie move the N (workers) infrequently, expend
> search cycles frequently only on the 1 (dispatch).
> 
> Ponder..

Since it was too hot to do outside chores (any excuse will do;)...

If we're (read /me) on track, the bellow should help.  Per my tracing,
it may want a wee bit of toning down actually, though when I trace
virgin source I expect to see the same, namely Xorg and friends having
"wide-load" tattooed across their hindquarters earlier than they should.
It doesn't seem to hurt anything, but then demolishing a single llc box
is a tad more difficult than demolishing a NUMA box.


sched: beef up wake_wide()

Josef Bacik reported that Facebook sees better performance with their
1:N load (1 dispatch/node, N workers/node) when carrying an old patch
to try very hard to wake to an idle CPU.  While looking at wake_wide(),
I noticed that it doesn't pay attention to wakeup of the 1:N waker,
returning 1 only when waking one of its N minions.

Correct that, and give the user the option to do an expensive balance IFF
select_idle_sibling() doesn't find an idle CPU, and IFF the wakee is the
the 1:N dispatcher of work, thus worth some extra effort.

Not-Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
---
 kernel/sched/fair.c     |   89 +++++++++++++++++++++++++-----------------------
 kernel/sched/features.h |    6 +++
 2 files changed, 54 insertions(+), 41 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -666,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *c
 }
 
 #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int cpu, void *clear);
 static unsigned long task_h_load(struct task_struct *p);
 
 static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -1375,7 +1375,7 @@ static void task_numa_compare(struct tas
 	 * Call select_idle_sibling to maybe find a better one.
 	 */
 	if (!cur)
-		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu, NULL);
 
 assign:
 	task_numa_assign(env, cur, imp);
@@ -4730,26 +4730,30 @@ static long effective_load(struct task_g
 
 #endif
 
+/*
+ * Detect 1:N waker/wakee relationship via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees.  In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.  With
+ * both conditions met, we can be relatively sure that we are seeing a 1:N
+ * relationship, and that load size exceeds socket size.
+ */
 static int wake_wide(struct task_struct *p)
 {
-	int factor = this_cpu_read(sd_llc_size);
-
-	/*
-	 * Yeah, it's the switching-frequency, could means many wakee or
-	 * rapidly switch, use factor here will just help to automatically
-	 * adjust the loose-degree, so bigger node will lead to more pull.
-	 */
-	if (p->wakee_flips > factor) {
-		/*
-		 * wakee is somewhat hot, it needs certain amount of cpu
-		 * resource, so if waker is far more hot, prefer to leave
-		 * it alone.
-		 */
-		if (current->wakee_flips > (factor * p->wakee_flips))
-			return 1;
+	unsigned long waker_flips = current->wakee_flips;
+	unsigned long wakee_flips = p->wakee_flips;
+	int factor = this_cpu_read(sd_llc_size), ret = 1;
+
+	if (waker_flips < wakee_flips) {
+		swap(waker_flips, wakee_flips);
+		/* Tell the caller that we're waking a 1:N waker */
+		ret += sched_feat(WAKE_WIDE_BALANCE);
 	}
-
-	return 0;
+	if (wakee_flips < factor || waker_flips < wakee_flips * factor)
+		return 0;
+	return ret;
 }
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4761,13 +4765,6 @@ static int wake_affine(struct sched_doma
 	unsigned long weight;
 	int balanced;
 
-	/*
-	 * If we wake multiple tasks be careful to not bounce
-	 * ourselves around too much.
-	 */
-	if (wake_wide(p))
-		return 0;
-
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
@@ -4935,20 +4932,22 @@ find_idlest_cpu(struct sched_group *grou
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int target, void *clear)
 {
 	struct sched_domain *sd;
 	struct sched_group *sg;
 	int i = task_cpu(p);
 
 	if (idle_cpu(target))
-		return target;
+		goto done;
 
 	/*
 	 * If the prevous cpu is cache affine and idle, don't be stupid.
 	 */
-	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-		return i;
+	if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) {
+		target = i;
+		goto done;
+	}
 
 	/*
 	 * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -4973,7 +4972,11 @@ static int select_idle_sibling(struct ta
 			sg = sg->next;
 		} while (sg != sd->groups);
 	}
+	return target;
 done:
+	if (clear)
+		*(void **)clear = 0;
+
 	return target;
 }
 /*
@@ -5021,14 +5024,19 @@ select_task_rq_fair(struct task_struct *
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
-	int new_cpu = cpu;
-	int want_affine = 0;
+	int new_cpu = prev_cpu;
+	int want_affine = 0, want_balance = 0;
 	int sync = wake_flags & WF_SYNC;
 
-	if (sd_flag & SD_BALANCE_WAKE)
-		want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
-
 	rcu_read_lock();
+	if (sd_flag & SD_BALANCE_WAKE) {
+		want_affine = wake_wide(p);
+		want_balance = want_affine > 1;
+		want_affine = !want_affine && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+		if (!want_affine && !want_balance)
+			goto select;
+	}
+
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))
 			continue;
@@ -5043,23 +5051,23 @@ select_task_rq_fair(struct task_struct *
 			break;
 		}
 
-		if (tmp->flags & sd_flag)
+		if (tmp->flags & sd_flag || want_balance)
 			sd = tmp;
 	}
 
 	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-		prev_cpu = cpu;
+		new_cpu = cpu;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
-		new_cpu = select_idle_sibling(p, prev_cpu);
-		goto unlock;
+select:
+		new_cpu = select_idle_sibling(p, new_cpu, &sd);
 	}
 
 	while (sd) {
 		struct sched_group *group;
 		int weight;
 
-		if (!(sd->flags & sd_flag)) {
+		if (!(sd->flags & sd_flag) && !want_balance) {
 			sd = sd->child;
 			continue;
 		}
@@ -5089,7 +5097,6 @@ select_task_rq_fair(struct task_struct *
 		}
 		/* while loop will break here if sd == NULL */
 	}
-unlock:
 	rcu_read_unlock();
 
 	return new_cpu;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -96,3 +96,9 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
  */
 SCHED_FEAT(NUMA_RESIST_LOWER, false)
 #endif
+
+/*
+ * Perform expensive full wake balance for 1:N wakers when the
+ * selected cpu is not completely idle.
+ */
+SCHED_FEAT(WAKE_WIDE_BALANCE, false)