All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/7] sched,numa: move power adjustment into load_too_imbalanced
       [not found] <1403538378-31571-1-git-send-email-riel@redhat.com>
@ 2014-06-23 15:46 ` riel
  2014-07-05 10:44   ` [tip:sched/core] sched/numa: Move power adjustment into load_too_imbalanced() tip-bot for Rik van Riel
  2014-06-23 15:46 ` [PATCH 3/7] sched,numa: use effective_load to balance NUMA loads riel
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: riel @ 2014-06-23 15:46 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, mingo, chegu_vinod, mgorman

From: Rik van Riel <riel@redhat.com>

Currently the NUMA code scales the load on each node with the
amount of CPU power available on that node, but it does not
apply any adjustment to the load of the task that is being
moved over.

On systems with SMT/HT, this results in a task being weighed
much more heavily than a CPU core, and a task move that would
even out the load between nodes being disallowed.

The correct thing is to apply the power correction to the
numbers after we have first applied the move of the tasks'
loads to them.

This also allows us to do the power correction with a multiplication,
rather than a division.

Also drop two function arguments for load_too_unbalanced, since it
takes various factors from env already.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 kernel/sched/fair.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a58e79..612c963 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
 	if (!cpus)
 		return;
 
-	ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
 	ns->task_capacity =
 		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
 	env->best_cpu = env->dst_cpu;
 }
 
-static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
-				long src_load, long dst_load,
+static bool load_too_imbalanced(long src_load, long dst_load,
 				struct task_numa_env *env)
 {
 	long imb, old_imb;
+	long orig_src_load, orig_dst_load;
+	long src_capacity, dst_capacity;
+
+	/*
+	 * The load is corrected for the CPU capacity available on each node.
+	 *
+	 * src_load        dst_load
+	 * ------------ vs ---------
+	 * src_capacity    dst_capacity
+	 */
+	src_capacity = env->src_stats.compute_capacity;
+	dst_capacity = env->dst_stats.compute_capacity;
 
 	/* We care about the slope of the imbalance, not the direction. */
 	if (dst_load < src_load)
 		swap(dst_load, src_load);
 
 	/* Is the difference below the threshold? */
-	imb = dst_load * 100 - src_load * env->imbalance_pct;
+	imb = dst_load * src_capacity * 100 -
+	      src_load * dst_capacity * env->imbalance_pct;
 	if (imb <= 0)
 		return false;
 
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
 	 * The imbalance is above the allowed threshold.
 	 * Compare it with the old imbalance.
 	 */
+	orig_src_load = env->src_stats.load;
+	orig_dst_load = env->dst_stats.load;
+
 	if (orig_dst_load < orig_src_load)
 		swap(orig_dst_load, orig_src_load);
 
-	old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+	old_imb = orig_dst_load * src_capacity * 100 -
+		  orig_src_load * dst_capacity * env->imbalance_pct;
 
 	/* Would this change make things worse? */
 	return (imb > old_imb);
@@ -1136,8 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	struct rq *src_rq = cpu_rq(env->src_cpu);
 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
 	struct task_struct *cur;
-	long orig_src_load, src_load;
-	long orig_dst_load, dst_load;
+	long src_load, dst_load;
 	long load;
 	long imp = (groupimp > 0) ? groupimp : taskimp;
 
@@ -1211,13 +1225,9 @@ static void task_numa_compare(struct task_numa_env *env,
 	 * In the overloaded case, try and keep the load balanced.
 	 */
 balance:
-	orig_dst_load = env->dst_stats.load;
-	orig_src_load = env->src_stats.load;
-
-	/* XXX missing capacity terms */
 	load = task_h_load(env->p);
-	dst_load = orig_dst_load + load;
-	src_load = orig_src_load - load;
+	dst_load = env->dst_stats.load + load;
+	src_load = env->src_stats.load - load;
 
 	if (cur) {
 		load = task_h_load(cur);
@@ -1225,8 +1235,7 @@ static void task_numa_compare(struct task_numa_env *env,
 		src_load += load;
 	}
 
-	if (load_too_imbalanced(orig_src_load, orig_dst_load,
-				src_load, dst_load, env))
+	if (load_too_imbalanced(src_load, dst_load, env))
 		goto unlock;
 
 assign:
-- 
1.8.5.3


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/7] sched,numa: use effective_load to balance NUMA loads
       [not found] <1403538378-31571-1-git-send-email-riel@redhat.com>
  2014-06-23 15:46 ` [PATCH 2/7] sched,numa: move power adjustment into load_too_imbalanced riel
@ 2014-06-23 15:46 ` riel
  2014-07-05 10:44   ` [tip:sched/core] sched/numa: Use effective_load() " tip-bot for Rik van Riel
  2014-06-23 15:46 ` [PATCH 4/7] sched,numa: simplify task_numa_compare riel
  2014-06-23 15:46 ` [PATCH 5/7] sched,numa: examine a task move when examining a task swap riel
  3 siblings, 1 reply; 9+ messages in thread
From: riel @ 2014-06-23 15:46 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, mingo, chegu_vinod, mgorman

From: Rik van Riel <riel@redhat.com>

When CONFIG_FAIR_GROUP_SCHED is enabled, the load that a task places
on a CPU is determined by the group the task is in. This is conveniently
calculated for us by effective_load(), which task_numa_compare should
use.

The active groups on the source and destination CPU can be different,
so the calculation needs to be done separately for each CPU.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 kernel/sched/fair.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 612c963..41b75a6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1151,6 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	struct rq *src_rq = cpu_rq(env->src_cpu);
 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
 	struct task_struct *cur;
+	struct task_group *tg;
 	long src_load, dst_load;
 	long load;
 	long imp = (groupimp > 0) ? groupimp : taskimp;
@@ -1225,14 +1226,21 @@ static void task_numa_compare(struct task_numa_env *env,
 	 * In the overloaded case, try and keep the load balanced.
 	 */
 balance:
+	src_load = env->src_stats.load;
+	dst_load = env->dst_stats.load;
+
+	/* Calculate the effect of moving env->p from src to dst. */
 	load = task_h_load(env->p);
-	dst_load = env->dst_stats.load + load;
-	src_load = env->src_stats.load - load;
+	tg = task_group(env->p);
+	src_load += effective_load(tg, env->src_cpu, -load, -load);
+	dst_load += effective_load(tg, env->dst_cpu, load, load);
 
 	if (cur) {
+		/* Cur moves in the opposite direction. */
 		load = task_h_load(cur);
-		dst_load -= load;
-		src_load += load;
+		tg = task_group(cur);
+		src_load += effective_load(tg, env->src_cpu, load, load);
+		dst_load += effective_load(tg, env->dst_cpu, -load, -load);
 	}
 
 	if (load_too_imbalanced(src_load, dst_load, env))
-- 
1.8.5.3


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 4/7] sched,numa: simplify task_numa_compare
       [not found] <1403538378-31571-1-git-send-email-riel@redhat.com>
  2014-06-23 15:46 ` [PATCH 2/7] sched,numa: move power adjustment into load_too_imbalanced riel
  2014-06-23 15:46 ` [PATCH 3/7] sched,numa: use effective_load to balance NUMA loads riel
@ 2014-06-23 15:46 ` riel
  2014-07-05 10:45   ` [tip:sched/core] sched/numa: Simplify task_numa_compare() tip-bot for Rik van Riel
  2014-06-23 15:46 ` [PATCH 5/7] sched,numa: examine a task move when examining a task swap riel
  3 siblings, 1 reply; 9+ messages in thread
From: riel @ 2014-06-23 15:46 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, mingo, chegu_vinod, mgorman

From: Rik van Riel <riel@redhat.com>

When a task is part of a numa_group, the comparison should always use
the group weight, in order to make workloads converge.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 kernel/sched/fair.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 41b75a6..2eb845c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1154,7 +1154,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	struct task_group *tg;
 	long src_load, dst_load;
 	long load;
-	long imp = (groupimp > 0) ? groupimp : taskimp;
+	long imp = env->p->numa_group ? groupimp : taskimp;
 
 	rcu_read_lock();
 	cur = ACCESS_ONCE(dst_rq->curr);
@@ -1192,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
 			 * itself (not part of a group), use the task weight
 			 * instead.
 			 */
-			if (env->p->numa_group)
-				imp = groupimp;
-			else
-				imp = taskimp;
-
 			if (cur->numa_group)
 				imp += group_weight(cur, env->src_nid) -
 				       group_weight(cur, env->dst_nid);
-- 
1.8.5.3


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 5/7] sched,numa: examine a task move when examining a task swap
       [not found] <1403538378-31571-1-git-send-email-riel@redhat.com>
                   ` (2 preceding siblings ...)
  2014-06-23 15:46 ` [PATCH 4/7] sched,numa: simplify task_numa_compare riel
@ 2014-06-23 15:46 ` riel
  3 siblings, 0 replies; 9+ messages in thread
From: riel @ 2014-06-23 15:46 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, mingo, chegu_vinod, mgorman

From: Rik van Riel <riel@redhat.com>

Running "perf bench numa mem -0 -m -P 1000 -p 8 -t 20" on a 4
node system results in 160 runnable threads on a system with 80
CPU threads.

Once a process has nearly converged, with 39 threads on one node
and 1 thread on another node, the remaining thread will be unable
to migrate to its preferred node through a task swap.

However, a simple task move would make the workload converge,
witout causing an imbalance.

Test for this unlikely occurrence, and attempt a task move to
the preferred nid when it happens.

 # Running main, "perf bench numa mem -p 8 -t 20 -0 -m -P 1000"

 ###
 # 160 tasks will execute (on 4 nodes, 80 CPUs):
 #         -1x     0MB global  shared mem operations
 #         -1x  1000MB process shared mem operations
 #         -1x     0MB thread  local  mem operations
 ###

 ###
 #
 #    0.0%  [0.2 mins]  0/0   1/1  36/2   0/0  [36/3 ] l:  0-0   (  0) {0-2}
 #    0.0%  [0.3 mins] 43/3  37/2  39/2  41/3  [ 6/10] l:  0-1   (  1) {1-2}
 #    0.0%  [0.4 mins] 42/3  38/2  40/2  40/2  [ 4/9 ] l:  1-2   (  1) [50.0%] {1-2}
 #    0.0%  [0.6 mins] 41/3  39/2  40/2  40/2  [ 2/9 ] l:  2-4   (  2) [50.0%] {1-2}
 #    0.0%  [0.7 mins] 40/2  40/2  40/2  40/2  [ 0/8 ] l:  3-5   (  2) [40.0%] (  41.8s converged)

Without this patch, this same perf bench numa mem run had to
rely on the scheduler load balancer to first balance out the
load (moving a random task), before a task swap could complete
the NUMA convergence.

The load balancer does not normally take action unless the load
difference exceeds 25%. Convergence times of over half an hour
have been observed without this patch.

With this patch, the NUMA balancing code will simply migrate the
task, if that does not cause an imbalance.

Also skip examining a CPU in detail if the improvement on that CPU
is no more than the best we already have.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 kernel/sched/fair.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2eb845c..d525451 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1155,6 +1155,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	long src_load, dst_load;
 	long load;
 	long imp = env->p->numa_group ? groupimp : taskimp;
+	long moveimp = imp;
 
 	rcu_read_lock();
 	cur = ACCESS_ONCE(dst_rq->curr);
@@ -1201,7 +1202,7 @@ static void task_numa_compare(struct task_numa_env *env,
 		}
 	}
 
-	if (imp < env->best_imp)
+	if (imp <= env->best_imp && moveimp <= env->best_imp)
 		goto unlock;
 
 	if (!cur) {
@@ -1214,7 +1215,8 @@ static void task_numa_compare(struct task_numa_env *env,
 	}
 
 	/* Balance doesn't matter much if we're running a task per cpu */
-	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+	if (imp > env->best_imp && src_rq->nr_running == 1 &&
+			dst_rq->nr_running == 1)
 		goto assign;
 
 	/*
@@ -1230,6 +1232,23 @@ static void task_numa_compare(struct task_numa_env *env,
 	src_load += effective_load(tg, env->src_cpu, -load, -load);
 	dst_load += effective_load(tg, env->dst_cpu, load, load);
 
+	if (moveimp > imp && moveimp > env->best_imp) {
+		/*
+		 * If the improvement from just moving env->p direction is
+		 * better than swapping tasks around, check if a move is
+		 * possible. Store a slightly smaller score than moveimp,
+		 * so an actually idle CPU will win.
+		 */
+		if (!load_too_imbalanced(src_load, dst_load, env)) {
+			imp = moveimp - 1;
+			cur = NULL;
+			goto assign;
+		}
+	}
+
+	if (imp <= env->best_imp)
+		goto unlock;
+
 	if (cur) {
 		/* Cur moves in the opposite direction. */
 		load = task_h_load(cur);
-- 
1.8.5.3


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip:sched/core] sched/numa: Move power adjustment into load_too_imbalanced()
  2014-06-23 15:46 ` [PATCH 2/7] sched,numa: move power adjustment into load_too_imbalanced riel
@ 2014-07-05 10:44   ` tip-bot for Rik van Riel
  0 siblings, 0 replies; 9+ messages in thread
From: tip-bot for Rik van Riel @ 2014-07-05 10:44 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, riel, hpa, mingo, torvalds, peterz, tglx

Commit-ID:  28a21745190a0ca613cab817bfe3dc65373158bf
Gitweb:     http://git.kernel.org/tip/28a21745190a0ca613cab817bfe3dc65373158bf
Author:     Rik van Riel <riel@redhat.com>
AuthorDate: Mon, 23 Jun 2014 11:46:13 -0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 5 Jul 2014 11:17:34 +0200

sched/numa: Move power adjustment into load_too_imbalanced()

Currently the NUMA code scales the load on each node with the
amount of CPU power available on that node, but it does not
apply any adjustment to the load of the task that is being
moved over.

On systems with SMT/HT, this results in a task being weighed
much more heavily than a CPU core, and a task move that would
even out the load between nodes being disallowed.

The correct thing is to apply the power correction to the
numbers after we have first applied the move of the tasks'
loads to them.

This also allows us to do the power correction with a multiplication,
rather than a division.

Also drop two function arguments for load_too_unbalanced, since it
takes various factors from env already.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: chegu_vinod@hp.com
Cc: mgorman@suse.de
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1403538378-31571-2-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 96b2d39..f287d0b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
 	if (!cpus)
 		return;
 
-	ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
 	ns->task_capacity =
 		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
 	env->best_cpu = env->dst_cpu;
 }
 
-static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
-				long src_load, long dst_load,
+static bool load_too_imbalanced(long src_load, long dst_load,
 				struct task_numa_env *env)
 {
 	long imb, old_imb;
+	long orig_src_load, orig_dst_load;
+	long src_capacity, dst_capacity;
+
+	/*
+	 * The load is corrected for the CPU capacity available on each node.
+	 *
+	 * src_load        dst_load
+	 * ------------ vs ---------
+	 * src_capacity    dst_capacity
+	 */
+	src_capacity = env->src_stats.compute_capacity;
+	dst_capacity = env->dst_stats.compute_capacity;
 
 	/* We care about the slope of the imbalance, not the direction. */
 	if (dst_load < src_load)
 		swap(dst_load, src_load);
 
 	/* Is the difference below the threshold? */
-	imb = dst_load * 100 - src_load * env->imbalance_pct;
+	imb = dst_load * src_capacity * 100 -
+	      src_load * dst_capacity * env->imbalance_pct;
 	if (imb <= 0)
 		return false;
 
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
 	 * The imbalance is above the allowed threshold.
 	 * Compare it with the old imbalance.
 	 */
+	orig_src_load = env->src_stats.load;
+	orig_dst_load = env->dst_stats.load;
+
 	if (orig_dst_load < orig_src_load)
 		swap(orig_dst_load, orig_src_load);
 
-	old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+	old_imb = orig_dst_load * src_capacity * 100 -
+		  orig_src_load * dst_capacity * env->imbalance_pct;
 
 	/* Would this change make things worse? */
 	return (imb > old_imb);
@@ -1136,8 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	struct rq *src_rq = cpu_rq(env->src_cpu);
 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
 	struct task_struct *cur;
-	long orig_src_load, src_load;
-	long orig_dst_load, dst_load;
+	long src_load, dst_load;
 	long load;
 	long imp = (groupimp > 0) ? groupimp : taskimp;
 
@@ -1211,13 +1225,9 @@ static void task_numa_compare(struct task_numa_env *env,
 	 * In the overloaded case, try and keep the load balanced.
 	 */
 balance:
-	orig_dst_load = env->dst_stats.load;
-	orig_src_load = env->src_stats.load;
-
-	/* XXX missing capacity terms */
 	load = task_h_load(env->p);
-	dst_load = orig_dst_load + load;
-	src_load = orig_src_load - load;
+	dst_load = env->dst_stats.load + load;
+	src_load = env->src_stats.load - load;
 
 	if (cur) {
 		load = task_h_load(cur);
@@ -1225,8 +1235,7 @@ balance:
 		src_load += load;
 	}
 
-	if (load_too_imbalanced(orig_src_load, orig_dst_load,
-				src_load, dst_load, env))
+	if (load_too_imbalanced(src_load, dst_load, env))
 		goto unlock;
 
 assign:

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip:sched/core] sched/numa: Use effective_load() to balance NUMA loads
  2014-06-23 15:46 ` [PATCH 3/7] sched,numa: use effective_load to balance NUMA loads riel
@ 2014-07-05 10:44   ` tip-bot for Rik van Riel
  2014-07-09 16:02     ` Rik van Riel
  0 siblings, 1 reply; 9+ messages in thread
From: tip-bot for Rik van Riel @ 2014-07-05 10:44 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, riel, hpa, mingo, torvalds, peterz, tglx

Commit-ID:  6dc1a672ab15604947361dcd02e459effa09bad5
Gitweb:     http://git.kernel.org/tip/6dc1a672ab15604947361dcd02e459effa09bad5
Author:     Rik van Riel <riel@redhat.com>
AuthorDate: Mon, 23 Jun 2014 11:46:14 -0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 5 Jul 2014 11:17:35 +0200

sched/numa: Use effective_load() to balance NUMA loads

When CONFIG_FAIR_GROUP_SCHED is enabled, the load that a task places
on a CPU is determined by the group the task is in. The active groups
on the source and destination CPU can be different, resulting in a
different load contribution by the same task at its source and at its
destination. As a result, the load needs to be calculated separately
for each CPU, instead of estimated once with task_h_load().

Getting this calculation right allows some workloads to converge,
where previously the last thread could get stuck on another node,
without being able to migrate to its final destination.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: mgorman@suse.de
Cc: chegu_vinod@hp.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1403538378-31571-3-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f287d0b..d6526d2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1151,6 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	struct rq *src_rq = cpu_rq(env->src_cpu);
 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
 	struct task_struct *cur;
+	struct task_group *tg;
 	long src_load, dst_load;
 	long load;
 	long imp = (groupimp > 0) ? groupimp : taskimp;
@@ -1225,14 +1226,21 @@ static void task_numa_compare(struct task_numa_env *env,
 	 * In the overloaded case, try and keep the load balanced.
 	 */
 balance:
-	load = task_h_load(env->p);
-	dst_load = env->dst_stats.load + load;
-	src_load = env->src_stats.load - load;
+	src_load = env->src_stats.load;
+	dst_load = env->dst_stats.load;
+
+	/* Calculate the effect of moving env->p from src to dst. */
+	load = env->p->se.load.weight;
+	tg = task_group(env->p);
+	src_load += effective_load(tg, env->src_cpu, -load, -load);
+	dst_load += effective_load(tg, env->dst_cpu, load, load);
 
 	if (cur) {
-		load = task_h_load(cur);
-		dst_load -= load;
-		src_load += load;
+		/* Cur moves in the opposite direction. */
+		load = cur->se.load.weight;
+		tg = task_group(cur);
+		src_load += effective_load(tg, env->src_cpu, load, load);
+		dst_load += effective_load(tg, env->dst_cpu, -load, -load);
 	}
 
 	if (load_too_imbalanced(src_load, dst_load, env))

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [tip:sched/core] sched/numa: Simplify task_numa_compare()
  2014-06-23 15:46 ` [PATCH 4/7] sched,numa: simplify task_numa_compare riel
@ 2014-07-05 10:45   ` tip-bot for Rik van Riel
  0 siblings, 0 replies; 9+ messages in thread
From: tip-bot for Rik van Riel @ 2014-07-05 10:45 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, riel, hpa, mingo, torvalds, peterz, tglx

Commit-ID:  1c5d3eb3759013bc7ee4197aa0a9f245bdb6eb90
Gitweb:     http://git.kernel.org/tip/1c5d3eb3759013bc7ee4197aa0a9f245bdb6eb90
Author:     Rik van Riel <riel@redhat.com>
AuthorDate: Mon, 23 Jun 2014 11:46:15 -0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 5 Jul 2014 11:17:37 +0200

sched/numa: Simplify task_numa_compare()

When a task is part of a numa_group, the comparison should always use
the group weight, in order to make workloads converge.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: chegu_vinod@hp.com
Cc: mgorman@suse.de
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1403538378-31571-4-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d6526d2..cebb312 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1154,7 +1154,7 @@ static void task_numa_compare(struct task_numa_env *env,
 	struct task_group *tg;
 	long src_load, dst_load;
 	long load;
-	long imp = (groupimp > 0) ? groupimp : taskimp;
+	long imp = env->p->numa_group ? groupimp : taskimp;
 
 	rcu_read_lock();
 	cur = ACCESS_ONCE(dst_rq->curr);
@@ -1192,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
 			 * itself (not part of a group), use the task weight
 			 * instead.
 			 */
-			if (env->p->numa_group)
-				imp = groupimp;
-			else
-				imp = taskimp;
-
 			if (cur->numa_group)
 				imp += group_weight(cur, env->src_nid) -
 				       group_weight(cur, env->dst_nid);

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [tip:sched/core] sched/numa: Use effective_load() to balance NUMA loads
  2014-07-05 10:44   ` [tip:sched/core] sched/numa: Use effective_load() " tip-bot for Rik van Riel
@ 2014-07-09 16:02     ` Rik van Riel
  2014-07-11 13:56       ` Peter Zijlstra
  0 siblings, 1 reply; 9+ messages in thread
From: Rik van Riel @ 2014-07-09 16:02 UTC (permalink / raw)
  To: mingo, hpa, linux-kernel, torvalds, peterz, tglx

On 07/05/2014 06:44 AM, tip-bot for Rik van Riel wrote:
> Commit-ID:  6dc1a672ab15604947361dcd02e459effa09bad5
> Gitweb:     http://git.kernel.org/tip/6dc1a672ab15604947361dcd02e459effa09bad5
> Author:     Rik van Riel <riel@redhat.com>
> AuthorDate: Mon, 23 Jun 2014 11:46:14 -0400
> Committer:  Ingo Molnar <mingo@kernel.org>
> CommitDate: Sat, 5 Jul 2014 11:17:35 +0200
> 
> sched/numa: Use effective_load() to balance NUMA loads
> 
> When CONFIG_FAIR_GROUP_SCHED is enabled, the load that a task places
> on a CPU is determined by the group the task is in. The active groups
> on the source and destination CPU can be different, resulting in a
> different load contribution by the same task at its source and at its
> destination. As a result, the load needs to be calculated separately
> for each CPU, instead of estimated once with task_h_load().
> 
> Getting this calculation right allows some workloads to converge,
> where previously the last thread could get stuck on another node,
> without being able to migrate to its final destination.

Self-NAK

This patch should be reverted.

It turns out that the tree I am working on was missing
changeset a003a25b227d59ded9197ced109517f037d01c27,
which makes weighted_cpuload and update_numa_stats use
p->se.avg.load_avg_contrib instead of p->se.load.weight.

This means using effective_load, which operates on the
load.weight is inappropriate.

Sorry for the noise.

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f287d0b..d6526d2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1151,6 +1151,7 @@ static void task_numa_compare(struct task_numa_env *env,
>  	struct rq *src_rq = cpu_rq(env->src_cpu);
>  	struct rq *dst_rq = cpu_rq(env->dst_cpu);
>  	struct task_struct *cur;
> +	struct task_group *tg;
>  	long src_load, dst_load;
>  	long load;
>  	long imp = (groupimp > 0) ? groupimp : taskimp;
> @@ -1225,14 +1226,21 @@ static void task_numa_compare(struct task_numa_env *env,
>  	 * In the overloaded case, try and keep the load balanced.
>  	 */
>  balance:
> -	load = task_h_load(env->p);
> -	dst_load = env->dst_stats.load + load;
> -	src_load = env->src_stats.load - load;
> +	src_load = env->src_stats.load;
> +	dst_load = env->dst_stats.load;
> +
> +	/* Calculate the effect of moving env->p from src to dst. */
> +	load = env->p->se.load.weight;
> +	tg = task_group(env->p);
> +	src_load += effective_load(tg, env->src_cpu, -load, -load);
> +	dst_load += effective_load(tg, env->dst_cpu, load, load);
>  
>  	if (cur) {
> -		load = task_h_load(cur);
> -		dst_load -= load;
> -		src_load += load;
> +		/* Cur moves in the opposite direction. */
> +		load = cur->se.load.weight;
> +		tg = task_group(cur);
> +		src_load += effective_load(tg, env->src_cpu, load, load);
> +		dst_load += effective_load(tg, env->dst_cpu, -load, -load);
>  	}
>  
>  	if (load_too_imbalanced(src_load, dst_load, env))
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [tip:sched/core] sched/numa: Use effective_load() to balance NUMA loads
  2014-07-09 16:02     ` Rik van Riel
@ 2014-07-11 13:56       ` Peter Zijlstra
  0 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2014-07-11 13:56 UTC (permalink / raw)
  To: Rik van Riel; +Cc: mingo, hpa, linux-kernel, torvalds, tglx

On Wed, Jul 09, 2014 at 12:02:03PM -0400, Rik van Riel wrote:
> On 07/05/2014 06:44 AM, tip-bot for Rik van Riel wrote:
> > Commit-ID:  6dc1a672ab15604947361dcd02e459effa09bad5
> > Gitweb:     http://git.kernel.org/tip/6dc1a672ab15604947361dcd02e459effa09bad5
> > Author:     Rik van Riel <riel@redhat.com>
> > AuthorDate: Mon, 23 Jun 2014 11:46:14 -0400
> > Committer:  Ingo Molnar <mingo@kernel.org>
> > CommitDate: Sat, 5 Jul 2014 11:17:35 +0200
> > 
> > sched/numa: Use effective_load() to balance NUMA loads
> > 
> > When CONFIG_FAIR_GROUP_SCHED is enabled, the load that a task places
> > on a CPU is determined by the group the task is in. The active groups
> > on the source and destination CPU can be different, resulting in a
> > different load contribution by the same task at its source and at its
> > destination. As a result, the load needs to be calculated separately
> > for each CPU, instead of estimated once with task_h_load().
> > 
> > Getting this calculation right allows some workloads to converge,
> > where previously the last thread could get stuck on another node,
> > without being able to migrate to its final destination.
> 
> Self-NAK
> 
> This patch should be reverted.
> 
> It turns out that the tree I am working on was missing
> changeset a003a25b227d59ded9197ced109517f037d01c27,
> which makes weighted_cpuload and update_numa_stats use
> p->se.avg.load_avg_contrib instead of p->se.load.weight.
> 
> This means using effective_load, which operates on the
> load.weight is inappropriate.
> 
> Sorry for the noise.

n/p I'll take it out. Thanks!

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-07-11 13:57 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1403538378-31571-1-git-send-email-riel@redhat.com>
2014-06-23 15:46 ` [PATCH 2/7] sched,numa: move power adjustment into load_too_imbalanced riel
2014-07-05 10:44   ` [tip:sched/core] sched/numa: Move power adjustment into load_too_imbalanced() tip-bot for Rik van Riel
2014-06-23 15:46 ` [PATCH 3/7] sched,numa: use effective_load to balance NUMA loads riel
2014-07-05 10:44   ` [tip:sched/core] sched/numa: Use effective_load() " tip-bot for Rik van Riel
2014-07-09 16:02     ` Rik van Riel
2014-07-11 13:56       ` Peter Zijlstra
2014-06-23 15:46 ` [PATCH 4/7] sched,numa: simplify task_numa_compare riel
2014-07-05 10:45   ` [tip:sched/core] sched/numa: Simplify task_numa_compare() tip-bot for Rik van Riel
2014-06-23 15:46 ` [PATCH 5/7] sched,numa: examine a task move when examining a task swap riel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.