All of lore.kernel.org
 help / color / mirror / Atom feed
From: tip-bot for Rik van Riel <tipbot@zytor.com>
To: linux-tip-commits@vger.kernel.org
Cc: mingo@kernel.org, torvalds@linux-foundation.org, riel@redhat.com,
	tglx@linutronix.de, peterz@infradead.org, efault@gmx.de,
	linux-kernel@vger.kernel.org, hpa@zytor.com, mgorman@suse.de
Subject: [tip:sched/core] sched/numa: Implement NUMA node level wake_affine()
Date: Sat, 24 Jun 2017 00:23:24 -0700	[thread overview]
Message-ID: <tip-3fed382b46baac83703130fe4cd3d9147f427fb9@git.kernel.org> (raw)
In-Reply-To: <20170623165530.22514-4-riel@redhat.com>

Commit-ID:  3fed382b46baac83703130fe4cd3d9147f427fb9
Gitweb:     http://git.kernel.org/tip/3fed382b46baac83703130fe4cd3d9147f427fb9
Author:     Rik van Riel <riel@redhat.com>
AuthorDate: Fri, 23 Jun 2017 12:55:29 -0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 24 Jun 2017 08:57:52 +0200

sched/numa: Implement NUMA node level wake_affine()

Since select_idle_sibling() can place a task anywhere on a socket,
comparing loads between individual CPU cores makes no real sense
for deciding whether to do an affine wakeup across sockets, either.

Instead, compare the load between the sockets in a similar way the
load balancer and the numa balancing code do.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jhladky@redhat.com
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170623165530.22514-4-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 130 ++++++++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 59 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe19016..79ac078 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2586,6 +2586,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 		}
 	}
 }
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ */
+static inline bool numa_wake_affine(struct sched_domain *sd,
+				    struct task_struct *p, int this_cpu,
+				    int prev_cpu, int sync)
+{
+	struct numa_stats prev_load, this_load;
+	s64 this_eff_load, prev_eff_load;
+
+	update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
+	update_numa_stats(&this_load, cpu_to_node(this_cpu));
+
+	/*
+	 * If sync wakeup then subtract the (maximum possible)
+	 * effect of the currently running task from the load
+	 * of the current CPU:
+	 */
+	if (sync) {
+		unsigned long current_load = task_h_load(current);
+
+		if (this_load.load > current_load)
+			this_load.load -= current_load;
+		else
+			this_load.load = 0;
+	}
+
+	/*
+	 * In low-load situations, where this_cpu's node is idle due to the
+	 * sync cause above having dropped this_load.load to 0, move the task.
+	 * Moving to an idle socket will not create a bad imbalance.
+	 *
+	 * Otherwise check if the nodes are near enough in load to allow this
+	 * task to be woken on this_cpu's node.
+	 */
+	if (this_load.load > 0) {
+		unsigned long task_load = task_h_load(p);
+
+		this_eff_load = 100;
+		this_eff_load *= prev_load.compute_capacity;
+
+		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+		prev_eff_load *= this_load.compute_capacity;
+
+		this_eff_load *= this_load.load + task_load;
+		prev_eff_load *= prev_load.load - task_load;
+
+		return this_eff_load <= prev_eff_load;
+	}
+
+	return true;
+}
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
@@ -2598,6 +2652,13 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
+
+static inline bool numa_wake_affine(struct sched_domain *sd,
+				    struct task_struct *p, int this_cpu,
+				    int prev_cpu, int sync)
+{
+	return true;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -5407,74 +5468,25 @@ static int wake_wide(struct task_struct *p)
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int prev_cpu, int sync)
 {
-	s64 this_load, load;
-	s64 this_eff_load, prev_eff_load;
-	int idx, this_cpu;
-	struct task_group *tg;
-	unsigned long weight;
-	int balanced;
-
-	idx	  = sd->wake_idx;
-	this_cpu  = smp_processor_id();
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+	int this_cpu = smp_processor_id();
+	bool affine = false;
 
 	/*
 	 * Common case: CPUs are in the same socket, and select_idle_sibling()
 	 * will do its thing regardless of what we return:
 	 */
 	if (cpus_share_cache(prev_cpu, this_cpu))
-		return true;
-
-	/*
-	 * If sync wakeup then subtract the (maximum possible)
-	 * effect of the currently running task from the load
-	 * of the current CPU:
-	 */
-	if (sync) {
-		tg = task_group(current);
-		weight = current->se.avg.load_avg;
-
-		this_load += effective_load(tg, this_cpu, -weight, -weight);
-		load += effective_load(tg, prev_cpu, 0, -weight);
-	}
-
-	tg = task_group(p);
-	weight = p->se.avg.load_avg;
-
-	/*
-	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
-	 * due to the sync cause above having dropped this_load to 0, we'll
-	 * always have an imbalance, but there's really nothing you can do
-	 * about that, so that's good too.
-	 *
-	 * Otherwise check if either cpus are near enough in load to allow this
-	 * task to be woken on this_cpu.
-	 */
-	this_eff_load = 100;
-	this_eff_load *= capacity_of(prev_cpu);
-
-	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-	prev_eff_load *= capacity_of(this_cpu);
-
-	if (this_load > 0) {
-		this_eff_load *= this_load +
-			effective_load(tg, this_cpu, weight, weight);
-
-		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
-	}
-
-	balanced = this_eff_load <= prev_eff_load;
+		affine = true;
+	else
+		affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
 
 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
+	if (affine) {
+		schedstat_inc(sd->ttwu_move_affine);
+		schedstat_inc(p->se.statistics.nr_wakeups_affine);
+	}
 
-	if (!balanced)
-		return 0;
-
-	schedstat_inc(sd->ttwu_move_affine);
-	schedstat_inc(p->se.statistics.nr_wakeups_affine);
-
-	return 1;
+	return affine;
 }
 
 static inline int task_util(struct task_struct *p);

  reply	other threads:[~2017-06-24  7:27 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-23 16:55 [PATCH 0/4] NUMA improvements with task wakeup and load balancing riel
2017-06-23 16:55 ` [PATCH 1/4] sched,numa: override part of migrate_degrades_locality when idle balancing riel
2017-06-24  6:58   ` Ingo Molnar
2017-06-24 23:45     ` Rik van Riel
2017-06-24  7:22   ` [tip:sched/core] sched/numa: Override part of migrate_degrades_locality() " tip-bot for Rik van Riel
2017-06-23 16:55 ` [PATCH 2/4] sched: simplify wake_affine for single socket case riel
2017-06-24  7:22   ` [tip:sched/core] sched/fair: Simplify wake_affine() for the " tip-bot for Rik van Riel
2017-06-23 16:55 ` [PATCH 3/4] sched,numa: implement numa node level wake_affine riel
2017-06-24  7:23   ` tip-bot for Rik van Riel [this message]
2017-06-26 14:43   ` Peter Zijlstra
2017-06-23 16:55 ` [PATCH 4/4] sched,fair: remove effective_load riel
2017-06-24  7:23   ` [tip:sched/core] sched/fair: Remove effective_load() tip-bot for Rik van Riel
2017-06-26 14:44   ` [PATCH 4/4] sched,fair: remove effective_load Peter Zijlstra
2017-06-26 14:46     ` Peter Zijlstra
2017-06-26 14:55       ` Rik van Riel
2017-06-26 15:04         ` Peter Zijlstra
2017-06-26 15:20           ` Rik van Riel
2017-06-26 16:12             ` Peter Zijlstra
2017-06-26 19:34               ` Rik van Riel
2017-06-27  5:39                 ` Peter Zijlstra
2017-06-27 14:55                   ` Rik van Riel
2017-08-01 12:19                     ` [PATCH] sched/fair: Fix wake_affine() for !NUMA_BALANCING Peter Zijlstra
2017-08-01 19:26                       ` Josef Bacik
2017-08-01 21:43                         ` Peter Zijlstra
2017-08-24 22:29                           ` Chris Wilson
2017-08-25 15:46                           ` Chris Wilson
2017-06-27 18:27               ` [PATCH 4/4] sched,fair: remove effective_load Rik van Riel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=tip-3fed382b46baac83703130fe4cd3d9147f427fb9@git.kernel.org \
    --to=tipbot@zytor.com \
    --cc=efault@gmx.de \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-tip-commits@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=riel@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.