From: Ingo Molnar <mingo@kernel.org>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
Paul Turner <pjt@google.com>,
Lee Schermerhorn <Lee.Schermerhorn@hp.com>,
Christoph Lameter <cl@linux.com>, Rik van Riel <riel@redhat.com>,
Mel Gorman <mgorman@suse.de>,
Andrew Morton <akpm@linux-foundation.org>,
Andrea Arcangeli <aarcange@redhat.com>,
Linus Torvalds <torvalds@linux-foundation.org>,
Thomas Gleixner <tglx@linutronix.de>,
Johannes Weiner <hannes@cmpxchg.org>,
Hugh Dickins <hughd@google.com>
Subject: [PATCH 02/10] sched: Move the NUMA placement logic to a worklet
Date: Fri, 30 Nov 2012 20:58:33 +0100 [thread overview]
Message-ID: <1354305521-11583-3-git-send-email-mingo@kernel.org> (raw)
In-Reply-To: <1354305521-11583-1-git-send-email-mingo@kernel.org>
As an implementational detail, to be able to do directed task placement
we have to change how task_numa_fault() interfaces with the scheduler:
instead of the placement logic being executed directly from the fault
path we now trigger a worklet, similarly to how we do the NUMA
hinting fault work.
This moves placement into process context and allows the execution of the
directed task-flipping code via sched_rebalance_to().
This further decouples the NUMA hinting fault engine from
the actual NUMA placement logic.
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
include/linux/sched.h | 3 +-
kernel/sched/core.c | 21 ++++++-
kernel/sched/fair.c | 151 +++++++++++++++++++++++++++++++-------------------
kernel/sched/sched.h | 6 ++
4 files changed, 123 insertions(+), 58 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 696492e..ce9ccd7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1512,7 +1512,8 @@ struct task_struct {
unsigned long numa_weight;
unsigned long *numa_faults;
unsigned long *numa_faults_curr;
- struct callback_head numa_work;
+ struct callback_head numa_scan_work;
+ struct callback_head numa_placement_work;
struct task_struct *shared_buddy, *shared_buddy_curr;
unsigned long shared_buddy_faults, shared_buddy_faults_curr;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cad6c89..0324d5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -39,6 +39,7 @@
#include <linux/kernel_stat.h>
#include <linux/debug_locks.h>
#include <linux/perf_event.h>
+#include <linux/task_work.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
@@ -1558,7 +1559,6 @@ static void __sched_fork(struct task_struct *p)
p->numa_migrate_seq = 2;
p->numa_faults = NULL;
p->numa_scan_period = sysctl_sched_numa_scan_delay;
- p->numa_work.next = &p->numa_work;
p->shared_buddy = NULL;
p->shared_buddy_faults = 0;
@@ -1570,6 +1570,25 @@ static void __sched_fork(struct task_struct *p)
p->numa_policy.v.preferred_node = 0;
p->numa_policy.v.nodes = node_online_map;
+ init_task_work(&p->numa_scan_work, task_numa_scan_work);
+ p->numa_scan_work.next = &p->numa_scan_work;
+
+ init_task_work(&p->numa_placement_work, task_numa_placement_work);
+ p->numa_placement_work.next = &p->numa_placement_work;
+
+ if (p->mm) {
+ int entries = 2*nr_node_ids;
+ int size = sizeof(*p->numa_faults) * entries;
+
+ /*
+ * For efficiency reasons we allocate ->numa_faults[]
+ * and ->numa_faults_curr[] at once and split the
+ * buffer we get. They are separate otherwise.
+ */
+ p->numa_faults = kzalloc(2*size, GFP_KERNEL);
+ if (p->numa_faults)
+ p->numa_faults_curr = p->numa_faults + entries;
+ }
#endif /* CONFIG_NUMA_BALANCING */
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 54c1e7b..fda1b63 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1063,19 +1063,18 @@ clear_buddy:
p->ideal_cpu_curr = -1;
}
-static void task_numa_placement(struct task_struct *p)
+/*
+ * Called every couple of hundred milliseconds in the task's
+ * execution life-time, this function decides whether to
+ * change placement parameters:
+ */
+static void task_numa_placement_tick(struct task_struct *p)
{
- int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
unsigned long total[2] = { 0, 0 };
unsigned long faults, max_faults = 0;
int node, priv, shared, max_node = -1;
int this_node;
- if (p->numa_scan_seq == seq)
- return;
-
- p->numa_scan_seq = seq;
-
/*
* Update the fault average with the result of the latest
* scan:
@@ -1280,43 +1279,24 @@ void task_numa_fault(int node, int last_cpu, int pages)
int idx = 2*node + priv;
WARN_ON_ONCE(last_cpu == -1 || node == -1);
-
- if (unlikely(!p->numa_faults)) {
- int entries = 2*nr_node_ids;
- int size = sizeof(*p->numa_faults) * entries;
-
- p->numa_faults = kzalloc(2*size, GFP_KERNEL);
- if (!p->numa_faults)
- return;
- /*
- * For efficiency reasons we allocate ->numa_faults[]
- * and ->numa_faults_curr[] at once and split the
- * buffer we get. They are separate otherwise.
- */
- p->numa_faults_curr = p->numa_faults + entries;
- }
+ BUG_ON(!p->numa_faults);
p->numa_faults_curr[idx] += pages;
shared_fault_tick(p, node, last_cpu, pages);
- task_numa_placement(p);
}
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
-void task_numa_work(struct callback_head *work)
+void task_numa_placement_work(struct callback_head *work)
{
- long pages_total, pages_left, pages_changed;
- unsigned long migrate, next_scan, now = jiffies;
- unsigned long start0, start, end;
struct task_struct *p = current;
- struct mm_struct *mm = p->mm;
- struct vm_area_struct *vma;
- WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_placement_work));
work->next = work; /* protect against double add */
+
/*
* Who cares about NUMA placement when they're dying.
*
@@ -1328,6 +1308,29 @@ void task_numa_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
+ task_numa_placement_tick(p);
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_scan_work(struct callback_head *work)
+{
+ long pages_total, pages_left, pages_changed;
+ unsigned long migrate, next_scan, now = jiffies;
+ unsigned long start0, start, end;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_scan_work));
+
+ work->next = work; /* protect against double add */
+
+ if (p->flags & PF_EXITING)
+ return;
+
/*
* Enforce maximal scan/migration frequency..
*/
@@ -1383,15 +1386,12 @@ out:
/*
* Drive the periodic memory faults..
*/
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
+static void task_tick_numa_scan(struct rq *rq, struct task_struct *curr)
{
- struct callback_head *work = &curr->numa_work;
+ struct callback_head *work = &curr->numa_scan_work;
u64 period, now;
- /*
- * We don't care about NUMA placement if we don't have memory.
- */
- if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ if (work->next != work)
return;
/*
@@ -1403,28 +1403,67 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
- if (now - curr->node_stamp > period) {
- curr->node_stamp += period;
- curr->numa_scan_period = sysctl_sched_numa_scan_period_min;
+ if (now - curr->node_stamp <= period)
+ return;
- /*
- * We are comparing runtime to wall clock time here, which
- * puts a maximum scan frequency limit on the task work.
- *
- * This, together with the limits in task_numa_work() filters
- * us from over-sampling if there are many threads: if all
- * threads happen to come in at the same time we don't create a
- * spike in overhead.
- *
- * We also avoid multiple threads scanning at once in parallel to
- * each other.
- */
- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
- task_work_add(curr, work, true);
- }
- }
+ curr->node_stamp += period;
+ curr->numa_scan_period = sysctl_sched_numa_scan_period_min;
+
+ /*
+ * We are comparing runtime to wall clock time here, which
+ * puts a maximum scan frequency limit on the task work.
+ *
+ * This, together with the limits in task_numa_work() filters
+ * us from over-sampling if there are many threads: if all
+ * threads happen to come in at the same time we don't create a
+ * spike in overhead.
+ *
+ * We also avoid multiple threads scanning at once in parallel to
+ * each other.
+ */
+ if (time_before(jiffies, curr->mm->numa_next_scan))
+ return;
+
+ task_work_add(curr, work, true);
}
+
+/*
+ * Drive the placement logic:
+ */
+static void task_tick_numa_placement(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_placement_work;
+ int seq;
+
+ if (work->next != work)
+ return;
+
+ /*
+ * Check whether we should run task_numa_placement(),
+ * and if yes, activate the worklet:
+ */
+ seq = ACCESS_ONCE(curr->mm->numa_scan_seq);
+
+ if (curr->numa_scan_seq == seq)
+ return;
+
+ curr->numa_scan_seq = seq;
+ task_work_add(curr, work, true);
+}
+
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ /*
+ * We don't care about NUMA placement if we don't have memory
+ * or are exiting:
+ */
+ if (!curr->mm || (curr->flags & PF_EXITING))
+ return;
+
+ task_tick_numa_scan(rq, curr);
+ task_tick_numa_placement(rq, curr);
+}
+
#else /* !CONFIG_NUMA_BALANCING: */
#ifdef CONFIG_SMP
static inline int task_ideal_cpu(struct task_struct *p) { return -1; }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f3a284e..7e53cbf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1259,6 +1259,12 @@ static inline u64 irq_time_read(int cpu)
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}
#endif /* CONFIG_64BIT */
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_scan_work(struct callback_head *work);
+extern void task_numa_placement_work(struct callback_head *work);
+#endif
+
#ifdef CONFIG_SMP
extern void sched_rebalance_to(int dest_cpu, int flip_tasks);
#else
--
1.7.11.7
next prev parent reply other threads:[~2012-11-30 20:02 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-11-30 19:58 [PATCH 00/10] Latest numa/core release, v18 Ingo Molnar
2012-11-30 19:58 ` [PATCH 01/10] sched: Add "task flipping" support Ingo Molnar
2012-11-30 19:58 ` Ingo Molnar [this message]
2012-11-30 19:58 ` [PATCH 03/10] numa, mempolicy: Improve CONFIG_NUMA_BALANCING=y OOM behavior Ingo Molnar
2012-11-30 19:58 ` [PATCH 04/10] mm, numa: Turn 4K pte NUMA faults into effective hugepage ones Ingo Molnar
2012-11-30 19:58 ` [PATCH 05/10] sched: Introduce directed NUMA convergence Ingo Molnar
2012-11-30 19:58 ` [PATCH 06/10] sched: Remove statistical NUMA scheduling Ingo Molnar
2012-11-30 19:58 ` [PATCH 07/10] sched: Track quality and strength of convergence Ingo Molnar
2012-11-30 19:58 ` [PATCH 08/10] sched: Converge NUMA migrations Ingo Molnar
2012-11-30 19:58 ` [PATCH 09/10] sched: Add convergence strength based adaptive NUMA page fault rate Ingo Molnar
2012-11-30 19:58 ` [PATCH 10/10] sched: Refine the 'shared tasks' memory interleaving logic Ingo Molnar
2012-11-30 20:37 ` [PATCH 00/10] Latest numa/core release, v18 Linus Torvalds
2012-12-01 9:49 ` [RFC PATCH] mm/migration: Don't lock anon vmas in rmap_walk_anon() Ingo Molnar
2012-12-01 12:26 ` [RFC PATCH] mm/migration: Remove anon vma locking from try_to_unmap() use Ingo Molnar
2012-12-01 18:38 ` Linus Torvalds
2012-12-01 18:41 ` Ingo Molnar
2012-12-01 18:50 ` Linus Torvalds
2012-12-01 20:10 ` [PATCH 1/2] mm/rmap: Convert the struct anon_vma::mutex to an rwsem Ingo Molnar
2012-12-01 20:19 ` Rik van Riel
2012-12-02 15:10 ` Ingo Molnar
2012-12-03 13:59 ` Mel Gorman
2012-12-01 20:15 ` [PATCH 2/2] mm/migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable Ingo Molnar
2012-12-01 20:33 ` Rik van Riel
2012-12-02 15:12 ` [PATCH 2/2, v2] " Ingo Molnar
2012-12-02 17:53 ` Rik van Riel
2012-12-04 14:42 ` Michel Lespinasse
2012-12-05 2:59 ` Michel Lespinasse
2012-12-03 14:17 ` [PATCH 2/2] " Mel Gorman
2012-12-04 14:37 ` Michel Lespinasse
2012-12-04 18:17 ` Mel Gorman
2012-12-01 18:55 ` [RFC PATCH] mm/migration: Remove anon vma locking from try_to_unmap() use Rik van Riel
2012-12-01 16:19 ` [RFC PATCH] mm/migration: Don't lock anon vmas in rmap_walk_anon() Rik van Riel
2012-12-01 17:55 ` Linus Torvalds
2012-12-01 18:30 ` Ingo Molnar
2012-12-03 13:41 ` [PATCH 00/10] Latest numa/core release, v18 Mel Gorman
2012-12-04 17:30 ` Thomas Gleixner
2012-12-03 10:43 ` Mel Gorman
2012-12-03 11:32 ` Mel Gorman
2012-12-04 22:49 ` Mel Gorman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1354305521-11583-3-git-send-email-mingo@kernel.org \
--to=mingo@kernel.org \
--cc=Lee.Schermerhorn@hp.com \
--cc=a.p.zijlstra@chello.nl \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=cl@linux.com \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mgorman@suse.de \
--cc=pjt@google.com \
--cc=riel@redhat.com \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).