* [PATCH REBASE V3 1/4] sched/numa: Apply the scan delay to every new vma
2023-03-01 12:18 [PATCH REBASE V3 0/4] sched/numa: Enhance vma scanning Raghavendra K T
@ 2023-03-01 12:19 ` Raghavendra K T
2023-03-01 12:19 ` [PATCH REBASE V3 2/4] sched/numa: Enhance vma scanning logic Raghavendra K T
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Raghavendra K T @ 2023-03-01 12:19 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: Ingo Molnar, Peter Zijlstra, Mel Gorman, Andrew Morton,
David Hildenbrand, rppt, Bharata B Rao, Disha Talreja,
Mel Gorman, Raghavendra K T
From: Mel Gorman <mgorman@techsingularity.net>
Currently whenever a new task is created we wait for
sysctl_numa_balancing_scan_delay to avoid unnessary scanning
overhead. Extend the same logic to new or very short-lived VMAs.
(Raghavendra: Add initialization in vm_area_dup())
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
---
include/linux/mm.h | 16 ++++++++++++++++
include/linux/mm_types.h | 7 +++++++
kernel/fork.c | 2 ++
kernel/sched/fair.c | 19 +++++++++++++++++++
4 files changed, 44 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3d4bb18dfcb7..2cce434a5e55 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -29,6 +29,7 @@
#include <linux/pgtable.h>
#include <linux/kasan.h>
#include <linux/memremap.h>
+#include <linux/slab.h>
struct mempolicy;
struct anon_vma;
@@ -626,6 +627,20 @@ struct vm_operations_struct {
unsigned long addr);
};
+#ifdef CONFIG_NUMA_BALANCING
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+ vma->numab_state = NULL;
+}
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
+{
+ kfree(vma->numab_state);
+}
+#else
+static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
+static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+#endif /* CONFIG_NUMA_BALANCING */
+
#ifdef CONFIG_PER_VMA_LOCK
/*
* Try to read-lock a vma. The function is allowed to occasionally yield false
@@ -727,6 +742,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma_mark_detached(vma, false);
+ vma_numab_state_init(vma);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 89bbf7d8a312..1cea78f60011 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -475,6 +475,10 @@ struct vma_lock {
struct rw_semaphore lock;
};
+struct vma_numab_state {
+ unsigned long next_scan;
+};
+
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -565,6 +569,9 @@ struct vm_area_struct {
#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ struct vma_numab_state *numab_state; /* NUMA Balancing state */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
diff --git a/kernel/fork.c b/kernel/fork.c
index 75792157f51a..305f963359dc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -516,6 +516,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
return NULL;
}
INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
return new;
@@ -523,6 +524,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
void __vm_area_free(struct vm_area_struct *vma)
{
+ vma_numab_state_free(vma);
free_anon_vma_name(vma);
vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a1b1f855b96..7c2bbc8d618b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3027,6 +3027,25 @@ static void task_numa_work(struct callback_head *work)
if (!vma_is_accessible(vma))
continue;
+ /* Initialise new per-VMA NUMAB state. */
+ if (!vma->numab_state) {
+ vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
+ GFP_KERNEL);
+ if (!vma->numab_state)
+ continue;
+
+ vma->numab_state->next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ }
+
+ /*
+ * Scanning the VMA's of short lived tasks add more overhead. So
+ * delay the scan for new VMAs.
+ */
+ if (mm->numa_scan_seq && time_before(jiffies,
+ vma->numab_state->next_scan))
+ continue;
+
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
--
2.34.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH REBASE V3 2/4] sched/numa: Enhance vma scanning logic
2023-03-01 12:18 [PATCH REBASE V3 0/4] sched/numa: Enhance vma scanning Raghavendra K T
2023-03-01 12:19 ` [PATCH REBASE V3 1/4] sched/numa: Apply the scan delay to every new vma Raghavendra K T
@ 2023-03-01 12:19 ` Raghavendra K T
2023-03-01 12:19 ` [PATCH REBASE V3 3/4] sched/numa: implement access PID reset logic Raghavendra K T
2023-03-01 12:19 ` [PATCH REBASE V3 4/4] sched/numa: Use hash_32 to mix up PIDs accessing VMA Raghavendra K T
3 siblings, 0 replies; 5+ messages in thread
From: Raghavendra K T @ 2023-03-01 12:19 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: Ingo Molnar, Peter Zijlstra, Mel Gorman, Andrew Morton,
David Hildenbrand, rppt, Bharata B Rao, Disha Talreja,
Raghavendra K T
During the Numa scanning make sure only relevant vmas of the
tasks are scanned.
Before:
All the tasks of a process participate in scanning the vma
even if they do not access vma in it's lifespan.
Now:
Except cases of first few unconditional scans, if a process do
not touch vma (exluding false positive cases of PID collisions)
tasks no longer scan all vma
Logic used:
1) 6 bits of PID used to mark active bit in vma numab status during
fault to remember PIDs accessing vma. (Thanks Mel)
2) Subsequently in scan path, vma scanning is skipped if current PID
had not accessed vma.
3) First two times we do allow unconditional scan to preserve earlier
behaviour of scanning.
Acknowledgement to Bharata B Rao <bharata@amd.com> for initial patch
to store pid information and Peter Zijlstra <peterz@infradead.org>
(Usage of test and set bit)
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
---
include/linux/mm.h | 14 ++++++++++++++
include/linux/mm_types.h | 1 +
kernel/sched/fair.c | 19 +++++++++++++++++++
mm/memory.c | 3 +++
4 files changed, 37 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2cce434a5e55..b7e4484af05b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1666,6 +1666,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
return last_time << PAGE_ACCESS_TIME_BUCKETS;
}
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+ unsigned int pid_bit;
+
+ pid_bit = current->pid % BITS_PER_LONG;
+ if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids)) {
+ __set_bit(pid_bit, &vma->numab_state->access_pids);
+ }
+}
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
@@ -1715,6 +1725,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
return false;
}
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1cea78f60011..df4e0bc66d17 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -477,6 +477,7 @@ struct vma_lock {
struct vma_numab_state {
unsigned long next_scan;
+ unsigned long access_pids;
};
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c2bbc8d618b..9443ae9db028 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2928,6 +2928,21 @@ static void reset_ptenuma_scan(struct task_struct *p)
p->mm->numa_scan_offset = 0;
}
+static bool vma_is_accessed(struct vm_area_struct *vma)
+{
+ /*
+ * Allow unconditional access first two times, so that all the (pages)
+ * of VMAs get prot_none fault introduced irrespective of accesses.
+ * This is also done to avoid any side effect of task scanning
+ * amplifying the unfairness of disjoint set of VMAs' access.
+ */
+ if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+ return true;
+
+ return test_bit(current->pid % BITS_PER_LONG,
+ &vma->numab_state->access_pids);
+}
+
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
@@ -3046,6 +3061,10 @@ static void task_numa_work(struct callback_head *work)
vma->numab_state->next_scan))
continue;
+ /* Do not scan the VMA if task has not accessed */
+ if (!vma_is_accessed(vma))
+ continue;
+
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
diff --git a/mm/memory.c b/mm/memory.c
index 255b2f4fdd4a..8fac837cde9e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4647,6 +4647,9 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
{
get_page(page);
+ /* Record the current PID acceesing VMA */
+ vma_set_access_pid_bit(vma);
+
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
--
2.34.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH REBASE V3 3/4] sched/numa: implement access PID reset logic
2023-03-01 12:18 [PATCH REBASE V3 0/4] sched/numa: Enhance vma scanning Raghavendra K T
2023-03-01 12:19 ` [PATCH REBASE V3 1/4] sched/numa: Apply the scan delay to every new vma Raghavendra K T
2023-03-01 12:19 ` [PATCH REBASE V3 2/4] sched/numa: Enhance vma scanning logic Raghavendra K T
@ 2023-03-01 12:19 ` Raghavendra K T
2023-03-01 12:19 ` [PATCH REBASE V3 4/4] sched/numa: Use hash_32 to mix up PIDs accessing VMA Raghavendra K T
3 siblings, 0 replies; 5+ messages in thread
From: Raghavendra K T @ 2023-03-01 12:19 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: Ingo Molnar, Peter Zijlstra, Mel Gorman, Andrew Morton,
David Hildenbrand, rppt, Bharata B Rao, Disha Talreja,
Raghavendra K T
This helps to ensure, only recently accessed PIDs scan the
VMAs.
Current implementation: (idea supported by PeterZ)
1. Accessing PID information is maintained in two windows.
access_pids[1] being newest.
2. Reset old access PID info i.e. access_pid[0] every
(4 * sysctl_numa_balancing_scan_delay) interval after initial
scan delay period expires.
The above interval seemed to be experimentally optimum since it
avoids frequent reset of access info as well as helps clearing
the old access info regularly.
The reset logic is implemented in scan path.
Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
---
include/linux/mm.h | 4 ++--
include/linux/mm_types.h | 3 ++-
kernel/sched/fair.c | 23 +++++++++++++++++++++--
3 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b7e4484af05b..5232ebb34145 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1672,8 +1672,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
unsigned int pid_bit;
pid_bit = current->pid % BITS_PER_LONG;
- if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids)) {
- __set_bit(pid_bit, &vma->numab_state->access_pids);
+ if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
+ __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
}
}
#else /* !CONFIG_NUMA_BALANCING */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index df4e0bc66d17..e17bdd10dc15 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -477,7 +477,8 @@ struct vma_lock {
struct vma_numab_state {
unsigned long next_scan;
- unsigned long access_pids;
+ unsigned long next_pid_reset;
+ unsigned long access_pids[2];
};
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9443ae9db028..a93e7a33281f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2930,6 +2930,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
static bool vma_is_accessed(struct vm_area_struct *vma)
{
+ unsigned long pids;
/*
* Allow unconditional access first two times, so that all the (pages)
* of VMAs get prot_none fault introduced irrespective of accesses.
@@ -2939,10 +2940,12 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
if (READ_ONCE(current->mm->numa_scan_seq) < 2)
return true;
- return test_bit(current->pid % BITS_PER_LONG,
- &vma->numab_state->access_pids);
+ pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
+ return test_bit(current->pid % BITS_PER_LONG, &pids);
}
+#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
+
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
@@ -3051,6 +3054,10 @@ static void task_numa_work(struct callback_head *work)
vma->numab_state->next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+
+ /* Reset happens after 4 times scan delay of scan start */
+ vma->numab_state->next_pid_reset = vma->numab_state->next_scan +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
}
/*
@@ -3065,6 +3072,18 @@ static void task_numa_work(struct callback_head *work)
if (!vma_is_accessed(vma))
continue;
+ /*
+ * RESET access PIDs regularly for old VMAs. Resetting after checking
+ * vma for recent access to avoid clearing PID info before access..
+ */
+ if (mm->numa_scan_seq &&
+ time_after(jiffies, vma->numab_state->next_pid_reset)) {
+ vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
+ vma->numab_state->access_pids[1] = 0;
+ }
+
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
--
2.34.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH REBASE V3 4/4] sched/numa: Use hash_32 to mix up PIDs accessing VMA
2023-03-01 12:18 [PATCH REBASE V3 0/4] sched/numa: Enhance vma scanning Raghavendra K T
` (2 preceding siblings ...)
2023-03-01 12:19 ` [PATCH REBASE V3 3/4] sched/numa: implement access PID reset logic Raghavendra K T
@ 2023-03-01 12:19 ` Raghavendra K T
3 siblings, 0 replies; 5+ messages in thread
From: Raghavendra K T @ 2023-03-01 12:19 UTC (permalink / raw)
To: linux-kernel, linux-mm
Cc: Ingo Molnar, Peter Zijlstra, Mel Gorman, Andrew Morton,
David Hildenbrand, rppt, Bharata B Rao, Disha Talreja,
Raghavendra K T
before: last 6 bits of PID is used as index to store
information about tasks accessing VMA's.
after: hash_32 is used to take of cases where tasks are
created over a period of time, and thus improve collision
probability.
Result:
The patch series overall improving autonuma cost.
Kernbench around more than 5% improvement and
system time in mmtest autonuma showed more than 80%
improvement
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
---
include/linux/mm.h | 2 +-
kernel/sched/fair.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5232ebb34145..1b9be34a24fb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1671,7 +1671,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
unsigned int pid_bit;
- pid_bit = current->pid % BITS_PER_LONG;
+ pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a93e7a33281f..8592941dd565 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2941,7 +2941,7 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
return true;
pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
- return test_bit(current->pid % BITS_PER_LONG, &pids);
+ return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
}
#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
--
2.34.1
^ permalink raw reply related [flat|nested] 5+ messages in thread