linux-doc.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] sched/numa: add per-process numa_balancing
@ 2021-12-06  2:45 Gang Li
  2021-12-13  6:49 ` Gang Li
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Gang Li @ 2021-12-06  2:45 UTC (permalink / raw)
  To: Jonathan Corbet, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira
  Cc: Gang Li, linux-api, linux-kernel, linux-fsdevel, linux-doc

This patch add a new api PR_NUMA_BALANCING in prctl.

A large number of page faults will cause performance loss when numa
balancing is performing. Thus those processes which care about worst-case
performance need numa balancing disabled. Others, on the contrary, allow a
temporary performance loss in exchange for higher average performance, so
enable numa balancing is better for them.

Numa balancing can only be controlled globally by
/proc/sys/kernel/numa_balancing. Due to the above case, we want to
disable/enable numa_balancing per-process instead.

Add numa_balancing under mm_struct. Then use it in task_tick_fair.

Set per-process numa balancing:
	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DISABLE); //disable
	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE);  //enable
	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DEFAULT); //follow global
Get numa_balancing state:
	prctl(PR_NUMA_BALANCING, PR_GET_NUMAB, &ret);
	cat /proc/<pid>/status | grep NumaB_enabled

Cc: linux-api@vger.kernel.org
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
---

Changes in v3:
- Fix compile error.

Changes in v2:
- Now PR_NUMA_BALANCING support three states: enabled, disabled, default.
  enabled and disabled will ignore global setting, and default will follow
  global setting.
---
 Documentation/filesystems/proc.rst   |  2 ++
 fs/proc/task_mmu.c                   | 19 +++++++++++++++
 include/linux/mm_types.h             |  3 +++
 include/linux/sched/numa_balancing.h |  6 +++++
 include/uapi/linux/prctl.h           |  7 ++++++
 kernel/fork.c                        |  3 +++
 kernel/sched/fair.c                  |  6 ++++-
 kernel/sys.c                         | 35 ++++++++++++++++++++++++++++
 8 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 061744c436d9..767e8d893fb5 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -192,6 +192,7 @@ read the file /proc/PID/status::
   VmLib:      1412 kB
   VmPTE:        20 kb
   VmSwap:        0 kB
+  NumaB_enabled:  default
   HugetlbPages:          0 kB
   CoreDumping:    0
   THP_enabled:	  1
@@ -273,6 +274,7 @@ It's slow but very precise.
  VmPTE                       size of page table entries
  VmSwap                      amount of swap used by anonymous private data
                              (shmem swap usage is not included)
+ NumaB_enabled               numa balancing state, set by prctl(PR_NUMA_BALANCING, ...)
  HugetlbPages                size of hugetlb memory portions
  CoreDumping                 process's memory is currently being dumped
                              (killing the process may lead to a corrupted core)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e6998652fd67..c5bc00ab0460 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <linux/sched/numa_balancing.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -75,6 +76,24 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		    " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
 	SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
 	seq_puts(m, " kB\n");
+#ifdef CONFIG_NUMA_BALANCING
+	seq_puts(m, "NumaB_enabled:\t");
+	switch (mm->numab_enabled) {
+	case NUMAB_DEFAULT:
+		seq_puts(m, "default");
+		break;
+	case NUMAB_DISABLED:
+		seq_puts(m, "disabled");
+		break;
+	case NUMAB_ENABLED:
+		seq_puts(m, "enabled");
+		break;
+	default:
+		seq_puts(m, "unknown");
+		break;
+	}
+	seq_putc(m, '\n');
+#endif
 	hugetlb_report_usage(m, mm);
 }
 #undef SEQ_PUT_DEC
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 850e71986b9d..96607e43e00f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -638,6 +638,9 @@ struct mm_struct {
 
 		/* numa_scan_seq prevents two threads setting pte_numa */
 		int numa_scan_seq;
+
+		/* Controls whether NUMA balancing is active for this mm. */
+		int numab_enabled;
 #endif
 		/*
 		 * An operation with batched TLB flushing is going on. Anything
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..35a1c79925ea 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -16,6 +16,12 @@
 #define TNF_MIGRATE_FAIL 0x10
 
 #ifdef CONFIG_NUMA_BALANCING
+enum {
+	NUMAB_DISABLED,
+	NUMAB_ENABLED,
+	NUMAB_DEFAULT
+};
+DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index e998764f0262..d120cdde5d27 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -275,4 +275,11 @@ struct prctl_mm_map {
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
+/* Set/get enabled per-process numa_balancing */
+#define PR_NUMA_BALANCING		63
+# define PR_SET_NUMAB_DISABLED		NUMAB_DISABLED
+# define PR_SET_NUMAB_ENABLED		NUMAB_ENABLED
+# define PR_SET_NUMAB_DEFAULT		NUMAB_DEFAULT
+# define PR_GET_NUMAB			3
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7c06be0ca31b..5d4f876b588b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,6 +1061,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+	mm->numab_enabled = NUMAB_DEFAULT;
 #endif
 	mm_init_uprobes_state(mm);
 	hugetlb_count_init(mm);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 884f29d07963..2980f33ac61f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11169,8 +11169,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		entity_tick(cfs_rq, se, queued);
 	}
 
-	if (static_branch_unlikely(&sched_numa_balancing))
+#ifdef CONFIG_NUMA_BALANCING
+	if (curr->mm && (curr->mm->numab_enabled == NUMAB_ENABLED
+	    || (static_branch_unlikely(&sched_numa_balancing)
+	    && curr->mm->numab_enabled == NUMAB_DEFAULT)))
 		task_tick_numa(rq, curr);
+#endif
 
 	update_misfit_status(curr, rq);
 	update_overutilized_status(task_rq(curr));
diff --git a/kernel/sys.c b/kernel/sys.c
index 2450a9f33cb0..4a5a2bd57248 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -58,6 +58,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/task.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
@@ -2081,6 +2082,23 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
 	return 0;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static int prctl_pid_numa_balancing_write(int numa_balancing)
+{
+	if (numa_balancing != PR_SET_NUMAB_DEFAULT
+	    && numa_balancing != PR_SET_NUMAB_DISABLED
+	    && numa_balancing != PR_SET_NUMAB_ENABLED)
+		return -EINVAL;
+	current->mm->numab_enabled = numa_balancing;
+	return 0;
+}
+
+static int prctl_pid_numa_balancing_read(void)
+{
+	return current->mm->numab_enabled;
+}
+#endif
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
@@ -2585,6 +2603,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = set_syscall_user_dispatch(arg2, arg3, arg4,
 						  (char __user *) arg5);
 		break;
+#ifdef CONFIG_NUMA_BALANCING
+	case PR_NUMA_BALANCING:
+		switch (arg2) {
+		case PR_SET_NUMAB_DEFAULT:
+		case PR_SET_NUMAB_DISABLED:
+		case PR_SET_NUMAB_ENABLED:
+			error = prctl_pid_numa_balancing_write((int)arg2);
+			break;
+		case PR_GET_NUMAB:
+			error = put_user(prctl_pid_numa_balancing_read(), (int __user *)arg3);
+			break;
+		default:
+			error = -EINVAL;
+			break;
+		}
+		break;
+#endif
 #ifdef CONFIG_SCHED_CORE
 	case PR_SCHED_CORE:
 		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] sched/numa: add per-process numa_balancing
  2021-12-06  2:45 [PATCH v3] sched/numa: add per-process numa_balancing Gang Li
@ 2021-12-13  6:49 ` Gang Li
  2022-01-06  8:08 ` Gang Li
  2022-01-12 14:43 ` Peter Zijlstra
  2 siblings, 0 replies; 5+ messages in thread
From: Gang Li @ 2021-12-13  6:49 UTC (permalink / raw)
  To: Jonathan Corbet, Ingo Molnar, Peter Zijlstra, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira
  Cc: linux-api, linux-kernel, linux-fsdevel, linux-doc

Hi,
Any comments plz ;)

On 2021/12/6 10:45, Gang Li wrote:
> This patch add a new api PR_NUMA_BALANCING in prctl.
> 
> A large number of page faults will cause performance loss when numa
> balancing is performing. Thus those processes which care about worst-case
> performance need numa balancing disabled. Others, on the contrary, allow a
> temporary performance loss in exchange for higher average performance, so
> enable numa balancing is better for them.
> 
> Numa balancing can only be controlled globally by
> /proc/sys/kernel/numa_balancing. Due to the above case, we want to
> disable/enable numa_balancing per-process instead.
> 
> Add numa_balancing under mm_struct. Then use it in task_tick_fair.
> 
> Set per-process numa balancing:
> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DISABLE); //disable
> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE);  //enable
> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DEFAULT); //follow global
> Get numa_balancing state:
> 	prctl(PR_NUMA_BALANCING, PR_GET_NUMAB, &ret);
> 	cat /proc/<pid>/status | grep NumaB_enabled
> 
> Cc: linux-api@vger.kernel.org
> Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] sched/numa: add per-process numa_balancing
  2021-12-06  2:45 [PATCH v3] sched/numa: add per-process numa_balancing Gang Li
  2021-12-13  6:49 ` Gang Li
@ 2022-01-06  8:08 ` Gang Li
  2022-01-12 14:43 ` Peter Zijlstra
  2 siblings, 0 replies; 5+ messages in thread
From: Gang Li @ 2022-01-06  8:08 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-api, linux-kernel, linux-fsdevel, linux-doc, songmuchun,
	zhengqi.arch

Hi,
Any comments please?

;)

On 2021/12/6 10:45, Gang Li wrote:
> This patch add a new api PR_NUMA_BALANCING in prctl.
> 
> A large number of page faults will cause performance loss when numa
> balancing is performing. Thus those processes which care about worst-case
> performance need numa balancing disabled. Others, on the contrary, allow a
> temporary performance loss in exchange for higher average performance, so
> enable numa balancing is better for them.
> 
> Numa balancing can only be controlled globally by
> /proc/sys/kernel/numa_balancing. Due to the above case, we want to
> disable/enable numa_balancing per-process instead.
> 
> Add numa_balancing under mm_struct. Then use it in task_tick_fair.
> 
> Set per-process numa balancing:
> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DISABLE); //disable
> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE);  //enable
> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DEFAULT); //follow global
> Get numa_balancing state:
> 	prctl(PR_NUMA_BALANCING, PR_GET_NUMAB, &ret);
> 	cat /proc/<pid>/status | grep NumaB_enabled
> 
> Cc: linux-api@vger.kernel.org
> Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
> ---
> 
> Changes in v3:
> - Fix compile error.
> 
> Changes in v2:
> - Now PR_NUMA_BALANCING support three states: enabled, disabled, default.
>    enabled and disabled will ignore global setting, and default will follow
>    global setting.
-- 
Thanks
Gang Li


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] sched/numa: add per-process numa_balancing
  2021-12-06  2:45 [PATCH v3] sched/numa: add per-process numa_balancing Gang Li
  2021-12-13  6:49 ` Gang Li
  2022-01-06  8:08 ` Gang Li
@ 2022-01-12 14:43 ` Peter Zijlstra
  2022-02-08  3:46   ` Gang Li
  2 siblings, 1 reply; 5+ messages in thread
From: Peter Zijlstra @ 2022-01-12 14:43 UTC (permalink / raw)
  To: Gang Li
  Cc: Jonathan Corbet, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-api, linux-kernel,
	linux-fsdevel, linux-doc

On Mon, Dec 06, 2021 at 10:45:28AM +0800, Gang Li wrote:
> This patch add a new api PR_NUMA_BALANCING in prctl.
> 
> A large number of page faults will cause performance loss when numa
> balancing is performing. Thus those processes which care about worst-case
> performance need numa balancing disabled. Others, on the contrary, allow a
> temporary performance loss in exchange for higher average performance, so
> enable numa balancing is better for them.
> 
> Numa balancing can only be controlled globally by
> /proc/sys/kernel/numa_balancing. Due to the above case, we want to
> disable/enable numa_balancing per-process instead.
> 
> Add numa_balancing under mm_struct. Then use it in task_tick_fair.
> 
> Set per-process numa balancing:
> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DISABLE); //disable
> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE);  //enable
> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DEFAULT); //follow global

This seems to imply you can prctl(ENABLE) even if the global is
disabled, IOW sched_numa_balancing is off.

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 884f29d07963..2980f33ac61f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11169,8 +11169,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>  		entity_tick(cfs_rq, se, queued);
>  	}
>  
> -	if (static_branch_unlikely(&sched_numa_balancing))
> +#ifdef CONFIG_NUMA_BALANCING
> +	if (curr->mm && (curr->mm->numab_enabled == NUMAB_ENABLED
> +	    || (static_branch_unlikely(&sched_numa_balancing)
> +	    && curr->mm->numab_enabled == NUMAB_DEFAULT)))
>  		task_tick_numa(rq, curr);
> +#endif
>  
>  	update_misfit_status(curr, rq);
>  	update_overutilized_status(task_rq(curr));

There's just about everything wrong there... not least of all the
horrific coding style.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Re: [PATCH v3] sched/numa: add per-process numa_balancing
  2022-01-12 14:43 ` Peter Zijlstra
@ 2022-02-08  3:46   ` Gang Li
  0 siblings, 0 replies; 5+ messages in thread
From: Gang Li @ 2022-02-08  3:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jonathan Corbet, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, linux-api, linux-kernel,
	linux-fsdevel, linux-doc, songmuchun, zhengqi.arch

On 2022/1/12 22:43, Peter Zijlstra wrote:
> On Mon, Dec 06, 2021 at 10:45:28AM +0800, Gang Li wrote:
>> This patch add a new api PR_NUMA_BALANCING in prctl.
>>
>> A large number of page faults will cause performance loss when numa
>> balancing is performing. Thus those processes which care about worst-case
>> performance need numa balancing disabled. Others, on the contrary, allow a
>> temporary performance loss in exchange for higher average performance, so
>> enable numa balancing is better for them.
>>
>> Numa balancing can only be controlled globally by
>> /proc/sys/kernel/numa_balancing. Due to the above case, we want to
>> disable/enable numa_balancing per-process instead.
>>
>> Add numa_balancing under mm_struct. Then use it in task_tick_fair.
>>
>> Set per-process numa balancing:
>> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DISABLE); //disable
>> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE);  //enable
>> 	prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_DEFAULT); //follow global
> 
> This seems to imply you can prctl(ENABLE) even if the global is
> disabled, IOW sched_numa_balancing is off.
> 

Of course, this semantic has been discussed here FYI.
  https://lore.kernel.org/all/20211118085819.GD3301@suse.de/

On 11/18/21 4:58 PM, Mel Gorman wrote:
 > On Thu, Nov 18, 2021 at 11:26:30AM +0800, Gang Li wrote:
 >> 3. prctl(PR_NUMA_BALANCING, PR_SET_NUMAB_ENABLE);  //enable
 >
 > If PR_SET_NUMAB_ENABLE enables numa balancing for a task when
 > kernel.numa_balancing == 0 instead of returning an error then sure.


>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 884f29d07963..2980f33ac61f 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -11169,8 +11169,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>>   		entity_tick(cfs_rq, se, queued);
>>   	}
>>   
>> -	if (static_branch_unlikely(&sched_numa_balancing))
>> +#ifdef CONFIG_NUMA_BALANCING
>> +	if (curr->mm && (curr->mm->numab_enabled == NUMAB_ENABLED
>> +	    || (static_branch_unlikely(&sched_numa_balancing)
>> +	    && curr->mm->numab_enabled == NUMAB_DEFAULT)))
>>   		task_tick_numa(rq, curr);
>> +#endif
>>   
>>   	update_misfit_status(curr, rq);
>>   	update_overutilized_status(task_rq(curr));
> 
> There's just about everything wrong there... not least of all the
> horrific coding style.

horrible code, yes.
I'll do some code clean.
-- 
Thanks
Gang Li


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-02-08  3:46 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-06  2:45 [PATCH v3] sched/numa: add per-process numa_balancing Gang Li
2021-12-13  6:49 ` Gang Li
2022-01-06  8:08 ` Gang Li
2022-01-12 14:43 ` Peter Zijlstra
2022-02-08  3:46   ` Gang Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).