All of lore.kernel.org
 help / color / mirror / Atom feed
From: Gang Li <ligang.bdlg@bytedance.com>
To: Jonathan Corbet <corbet@lwn.net>, Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: songmuchun@bytedance.com, zhengqi.arch@bytedance.com,
	Gang Li <ligang.bdlg@bytedance.com>,
	linux-api@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, linux-doc@vger.kernel.org,
	linux-mm@kvack.org
Subject: [PATCH] sched/numa: add per-process numa_balancing
Date: Thu, 24 Feb 2022 15:52:25 +0800	[thread overview]
Message-ID: <20220224075227.27127-1-ligang.bdlg@bytedance.com> (raw)

This patch add a new api PR_PROCESS_NUMAB in prctl.

A large number of page faults will cause performance loss when numa
balancing is performing. Thus those processes which care about worst-case
performance need numa balancing disabled. Others, on the contrary, allow a
temporary performance loss in exchange for higher average performance, so
enable numa balancing is better for them.

Numa balancing can only be controlled globally by
/proc/sys/kernel/numa_balancing. Due to the above case, we want to
disable/enable numa_balancing per-process instead.

Add numa_balancing under mm_struct. Then use it in task_tick_fair.

Set per-process numa balancing:
	prctl(PR_PROCESS_NUMAB, PR_SET_PROCESS_NUMAB_DISABLED);
	prctl(PR_PROCESS_NUMAB, PR_SET_PROCESS_NUMAB_ENABLED);
	prctl(PR_PROCESS_NUMAB, PR_SET_PROCESS_NUMAB_DEFAULT);
Get numa_balancing state:
	prctl(PR_PROCESS_NUMAB, PR_GET_PROCESS_NUMAB, &ret);
	cat /proc/<pid>/status | grep NumaB_enabled

Cc: linux-api@vger.kernel.org
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
---

Changes in v4:
- Adaptation of new feature: optimize page placement for memory tiering system.
  https://lore.kernel.org/all/20220128082751.593478-3-ying.huang@intel.com/
- warp sched_numa_balancing and mm->numab_enabled with process_sched_numab_enabled().

Changes in v3:
- Fix compile error.

Changes in v2:
- Now PR_NUMA_BALANCING support three states: enabled, disabled, default.
  enabled and disabled will ignore global setting, and default will follow
  global setting.

---
 Documentation/filesystems/proc.rst   |  2 ++
 fs/proc/task_mmu.c                   | 19 +++++++++++++++
 include/linux/mm_types.h             |  3 +++
 include/linux/sched/numa_balancing.h |  6 +++++
 include/linux/sched/sysctl.h         | 19 +++++++++++++++
 include/uapi/linux/prctl.h           |  7 ++++++
 kernel/fork.c                        |  3 +++
 kernel/sched/fair.c                  | 34 ++++++++++++++++++++++-----
 kernel/sys.c                         | 35 ++++++++++++++++++++++++++++
 mm/huge_memory.c                     |  2 +-
 mm/mprotect.c                        |  6 ++---
 11 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 061744c436d9..00f6503f0793 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -192,6 +192,7 @@ read the file /proc/PID/status::
   VmLib:      1412 kB
   VmPTE:        20 kb
   VmSwap:        0 kB
+  NumaB_enabled:  default
   HugetlbPages:          0 kB
   CoreDumping:    0
   THP_enabled:	  1
@@ -273,6 +274,7 @@ It's slow but very precise.
  VmPTE                       size of page table entries
  VmSwap                      amount of swap used by anonymous private data
                              (shmem swap usage is not included)
+ NumaB_enabled               numa balancing state, set by prctl(PR_PROCESS_NUMAB, ...)
  HugetlbPages                size of hugetlb memory portions
  CoreDumping                 process's memory is currently being dumped
                              (killing the process may lead to a corrupted core)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6e97ed775074..b1aa100b8711 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -20,6 +20,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <linux/sched/numa_balancing.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -76,6 +77,24 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		    " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8);
 	SEQ_PUT_DEC(" kB\nVmSwap:\t", swap);
 	seq_puts(m, " kB\n");
+#ifdef CONFIG_NUMA_BALANCING
+	seq_puts(m, "NumaB_enabled:\t");
+	switch (mm->numab_enabled) {
+	case PROCESS_NUMAB_DEFAULT:
+		seq_puts(m, "default");
+		break;
+	case PROCESS_NUMAB_DISABLED:
+		seq_puts(m, "disabled");
+		break;
+	case PROCESS_NUMAB_ENABLED:
+		seq_puts(m, "enabled");
+		break;
+	default:
+		seq_puts(m, "unknown");
+		break;
+	}
+	seq_putc(m, '\n');
+#endif
 	hugetlb_report_usage(m, mm);
 }
 #undef SEQ_PUT_DEC
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9f05ffa12265..5a42aba3b17f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -626,6 +626,9 @@ struct mm_struct {
 
 		/* numa_scan_seq prevents two threads setting pte_numa */
 		int numa_scan_seq;
+
+		/* Controls whether NUMA balancing is active for this mm. */
+		int numab_enabled;
 #endif
 		/*
 		 * An operation with batched TLB flushing is going on. Anything
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..c7dc08d6ba6a 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -16,6 +16,12 @@
 #define TNF_MIGRATE_FAIL 0x10
 
 #ifdef CONFIG_NUMA_BALANCING
+enum {
+	PROCESS_NUMAB_DISABLED,
+	PROCESS_NUMAB_ENABLED,
+	PROCESS_NUMAB_DEFAULT
+};
+DECLARE_STATIC_KEY_FALSE(sched_numa_balancing);
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c1076b5e17fb..77d010942481 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -3,6 +3,7 @@
 #define _LINUX_SCHED_SYSCTL_H
 
 #include <linux/types.h>
+#include <linux/sched/numa_balancing.h>
 
 struct ctl_table;
 
@@ -29,8 +30,26 @@ enum sched_tunable_scaling {
 
 #ifdef CONFIG_NUMA_BALANCING
 extern int sysctl_numa_balancing_mode;
+static inline int process_sysctl_numab_mode(struct mm_struct *mm)
+{
+	int numab = mm->numab_enabled;
+
+	switch (numab) {
+	case PROCESS_NUMAB_ENABLED:
+		return NUMA_BALANCING_NORMAL;
+	case PROCESS_NUMAB_DISABLED:
+		return NUMA_BALANCING_DISABLED;
+	case PROCESS_NUMAB_DEFAULT:
+	default:
+		return sysctl_numa_balancing_mode;
+	}
+}
 #else
 #define sysctl_numa_balancing_mode	0
+static inline int process_sysctl_numab_mode(struct mm_struct *mm)
+{
+	return NUMA_BALANCING_DISABLED;
+}
 #endif
 
 /*
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index e998764f0262..d06a904c35c1 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -275,4 +275,11 @@ struct prctl_mm_map {
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
+/* Set/get enabled per-process numa_balancing */
+#define PR_PROCESS_NUMAB		63
+# define PR_SET_PROCESS_NUMAB_DISABLED	PROCESS_NUMAB_DISABLED
+# define PR_SET_PROCESS_NUMAB_ENABLED	PROCESS_NUMAB_ENABLED
+# define PR_SET_PROCESS_NUMAB_DEFAULT	PROCESS_NUMAB_DEFAULT
+# define PR_GET_PROCESS_NUMAB		3
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 64dbfb9426fd..2f93b240ebab 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1059,6 +1059,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+	mm->numab_enabled = PROCESS_NUMAB_DEFAULT;
 #endif
 	mm_init_uprobes_state(mm);
 	hugetlb_count_init(mm);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 28b91f11b618..7ff5831c5b33 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2575,6 +2575,23 @@ void task_numa_free(struct task_struct *p, bool final)
 	}
 }
 
+static inline bool process_sched_numab_enabled(struct task_struct *p)
+{
+	if (p->mm) {
+		int numab = p->mm->numab_enabled;
+
+		switch (numab) {
+		case PROCESS_NUMAB_ENABLED:
+			return true;
+		case PROCESS_NUMAB_DISABLED:
+			return false;
+		case PROCESS_NUMAB_DEFAULT:
+			break;
+		}
+	}
+	return static_branch_unlikely(&sched_numa_balancing);
+}
+
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
@@ -2587,13 +2604,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	struct numa_group *ng;
 	int priv;
 
-	if (!static_branch_likely(&sched_numa_balancing))
-		return;
-
 	/* for example, ksmd faulting in a user's mm */
 	if (!p->mm)
 		return;
 
+	if (!process_sched_numab_enabled(p))
+		return;
+
 	/* Allocate buffer to track faults on a per-node basis */
 	if (unlikely(!p->numa_faults)) {
 		int size = sizeof(*p->numa_faults) *
@@ -2894,7 +2911,7 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
 	int src_nid = cpu_to_node(task_cpu(p));
 	int dst_nid = cpu_to_node(new_cpu);
 
-	if (!static_branch_likely(&sched_numa_balancing))
+	if (!process_sched_numab_enabled(p))
 		return;
 
 	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
@@ -2928,6 +2945,11 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
 
+static inline bool process_sched_numab_enabled(struct task_struct *p)
+{
+	return false;
+}
+
 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
 }
@@ -7687,7 +7709,7 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	unsigned long src_weight, dst_weight;
 	int src_nid, dst_nid, dist;
 
-	if (!static_branch_likely(&sched_numa_balancing))
+	if (!process_sched_numab_enabled(p))
 		return -1;
 
 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
@@ -11164,7 +11186,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 		entity_tick(cfs_rq, se, queued);
 	}
 
-	if (static_branch_unlikely(&sched_numa_balancing))
+	if (process_sched_numab_enabled(curr))
 		task_tick_numa(rq, curr);
 
 	update_misfit_status(curr, rq);
diff --git a/kernel/sys.c b/kernel/sys.c
index ecc4cf019242..04cb73e39926 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -58,6 +58,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/task.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/numa_balancing.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
@@ -2081,6 +2082,23 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
 	return 0;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static int prctl_pid_numa_balancing_write(int numa_balancing)
+{
+	if (numa_balancing != PR_SET_PROCESS_NUMAB_DEFAULT
+	    && numa_balancing != PR_SET_PROCESS_NUMAB_DISABLED
+	    && numa_balancing != PR_SET_PROCESS_NUMAB_ENABLED)
+		return -EINVAL;
+	current->mm->numab_enabled = numa_balancing;
+	return 0;
+}
+
+static int prctl_pid_numa_balancing_read(void)
+{
+	return current->mm->numab_enabled;
+}
+#endif
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
@@ -2585,6 +2603,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = set_syscall_user_dispatch(arg2, arg3, arg4,
 						  (char __user *) arg5);
 		break;
+#ifdef CONFIG_NUMA_BALANCING
+	case PR_PROCESS_NUMAB:
+		switch (arg2) {
+		case PR_SET_PROCESS_NUMAB_DEFAULT:
+		case PR_SET_PROCESS_NUMAB_DISABLED:
+		case PR_SET_PROCESS_NUMAB_ENABLED:
+			error = prctl_pid_numa_balancing_write((int)arg2);
+			break;
+		case PR_GET_PROCESS_NUMAB:
+			error = put_user(prctl_pid_numa_balancing_read(), (int __user *)arg3);
+			break;
+		default:
+			error = -EINVAL;
+			break;
+		}
+		break;
+#endif
 #ifdef CONFIG_SCHED_CORE
 	case PR_SCHED_CORE:
 		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 09fb65a80e63..25a660065af1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1760,7 +1760,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		 * Skip scanning top tier node if normal numa
 		 * balancing is disabled
 		 */
-		if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+		if (!(process_sysctl_numab_mode(vma->vm_mm) & NUMA_BALANCING_NORMAL) &&
 		    node_is_toptier(page_to_nid(page)))
 			goto unlock;
 	}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2fe03e695c81..2ae0127f46e8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -33,7 +33,7 @@
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
-
+#include <linux/sched/numa_balancing.h>
 #include "internal.h"
 
 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -119,8 +119,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				 * Skip scanning top tier node if normal numa
 				 * balancing is disabled
 				 */
-				if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
-				    node_is_toptier(nid))
+				if (!(process_sysctl_numab_mode(vma->vm_mm) & NUMA_BALANCING_NORMAL)
+				    && node_is_toptier(nid))
 					continue;
 			}
 
-- 
2.20.1


             reply	other threads:[~2022-02-24  7:52 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-24  7:52 Gang Li [this message]
2022-03-03  2:40 ` [PATCH] sched/numa: add per-process numa_balancing Gang Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220224075227.27127-1-ligang.bdlg@bytedance.com \
    --to=ligang.bdlg@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=corbet@lwn.net \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=songmuchun@bytedance.com \
    --cc=vincent.guittot@linaro.org \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.