All of lore.kernel.org
 help / color / mirror / Atom feed
From: Gang Li <ligang.bdlg@bytedance.com>
To: akpm@linux-foundation.org
Cc: songmuchun@bytedance.com, hca@linux.ibm.com, gor@linux.ibm.com,
	agordeev@linux.ibm.com, borntraeger@linux.ibm.com,
	svens@linux.ibm.com, ebiederm@xmission.com,
	keescook@chromium.org, viro@zeniv.linux.org.uk,
	rostedt@goodmis.org, mingo@redhat.com, peterz@infradead.org,
	acme@kernel.org, mark.rutland@arm.com,
	alexander.shishkin@linux.intel.com, jolsa@kernel.org,
	namhyung@kernel.org, david@redhat.com, imbrenda@linux.ibm.com,
	apopple@nvidia.com, adobriyan@gmail.com,
	stephen.s.brennan@oracle.com, ohoono.kwon@samsung.com,
	haolee.swjtu@gmail.com, kaleshsingh@google.com,
	zhengqi.arch@bytedance.com, peterx@redhat.com,
	shy828301@gmail.com, surenb@google.com, ccross@google.com,
	vincent.whitchurch@axis.com, tglx@linutronix.de,
	bigeasy@linutronix.de, fenghua.yu@intel.com,
	linux-s390@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-perf-users@vger.kernel.org,
	Gang Li <ligang.bdlg@bytedance.com>
Subject: [PATCH 5/5 v1] mm, oom: enable per numa node oom for CONSTRAINT_MEMORY_POLICY
Date: Thu, 12 May 2022 12:46:34 +0800	[thread overview]
Message-ID: <20220512044634.63586-6-ligang.bdlg@bytedance.com> (raw)
In-Reply-To: <20220512044634.63586-1-ligang.bdlg@bytedance.com>

Page allocator will only alloc pages on node indicated by
`nodemask`. But oom will still select bad process by total rss usage
which may reclam nothing on the node indicated by `nodemask`.

This patch let oom only calculate rss on the given node when
oc->constraint equals to CONSTRAINT_MEMORY_POLICY.

If `nodemask` is asigned, the process with the highest memory
consumption on the specific node will be killed. oom_kill dmesg will
looks like this:

```
[ 1471.436027] Tasks state (memory values in pages):
[ 1471.438518] [  pid  ]   uid  tgid total_vm      rss (01)nrss  pgtables_bytes swapents oom_score_adj name
[ 1471.554703] [   1011]     0  1011   220005     8589     1872   823296        0             0 node
[ 1471.707912] [  12399]     0 12399  1311306  1311056   262170 10534912        0             0 a.out
[ 1471.712429] [  13135]     0 13135   787018   674666   674300  5439488        0             0 a.out
[ 1471.721506] [  13295]     0 13295      597      188        0    24576        0             0 sh
[ 1471.734600] oom-kill:constraint=CONSTRAINT_MEMORY_POLICY,nodemask=1,cpuset=/,mems_allowed=0-2,global_oom,task_memcg=/user.slice/user-0.slice/session-3.scope,task=a.out,pid=13135,uid=0
[ 1471.742583] Out of memory: Killed process 13135 (a.out) total-vm:3148072kB, anon-rss:2697304kB, file-rss:1360kB, shmem-rss:0kB, UID:0 pgtables:5312kB oom_score_adj:0
[ 1471.849615] oom_reaper: reaped process 13135 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
```

Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
---
 fs/proc/base.c      |  6 +++++-
 include/linux/oom.h |  2 +-
 mm/oom_kill.c       | 45 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c1031843cc6a..caf0f51284d0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -552,8 +552,12 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long totalpages = totalram_pages() + total_swap_pages;
 	unsigned long points = 0;
 	long badness;
+	struct oom_control oc = {
+		.totalpages =  totalpages,
+		.gfp_mask = 0,
+	};
 
-	badness = oom_badness(task, totalpages);
+	badness = oom_badness(task, &oc);
 	/*
 	 * Special case OOM_SCORE_ADJ_MIN for all others scale the
 	 * badness value into [0, 2000] range which we have been
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 2db9a1432511..0cb6a60be776 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -109,7 +109,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
 long oom_badness(struct task_struct *p,
-		unsigned long totalpages);
+		struct oom_control *oc);
 
 extern bool out_of_memory(struct oom_control *oc);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 757f5665ae94..75a80b5a63bf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -198,7 +198,7 @@ static bool should_dump_unreclaim_slab(void)
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-long oom_badness(struct task_struct *p, unsigned long totalpages)
+long oom_badness(struct task_struct *p, struct oom_control *oc)
 {
 	long points;
 	long adj;
@@ -227,12 +227,22 @@ long oom_badness(struct task_struct *p, unsigned long totalpages)
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
-		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) {
+		struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+								oc->nodemask);
+		int nid_to_find_victim = zone_to_nid(zoneref->zone);
+
+		points = get_mm_counter(p->mm, -1, nid_to_find_victim) +
+			get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
+			mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	} else {
+		points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS, NUMA_NO_NODE) +
+			mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	}
 	task_unlock(p);
 
 	/* Normalize to oom_score_adj units */
-	adj *= totalpages / 1000;
+	adj *= oc->totalpages / 1000;
 	points += adj;
 
 	return points;
@@ -338,7 +348,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto select;
 	}
 
-	points = oom_badness(task, oc->totalpages);
+	points = oom_badness(task, oc);
 	if (points == LONG_MIN || points < oc->chosen_points)
 		goto next;
 
@@ -382,6 +392,7 @@ static int dump_task(struct task_struct *p, void *arg)
 {
 	struct oom_control *oc = arg;
 	struct task_struct *task;
+	unsigned long node_mm_rss;
 
 	if (oom_unkillable_task(p))
 		return 0;
@@ -399,9 +410,18 @@ static int dump_task(struct task_struct *p, void *arg)
 		return 0;
 	}
 
-	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
+	if (unlikely(oc->constraint == CONSTRAINT_MEMORY_POLICY)) {
+		struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+								oc->nodemask);
+		int nid_to_find_victim = zone_to_nid(zoneref->zone);
+
+		node_mm_rss = get_mm_counter(p->mm, -1, nid_to_find_victim);
+	} else {
+		node_mm_rss = 0;
+	}
+	pr_info("[%7d] %5d %5d %8lu %8lu %8lu %8ld %8lu         %5hd %s\n",
 		task->pid, from_kuid(&init_user_ns, task_uid(task)),
-		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+		task->tgid, task->mm->total_vm, get_mm_rss(task->mm), node_mm_rss,
 		mm_pgtables_bytes(task->mm),
 		get_mm_counter(task->mm, MM_SWAPENTS, NUMA_NO_NODE),
 		task->signal->oom_score_adj, task->comm);
@@ -422,8 +442,17 @@ static int dump_task(struct task_struct *p, void *arg)
  */
 static void dump_tasks(struct oom_control *oc)
 {
+	int nid_to_find_victim;
+
+	if (oc->nodemask) {
+		struct zoneref *zoneref = first_zones_zonelist(oc->zonelist, gfp_zone(oc->gfp_mask),
+								oc->nodemask);
+		nid_to_find_victim = zone_to_nid(zoneref->zone);
+	} else {
+		nid_to_find_victim = -1;
+	}
 	pr_info("Tasks state (memory values in pages):\n");
-	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
+	pr_info("[  pid  ]   uid  tgid total_vm      rss (%02d)nrss  pgtables_bytes swapents oom_score_adj name\n", nid_to_find_victim);
 
 	if (is_memcg_oom(oc))
 		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
-- 
2.20.1


  parent reply	other threads:[~2022-05-12  4:49 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-12  4:46 [PATCH 0/5 v1] mm, oom: Introduce per numa node oom for CONSTRAINT_MEMORY_POLICY Gang Li
2022-05-12  4:46 ` [PATCH 1/5 v1] mm: add a new parameter `node` to `get/add/inc/dec_mm_counter` Gang Li
2022-05-12  4:46 ` [PATCH 2/5 v1] mm: add numa_count field for rss_stat Gang Li
2022-05-12 16:31   ` kernel test robot
2022-05-12  4:46 ` [PATCH 3/5 v1] mm: add numa fields for tracepoint rss_stat Gang Li
2022-05-12  4:46 ` [PATCH 4/5 v1] mm: enable per numa node rss_stat count Gang Li
2022-05-17  2:28   ` [mm] c9dc81ef10: BUG:Bad_rss-counter_state_mm:#node:#val kernel test robot
2022-05-17  2:28     ` kernel test robot
2022-05-12  4:46 ` Gang Li [this message]
2022-05-12 22:31 ` [PATCH 0/5 v1] mm, oom: Introduce per numa node oom for CONSTRAINT_MEMORY_POLICY Suren Baghdasaryan
2022-05-16 16:44 ` Michal Hocko
2022-06-15 10:13   ` Gang Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220512044634.63586-6-ligang.bdlg@bytedance.com \
    --to=ligang.bdlg@bytedance.com \
    --cc=acme@kernel.org \
    --cc=adobriyan@gmail.com \
    --cc=agordeev@linux.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=apopple@nvidia.com \
    --cc=bigeasy@linutronix.de \
    --cc=borntraeger@linux.ibm.com \
    --cc=ccross@google.com \
    --cc=david@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=fenghua.yu@intel.com \
    --cc=gor@linux.ibm.com \
    --cc=haolee.swjtu@gmail.com \
    --cc=hca@linux.ibm.com \
    --cc=imbrenda@linux.ibm.com \
    --cc=jolsa@kernel.org \
    --cc=kaleshsingh@google.com \
    --cc=keescook@chromium.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=ohoono.kwon@samsung.com \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=shy828301@gmail.com \
    --cc=songmuchun@bytedance.com \
    --cc=stephen.s.brennan@oracle.com \
    --cc=surenb@google.com \
    --cc=svens@linux.ibm.com \
    --cc=tglx@linutronix.de \
    --cc=vincent.whitchurch@axis.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.