All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] oom: choose a more suitable process to kill while all processes are not killable
@ 2019-12-20  6:26 zgpeng.linux
  2019-12-20  7:13 ` Michal Hocko
  0 siblings, 1 reply; 4+ messages in thread
From: zgpeng.linux @ 2019-12-20  6:26 UTC (permalink / raw)
  To: akpm, hannes, mhocko, vdavydov.dev, shakeelb
  Cc: linux-mm, cgroups, linux-kernel, zgpeng

From: zgpeng <zgpeng@tencent.com>

It has been found in multiple business scenarios that when a oom occurs
in a cgroup, the process that consumes the most memory in the cgroup is 
not killed first. Analysis of the reasons found that each process in the
cgroup oom_score_adj is set to -998, oom_badness in the calculation of 
points, if points is negative, uniformly set it to 1. For these processes 
that should not be killed, the kernel does not distinguish which process 
consumes the most memory. As a result, there is a phenomenon that a 
process with low memory consumption may be killed first. In addition, 
when the memory is large, even if oom_score_adj is not set so small, 
such as -1, there will be similar problems.

Based on the scenario mentioned above, the existing oom killer can be 
optimized. In this patch, oom_badness is optimized so that when points 
are negative, it can also distinguish which process consumes the most 
memory. Therefore, when oom occurs, the process with the largest memory 
consumption can be killed first.

Signed-off-by: zgpeng <zgpeng@tencent.com>
---
 drivers/tty/sysrq.c |  1 +
 fs/proc/base.c      |  6 ++++--
 include/linux/oom.h |  4 ++--
 mm/memcontrol.c     |  1 +
 mm/oom_kill.c       | 21 +++++++++------------
 mm/page_alloc.c     |  1 +
 6 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 573b205..1fe79d9 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -367,6 +367,7 @@ static void moom_callback(struct work_struct *ignored)
 		.memcg = NULL,
 		.gfp_mask = gfp_mask,
 		.order = -1,
+		.chosen_points = LONG_MIN,
 	};
 
 	mutex_lock(&oom_lock);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ebea950..2c7c4e0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -548,9 +548,11 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 			  struct pid *pid, struct task_struct *task)
 {
 	unsigned long totalpages = totalram_pages() + total_swap_pages;
-	unsigned long points = 0;
+	long points = 0;
 
-	points = oom_badness(task, totalpages) * 1000 / totalpages;
+	points = oom_badness(task, totalpages);
+	points = points > 0 ? points : 1;
+	points = points * 1000 / totalpages;
 	seq_printf(m, "%lu\n", points);
 
 	return 0;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index c696c26..2d2b898 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -48,7 +48,7 @@ struct oom_control {
 	/* Used by oom implementation, do not set */
 	unsigned long totalpages;
 	struct task_struct *chosen;
-	unsigned long chosen_points;
+	long chosen_points;
 
 	/* Used to print the constraint info. */
 	enum oom_constraint constraint;
@@ -107,7 +107,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
-extern unsigned long oom_badness(struct task_struct *p,
+extern long oom_badness(struct task_struct *p,
 		unsigned long totalpages);
 
 extern bool out_of_memory(struct oom_control *oc);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c5b5f74..73e2381 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1563,6 +1563,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		.memcg = memcg,
 		.gfp_mask = gfp_mask,
 		.order = order,
+		.chosen_points = LONG_MIN,
 	};
 	bool ret;
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 71e3ace..160f364 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -195,17 +195,17 @@ static bool is_dump_unreclaim_slabs(void)
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
+long oom_badness(struct task_struct *p, unsigned long totalpages)
 {
 	long points;
 	long adj;
 
 	if (oom_unkillable_task(p))
-		return 0;
+		return LONG_MIN;
 
 	p = find_lock_task_mm(p);
 	if (!p)
-		return 0;
+		return LONG_MIN;
 
 	/*
 	 * Do not even consider tasks which are explicitly marked oom
@@ -217,7 +217,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
 			test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
 			in_vfork(p)) {
 		task_unlock(p);
-		return 0;
+		return LONG_MIN;
 	}
 
 	/*
@@ -232,11 +232,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
 	adj *= totalpages / 1000;
 	points += adj;
 
-	/*
-	 * Never return 0 for an eligible task regardless of the root bonus and
-	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
-	 */
-	return points > 0 ? points : 1;
+	return points;
 }
 
 static const char * const oom_constraint_text[] = {
@@ -309,7 +305,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
 static int oom_evaluate_task(struct task_struct *task, void *arg)
 {
 	struct oom_control *oc = arg;
-	unsigned long points;
+	long points;
 
 	if (oom_unkillable_task(task))
 		goto next;
@@ -335,12 +331,12 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	 * killed first if it triggers an oom, then select it.
 	 */
 	if (oom_task_origin(task)) {
-		points = ULONG_MAX;
+		points = LONG_MAX;
 		goto select;
 	}
 
 	points = oom_badness(task, oc->totalpages);
-	if (!points || points < oc->chosen_points)
+	if (points == LONG_MIN || points < oc->chosen_points)
 		goto next;
 
 select:
@@ -1126,6 +1122,7 @@ void pagefault_out_of_memory(void)
 		.memcg = NULL,
 		.gfp_mask = 0,
 		.order = 0,
+		.chosen_points = LONG_MIN,
 	};
 
 	if (mem_cgroup_oom_synchronize(true))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4785a8a..63ccb2a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3797,6 +3797,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 		.memcg = NULL,
 		.gfp_mask = gfp_mask,
 		.order = order,
+		.chosen_points = LONG_MIN,
 	};
 	struct page *page;
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] oom: choose a more suitable process to kill while all processes are not killable
  2019-12-20  6:26 [PATCH] oom: choose a more suitable process to kill while all processes are not killable zgpeng.linux
@ 2019-12-20  7:13 ` Michal Hocko
  2019-12-20  9:56   ` 彭志刚
  0 siblings, 1 reply; 4+ messages in thread
From: Michal Hocko @ 2019-12-20  7:13 UTC (permalink / raw)
  To: zgpeng.linux
  Cc: akpm, hannes, vdavydov.dev, shakeelb, linux-mm, cgroups,
	linux-kernel, zgpeng

On Fri 20-12-19 14:26:12, zgpeng.linux@gmail.com wrote:
> From: zgpeng <zgpeng@tencent.com>
> 
> It has been found in multiple business scenarios that when a oom occurs
> in a cgroup, the process that consumes the most memory in the cgroup is 
> not killed first. Analysis of the reasons found that each process in the
> cgroup oom_score_adj is set to -998, oom_badness in the calculation of 
> points, if points is negative, uniformly set it to 1.

Can you provide an example of the oom report?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] oom: choose a more suitable process to kill while all processes are not killable
  2019-12-20  7:13 ` Michal Hocko
@ 2019-12-20  9:56   ` 彭志刚
  2019-12-20 11:37     ` Michal Hocko
  0 siblings, 1 reply; 4+ messages in thread
From: 彭志刚 @ 2019-12-20  9:56 UTC (permalink / raw)
  To: Michal Hocko
  Cc: akpm, hannes, vdavydov.dev, Shakeel Butt, linux-mm, cgroups,
	linux-kernel, zgpeng

[-- Attachment #1: Type: text/plain, Size: 3153 bytes --]

certainly.

Steps to reproduce:
(1)Create a mm cgroup and set memory.limit_in_bytes
(2)Move the bash process to the newly created cgroup, and set the
oom_score_adj of the  bash process to -998.
(3)In bash, start multiple processes, each process consumes different
memory until cgroup oom is triggered.

The triggered phenomenon is shown below. We can see that when cgroup oom
happened, process 23777 was killed, but in fact, 23772 consumes more memory;

[  591.000970] Tasks state (memory values in pages):
[  591.000970] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes
swapents oom_score_adj name
[  591.000973] [  23344]     0 23344     2863      923    61440        0
       -998 bash
[  591.000975] [  23714]     0 23714    27522    25935   258048        0
       -998 test
[  591.000976] [  23772]     0 23772   104622   103032   876544        0
       -998 test
[  591.000978] [  23777]     0 23777    78922    77335   667648        0
       -998 test
[  591.000980]
oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=/,mems_allowed=0-1,oom_memcg=/test,task_memcg=/test,task=test,pid=23777,uid=0
[  591.000986] Memory cgroup out of memory: Killed process 23777 (test)
total-vm:315688kB, anon-rss:308420kB, file-rss:920kB, shmem-rss:0kB, UID:0
pgtables:667648kB oom_score_adj:-998

The verification process is the same. After applying this repair patch, we
can find that when the oom cgroup occurs, the process that consumes the
most memory is killed first.  The effect is shown below:

[195118.961767] Tasks state (memory values in pages):
[195118.961768] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes
swapents oom_score_adj name
[195118.961770] [  22283]     0 22283     2862      911    69632        0
       -998 bash
[195118.961771] [  79244]     0 79244    27522    25922   262144        0
       -998 test
[195118.961773] [  79247]     0 79247    53222    51596   462848        0
       -998 test
[195118.961776] [  79263]     0 79263    58362    56744   507904        0
       -998 test
[195118.961777] [  79267]     0 79267    45769    44005   409600        0
       -998 test
[195118.961779]
oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=/,mems_allowed=0-1,oom_memcg=/test,task_memcg=/test,task=test,pid=79263,uid=0
[195118.961786] Memory cgroup out of memory: Killed process 79263 (test)
total-vm:233448kB, anon-rss:226048kB, file-rss:928kB, shmem-rss:0kB, UID:0
pgtables:507904kB oom_score_adj:-998

Michal Hocko <mhocko@kernel.org> 于2019年12月20日周五 下午3:13写道:

> On Fri 20-12-19 14:26:12, zgpeng.linux@gmail.com wrote:
> > From: zgpeng <zgpeng@tencent.com>
> >
> > It has been found in multiple business scenarios that when a oom occurs
> > in a cgroup, the process that consumes the most memory in the cgroup is
> > not killed first. Analysis of the reasons found that each process in the
> > cgroup oom_score_adj is set to -998, oom_badness in the calculation of
> > points, if points is negative, uniformly set it to 1.
>
> Can you provide an example of the oom report?
> --
> Michal Hocko
> SUSE Labs
>

[-- Attachment #2: Type: text/html, Size: 3956 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] oom: choose a more suitable process to kill while all processes are not killable
  2019-12-20  9:56   ` 彭志刚
@ 2019-12-20 11:37     ` Michal Hocko
  0 siblings, 0 replies; 4+ messages in thread
From: Michal Hocko @ 2019-12-20 11:37 UTC (permalink / raw)
  To: 彭志刚
  Cc: akpm, hannes, vdavydov.dev, Shakeel Butt, linux-mm, cgroups,
	linux-kernel, zgpeng

[Please do not top post]

On Fri 20-12-19 17:56:20, 彭志刚 wrote:
> certainly.
> 
> Steps to reproduce:
> (1)Create a mm cgroup and set memory.limit_in_bytes
> (2)Move the bash process to the newly created cgroup, and set the
> oom_score_adj of the  bash process to -998.
> (3)In bash, start multiple processes, each process consumes different
> memory until cgroup oom is triggered.
> 
> The triggered phenomenon is shown below. We can see that when cgroup oom
> happened, process 23777 was killed, but in fact, 23772 consumes more memory;
> 
> [  591.000970] Tasks state (memory values in pages):
> [  591.000970] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
> [  591.000973] [  23344]     0 23344     2863      923    61440        0        -998 bash
> [  591.000975] [  23714]     0 23714    27522    25935   258048        0        -998 test
> [  591.000976] [  23772]     0 23772   104622   103032   876544        0        -998 test

points = 103032 + 0 + 876544/4096 = 103246

> [  591.000978] [  23777]     0 23777    78922    77335   667648        0        -998 test

points = 77335 + 0 + 667648/4096 = 77498

It is not clear what is the actual hard limit but let's assume that
rss+page_tables is the only charged memory (or at least the majority of
it). That would be 207680 so the normalized oom_score_adj would be
-206586 which is way too big for both tasks so from the OOM killer
perspective both tasks are equal.

The question is whether this is a bug or a (mis)feature. The
oom_score_adj je documented as follows:
Documentation/filesystems/proc.txt
: Consequently, it is very simple for userspace to define the amount of memory to
: consider for each task.  Setting a /proc/<pid>/oom_score_adj value of +500, for
: example, is roughly equivalent to allowing the remainder of tasks sharing the
: same system, cpuset, mempolicy, or memory controller resources to use at least
: 50% more memory.  A value of -500, on the other hand, would be roughly
: equivalent to discounting 50% of the task's allowed memory from being considered
: as scoring against the task.

Which implies that we are talking about the budget based on a usable
memory (aka hard limit in this case). I do agree that the semantic is
awkward. I know there are usecases which try to use the existing scheme
for oom_score_adj to fine tune oom decisions and I am worried your patch
might break those.

That being said, I am not sure this change is safe wrt. to backward
compatibility. I would rather recommend to not using oom_score_adj for
anything but OOM_SCORE_ADJ_MIN resp OOM_SCORE_ADJ_MAX.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2019-12-20 11:37 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-20  6:26 [PATCH] oom: choose a more suitable process to kill while all processes are not killable zgpeng.linux
2019-12-20  7:13 ` Michal Hocko
2019-12-20  9:56   ` 彭志刚
2019-12-20 11:37     ` Michal Hocko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.