All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] mm, oom: Remove redundant OOM score normalization at select_bad_process().
@ 2019-05-22 10:08 Tetsuo Handa
  2019-05-22 10:08 ` [PATCH 2/4] mm, oom: Avoid potential RCU stall at dump_tasks() Tetsuo Handa
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Tetsuo Handa @ 2019-05-22 10:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, Michal Hocko, Tetsuo Handa

Since commit bbbe48029720d2c6 ("mm, oom: remove 'prefer children over
parent' heuristic") removed

  "%s: Kill process %d (%s) score %u or sacrifice child\n"

line, oc->chosen_points is no longer used after select_bad_process().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/oom_kill.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5a58778..7534046 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -381,8 +381,6 @@ static void select_bad_process(struct oom_control *oc)
 				break;
 		rcu_read_unlock();
 	}
-
-	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
 }
 
 /**
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/4] mm, oom: Avoid potential RCU stall at dump_tasks().
  2019-05-22 10:08 [PATCH 1/4] mm, oom: Remove redundant OOM score normalization at select_bad_process() Tetsuo Handa
@ 2019-05-22 10:08 ` Tetsuo Handa
  2019-05-22 10:08 ` [PATCH 3/4] mm, oom: Wait for OOM victims even if oom_kill_allocating_task case Tetsuo Handa
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Tetsuo Handa @ 2019-05-22 10:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, Michal Hocko, Tetsuo Handa, Dmitry Vyukov, Petr Mladek,
	Sergey Senozhatsky, Steven Rostedt

dump_tasks() calls printk() on each userspace process under RCU which
might result in RCU stall warning. I proposed introducing timeout for
dump_tasks() operation, but Michal Hocko expects that dump_tasks() is
either print all or suppress all [1]. Therefore, this patch changes to
cache all candidates at oom_evaluate_task() and then print the cached
candidates at dump_tasks().

With this patch, dump_tasks() no longer need to call printk() under RCU.
Also, dump_tasks() no longer need to check oom_unkillable_task() by
traversing all userspace processes, for select_bad_process() already
traversed all possible candidates. Also, the OOM killer no longer need to
call put_task_struct() from atomic context, and we can simplify refcount
handling for __oom_kill_process().

This patch has a user-visible side effect that oom_kill_allocating_task
path implies oom_dump_tasks == 0, for oom_evaluate_task() is not called
via select_bad_process(). But since the purpose of enabling
oom_kill_allocating_task is to kill as quick as possible (i.e. tolerate
killing more than needed) whereas the purpose of dump_tasks() which might
take minutes is to understand how the OOM killer selected an OOM victim,
not printing candidates should be acceptable when the administrator asked
the OOM killer to kill current thread.

[1] https://lore.kernel.org/linux-mm/20180906115320.GS14951@dhcp22.suse.cz/

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/sched.h |  1 +
 mm/oom_kill.c         | 44 +++++++++++++++++++-------------------------
 2 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1183741..f1736bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1180,6 +1180,7 @@ struct task_struct {
 #ifdef CONFIG_MMU
 	struct task_struct		*oom_reaper_list;
 #endif
+	struct list_head                oom_candidate_list;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7534046..00b594c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -63,6 +63,7 @@
  * and mark_oom_victim
  */
 DEFINE_MUTEX(oom_lock);
+static LIST_HEAD(oom_candidate_list);
 
 #ifdef CONFIG_NUMA
 /**
@@ -333,6 +334,9 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto abort;
 	}
 
+	get_task_struct(task);
+	list_add_tail(&task->oom_candidate_list, &oom_candidate_list);
+
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
 	 * killed first if it triggers an oom, then select it.
@@ -350,16 +354,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
 		goto next;
 select:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
-	get_task_struct(task);
 	oc->chosen = task;
 	oc->chosen_points = points;
 next:
 	return 0;
 abort:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
 	oc->chosen = (void *)-1UL;
 	return 1;
 }
@@ -401,11 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 
 	pr_info("Tasks state (memory values in pages):\n");
 	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
-	rcu_read_lock();
-	for_each_process(p) {
-		if (oom_unkillable_task(p, memcg, nodemask))
-			continue;
-
+	list_for_each_entry(p, &oom_candidate_list, oom_candidate_list) {
+		cond_resched();
 		task = find_lock_task_mm(p);
 		if (!task) {
 			/*
@@ -424,7 +420,6 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
 	}
-	rcu_read_unlock();
 }
 
 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
@@ -455,7 +450,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 		if (is_dump_unreclaim_slabs())
 			dump_unreclaimable_slab();
 	}
-	if (sysctl_oom_dump_tasks)
+	if (sysctl_oom_dump_tasks && !list_empty(&oom_candidate_list))
 		dump_tasks(oc->memcg, oc->nodemask);
 	if (p)
 		dump_oom_summary(oc, p);
@@ -849,17 +844,11 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
 	struct mm_struct *mm;
 	bool can_oom_reap = true;
 
-	p = find_lock_task_mm(victim);
-	if (!p) {
-		put_task_struct(victim);
-		return;
-	} else if (victim != p) {
-		get_task_struct(p);
-		put_task_struct(victim);
-		victim = p;
-	}
-
 	/* Get a reference to safely compare mm after task_unlock(victim) */
+	victim = find_lock_task_mm(victim);
+	if (!victim)
+		return;
+	get_task_struct(victim);
 	mm = victim->mm;
 	mmgrab(mm);
 
@@ -931,7 +920,6 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message)
 {
 	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
 	    !is_global_init(task)) {
-		get_task_struct(task);
 		__oom_kill_process(task, message);
 	}
 	return 0;
@@ -954,7 +942,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 		mark_oom_victim(victim);
 		wake_oom_reaper(victim);
 		task_unlock(victim);
-		put_task_struct(victim);
 		return;
 	}
 	task_unlock(victim);
@@ -1077,7 +1064,6 @@ bool out_of_memory(struct oom_control *oc)
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
-		get_task_struct(current);
 		oc->chosen = current;
 		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
 		return true;
@@ -1099,6 +1085,14 @@ bool out_of_memory(struct oom_control *oc)
 	if (oc->chosen && oc->chosen != (void *)-1UL)
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
 				 "Memory cgroup out of memory");
+	while (!list_empty(&oom_candidate_list)) {
+		struct task_struct *p = list_first_entry(&oom_candidate_list,
+							 struct task_struct,
+							 oom_candidate_list);
+
+		list_del(&p->oom_candidate_list);
+		put_task_struct(p);
+	}
 	return !!oc->chosen;
 }
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/4] mm, oom: Wait for OOM victims even if oom_kill_allocating_task case.
  2019-05-22 10:08 [PATCH 1/4] mm, oom: Remove redundant OOM score normalization at select_bad_process() Tetsuo Handa
  2019-05-22 10:08 ` [PATCH 2/4] mm, oom: Avoid potential RCU stall at dump_tasks() Tetsuo Handa
@ 2019-05-22 10:08 ` Tetsuo Handa
  2019-05-22 10:08 ` [PATCH 4/4] mm, oom: Respect oom_task_origin() " Tetsuo Handa
  2019-05-23 22:04 ` [PATCH 5/4] mm, oom: Deduplicate memcg candidates at select_bad_process() Tetsuo Handa
  3 siblings, 0 replies; 5+ messages in thread
From: Tetsuo Handa @ 2019-05-22 10:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, Michal Hocko, Tetsuo Handa

"mm, oom: Avoid potential RCU stall at dump_tasks()." changed to imply
oom_dump_tasks == 0 if oom_kill_allocating_task != 0. But since we can
expect the OOM reaper to reclaim memory quickly, and majority of latency
is not for_each_process() from select_bad_process() but printk() from
dump_header(), waiting for in-flight OOM victims until the OOM reaper
completes should generate preferable results (i.e. minimal number of
OOM victims).

As side effects of this patch, oom_kill_allocating_task != 0 no longer
implies oom_dump_tasks == 0, complicated conditions for whether to enter
oom_kill_allocating_task path are simplified, and a theoretical bug that
the OOM killer forever retries oom_kill_allocating_task path even after
the OOM reaper set MMF_OOM_SKIP is fixed.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/oom_kill.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 00b594c..64e582e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -367,19 +367,29 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
  * Simple selection loop. We choose the process with the highest number of
  * 'points'. In case scan was aborted, oc->chosen is set to -1.
  */
-static void select_bad_process(struct oom_control *oc)
+static const char *select_bad_process(struct oom_control *oc)
 {
-	if (is_memcg_oom(oc))
-		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
-	else {
-		struct task_struct *p;
+	struct task_struct *p;
 
-		rcu_read_lock();
-		for_each_process(p)
-			if (oom_evaluate_task(p, oc))
-				break;
-		rcu_read_unlock();
+	if (is_memcg_oom(oc)) {
+		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
+		return "Memory cgroup out of memory";
 	}
+	rcu_read_lock();
+	for_each_process(p)
+		if (oom_evaluate_task(p, oc))
+			break;
+	rcu_read_unlock();
+	if (sysctl_oom_kill_allocating_task && oc->chosen != (void *)-1UL) {
+		list_for_each_entry(p, &oom_candidate_list,
+				    oom_candidate_list) {
+			if (!same_thread_group(p, current))
+				continue;
+			oc->chosen = current;
+			return "Out of memory (oom_kill_allocating_task)";
+		}
+	}
+	return "Out of memory";
 }
 
 /**
@@ -1021,6 +1031,7 @@ bool out_of_memory(struct oom_control *oc)
 {
 	unsigned long freed = 0;
 	enum oom_constraint constraint = CONSTRAINT_NONE;
+	const char *message;
 
 	if (oom_killer_disabled)
 		return false;
@@ -1061,15 +1072,7 @@ bool out_of_memory(struct oom_control *oc)
 		oc->nodemask = NULL;
 	check_panic_on_oom(oc, constraint);
 
-	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
-	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
-	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
-		oc->chosen = current;
-		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
-		return true;
-	}
-
-	select_bad_process(oc);
+	message = select_bad_process(oc);
 	/* Found nothing?!?! */
 	if (!oc->chosen) {
 		dump_header(oc, NULL);
@@ -1083,8 +1086,7 @@ bool out_of_memory(struct oom_control *oc)
 			panic("System is deadlocked on memory\n");
 	}
 	if (oc->chosen && oc->chosen != (void *)-1UL)
-		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
-				 "Memory cgroup out of memory");
+		oom_kill_process(oc, message);
 	while (!list_empty(&oom_candidate_list)) {
 		struct task_struct *p = list_first_entry(&oom_candidate_list,
 							 struct task_struct,
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/4] mm, oom: Respect oom_task_origin() even if oom_kill_allocating_task case.
  2019-05-22 10:08 [PATCH 1/4] mm, oom: Remove redundant OOM score normalization at select_bad_process() Tetsuo Handa
  2019-05-22 10:08 ` [PATCH 2/4] mm, oom: Avoid potential RCU stall at dump_tasks() Tetsuo Handa
  2019-05-22 10:08 ` [PATCH 3/4] mm, oom: Wait for OOM victims even if oom_kill_allocating_task case Tetsuo Handa
@ 2019-05-22 10:08 ` Tetsuo Handa
  2019-05-23 22:04 ` [PATCH 5/4] mm, oom: Deduplicate memcg candidates at select_bad_process() Tetsuo Handa
  3 siblings, 0 replies; 5+ messages in thread
From: Tetsuo Handa @ 2019-05-22 10:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, Michal Hocko, Tetsuo Handa

Currently oom_kill_allocating_task path ignores oom_task_origin() tasks.
But since oom_task_origin() is used for "notify me first using SIGKILL
(because I'm likely the culprit for this situation)", respect it.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/oom_kill.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 64e582e..bdd90b5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -380,7 +380,8 @@ static const char *select_bad_process(struct oom_control *oc)
 		if (oom_evaluate_task(p, oc))
 			break;
 	rcu_read_unlock();
-	if (sysctl_oom_kill_allocating_task && oc->chosen != (void *)-1UL) {
+	if (sysctl_oom_kill_allocating_task && oc->chosen != (void *)-1UL &&
+	    oc->chosen_points != ULONG_MAX) {
 		list_for_each_entry(p, &oom_candidate_list,
 				    oom_candidate_list) {
 			if (!same_thread_group(p, current))
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 5/4] mm, oom: Deduplicate memcg candidates at select_bad_process().
  2019-05-22 10:08 [PATCH 1/4] mm, oom: Remove redundant OOM score normalization at select_bad_process() Tetsuo Handa
                   ` (2 preceding siblings ...)
  2019-05-22 10:08 ` [PATCH 4/4] mm, oom: Respect oom_task_origin() " Tetsuo Handa
@ 2019-05-23 22:04 ` Tetsuo Handa
  3 siblings, 0 replies; 5+ messages in thread
From: Tetsuo Handa @ 2019-05-23 22:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, Michal Hocko

Since "mm, oom: Avoid potential RCU stall at dump_tasks()." changed to
cache all candidates at select_bad_process(), dump_tasks() started
printing all threads upon memcg OOM event.

Unfortunately, mem_cgroup_scan_tasks() can't traverse on only thread
group leaders because CSS_TASK_ITER_PROCS does not work if the thread
group leader already exit()ed.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/oom_kill.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bdd90b53bbd3..a92b2f70d15b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -334,6 +334,20 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto abort;
 	}
 
+	/*
+	 * Since mem_cgroup_scan_tasks() calls this function on each thread
+	 * whlie for_each_process() calls this function on each thread group,
+	 * memcg OOM should evaluate only one thread from each thread group.
+	 */
+	if (is_memcg_oom(oc) && get_nr_threads(task) != 1) {
+		struct task_struct *p;
+
+		list_for_each_entry_reverse(p, &oom_candidate_list,
+					    oom_candidate_list)
+			if (same_thread_group(p, task))
+				goto next;
+	}
+
 	get_task_struct(task);
 	list_add_tail(&task->oom_candidate_list, &oom_candidate_list);
 
@@ -350,9 +364,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	if (!points || points < oc->chosen_points)
 		goto next;
 
-	/* Prefer thread group leaders for display purposes */
-	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
-		goto next;
 select:
 	oc->chosen = task;
 	oc->chosen_points = points;
-- 
2.16.5


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-05-23 22:04 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-22 10:08 [PATCH 1/4] mm, oom: Remove redundant OOM score normalization at select_bad_process() Tetsuo Handa
2019-05-22 10:08 ` [PATCH 2/4] mm, oom: Avoid potential RCU stall at dump_tasks() Tetsuo Handa
2019-05-22 10:08 ` [PATCH 3/4] mm, oom: Wait for OOM victims even if oom_kill_allocating_task case Tetsuo Handa
2019-05-22 10:08 ` [PATCH 4/4] mm, oom: Respect oom_task_origin() " Tetsuo Handa
2019-05-23 22:04 ` [PATCH 5/4] mm, oom: Deduplicate memcg candidates at select_bad_process() Tetsuo Handa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.