All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
To: linux-mm@kvack.org
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>,
	David Rientjes <rientjes@google.com>,
	Michal Hocko <mhocko@kernel.org>, Roman Gushchin <guro@fb.com>
Subject: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
Date: Sat,  4 Aug 2018 22:29:46 +0900	[thread overview]
Message-ID: <1533389386-3501-4-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp> (raw)
In-Reply-To: <1533389386-3501-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp>

David Rientjes is complaining about current behavior that the OOM killer
selects next OOM victim as soon as MMF_OOM_SKIP is set even if
__oom_reap_task_mm() returned without any progress.

To address this problem, this patch adds a timeout with whether the OOM
score of an OOM victim's memory is decreasing over time as a feedback,
after MMF_OOM_SKIP is set by the OOM reaper or exit_mmap().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Roman Gushchin <guro@fb.com>
---
 include/linux/sched.h |  3 ++
 mm/oom_kill.c         | 81 ++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 589fe78..70c7dfd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1174,6 +1174,9 @@ struct task_struct {
 #endif
 	int				pagefault_disabled;
 	struct list_head		oom_victim_list;
+	unsigned long			last_oom_compared;
+	unsigned long			last_oom_score;
+	unsigned char			oom_reap_stall_count;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 783f04d..7cad886 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -49,6 +49,12 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
 
+static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
+{
+	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
+		mm_pgtables_bytes(mm) / PAGE_SIZE;
+}
+
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
@@ -230,8 +236,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	points = oom_victim_mm_score(p->mm);
 	task_unlock(p);
 
 	/* Normalize to oom_score_adj units */
@@ -571,15 +576,6 @@ static void oom_reap_task(struct task_struct *tsk)
 	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
 		schedule_timeout_idle(HZ/10);
 
-	if (attempts <= MAX_OOM_REAP_RETRIES ||
-	    test_bit(MMF_OOM_SKIP, &mm->flags))
-		goto done;
-
-	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
-		task_pid_nr(tsk), tsk->comm);
-	debug_show_all_locks();
-
-done:
 	/*
 	 * Hide this mm from OOM killer because it has been either reaped or
 	 * somebody can't call up_write(mmap_sem).
@@ -631,6 +627,9 @@ static void mark_oom_victim(struct task_struct *tsk)
 	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
 		mmgrab(tsk->signal->oom_mm);
 		set_bit(MMF_OOM_VICTIM, &mm->flags);
+		tsk->last_oom_compared = jiffies;
+		tsk->last_oom_score = oom_victim_mm_score(mm);
+		tsk->oom_reap_stall_count = 0;
 		get_task_struct(tsk);
 		list_add(&tsk->oom_victim_list, &oom_victim_list);
 	}
@@ -867,7 +866,6 @@ static void __oom_kill_process(struct task_struct *victim)
 	mmdrop(mm);
 	put_task_struct(victim);
 }
-#undef K
 
 /*
  * Kill provided task unless it's secured by setting
@@ -999,33 +997,74 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+static bool victim_mm_stalling(struct task_struct *p, struct mm_struct *mm)
+{
+	unsigned long score;
+
+	if (time_before(jiffies, p->last_oom_compared + HZ / 10))
+		return false;
+	score = oom_victim_mm_score(mm);
+	if (score < p->last_oom_score)
+		p->oom_reap_stall_count = 0;
+	else
+		p->oom_reap_stall_count++;
+	p->last_oom_score = oom_victim_mm_score(mm);
+	p->last_oom_compared = jiffies;
+	if (p->oom_reap_stall_count < 30)
+		return false;
+	pr_info("Gave up waiting for process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+		task_pid_nr(p), p->comm, K(mm->total_vm),
+		K(get_mm_counter(mm, MM_ANONPAGES)),
+		K(get_mm_counter(mm, MM_FILEPAGES)),
+		K(get_mm_counter(mm, MM_SHMEMPAGES)));
+	return true;
+}
+
 static bool oom_has_pending_victims(struct oom_control *oc)
 {
-	struct task_struct *p;
+	struct task_struct *p, *tmp;
+	bool ret = false;
+	bool gaveup = false;
 
 	if (is_sysrq_oom(oc))
 		return false;
 	/*
-	 * Since oom_reap_task()/exit_mmap() will set MMF_OOM_SKIP, let's
-	 * wait for pending victims until MMF_OOM_SKIP is set or __mmput()
-	 * completes.
+	 * Wait for pending victims until __mmput() completes or stalled
+	 * too long.
 	 */
-	list_for_each_entry(p, &oom_victim_list, oom_victim_list) {
+	list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) {
+		struct mm_struct *mm = p->signal->oom_mm;
+
 		if (oom_unkillable_task(p, oc->memcg, oc->nodemask))
 			continue;
-		if (!test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags)) {
+		ret = true;
 #ifdef CONFIG_MMU
+		/*
+		 * Since the OOM reaper exists, we can safely wait until
+		 * MMF_OOM_SKIP is set.
+		 */
+		if (!test_bit(MMF_OOM_SKIP, &mm->flags)) {
 			if (!oom_reap_target) {
 				get_task_struct(p);
 				oom_reap_target = p;
 				trace_wake_reaper(p->pid);
 				wake_up(&oom_reaper_wait);
 			}
-#endif
-			return true;
+			continue;
 		}
+#endif
+		/* We can wait as long as OOM score is decreasing over time. */
+		if (!victim_mm_stalling(p, mm))
+			continue;
+		gaveup = true;
+		list_del(&p->oom_victim_list);
+		/* Drop a reference taken by mark_oom_victim(). */
+		put_task_struct(p);
 	}
-	return false;
+	if (gaveup)
+		debug_show_all_locks();
+
+	return ret;
 }
 
 /**
-- 
1.8.3.1

  parent reply	other threads:[~2018-08-04 13:30 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-08-04 13:29 [PATCH 1/4] mm, oom: Remove wake_oom_reaper() Tetsuo Handa
2018-08-04 13:29 ` [PATCH 2/4] mm, oom: Check pending victims earlier in out_of_memory() Tetsuo Handa
2018-08-04 13:29 ` [PATCH 3/4] mm, oom: Remove unused "abort" path Tetsuo Handa
2018-08-04 13:29 ` Tetsuo Handa [this message]
2018-08-06 13:45   ` [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes Michal Hocko
2018-08-06 20:19     ` David Rientjes
2018-08-06 20:51       ` Michal Hocko
2018-08-09 20:16         ` David Rientjes
2018-08-10  9:07           ` Michal Hocko
2018-08-10 10:54             ` Tetsuo Handa
2018-08-10 11:16               ` Michal Hocko
2018-08-11  3:12                 ` Tetsuo Handa
2018-08-14 11:33                   ` Michal Hocko
2018-08-19 14:23                     ` Tetsuo Handa
2018-08-20  5:54                       ` Michal Hocko
2018-08-20 22:03                         ` Tetsuo Handa
2018-08-21  6:16                           ` Michal Hocko
2018-08-21 13:39                             ` Tetsuo Handa
2018-08-19 23:45             ` David Rientjes
2018-08-20  6:07               ` Michal Hocko
2018-08-20 21:31                 ` David Rientjes
2018-08-21  6:09                   ` Michal Hocko
2018-08-21 17:20                     ` David Rientjes
2018-08-22  8:03                       ` Michal Hocko
2018-08-22 20:54                         ` David Rientjes
2018-09-01 11:48         ` Tetsuo Handa
2018-09-06 11:35           ` Michal Hocko
2018-09-06 11:50             ` Tetsuo Handa
2018-09-06 12:05               ` Michal Hocko
2018-09-06 13:40                 ` Tetsuo Handa
2018-09-06 13:56                   ` Michal Hocko
2018-09-06 14:06                     ` Tetsuo Handa
2018-09-06 14:16                       ` Michal Hocko
2018-09-06 21:13                         ` Tetsuo Handa
2018-09-07 11:10                           ` Michal Hocko
2018-09-07 11:36                             ` Tetsuo Handa
2018-09-07 11:51                               ` Michal Hocko
2018-09-07 13:30                                 ` Tetsuo Handa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1533389386-3501-4-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp \
    --to=penguin-kernel@i-love.sakura.ne.jp \
    --cc=guro@fb.com \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=rientjes@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.