linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
@ 2016-07-12 13:29 Tetsuo Handa
  2016-07-12 13:29 ` [PATCH 1/8] mm,oom_reaper: Reduce find_lock_task_mm() usage Tetsuo Handa
                   ` (8 more replies)
  0 siblings, 9 replies; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 13:29 UTC (permalink / raw)
  To: mhocko, mhocko; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

This series is an update of
http://lkml.kernel.org/r/201607080058.BFI87504.JtFOOFQFVHSLOM@I-love.SAKURA.ne.jp .

This series is based on top of linux-next-20160712 +
http://lkml.kernel.org/r/1467201562-6709-1-git-send-email-mhocko@kernel.org .

 include/linux/mm_types.h |    7
 include/linux/oom.h      |   14 -
 include/linux/sched.h    |    2
 kernel/exit.c            |    2
 kernel/fork.c            |    2
 mm/memcontrol.c          |   14 -
 mm/oom_kill.c            |  362 ++++++++++++++++++++++-------------------------
 7 files changed, 190 insertions(+), 213 deletions(-)

[PATCH 1/8] mm,oom_reaper: Reduce find_lock_task_mm() usage.
[PATCH 2/8] mm,oom_reaper: Do not attempt to reap a task twice.
[PATCH 3/8] mm,oom: Use list of mm_struct used by OOM victims.
[PATCH 4/8] mm,oom: Close oom_has_pending_mm race.
[PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct.
[PATCH 6/8] mm,oom: Remove OOM_SCAN_ABORT case and signal_struct->oom_victims.
[PATCH 7/8] mm,oom: Stop clearing TIF_MEMDIE on remote thread.
[PATCH 8/8] oom_reaper: Revert "oom_reaper: close race with exiting task".

At a first glance, the diff lines have increased compared to v2. But v3 is rather
patch description update. Actual change (shown below) is almost same with v2.

 kernel/fork.c |    2
 mm/oom_kill.c |  117 ++++++++++++++++++++++++++++++++++++++--------------------
 2 files changed, 78 insertions(+), 41 deletions(-)

diff -ur v2/kernel/fork.c v3/kernel/fork.c
--- v2/kernel/fork.c
+++ v3/kernel/fork.c
@@ -722,10 +722,8 @@
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
-#ifndef CONFIG_MMU
 	if (mm->oom_mm.victim)
 		exit_oom_mm(mm);
-#endif
 	mmdrop(mm);
 }
 
diff -ur v2/mm/oom_kill.c v3/mm/oom_kill.c
--- v2/mm/oom_kill.c
+++ v3/mm/oom_kill.c
@@ -132,6 +132,20 @@
 	return oc->order == -1;
 }
 
+static bool task_in_oom_domain(struct task_struct *p, struct mem_cgroup *memcg,
+			       const nodemask_t *nodemask)
+{
+	/* When mem_cgroup_out_of_memory() and p is not member of the group */
+	if (memcg && !task_in_mem_cgroup(p, memcg))
+		return false;
+
+	/* p may not have freeable memory in nodemask */
+	if (!has_intersects_mems_allowed(p, nodemask))
+		return false;
+
+	return true;
+}
+
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask)
@@ -141,15 +155,7 @@
 	if (p->flags & PF_KTHREAD)
 		return true;
 
-	/* When mem_cgroup_out_of_memory() and p is not member of the group */
-	if (memcg && !task_in_mem_cgroup(p, memcg))
-		return true;
-
-	/* p may not have freeable memory in nodemask */
-	if (!has_intersects_mems_allowed(p, nodemask))
-		return true;
-
-	return false;
+	return !task_in_oom_domain(p, memcg, nodemask);
 }
 
 /**
@@ -276,25 +282,39 @@
 #endif
 
 static LIST_HEAD(oom_mm_list);
+static DEFINE_SPINLOCK(oom_mm_lock);
 
 void exit_oom_mm(struct mm_struct *mm)
 {
-	mutex_lock(&oom_lock);
-	list_del(&mm->oom_mm.list);
-	put_task_struct(mm->oom_mm.victim);
+	struct task_struct *victim;
+
+	/* __mmput() and oom_reaper() could race. */
+	spin_lock(&oom_mm_lock);
+	victim = mm->oom_mm.victim;
 	mm->oom_mm.victim = NULL;
-	mmdrop(mm);
-	mutex_unlock(&oom_lock);
+	if (victim)
+		list_del(&mm->oom_mm.list);
+	spin_unlock(&oom_mm_lock);
+	if (victim) {
+		put_task_struct(victim);
+		mmdrop(mm);
+	}
 }
 
 bool oom_has_pending_mm(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
 	struct mm_struct *mm;
+	bool ret = false;
 
-	list_for_each_entry(mm, &oom_mm_list, oom_mm.list)
-		if (!oom_unkillable_task(mm->oom_mm.victim, memcg, nodemask))
-			return true;
-	return false;
+	spin_lock(&oom_mm_lock);
+	list_for_each_entry(mm, &oom_mm_list, oom_mm.list) {
+		if (task_in_oom_domain(mm->oom_mm.victim, memcg, nodemask)) {
+			ret = true;
+			break;
+		}
+	}
+	spin_unlock(&oom_mm_lock);
+	return ret;
 }
 
 enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
@@ -522,12 +542,16 @@
 static void oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
 {
 	int attempts = 0;
+	bool ret;
 
 	/*
-	 * Check MMF_OOM_REAPED in case oom_kill_process() found this mm
-	 * pinned.
+	 * Check MMF_OOM_REAPED after holding oom_lock in case
+	 * oom_kill_process() found this mm pinned.
 	 */
-	if (test_bit(MMF_OOM_REAPED, &mm->flags))
+	mutex_lock(&oom_lock);
+	ret = test_bit(MMF_OOM_REAPED, &mm->flags);
+	mutex_unlock(&oom_lock);
+	if (ret)
 		return;
 
 	/* Retry the down_read_trylock(mmap_sem) a few times */
@@ -550,22 +574,26 @@
 	set_freezable();
 
 	while (true) {
-		struct mm_struct *mm;
-		struct task_struct *victim;
+		struct mm_struct *mm = NULL;
+		struct task_struct *victim = NULL;
 
 		wait_event_freezable(oom_reaper_wait,
 				     !list_empty(&oom_mm_list));
-		mutex_lock(&oom_lock);
-		mm = list_first_entry(&oom_mm_list, struct mm_struct,
-				      oom_mm.list);
-		victim = mm->oom_mm.victim;
-		/*
-		 * Take a reference on current victim thread in case
-		 * oom_reap_task() raced with mark_oom_victim() by
-		 * other threads sharing this mm.
-		 */
-		get_task_struct(victim);
-		mutex_unlock(&oom_lock);
+		spin_lock(&oom_mm_lock);
+		if (!list_empty(&oom_mm_list)) {
+			mm = list_first_entry(&oom_mm_list, struct mm_struct,
+					      oom_mm.list);
+			victim = mm->oom_mm.victim;
+			/*
+			 * Take a reference on current victim thread in case
+			 * oom_reap_task() raced with mark_oom_victim() by
+			 * other threads sharing this mm.
+			 */
+			get_task_struct(victim);
+		}
+		spin_unlock(&oom_mm_lock);
+		if (!mm)
+			continue;
 		oom_reap_task(victim, mm);
 		put_task_struct(victim);
 		/* Drop references taken by mark_oom_victim() */
@@ -598,7 +626,7 @@
 void mark_oom_victim(struct task_struct *tsk)
 {
 	struct mm_struct *mm = tsk->mm;
-	struct task_struct *old_tsk = mm->oom_mm.victim;
+	struct task_struct *old_tsk;
 
 	WARN_ON(oom_killer_disabled);
 	/* OOM killer might race with memcg OOM */
@@ -615,18 +643,29 @@
 	/*
 	 * Since mark_oom_victim() is called from multiple threads,
 	 * connect this mm to oom_mm_list only if not yet connected.
+	 *
+	 * But task_in_oom_domain(mm->oom_mm.victim, memcg, nodemask) in
+	 * oom_has_pending_mm() might return false after all threads in one
+	 * thread group (which mm->oom_mm.victim belongs to) reached TASK_DEAD
+	 * state. In that case, the same mm will be selected by another thread
+	 * group (which mm->oom_mm.victim does not belongs to). Therefore,
+	 * we need to replace the old task with the new task (at least when
+	 * task_in_oom_domain() returned false).
 	 */
 	get_task_struct(tsk);
+	spin_lock(&oom_mm_lock);
+	old_tsk = mm->oom_mm.victim;
 	mm->oom_mm.victim = tsk;
 	if (!old_tsk) {
 		atomic_inc(&mm->mm_count);
 		list_add_tail(&mm->oom_mm.list, &oom_mm_list);
+	}
+	spin_unlock(&oom_mm_lock);
+	if (old_tsk)
+		put_task_struct(old_tsk);
 #ifdef CONFIG_MMU
-		wake_up(&oom_reaper_wait);
+	wake_up(&oom_reaper_wait);
 #endif
-	} else {
-		put_task_struct(old_tsk);
-	}
 }
 
 /**

This series does not include patches for use_mm() users and wait_event()
in oom_killer_disable(). We can apply
http://lkml.kernel.org/r/1467365190-24640-3-git-send-email-mhocko@kernel.org
on top of this series.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/8] mm,oom_reaper: Reduce find_lock_task_mm() usage.
  2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
@ 2016-07-12 13:29 ` Tetsuo Handa
  2016-07-12 13:29 ` [PATCH 2/8] mm,oom_reaper: Do not attempt to reap a task twice Tetsuo Handa
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 13:29 UTC (permalink / raw)
  To: mhocko, mhocko
  Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst, Tetsuo Handa

__oom_reap_task() can be simplified a bit if it receives a valid mm from
oom_reap_task() which also uses that mm when __oom_reap_task() failed.
We can drop one find_lock_task_mm() call and also make the
__oom_reap_task() code flow easier to follow. Moreover, this will make
later patch in the series easier to review. Pinning mm's mm_count for
longer time is not really harmful because this will not pin much memory.

This patch doesn't introduce any functional change.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
---
 mm/oom_kill.c | 79 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 37 insertions(+), 42 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7d0a275..951eb1b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -452,12 +452,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
-static bool __oom_reap_task(struct task_struct *tsk)
+static bool __oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
-	struct mm_struct *mm = NULL;
-	struct task_struct *p;
 	struct zap_details details = {.check_swap_entries = true,
 				      .ignore_dirty = true};
 	bool ret = true;
@@ -478,22 +476,9 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 */
 	mutex_lock(&oom_lock);
 
-	/*
-	 * Make sure we find the associated mm_struct even when the particular
-	 * thread has already terminated and cleared its mm.
-	 * We might have race with exit path so consider our work done if there
-	 * is no mm.
-	 */
-	p = find_lock_task_mm(tsk);
-	if (!p)
-		goto unlock_oom;
-	mm = p->mm;
-	atomic_inc(&mm->mm_count);
-	task_unlock(p);
-
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		ret = false;
-		goto mm_drop;
+		goto unlock_oom;
 	}
 
 	/*
@@ -503,7 +488,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 */
 	if (!mmget_not_zero(mm)) {
 		up_read(&mm->mmap_sem);
-		goto mm_drop;
+		goto unlock_oom;
 	}
 
 	tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -551,8 +536,6 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 * put the oom_reaper out of the way.
 	 */
 	mmput_async(mm);
-mm_drop:
-	mmdrop(mm);
 unlock_oom:
 	mutex_unlock(&oom_lock);
 	return ret;
@@ -562,36 +545,45 @@ unlock_oom:
 static void oom_reap_task(struct task_struct *tsk)
 {
 	int attempts = 0;
+	struct mm_struct *mm = NULL;
+	struct task_struct *p = find_lock_task_mm(tsk);
+
+	/*
+	 * Make sure we find the associated mm_struct even when the particular
+	 * thread has already terminated and cleared its mm.
+	 * We might have race with exit path so consider our work done if there
+	 * is no mm.
+	 */
+	if (!p)
+		goto done;
+	mm = p->mm;
+	atomic_inc(&mm->mm_count);
+	task_unlock(p);
 
 	/* Retry the down_read_trylock(mmap_sem) a few times */
-	while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
+	while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk, mm))
 		schedule_timeout_idle(HZ/10);
 
-	if (attempts > MAX_OOM_REAP_RETRIES) {
-		struct task_struct *p;
+	if (attempts <= MAX_OOM_REAP_RETRIES)
+		goto done;
 
-		pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
-				task_pid_nr(tsk), tsk->comm);
+	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+		task_pid_nr(tsk), tsk->comm);
 
-		/*
-		 * If we've already tried to reap this task in the past and
-		 * failed it probably doesn't make much sense to try yet again
-		 * so hide the mm from the oom killer so that it can move on
-		 * to another task with a different mm struct.
-		 */
-		p = find_lock_task_mm(tsk);
-		if (p) {
-			if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
-				pr_info("oom_reaper: giving up pid:%d (%s)\n",
-						task_pid_nr(tsk), tsk->comm);
-				set_bit(MMF_OOM_REAPED, &p->mm->flags);
-			}
-			task_unlock(p);
-		}
-
-		debug_show_all_locks();
+	/*
+	 * If we've already tried to reap this task in the past and
+	 * failed it probably doesn't make much sense to try yet again
+	 * so hide the mm from the oom killer so that it can move on
+	 * to another task with a different mm struct.
+	 */
+	if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &mm->flags)) {
+		pr_info("oom_reaper: giving up pid:%d (%s)\n",
+			task_pid_nr(tsk), tsk->comm);
+		set_bit(MMF_OOM_REAPED, &mm->flags);
 	}
+	debug_show_all_locks();
 
+done:
 	/*
 	 * Clear TIF_MEMDIE because the task shouldn't be sitting on a
 	 * reasonably reclaimable memory anymore or it is not a good candidate
@@ -603,6 +595,9 @@ static void oom_reap_task(struct task_struct *tsk)
 
 	/* Drop a reference taken by wake_oom_reaper */
 	put_task_struct(tsk);
+	/* Drop a reference taken above. */
+	if (mm)
+		mmdrop(mm);
 }
 
 static int oom_reaper(void *unused)
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 2/8] mm,oom_reaper: Do not attempt to reap a task twice.
  2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
  2016-07-12 13:29 ` [PATCH 1/8] mm,oom_reaper: Reduce find_lock_task_mm() usage Tetsuo Handa
@ 2016-07-12 13:29 ` Tetsuo Handa
  2016-07-12 14:19   ` Michal Hocko
  2016-07-12 13:29 ` [PATCH 3/8] mm,oom: Use list of mm_struct used by OOM victims Tetsuo Handa
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 13:29 UTC (permalink / raw)
  To: mhocko, mhocko
  Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst, Tetsuo Handa

"mm, oom_reaper: do not attempt to reap a task twice" tried to give the
OOM reaper one more chance to retry using MMF_OOM_NOT_REAPABLE flag. But
the usefulness of the flag is rather limited and actually never shown
in practice. If the flag is set, it means that the holder of mm->mmap_sem
cannot call up_write() due to presumably being blocked at unkillable wait
waiting for other thread's memory allocation. But since one of threads
sharing that mm will queue that mm immediately via task_will_free_mem()
shortcut (otherwise, oom_badness() will select the same mm again due to
oom_score_adj value unchanged), retrying MMF_OOM_NOT_REAPABLE mm is
unlikely helpful.

Let's always set MMF_OOM_REAPED.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 include/linux/sched.h |  1 -
 mm/oom_kill.c         | 15 +++------------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 553af29..c0efd80 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,7 +523,6 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
 #define MMF_OOM_REAPED		21	/* mm has been already reaped */
-#define MMF_OOM_NOT_REAPABLE	22	/* mm couldn't be reaped */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 951eb1b..9f0022e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -567,20 +567,11 @@ static void oom_reap_task(struct task_struct *tsk)
 	if (attempts <= MAX_OOM_REAP_RETRIES)
 		goto done;
 
+	/* Ignore this mm because somebody can't call up_write(mmap_sem). */
+	set_bit(MMF_OOM_REAPED, &mm->flags);
+
 	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
 		task_pid_nr(tsk), tsk->comm);
-
-	/*
-	 * If we've already tried to reap this task in the past and
-	 * failed it probably doesn't make much sense to try yet again
-	 * so hide the mm from the oom killer so that it can move on
-	 * to another task with a different mm struct.
-	 */
-	if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &mm->flags)) {
-		pr_info("oom_reaper: giving up pid:%d (%s)\n",
-			task_pid_nr(tsk), tsk->comm);
-		set_bit(MMF_OOM_REAPED, &mm->flags);
-	}
 	debug_show_all_locks();
 
 done:
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 3/8] mm,oom: Use list of mm_struct used by OOM victims.
  2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
  2016-07-12 13:29 ` [PATCH 1/8] mm,oom_reaper: Reduce find_lock_task_mm() usage Tetsuo Handa
  2016-07-12 13:29 ` [PATCH 2/8] mm,oom_reaper: Do not attempt to reap a task twice Tetsuo Handa
@ 2016-07-12 13:29 ` Tetsuo Handa
  2016-07-12 14:28   ` Michal Hocko
  2016-07-12 13:29 ` [PATCH 4/8] mm,oom: Close oom_has_pending_mm race Tetsuo Handa
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 13:29 UTC (permalink / raw)
  To: mhocko, mhocko
  Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst, Tetsuo Handa

Currently, we walk process list in order to find existing TIF_MEMDIE
threads. But if we remember list of mm_struct used by TIF_MEMDIE threads,
we can avoid walking process list. Later patch in the series will change
OOM reaper to use list of mm_struct introduced by this patch.

Also, we can start using TIF_MEMDIE only for the access to memory reserves
to oom victims which actually need to allocate and decouple the current
double meaning. Later patch in the series will eliminate OOM_SCAN_ABORT
case and "struct signal_struct"->oom_victims because oom_has_pending_mm()
introduced by this patch can take that role.

It is theoretically possible that the number of elements on this list
grows as many as the number of configured memcg groups and
mempolicy/cpuset patterns. But in most cases, the OOM reaper will
immediately remove almost all elements in first OOM reap attempt.
Moreover, many of OOM events can be solved before the OOM reaper tries
to OOM reap that memory. Therefore, the average speed of removing
elements will be faster than the average speed of adding elements.
If delay caused by retrying specific element for one second before
trying other elements matters, we can do parallel OOM reaping.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 include/linux/mm_types.h |  7 +++++
 include/linux/oom.h      |  3 +++
 kernel/fork.c            |  2 ++
 mm/memcontrol.c          |  5 ++++
 mm/oom_kill.c            | 69 +++++++++++++++++++++++++++++++++++++++++-------
 5 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 79472b2..96a8709 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -392,6 +392,12 @@ struct mm_rss_stat {
 	atomic_long_t count[NR_MM_COUNTERS];
 };
 
+struct oom_mm {
+	struct list_head list; /* Linked to oom_mm_list list. */
+	/* Thread which was passed to mark_oom_victim() for the last time. */
+	struct task_struct *victim;
+};
+
 struct kioctx_table;
 struct mm_struct {
 	struct vm_area_struct *mmap;		/* list of VMAs */
@@ -515,6 +521,7 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+	struct oom_mm oom_mm;
 #ifdef CONFIG_MMU
 	struct work_struct async_put_work;
 #endif
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 5bc0457..bdcb331 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -91,6 +91,9 @@ extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 extern void check_panic_on_oom(struct oom_control *oc,
 			       enum oom_constraint constraint);
 
+extern void exit_oom_mm(struct mm_struct *mm);
+extern bool oom_has_pending_mm(struct mem_cgroup *memcg,
+			       const nodemask_t *nodemask);
 extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 					       struct task_struct *task);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 7926993..d83163a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -722,6 +722,8 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
+	if (mm->oom_mm.victim)
+		exit_oom_mm(mm);
 	mmdrop(mm);
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 40dfca3..8f7a5b7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1241,6 +1241,11 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	}
 
 	check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
+	if (oom_has_pending_mm(memcg, NULL)) {
+		/* Set a dummy value to return "true". */
+		chosen = (void *) 1;
+		goto unlock;
+	}
 	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct css_task_iter it;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9f0022e..07e8c1a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -132,6 +132,20 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
 	return oc->order == -1;
 }
 
+static bool task_in_oom_domain(struct task_struct *p, struct mem_cgroup *memcg,
+			       const nodemask_t *nodemask)
+{
+	/* When mem_cgroup_out_of_memory() and p is not member of the group */
+	if (memcg && !task_in_mem_cgroup(p, memcg))
+		return false;
+
+	/* p may not have freeable memory in nodemask */
+	if (!has_intersects_mems_allowed(p, nodemask))
+		return false;
+
+	return true;
+}
+
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask)
@@ -141,15 +155,7 @@ static bool oom_unkillable_task(struct task_struct *p,
 	if (p->flags & PF_KTHREAD)
 		return true;
 
-	/* When mem_cgroup_out_of_memory() and p is not member of the group */
-	if (memcg && !task_in_mem_cgroup(p, memcg))
-		return true;
-
-	/* p may not have freeable memory in nodemask */
-	if (!has_intersects_mems_allowed(p, nodemask))
-		return true;
-
-	return false;
+	return !task_in_oom_domain(p, memcg, nodemask);
 }
 
 /**
@@ -275,6 +281,34 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
 }
 #endif
 
+static LIST_HEAD(oom_mm_list);
+static DEFINE_SPINLOCK(oom_mm_lock);
+
+void exit_oom_mm(struct mm_struct *mm)
+{
+	spin_lock(&oom_mm_lock);
+	list_del(&mm->oom_mm.list);
+	spin_unlock(&oom_mm_lock);
+	put_task_struct(mm->oom_mm.victim);
+	mmdrop(mm);
+}
+
+bool oom_has_pending_mm(struct mem_cgroup *memcg, const nodemask_t *nodemask)
+{
+	struct mm_struct *mm;
+	bool ret = false;
+
+	spin_lock(&oom_mm_lock);
+	list_for_each_entry(mm, &oom_mm_list, oom_mm.list) {
+		if (task_in_oom_domain(mm->oom_mm.victim, memcg, nodemask)) {
+			ret = true;
+			break;
+		}
+	}
+	spin_unlock(&oom_mm_lock);
+	return ret;
+}
+
 enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 					struct task_struct *task)
 {
@@ -653,6 +687,8 @@ subsys_initcall(oom_init)
  */
 void mark_oom_victim(struct task_struct *tsk)
 {
+	struct mm_struct *mm = tsk->mm;
+
 	WARN_ON(oom_killer_disabled);
 	/* OOM killer might race with memcg OOM */
 	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
@@ -666,6 +702,18 @@ void mark_oom_victim(struct task_struct *tsk)
 	 */
 	__thaw_task(tsk);
 	atomic_inc(&oom_victims);
+	/*
+	 * Since mark_oom_victim() is called from multiple threads,
+	 * connect this mm to oom_mm_list only if not yet connected.
+	 */
+	if (!mm->oom_mm.victim) {
+		atomic_inc(&mm->mm_count);
+		get_task_struct(tsk);
+		mm->oom_mm.victim = tsk;
+		spin_lock(&oom_mm_lock);
+		list_add_tail(&mm->oom_mm.list, &oom_mm_list);
+		spin_unlock(&oom_mm_lock);
+	}
 }
 
 /**
@@ -1026,6 +1074,9 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	if (!is_sysrq_oom(oc) && oom_has_pending_mm(oc->memcg, oc->nodemask))
+		return true;
+
 	p = select_bad_process(oc, &points, totalpages);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p && !is_sysrq_oom(oc)) {
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 4/8] mm,oom: Close oom_has_pending_mm race.
  2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
                   ` (2 preceding siblings ...)
  2016-07-12 13:29 ` [PATCH 3/8] mm,oom: Use list of mm_struct used by OOM victims Tetsuo Handa
@ 2016-07-12 13:29 ` Tetsuo Handa
  2016-07-12 14:36   ` Michal Hocko
  2016-07-12 13:29 ` [PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct Tetsuo Handa
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 13:29 UTC (permalink / raw)
  To: mhocko, mhocko
  Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst, Tetsuo Handa

Previous patch ignored a situation where oom_has_pending_mm() returns
false due to all threads which mm->oom_mm.victim belongs to have reached
TASK_DEAD state, for there might be other thread groups sharing that mm.

This patch handles such situation by always updating mm->oom_mm.victim.
By applying this patch, the comm/pid pair printed at oom_kill_process()
and oom_reap_task() might differ. But that will not be a critical
problem.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/oom_kill.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 07e8c1a..0b78133 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -688,6 +688,7 @@ subsys_initcall(oom_init)
 void mark_oom_victim(struct task_struct *tsk)
 {
 	struct mm_struct *mm = tsk->mm;
+	struct task_struct *old_tsk;
 
 	WARN_ON(oom_killer_disabled);
 	/* OOM killer might race with memcg OOM */
@@ -705,15 +706,26 @@ void mark_oom_victim(struct task_struct *tsk)
 	/*
 	 * Since mark_oom_victim() is called from multiple threads,
 	 * connect this mm to oom_mm_list only if not yet connected.
+	 *
+	 * But task_in_oom_domain(mm->oom_mm.victim, memcg, nodemask) in
+	 * oom_has_pending_mm() might return false after all threads in one
+	 * thread group (which mm->oom_mm.victim belongs to) reached TASK_DEAD
+	 * state. In that case, the same mm will be selected by another thread
+	 * group (which mm->oom_mm.victim does not belongs to). Therefore,
+	 * we need to replace the old task with the new task (at least when
+	 * task_in_oom_domain() returned false).
 	 */
-	if (!mm->oom_mm.victim) {
+	get_task_struct(tsk);
+	spin_lock(&oom_mm_lock);
+	old_tsk = mm->oom_mm.victim;
+	mm->oom_mm.victim = tsk;
+	if (!old_tsk) {
 		atomic_inc(&mm->mm_count);
-		get_task_struct(tsk);
-		mm->oom_mm.victim = tsk;
-		spin_lock(&oom_mm_lock);
 		list_add_tail(&mm->oom_mm.list, &oom_mm_list);
-		spin_unlock(&oom_mm_lock);
 	}
+	spin_unlock(&oom_mm_lock);
+	if (old_tsk)
+		put_task_struct(old_tsk);
 }
 
 /**
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct.
  2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
                   ` (3 preceding siblings ...)
  2016-07-12 13:29 ` [PATCH 4/8] mm,oom: Close oom_has_pending_mm race Tetsuo Handa
@ 2016-07-12 13:29 ` Tetsuo Handa
  2016-07-12 14:51   ` Michal Hocko
  2016-07-12 13:29 ` [PATCH 6/8] mm,oom: Remove OOM_SCAN_ABORT case and signal_struct->oom_victims Tetsuo Handa
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 13:29 UTC (permalink / raw)
  To: mhocko, mhocko
  Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst, Tetsuo Handa

find_lock_task_mm() is racy when finding an mm because it returns NULL
when all threads in that thread group passed current->mm == NULL line
in exit_mm() while that mm might be still waiting for __mmput() (or the
OOM reaper) to reclaim memory used by that mm.

Since OOM reaping is per mm_struct operation, it is natural to use
list of mm_struct used by OOM victims. By using list of mm_struct,
we can eliminate find_lock_task_mm() usage from the OOM reaper.

We still have racy find_lock_task_mm() usage in oom_scan_process_thread()
which can theoretically cause OOM livelock situation when MMF_OOM_REAPED
was set to OOM victim's mm without putting that mm under the OOM reaper's
supervision. We must not depend on find_lock_task_mm() not returning NULL.

Since later patch in the series will change oom_scan_process_thread() not
to depend on atomic_read(&task->signal->oom_victims) != 0 &&
find_lock_task_mm(task) != NULL, this patch removes exit_oom_victim()
on remote thread.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 include/linux/oom.h |   8 ----
 mm/memcontrol.c     |   1 -
 mm/oom_kill.c       | 117 +++++++++++++++++++++-------------------------------
 3 files changed, 47 insertions(+), 79 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index bdcb331..cb3f041 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,14 +72,6 @@ static inline bool oom_task_origin(const struct task_struct *p)
 
 extern void mark_oom_victim(struct task_struct *tsk);
 
-#ifdef CONFIG_MMU
-extern void wake_oom_reaper(struct task_struct *tsk);
-#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
-{
-}
-#endif
-
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8f7a5b7..5043324 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1236,7 +1236,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 */
 	if (task_will_free_mem(current)) {
 		mark_oom_victim(current);
-		wake_oom_reaper(current);
 		goto unlock;
 	}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0b78133..715f77d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -286,11 +286,19 @@ static DEFINE_SPINLOCK(oom_mm_lock);
 
 void exit_oom_mm(struct mm_struct *mm)
 {
+	struct task_struct *victim;
+
+	/* __mmput() and oom_reaper() could race. */
 	spin_lock(&oom_mm_lock);
-	list_del(&mm->oom_mm.list);
+	victim = mm->oom_mm.victim;
+	mm->oom_mm.victim = NULL;
+	if (victim)
+		list_del(&mm->oom_mm.list);
 	spin_unlock(&oom_mm_lock);
-	put_task_struct(mm->oom_mm.victim);
-	mmdrop(mm);
+	if (victim) {
+		put_task_struct(victim);
+		mmdrop(mm);
+	}
 }
 
 bool oom_has_pending_mm(struct mem_cgroup *memcg, const nodemask_t *nodemask)
@@ -483,8 +491,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
  */
 static struct task_struct *oom_reaper_th;
 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
-static struct task_struct *oom_reaper_list;
-static DEFINE_SPINLOCK(oom_reaper_lock);
 
 static bool __oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
 {
@@ -576,30 +582,27 @@ unlock_oom:
 }
 
 #define MAX_OOM_REAP_RETRIES 10
-static void oom_reap_task(struct task_struct *tsk)
+static void oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
 {
 	int attempts = 0;
-	struct mm_struct *mm = NULL;
-	struct task_struct *p = find_lock_task_mm(tsk);
+	bool ret;
 
 	/*
-	 * Make sure we find the associated mm_struct even when the particular
-	 * thread has already terminated and cleared its mm.
-	 * We might have race with exit path so consider our work done if there
-	 * is no mm.
+	 * Check MMF_OOM_REAPED after holding oom_lock in case
+	 * oom_kill_process() found this mm pinned.
 	 */
-	if (!p)
-		goto done;
-	mm = p->mm;
-	atomic_inc(&mm->mm_count);
-	task_unlock(p);
+	mutex_lock(&oom_lock);
+	ret = test_bit(MMF_OOM_REAPED, &mm->flags);
+	mutex_unlock(&oom_lock);
+	if (ret)
+		return;
 
 	/* Retry the down_read_trylock(mmap_sem) a few times */
 	while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk, mm))
 		schedule_timeout_idle(HZ/10);
 
 	if (attempts <= MAX_OOM_REAP_RETRIES)
-		goto done;
+		return;
 
 	/* Ignore this mm because somebody can't call up_write(mmap_sem). */
 	set_bit(MMF_OOM_REAPED, &mm->flags);
@@ -607,22 +610,6 @@ static void oom_reap_task(struct task_struct *tsk)
 	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
 		task_pid_nr(tsk), tsk->comm);
 	debug_show_all_locks();
-
-done:
-	/*
-	 * Clear TIF_MEMDIE because the task shouldn't be sitting on a
-	 * reasonably reclaimable memory anymore or it is not a good candidate
-	 * for the oom victim right now because it cannot release its memory
-	 * itself nor by the oom reaper.
-	 */
-	tsk->oom_reaper_list = NULL;
-	exit_oom_victim(tsk);
-
-	/* Drop a reference taken by wake_oom_reaper */
-	put_task_struct(tsk);
-	/* Drop a reference taken above. */
-	if (mm)
-		mmdrop(mm);
 }
 
 static int oom_reaper(void *unused)
@@ -630,41 +617,35 @@ static int oom_reaper(void *unused)
 	set_freezable();
 
 	while (true) {
-		struct task_struct *tsk = NULL;
-
-		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
-		spin_lock(&oom_reaper_lock);
-		if (oom_reaper_list != NULL) {
-			tsk = oom_reaper_list;
-			oom_reaper_list = tsk->oom_reaper_list;
+		struct mm_struct *mm = NULL;
+		struct task_struct *victim = NULL;
+
+		wait_event_freezable(oom_reaper_wait,
+				     !list_empty(&oom_mm_list));
+		spin_lock(&oom_mm_lock);
+		if (!list_empty(&oom_mm_list)) {
+			mm = list_first_entry(&oom_mm_list, struct mm_struct,
+					      oom_mm.list);
+			victim = mm->oom_mm.victim;
+			/*
+			 * Take a reference on current victim thread in case
+			 * oom_reap_task() raced with mark_oom_victim() by
+			 * other threads sharing this mm.
+			 */
+			get_task_struct(victim);
 		}
-		spin_unlock(&oom_reaper_lock);
-
-		if (tsk)
-			oom_reap_task(tsk);
+		spin_unlock(&oom_mm_lock);
+		if (!mm)
+			continue;
+		oom_reap_task(victim, mm);
+		put_task_struct(victim);
+		/* Drop references taken by mark_oom_victim() */
+		exit_oom_mm(mm);
 	}
 
 	return 0;
 }
 
-void wake_oom_reaper(struct task_struct *tsk)
-{
-	if (!oom_reaper_th)
-		return;
-
-	/* tsk is already queued? */
-	if (tsk == oom_reaper_list || tsk->oom_reaper_list)
-		return;
-
-	get_task_struct(tsk);
-
-	spin_lock(&oom_reaper_lock);
-	tsk->oom_reaper_list = oom_reaper_list;
-	oom_reaper_list = tsk;
-	spin_unlock(&oom_reaper_lock);
-	wake_up(&oom_reaper_wait);
-}
-
 static int __init oom_init(void)
 {
 	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -726,6 +707,9 @@ void mark_oom_victim(struct task_struct *tsk)
 	spin_unlock(&oom_mm_lock);
 	if (old_tsk)
 		put_task_struct(old_tsk);
+#ifdef CONFIG_MMU
+	wake_up(&oom_reaper_wait);
+#endif
 }
 
 /**
@@ -867,7 +851,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	unsigned int victim_points = 0;
 	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 					      DEFAULT_RATELIMIT_BURST);
-	bool can_oom_reap = true;
 
 	/*
 	 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -876,7 +859,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	task_lock(p);
 	if (task_will_free_mem(p)) {
 		mark_oom_victim(p);
-		wake_oom_reaper(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -966,7 +948,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 			 * memory might be still used. Hide the mm from the oom
 			 * killer to guarantee OOM forward progress.
 			 */
-			can_oom_reap = false;
 			set_bit(MMF_OOM_REAPED, &mm->flags);
 			pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
 					task_pid_nr(victim), victim->comm,
@@ -977,9 +958,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	}
 	rcu_read_unlock();
 
-	if (can_oom_reap)
-		wake_oom_reaper(victim);
-
 	mmdrop(mm);
 	put_task_struct(victim);
 }
@@ -1055,7 +1033,6 @@ bool out_of_memory(struct oom_control *oc)
 	 */
 	if (task_will_free_mem(current)) {
 		mark_oom_victim(current);
-		wake_oom_reaper(current);
 		return true;
 	}
 
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 6/8] mm,oom: Remove OOM_SCAN_ABORT case and signal_struct->oom_victims.
  2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
                   ` (4 preceding siblings ...)
  2016-07-12 13:29 ` [PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct Tetsuo Handa
@ 2016-07-12 13:29 ` Tetsuo Handa
  2016-07-12 13:29 ` [PATCH 7/8] mm,oom: Stop clearing TIF_MEMDIE on remote thread Tetsuo Handa
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 13:29 UTC (permalink / raw)
  To: mhocko, mhocko
  Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst, Tetsuo Handa

Since oom_has_pending_mm() controls whether to select next OOM victim,
we no longer need to abort OOM victim selection loop using OOM_SCAN_ABORT
case. Also, since signal_struct->oom_victims was used only for allowing
oom_scan_process_thread() to return OOM_SCAN_ABORT, we no longer need to
use signal_struct->oom_victims.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
---
 include/linux/oom.h   |  1 -
 include/linux/sched.h |  1 -
 mm/memcontrol.c       |  8 --------
 mm/oom_kill.c         | 26 +-------------------------
 4 files changed, 1 insertion(+), 35 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index cb3f041..27be4ba 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -49,7 +49,6 @@ enum oom_constraint {
 enum oom_scan_t {
 	OOM_SCAN_OK,		/* scan thread and find its badness */
 	OOM_SCAN_CONTINUE,	/* do not consider thread for oom kill */
-	OOM_SCAN_ABORT,		/* abort the iteration and return */
 	OOM_SCAN_SELECT,	/* always select this thread first */
 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c0efd80..2f57cf1c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -670,7 +670,6 @@ struct signal_struct {
 	atomic_t		sigcnt;
 	atomic_t		live;
 	int			nr_threads;
-	atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */
 	struct list_head	thread_head;
 
 	wait_queue_head_t	wait_chldexit;	/* for wait4() */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5043324..6afe1c5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1262,14 +1262,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				/* fall through */
 			case OOM_SCAN_CONTINUE:
 				continue;
-			case OOM_SCAN_ABORT:
-				css_task_iter_end(&it);
-				mem_cgroup_iter_break(memcg, iter);
-				if (chosen)
-					put_task_struct(chosen);
-				/* Set a dummy value to return "true". */
-				chosen = (void *) 1;
-				goto unlock;
 			case OOM_SCAN_OK:
 				break;
 			};
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 715f77d..7ee6b70 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -324,25 +324,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 		return OOM_SCAN_CONTINUE;
 
 	/*
-	 * This task already has access to memory reserves and is being killed.
-	 * Don't allow any other task to have access to the reserves unless
-	 * the task has MMF_OOM_REAPED because chances that it would release
-	 * any memory is quite low.
-	 */
-	if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
-		struct task_struct *p = find_lock_task_mm(task);
-		enum oom_scan_t ret = OOM_SCAN_ABORT;
-
-		if (p) {
-			if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
-				ret = OOM_SCAN_CONTINUE;
-			task_unlock(p);
-		}
-
-		return ret;
-	}
-
-	/*
 	 * If task is allocating a lot of memory and has been marked to be
 	 * killed first if it triggers an oom, then select it.
 	 */
@@ -374,9 +355,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
 			/* fall through */
 		case OOM_SCAN_CONTINUE:
 			continue;
-		case OOM_SCAN_ABORT:
-			rcu_read_unlock();
-			return (struct task_struct *)(-1UL);
 		case OOM_SCAN_OK:
 			break;
 		};
@@ -675,7 +653,6 @@ void mark_oom_victim(struct task_struct *tsk)
 	/* OOM killer might race with memcg OOM */
 	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
-	atomic_inc(&tsk->signal->oom_victims);
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
 	 * if it is frozen because OOM killer wouldn't be able to free
@@ -719,7 +696,6 @@ void exit_oom_victim(struct task_struct *tsk)
 {
 	if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
-	atomic_dec(&tsk->signal->oom_victims);
 
 	if (!atomic_dec_return(&oom_victims))
 		wake_up_all(&oom_victims_wait);
@@ -1072,7 +1048,7 @@ bool out_of_memory(struct oom_control *oc)
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (p && p != (void *)-1UL) {
+	if (p) {
 		oom_kill_process(oc, p, points, totalpages, "Out of memory");
 		/*
 		 * Give the killed process a good chance to exit before trying
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 7/8] mm,oom: Stop clearing TIF_MEMDIE on remote thread.
  2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
                   ` (5 preceding siblings ...)
  2016-07-12 13:29 ` [PATCH 6/8] mm,oom: Remove OOM_SCAN_ABORT case and signal_struct->oom_victims Tetsuo Handa
@ 2016-07-12 13:29 ` Tetsuo Handa
  2016-07-12 14:53   ` Michal Hocko
  2016-07-12 13:29 ` [PATCH 8/8] oom_reaper: Revert "oom_reaper: close race with exiting task" Tetsuo Handa
  2016-07-21 11:21 ` [PATCH v3 0/8] Change OOM killer to use list of mm_struct Michal Hocko
  8 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 13:29 UTC (permalink / raw)
  To: mhocko, mhocko
  Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst, Tetsuo Handa

Since no kernel code path needs to clear TIF_MEMDIE flag on a remote
thread we can drop the task parameter and enforce that actually.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
---
 include/linux/oom.h | 2 +-
 kernel/exit.c       | 2 +-
 mm/oom_kill.c       | 5 ++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 27be4ba..90e98ea 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -90,7 +90,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 
 extern bool out_of_memory(struct oom_control *oc);
 
-extern void exit_oom_victim(struct task_struct *tsk);
+extern void exit_oom_victim(void);
 
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
diff --git a/kernel/exit.c b/kernel/exit.c
index 84ae830..1b1dada 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk)
 	mm_update_next_owner(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
-		exit_oom_victim(tsk);
+		exit_oom_victim();
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7ee6b70..fab0bec 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -692,10 +692,9 @@ void mark_oom_victim(struct task_struct *tsk)
 /**
  * exit_oom_victim - note the exit of an OOM victim
  */
-void exit_oom_victim(struct task_struct *tsk)
+void exit_oom_victim(void)
 {
-	if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
-		return;
+	clear_thread_flag(TIF_MEMDIE);
 
 	if (!atomic_dec_return(&oom_victims))
 		wake_up_all(&oom_victims_wait);
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 8/8] oom_reaper: Revert "oom_reaper: close race with exiting task".
  2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
                   ` (6 preceding siblings ...)
  2016-07-12 13:29 ` [PATCH 7/8] mm,oom: Stop clearing TIF_MEMDIE on remote thread Tetsuo Handa
@ 2016-07-12 13:29 ` Tetsuo Handa
  2016-07-12 14:56   ` Michal Hocko
  2016-07-21 11:21 ` [PATCH v3 0/8] Change OOM killer to use list of mm_struct Michal Hocko
  8 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 13:29 UTC (permalink / raw)
  To: mhocko, mhocko
  Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst, Tetsuo Handa

We can revert commit e2fe14564d3316d1 ("oom_reaper: close race with
exiting task") because oom_has_pending_mm() which will return true until
exit_oom_mm() is called after OOM victim's mm is reclaimed by __mmput()
or oom_reap_task() can close that race.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/oom_kill.c | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fab0bec..232c1ce 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -476,28 +476,9 @@ static bool __oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
 	struct vm_area_struct *vma;
 	struct zap_details details = {.check_swap_entries = true,
 				      .ignore_dirty = true};
-	bool ret = true;
 
-	/*
-	 * We have to make sure to not race with the victim exit path
-	 * and cause premature new oom victim selection:
-	 * __oom_reap_task		exit_mm
-	 *   mmget_not_zero
-	 *				  mmput
-	 *				    atomic_dec_and_test
-	 *				  exit_oom_victim
-	 *				[...]
-	 *				out_of_memory
-	 *				  select_bad_process
-	 *				    # no TIF_MEMDIE task selects new victim
-	 *  unmap_page_range # frees some memory
-	 */
-	mutex_lock(&oom_lock);
-
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		ret = false;
-		goto unlock_oom;
-	}
+	if (!down_read_trylock(&mm->mmap_sem))
+		return false;
 
 	/*
 	 * increase mm_users only after we know we will reap something so
@@ -506,7 +487,7 @@ static bool __oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
 	 */
 	if (!mmget_not_zero(mm)) {
 		up_read(&mm->mmap_sem);
-		goto unlock_oom;
+		return true;
 	}
 
 	tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -554,9 +535,7 @@ static bool __oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
 	 * put the oom_reaper out of the way.
 	 */
 	mmput_async(mm);
-unlock_oom:
-	mutex_unlock(&oom_lock);
-	return ret;
+	return true;
 }
 
 #define MAX_OOM_REAP_RETRIES 10
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/8] mm,oom_reaper: Do not attempt to reap a task twice.
  2016-07-12 13:29 ` [PATCH 2/8] mm,oom_reaper: Do not attempt to reap a task twice Tetsuo Handa
@ 2016-07-12 14:19   ` Michal Hocko
  0 siblings, 0 replies; 32+ messages in thread
From: Michal Hocko @ 2016-07-12 14:19 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Tue 12-07-16 22:29:17, Tetsuo Handa wrote:
> "mm, oom_reaper: do not attempt to reap a task twice" tried to give the
> OOM reaper one more chance to retry using MMF_OOM_NOT_REAPABLE flag. But
> the usefulness of the flag is rather limited and actually never shown
> in practice. If the flag is set, it means that the holder of mm->mmap_sem
> cannot call up_write() due to presumably being blocked at unkillable wait
> waiting for other thread's memory allocation. But since one of threads
> sharing that mm will queue that mm immediately via task_will_free_mem()
> shortcut (otherwise, oom_badness() will select the same mm again due to
> oom_score_adj value unchanged), retrying MMF_OOM_NOT_REAPABLE mm is
> unlikely helpful.
> 
> Let's always set MMF_OOM_REAPED.
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  include/linux/sched.h |  1 -
>  mm/oom_kill.c         | 15 +++------------
>  2 files changed, 3 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 553af29..c0efd80 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -523,7 +523,6 @@ static inline int get_dumpable(struct mm_struct *mm)
>  #define MMF_HAS_UPROBES		19	/* has uprobes */
>  #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
>  #define MMF_OOM_REAPED		21	/* mm has been already reaped */
> -#define MMF_OOM_NOT_REAPABLE	22	/* mm couldn't be reaped */
>  
>  #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
>  
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 951eb1b..9f0022e 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -567,20 +567,11 @@ static void oom_reap_task(struct task_struct *tsk)
>  	if (attempts <= MAX_OOM_REAP_RETRIES)
>  		goto done;
>  
> +	/* Ignore this mm because somebody can't call up_write(mmap_sem). */
> +	set_bit(MMF_OOM_REAPED, &mm->flags);
> +
>  	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
>  		task_pid_nr(tsk), tsk->comm);
> -
> -	/*
> -	 * If we've already tried to reap this task in the past and
> -	 * failed it probably doesn't make much sense to try yet again
> -	 * so hide the mm from the oom killer so that it can move on
> -	 * to another task with a different mm struct.
> -	 */
> -	if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &mm->flags)) {
> -		pr_info("oom_reaper: giving up pid:%d (%s)\n",
> -			task_pid_nr(tsk), tsk->comm);
> -		set_bit(MMF_OOM_REAPED, &mm->flags);
> -	}
>  	debug_show_all_locks();
>  
>  done:
> -- 
> 1.8.3.1
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/8] mm,oom: Use list of mm_struct used by OOM victims.
  2016-07-12 13:29 ` [PATCH 3/8] mm,oom: Use list of mm_struct used by OOM victims Tetsuo Handa
@ 2016-07-12 14:28   ` Michal Hocko
  0 siblings, 0 replies; 32+ messages in thread
From: Michal Hocko @ 2016-07-12 14:28 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Tue 12-07-16 22:29:18, Tetsuo Handa wrote:
> Currently, we walk process list in order to find existing TIF_MEMDIE
> threads. But if we remember list of mm_struct used by TIF_MEMDIE threads,
> we can avoid walking process list. Later patch in the series will change
> OOM reaper to use list of mm_struct introduced by this patch.
> 
> Also, we can start using TIF_MEMDIE only for the access to memory reserves
> to oom victims which actually need to allocate and decouple the current
> double meaning. Later patch in the series will eliminate OOM_SCAN_ABORT
> case and "struct signal_struct"->oom_victims because oom_has_pending_mm()
> introduced by this patch can take that role.
> 
> It is theoretically possible that the number of elements on this list
> grows as many as the number of configured memcg groups and
> mempolicy/cpuset patterns. But in most cases, the OOM reaper will
> immediately remove almost all elements in first OOM reap attempt.
> Moreover, many of OOM events can be solved before the OOM reaper tries
> to OOM reap that memory. Therefore, the average speed of removing
> elements will be faster than the average speed of adding elements.
> If delay caused by retrying specific element for one second before
> trying other elements matters, we can do parallel OOM reaping.
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

OK, this looks better. The amount of added code seems quite large but
follow up patches will remove quite some code so it looks like
worthwhile.

I am still not 100% decided whether this approach is better than
signal_struct->oom_mm one because the later one. Both have some pros and
cons but this approach is definitely sane and reasonable so

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  include/linux/mm_types.h |  7 +++++
>  include/linux/oom.h      |  3 +++
>  kernel/fork.c            |  2 ++
>  mm/memcontrol.c          |  5 ++++
>  mm/oom_kill.c            | 69 +++++++++++++++++++++++++++++++++++++++++-------
>  5 files changed, 77 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 79472b2..96a8709 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -392,6 +392,12 @@ struct mm_rss_stat {
>  	atomic_long_t count[NR_MM_COUNTERS];
>  };
>  
> +struct oom_mm {
> +	struct list_head list; /* Linked to oom_mm_list list. */
> +	/* Thread which was passed to mark_oom_victim() for the last time. */
> +	struct task_struct *victim;
> +};
> +
>  struct kioctx_table;
>  struct mm_struct {
>  	struct vm_area_struct *mmap;		/* list of VMAs */
> @@ -515,6 +521,7 @@ struct mm_struct {
>  #ifdef CONFIG_HUGETLB_PAGE
>  	atomic_long_t hugetlb_usage;
>  #endif
> +	struct oom_mm oom_mm;
>  #ifdef CONFIG_MMU
>  	struct work_struct async_put_work;
>  #endif
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 5bc0457..bdcb331 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -91,6 +91,9 @@ extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  extern void check_panic_on_oom(struct oom_control *oc,
>  			       enum oom_constraint constraint);
>  
> +extern void exit_oom_mm(struct mm_struct *mm);
> +extern bool oom_has_pending_mm(struct mem_cgroup *memcg,
> +			       const nodemask_t *nodemask);
>  extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
>  					       struct task_struct *task);
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 7926993..d83163a 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -722,6 +722,8 @@ static inline void __mmput(struct mm_struct *mm)
>  	}
>  	if (mm->binfmt)
>  		module_put(mm->binfmt->module);
> +	if (mm->oom_mm.victim)
> +		exit_oom_mm(mm);
>  	mmdrop(mm);
>  }
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 40dfca3..8f7a5b7 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1241,6 +1241,11 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	}
>  
>  	check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
> +	if (oom_has_pending_mm(memcg, NULL)) {
> +		/* Set a dummy value to return "true". */
> +		chosen = (void *) 1;
> +		goto unlock;
> +	}
>  	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
>  	for_each_mem_cgroup_tree(iter, memcg) {
>  		struct css_task_iter it;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 9f0022e..07e8c1a 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -132,6 +132,20 @@ static inline bool is_sysrq_oom(struct oom_control *oc)
>  	return oc->order == -1;
>  }
>  
> +static bool task_in_oom_domain(struct task_struct *p, struct mem_cgroup *memcg,
> +			       const nodemask_t *nodemask)
> +{
> +	/* When mem_cgroup_out_of_memory() and p is not member of the group */
> +	if (memcg && !task_in_mem_cgroup(p, memcg))
> +		return false;
> +
> +	/* p may not have freeable memory in nodemask */
> +	if (!has_intersects_mems_allowed(p, nodemask))
> +		return false;
> +
> +	return true;
> +}
> +
>  /* return true if the task is not adequate as candidate victim task. */
>  static bool oom_unkillable_task(struct task_struct *p,
>  		struct mem_cgroup *memcg, const nodemask_t *nodemask)
> @@ -141,15 +155,7 @@ static bool oom_unkillable_task(struct task_struct *p,
>  	if (p->flags & PF_KTHREAD)
>  		return true;
>  
> -	/* When mem_cgroup_out_of_memory() and p is not member of the group */
> -	if (memcg && !task_in_mem_cgroup(p, memcg))
> -		return true;
> -
> -	/* p may not have freeable memory in nodemask */
> -	if (!has_intersects_mems_allowed(p, nodemask))
> -		return true;
> -
> -	return false;
> +	return !task_in_oom_domain(p, memcg, nodemask);
>  }
>  
>  /**
> @@ -275,6 +281,34 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
>  }
>  #endif
>  
> +static LIST_HEAD(oom_mm_list);
> +static DEFINE_SPINLOCK(oom_mm_lock);
> +
> +void exit_oom_mm(struct mm_struct *mm)
> +{
> +	spin_lock(&oom_mm_lock);
> +	list_del(&mm->oom_mm.list);
> +	spin_unlock(&oom_mm_lock);
> +	put_task_struct(mm->oom_mm.victim);
> +	mmdrop(mm);
> +}
> +
> +bool oom_has_pending_mm(struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +{
> +	struct mm_struct *mm;
> +	bool ret = false;
> +
> +	spin_lock(&oom_mm_lock);
> +	list_for_each_entry(mm, &oom_mm_list, oom_mm.list) {
> +		if (task_in_oom_domain(mm->oom_mm.victim, memcg, nodemask)) {
> +			ret = true;
> +			break;
> +		}
> +	}
> +	spin_unlock(&oom_mm_lock);
> +	return ret;
> +}
> +
>  enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
>  					struct task_struct *task)
>  {
> @@ -653,6 +687,8 @@ subsys_initcall(oom_init)
>   */
>  void mark_oom_victim(struct task_struct *tsk)
>  {
> +	struct mm_struct *mm = tsk->mm;
> +
>  	WARN_ON(oom_killer_disabled);
>  	/* OOM killer might race with memcg OOM */
>  	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
> @@ -666,6 +702,18 @@ void mark_oom_victim(struct task_struct *tsk)
>  	 */
>  	__thaw_task(tsk);
>  	atomic_inc(&oom_victims);
> +	/*
> +	 * Since mark_oom_victim() is called from multiple threads,
> +	 * connect this mm to oom_mm_list only if not yet connected.
> +	 */
> +	if (!mm->oom_mm.victim) {
> +		atomic_inc(&mm->mm_count);
> +		get_task_struct(tsk);
> +		mm->oom_mm.victim = tsk;
> +		spin_lock(&oom_mm_lock);
> +		list_add_tail(&mm->oom_mm.list, &oom_mm_list);
> +		spin_unlock(&oom_mm_lock);
> +	}
>  }
>  
>  /**
> @@ -1026,6 +1074,9 @@ bool out_of_memory(struct oom_control *oc)
>  		return true;
>  	}
>  
> +	if (!is_sysrq_oom(oc) && oom_has_pending_mm(oc->memcg, oc->nodemask))
> +		return true;
> +
>  	p = select_bad_process(oc, &points, totalpages);
>  	/* Found nothing?!?! Either we hang forever, or we panic. */
>  	if (!p && !is_sysrq_oom(oc)) {
> -- 
> 1.8.3.1
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 4/8] mm,oom: Close oom_has_pending_mm race.
  2016-07-12 13:29 ` [PATCH 4/8] mm,oom: Close oom_has_pending_mm race Tetsuo Handa
@ 2016-07-12 14:36   ` Michal Hocko
  0 siblings, 0 replies; 32+ messages in thread
From: Michal Hocko @ 2016-07-12 14:36 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Tue 12-07-16 22:29:19, Tetsuo Handa wrote:
> Previous patch ignored a situation where oom_has_pending_mm() returns
> false due to all threads which mm->oom_mm.victim belongs to have reached
> TASK_DEAD state, for there might be other thread groups sharing that mm.
> 
> This patch handles such situation by always updating mm->oom_mm.victim.
> By applying this patch, the comm/pid pair printed at oom_kill_process()
> and oom_reap_task() might differ. But that will not be a critical
> problem.

I am not really sure this is worth it and that the approach actually
works reliable. The given tsk might drop or might have dropped already
its memcg association or mempolicy by the time we replace the original
one which still holds relevant data. So you are replacing one corner
case by another. It is hard to tell which one is more likely.

This patch is not really needed for correctness so I would rather wait
for reports than over engineer at this stage.

> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  mm/oom_kill.c | 22 +++++++++++++++++-----
>  1 file changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 07e8c1a..0b78133 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -688,6 +688,7 @@ subsys_initcall(oom_init)
>  void mark_oom_victim(struct task_struct *tsk)
>  {
>  	struct mm_struct *mm = tsk->mm;
> +	struct task_struct *old_tsk;
>  
>  	WARN_ON(oom_killer_disabled);
>  	/* OOM killer might race with memcg OOM */
> @@ -705,15 +706,26 @@ void mark_oom_victim(struct task_struct *tsk)
>  	/*
>  	 * Since mark_oom_victim() is called from multiple threads,
>  	 * connect this mm to oom_mm_list only if not yet connected.
> +	 *
> +	 * But task_in_oom_domain(mm->oom_mm.victim, memcg, nodemask) in
> +	 * oom_has_pending_mm() might return false after all threads in one
> +	 * thread group (which mm->oom_mm.victim belongs to) reached TASK_DEAD
> +	 * state. In that case, the same mm will be selected by another thread
> +	 * group (which mm->oom_mm.victim does not belongs to). Therefore,
> +	 * we need to replace the old task with the new task (at least when
> +	 * task_in_oom_domain() returned false).
>  	 */
> -	if (!mm->oom_mm.victim) {
> +	get_task_struct(tsk);
> +	spin_lock(&oom_mm_lock);
> +	old_tsk = mm->oom_mm.victim;
> +	mm->oom_mm.victim = tsk;
> +	if (!old_tsk) {
>  		atomic_inc(&mm->mm_count);
> -		get_task_struct(tsk);
> -		mm->oom_mm.victim = tsk;
> -		spin_lock(&oom_mm_lock);
>  		list_add_tail(&mm->oom_mm.list, &oom_mm_list);
> -		spin_unlock(&oom_mm_lock);
>  	}
> +	spin_unlock(&oom_mm_lock);
> +	if (old_tsk)
> +		put_task_struct(old_tsk);
>  }
>  
>  /**
> -- 
> 1.8.3.1
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct.
  2016-07-12 13:29 ` [PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct Tetsuo Handa
@ 2016-07-12 14:51   ` Michal Hocko
  2016-07-12 15:42     ` Tetsuo Handa
  0 siblings, 1 reply; 32+ messages in thread
From: Michal Hocko @ 2016-07-12 14:51 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Tue 12-07-16 22:29:20, Tetsuo Handa wrote:
> find_lock_task_mm() is racy when finding an mm because it returns NULL

I would rather s@racy@unreliable@

> when all threads in that thread group passed current->mm == NULL line
> in exit_mm() while that mm might be still waiting for __mmput() (or the
> OOM reaper) to reclaim memory used by that mm.
> 
> Since OOM reaping is per mm_struct operation, it is natural to use
> list of mm_struct used by OOM victims. By using list of mm_struct,
> we can eliminate find_lock_task_mm() usage from the OOM reaper.

Good. This will reduce the code size, simplify the code and make it more
reliable.

> We still have racy find_lock_task_mm() usage in oom_scan_process_thread()
> which can theoretically cause OOM livelock situation when MMF_OOM_REAPED
> was set to OOM victim's mm without putting that mm under the OOM reaper's
> supervision. We must not depend on find_lock_task_mm() not returning NULL.

But I guess this just makes the changelog confusing without adding a
large value.

> Since later patch in the series will change oom_scan_process_thread() not
> to depend on atomic_read(&task->signal->oom_victims) != 0 &&
> find_lock_task_mm(task) != NULL, this patch removes exit_oom_victim()
> on remote thread.

I have already suggested doing this in a separate patch. Because
dropping exit_oom_victim has other side effectes (namely for
oom_killer_disable convergence guarantee).

Also I would suggest doing set_bit(MMF_OOM_REAPED) from exit_oom_mm and
(in a follow up patch) rename it to MMF_SKIP_OOM_MM.

I haven't spotted any other issues.

> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  include/linux/oom.h |   8 ----
>  mm/memcontrol.c     |   1 -
>  mm/oom_kill.c       | 117 +++++++++++++++++++++-------------------------------
>  3 files changed, 47 insertions(+), 79 deletions(-)
> 
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index bdcb331..cb3f041 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -72,14 +72,6 @@ static inline bool oom_task_origin(const struct task_struct *p)
>  
>  extern void mark_oom_victim(struct task_struct *tsk);
>  
> -#ifdef CONFIG_MMU
> -extern void wake_oom_reaper(struct task_struct *tsk);
> -#else
> -static inline void wake_oom_reaper(struct task_struct *tsk)
> -{
> -}
> -#endif
> -
>  extern unsigned long oom_badness(struct task_struct *p,
>  		struct mem_cgroup *memcg, const nodemask_t *nodemask,
>  		unsigned long totalpages);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8f7a5b7..5043324 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1236,7 +1236,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	 */
>  	if (task_will_free_mem(current)) {
>  		mark_oom_victim(current);
> -		wake_oom_reaper(current);
>  		goto unlock;
>  	}
>  
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 0b78133..715f77d 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -286,11 +286,19 @@ static DEFINE_SPINLOCK(oom_mm_lock);
>  
>  void exit_oom_mm(struct mm_struct *mm)
>  {
> +	struct task_struct *victim;
> +
> +	/* __mmput() and oom_reaper() could race. */
>  	spin_lock(&oom_mm_lock);
> -	list_del(&mm->oom_mm.list);
> +	victim = mm->oom_mm.victim;
> +	mm->oom_mm.victim = NULL;
> +	if (victim)
> +		list_del(&mm->oom_mm.list);
>  	spin_unlock(&oom_mm_lock);
> -	put_task_struct(mm->oom_mm.victim);
> -	mmdrop(mm);
> +	if (victim) {
> +		put_task_struct(victim);
> +		mmdrop(mm);
> +	}
>  }
>  
>  bool oom_has_pending_mm(struct mem_cgroup *memcg, const nodemask_t *nodemask)
> @@ -483,8 +491,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
>   */
>  static struct task_struct *oom_reaper_th;
>  static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
> -static struct task_struct *oom_reaper_list;
> -static DEFINE_SPINLOCK(oom_reaper_lock);
>  
>  static bool __oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
>  {
> @@ -576,30 +582,27 @@ unlock_oom:
>  }
>  
>  #define MAX_OOM_REAP_RETRIES 10
> -static void oom_reap_task(struct task_struct *tsk)
> +static void oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
>  {
>  	int attempts = 0;
> -	struct mm_struct *mm = NULL;
> -	struct task_struct *p = find_lock_task_mm(tsk);
> +	bool ret;
>  
>  	/*
> -	 * Make sure we find the associated mm_struct even when the particular
> -	 * thread has already terminated and cleared its mm.
> -	 * We might have race with exit path so consider our work done if there
> -	 * is no mm.
> +	 * Check MMF_OOM_REAPED after holding oom_lock in case
> +	 * oom_kill_process() found this mm pinned.
>  	 */
> -	if (!p)
> -		goto done;
> -	mm = p->mm;
> -	atomic_inc(&mm->mm_count);
> -	task_unlock(p);
> +	mutex_lock(&oom_lock);
> +	ret = test_bit(MMF_OOM_REAPED, &mm->flags);
> +	mutex_unlock(&oom_lock);
> +	if (ret)
> +		return;
>  
>  	/* Retry the down_read_trylock(mmap_sem) a few times */
>  	while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk, mm))
>  		schedule_timeout_idle(HZ/10);
>  
>  	if (attempts <= MAX_OOM_REAP_RETRIES)
> -		goto done;
> +		return;
>  
>  	/* Ignore this mm because somebody can't call up_write(mmap_sem). */
>  	set_bit(MMF_OOM_REAPED, &mm->flags);
> @@ -607,22 +610,6 @@ static void oom_reap_task(struct task_struct *tsk)
>  	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
>  		task_pid_nr(tsk), tsk->comm);
>  	debug_show_all_locks();
> -
> -done:
> -	/*
> -	 * Clear TIF_MEMDIE because the task shouldn't be sitting on a
> -	 * reasonably reclaimable memory anymore or it is not a good candidate
> -	 * for the oom victim right now because it cannot release its memory
> -	 * itself nor by the oom reaper.
> -	 */
> -	tsk->oom_reaper_list = NULL;
> -	exit_oom_victim(tsk);
> -
> -	/* Drop a reference taken by wake_oom_reaper */
> -	put_task_struct(tsk);
> -	/* Drop a reference taken above. */
> -	if (mm)
> -		mmdrop(mm);
>  }
>  
>  static int oom_reaper(void *unused)
> @@ -630,41 +617,35 @@ static int oom_reaper(void *unused)
>  	set_freezable();
>  
>  	while (true) {
> -		struct task_struct *tsk = NULL;
> -
> -		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
> -		spin_lock(&oom_reaper_lock);
> -		if (oom_reaper_list != NULL) {
> -			tsk = oom_reaper_list;
> -			oom_reaper_list = tsk->oom_reaper_list;
> +		struct mm_struct *mm = NULL;
> +		struct task_struct *victim = NULL;
> +
> +		wait_event_freezable(oom_reaper_wait,
> +				     !list_empty(&oom_mm_list));
> +		spin_lock(&oom_mm_lock);
> +		if (!list_empty(&oom_mm_list)) {
> +			mm = list_first_entry(&oom_mm_list, struct mm_struct,
> +					      oom_mm.list);
> +			victim = mm->oom_mm.victim;
> +			/*
> +			 * Take a reference on current victim thread in case
> +			 * oom_reap_task() raced with mark_oom_victim() by
> +			 * other threads sharing this mm.
> +			 */
> +			get_task_struct(victim);
>  		}
> -		spin_unlock(&oom_reaper_lock);
> -
> -		if (tsk)
> -			oom_reap_task(tsk);
> +		spin_unlock(&oom_mm_lock);
> +		if (!mm)
> +			continue;
> +		oom_reap_task(victim, mm);
> +		put_task_struct(victim);
> +		/* Drop references taken by mark_oom_victim() */
> +		exit_oom_mm(mm);
>  	}
>  
>  	return 0;
>  }
>  
> -void wake_oom_reaper(struct task_struct *tsk)
> -{
> -	if (!oom_reaper_th)
> -		return;
> -
> -	/* tsk is already queued? */
> -	if (tsk == oom_reaper_list || tsk->oom_reaper_list)
> -		return;
> -
> -	get_task_struct(tsk);
> -
> -	spin_lock(&oom_reaper_lock);
> -	tsk->oom_reaper_list = oom_reaper_list;
> -	oom_reaper_list = tsk;
> -	spin_unlock(&oom_reaper_lock);
> -	wake_up(&oom_reaper_wait);
> -}
> -
>  static int __init oom_init(void)
>  {
>  	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
> @@ -726,6 +707,9 @@ void mark_oom_victim(struct task_struct *tsk)
>  	spin_unlock(&oom_mm_lock);
>  	if (old_tsk)
>  		put_task_struct(old_tsk);
> +#ifdef CONFIG_MMU
> +	wake_up(&oom_reaper_wait);
> +#endif
>  }
>  
>  /**
> @@ -867,7 +851,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  	unsigned int victim_points = 0;
>  	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
>  					      DEFAULT_RATELIMIT_BURST);
> -	bool can_oom_reap = true;
>  
>  	/*
>  	 * If the task is already exiting, don't alarm the sysadmin or kill
> @@ -876,7 +859,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  	task_lock(p);
>  	if (task_will_free_mem(p)) {
>  		mark_oom_victim(p);
> -		wake_oom_reaper(p);
>  		task_unlock(p);
>  		put_task_struct(p);
>  		return;
> @@ -966,7 +948,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  			 * memory might be still used. Hide the mm from the oom
>  			 * killer to guarantee OOM forward progress.
>  			 */
> -			can_oom_reap = false;
>  			set_bit(MMF_OOM_REAPED, &mm->flags);
>  			pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
>  					task_pid_nr(victim), victim->comm,
> @@ -977,9 +958,6 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  	}
>  	rcu_read_unlock();
>  
> -	if (can_oom_reap)
> -		wake_oom_reaper(victim);
> -
>  	mmdrop(mm);
>  	put_task_struct(victim);
>  }
> @@ -1055,7 +1033,6 @@ bool out_of_memory(struct oom_control *oc)
>  	 */
>  	if (task_will_free_mem(current)) {
>  		mark_oom_victim(current);
> -		wake_oom_reaper(current);
>  		return true;
>  	}
>  
> -- 
> 1.8.3.1
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 7/8] mm,oom: Stop clearing TIF_MEMDIE on remote thread.
  2016-07-12 13:29 ` [PATCH 7/8] mm,oom: Stop clearing TIF_MEMDIE on remote thread Tetsuo Handa
@ 2016-07-12 14:53   ` Michal Hocko
  2016-07-12 15:45     ` Tetsuo Handa
  0 siblings, 1 reply; 32+ messages in thread
From: Michal Hocko @ 2016-07-12 14:53 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Tue 12-07-16 22:29:22, Tetsuo Handa wrote:
> Since no kernel code path needs to clear TIF_MEMDIE flag on a remote
> thread we can drop the task parameter and enforce that actually.
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Acked-by: Michal Hocko <mhocko@suse.com>

Please wait with this one along with removing exit_oom_victim from the
oom reaper after we settle with the rest of the series. I believe we
really need to handle oom_killer_disable in the same batch and that
sounds outside of the scope of this series.

I can even pick your patch and rebase it along with the rest that I have
posted recently, unless you have objections of course.

> ---
>  include/linux/oom.h | 2 +-
>  kernel/exit.c       | 2 +-
>  mm/oom_kill.c       | 5 ++---
>  3 files changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 27be4ba..90e98ea 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -90,7 +90,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
>  
>  extern bool out_of_memory(struct oom_control *oc);
>  
> -extern void exit_oom_victim(struct task_struct *tsk);
> +extern void exit_oom_victim(void);
>  
>  extern int register_oom_notifier(struct notifier_block *nb);
>  extern int unregister_oom_notifier(struct notifier_block *nb);
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 84ae830..1b1dada 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk)
>  	mm_update_next_owner(mm);
>  	mmput(mm);
>  	if (test_thread_flag(TIF_MEMDIE))
> -		exit_oom_victim(tsk);
> +		exit_oom_victim();
>  }
>  
>  static struct task_struct *find_alive_thread(struct task_struct *p)
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 7ee6b70..fab0bec 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -692,10 +692,9 @@ void mark_oom_victim(struct task_struct *tsk)
>  /**
>   * exit_oom_victim - note the exit of an OOM victim
>   */
> -void exit_oom_victim(struct task_struct *tsk)
> +void exit_oom_victim(void)
>  {
> -	if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
> -		return;
> +	clear_thread_flag(TIF_MEMDIE);
>  
>  	if (!atomic_dec_return(&oom_victims))
>  		wake_up_all(&oom_victims_wait);
> -- 
> 1.8.3.1
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 8/8] oom_reaper: Revert "oom_reaper: close race with exiting task".
  2016-07-12 13:29 ` [PATCH 8/8] oom_reaper: Revert "oom_reaper: close race with exiting task" Tetsuo Handa
@ 2016-07-12 14:56   ` Michal Hocko
  0 siblings, 0 replies; 32+ messages in thread
From: Michal Hocko @ 2016-07-12 14:56 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Tue 12-07-16 22:29:23, Tetsuo Handa wrote:
> We can revert commit e2fe14564d3316d1 ("oom_reaper: close race with
> exiting task") because oom_has_pending_mm() which will return true until
> exit_oom_mm() is called after OOM victim's mm is reclaimed by __mmput()
> or oom_reap_task() can close that race.
> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  mm/oom_kill.c | 29 ++++-------------------------
>  1 file changed, 4 insertions(+), 25 deletions(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index fab0bec..232c1ce 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -476,28 +476,9 @@ static bool __oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
>  	struct vm_area_struct *vma;
>  	struct zap_details details = {.check_swap_entries = true,
>  				      .ignore_dirty = true};
> -	bool ret = true;
>  
> -	/*
> -	 * We have to make sure to not race with the victim exit path
> -	 * and cause premature new oom victim selection:
> -	 * __oom_reap_task		exit_mm
> -	 *   mmget_not_zero
> -	 *				  mmput
> -	 *				    atomic_dec_and_test
> -	 *				  exit_oom_victim
> -	 *				[...]
> -	 *				out_of_memory
> -	 *				  select_bad_process
> -	 *				    # no TIF_MEMDIE task selects new victim
> -	 *  unmap_page_range # frees some memory
> -	 */
> -	mutex_lock(&oom_lock);
> -
> -	if (!down_read_trylock(&mm->mmap_sem)) {
> -		ret = false;
> -		goto unlock_oom;
> -	}
> +	if (!down_read_trylock(&mm->mmap_sem))
> +		return false;
>  
>  	/*
>  	 * increase mm_users only after we know we will reap something so
> @@ -506,7 +487,7 @@ static bool __oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
>  	 */
>  	if (!mmget_not_zero(mm)) {
>  		up_read(&mm->mmap_sem);
> -		goto unlock_oom;
> +		return true;
>  	}
>  
>  	tlb_gather_mmu(&tlb, mm, 0, -1);
> @@ -554,9 +535,7 @@ static bool __oom_reap_task(struct task_struct *tsk, struct mm_struct *mm)
>  	 * put the oom_reaper out of the way.
>  	 */
>  	mmput_async(mm);
> -unlock_oom:
> -	mutex_unlock(&oom_lock);
> -	return ret;
> +	return true;
>  }
>  
>  #define MAX_OOM_REAP_RETRIES 10
> -- 
> 1.8.3.1
> 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct.
  2016-07-12 14:51   ` Michal Hocko
@ 2016-07-12 15:42     ` Tetsuo Handa
  2016-07-13  7:48       ` Michal Hocko
  0 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 15:42 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

Michal Hocko wrote:
> On Tue 12-07-16 22:29:20, Tetsuo Handa wrote:
> > find_lock_task_mm() is racy when finding an mm because it returns NULL
> 
> I would rather s@racy@unreliable@
> 
> > when all threads in that thread group passed current->mm == NULL line
> > in exit_mm() while that mm might be still waiting for __mmput() (or the
> > OOM reaper) to reclaim memory used by that mm.
> > 
> > Since OOM reaping is per mm_struct operation, it is natural to use
> > list of mm_struct used by OOM victims. By using list of mm_struct,
> > we can eliminate find_lock_task_mm() usage from the OOM reaper.
> 
> Good. This will reduce the code size, simplify the code and make it more
> reliable.
> 
> > We still have racy find_lock_task_mm() usage in oom_scan_process_thread()
> > which can theoretically cause OOM livelock situation when MMF_OOM_REAPED
> > was set to OOM victim's mm without putting that mm under the OOM reaper's
> > supervision. We must not depend on find_lock_task_mm() not returning NULL.
> 
> But I guess this just makes the changelog confusing without adding a
> large value.
> 
> > Since later patch in the series will change oom_scan_process_thread() not
> > to depend on atomic_read(&task->signal->oom_victims) != 0 &&
> > find_lock_task_mm(task) != NULL, this patch removes exit_oom_victim()
> > on remote thread.
> 
> I have already suggested doing this in a separate patch. Because
> dropping exit_oom_victim has other side effectes (namely for
> oom_killer_disable convergence guarantee).

You can apply
http://lkml.kernel.org/r/1467365190-24640-3-git-send-email-mhocko@kernel.org
at this point.

> 
> Also I would suggest doing set_bit(MMF_OOM_REAPED) from exit_oom_mm and
> (in a follow up patch) rename it to MMF_SKIP_OOM_MM.
> 
> I haven't spotted any other issues.
> 
Oops. Please fold below fix into
"[PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct.".

>From ae051fb92b285c0dc4ebc4953fadc755b1ae8a31 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 13 Jul 2016 00:24:32 +0900
Subject: [PATCH] mm,oom_reaper: Close race on exit_oom_mm().

Previous patch forgot to take a reference on mm, for __mmput() from
mmput() from exit_mm() can drop mm->mm_count till 0 before the OOM
reaper calls exit_oom_mm().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/oom_kill.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 715f77d..4c8b686 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -626,21 +626,24 @@ static int oom_reaper(void *unused)
 		if (!list_empty(&oom_mm_list)) {
 			mm = list_first_entry(&oom_mm_list, struct mm_struct,
 					      oom_mm.list);
-			victim = mm->oom_mm.victim;
 			/*
-			 * Take a reference on current victim thread in case
-			 * oom_reap_task() raced with mark_oom_victim() by
-			 * other threads sharing this mm.
+			 * Take references on mm and victim in case
+			 * oom_reap_task() raced with mark_oom_victim() or
+			 * __mmput().
 			 */
+			atomic_inc(&mm->mm_count);
+			victim = mm->oom_mm.victim;
 			get_task_struct(victim);
 		}
 		spin_unlock(&oom_mm_lock);
 		if (!mm)
 			continue;
 		oom_reap_task(victim, mm);
-		put_task_struct(victim);
 		/* Drop references taken by mark_oom_victim() */
 		exit_oom_mm(mm);
+		/* Drop references taken above. */
+		put_task_struct(victim);
+		mmdrop(mm);
 	}
 
 	return 0;
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 7/8] mm,oom: Stop clearing TIF_MEMDIE on remote thread.
  2016-07-12 14:53   ` Michal Hocko
@ 2016-07-12 15:45     ` Tetsuo Handa
  2016-07-13  8:13       ` Michal Hocko
  0 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-12 15:45 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

Michal Hocko wrote:
> On Tue 12-07-16 22:29:22, Tetsuo Handa wrote:
> > Since no kernel code path needs to clear TIF_MEMDIE flag on a remote
> > thread we can drop the task parameter and enforce that actually.
> > 
> > Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> > Acked-by: Michal Hocko <mhocko@suse.com>
> 
> Please wait with this one along with removing exit_oom_victim from the
> oom reaper after we settle with the rest of the series. I believe we
> really need to handle oom_killer_disable in the same batch and that
> sounds outside of the scope of this series.
> 
> I can even pick your patch and rebase it along with the rest that I have
> posted recently, unless you have objections of course.

I have no objections. Please insert my patches into your series.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct.
  2016-07-12 15:42     ` Tetsuo Handa
@ 2016-07-13  7:48       ` Michal Hocko
  0 siblings, 0 replies; 32+ messages in thread
From: Michal Hocko @ 2016-07-13  7:48 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Wed 13-07-16 00:42:51, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Tue 12-07-16 22:29:20, Tetsuo Handa wrote:
[...]
> > > Since later patch in the series will change oom_scan_process_thread() not
> > > to depend on atomic_read(&task->signal->oom_victims) != 0 &&
> > > find_lock_task_mm(task) != NULL, this patch removes exit_oom_victim()
> > > on remote thread.
> > 
> > I have already suggested doing this in a separate patch. Because
> > dropping exit_oom_victim has other side effectes (namely for
> > oom_killer_disable convergence guarantee).
> 
> You can apply
> http://lkml.kernel.org/r/1467365190-24640-3-git-send-email-mhocko@kernel.org
> at this point.

I would still prefer if exit_oom_victim was done in a separate patch.
Unless you have a strong reason to do it in this patch. I plan to rework
the above to remove exit_oom_victim along with the oom_killer_disable
change.

> > Also I would suggest doing set_bit(MMF_OOM_REAPED) from exit_oom_mm and
> > (in a follow up patch) rename it to MMF_SKIP_OOM_MM.
> > 
> > I haven't spotted any other issues.
> > 
> Oops. Please fold below fix into
> "[PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct.".
> 
> >From ae051fb92b285c0dc4ebc4953fadc755b1ae8a31 Mon Sep 17 00:00:00 2001
> From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Date: Wed, 13 Jul 2016 00:24:32 +0900
> Subject: [PATCH] mm,oom_reaper: Close race on exit_oom_mm().
> 
> Previous patch forgot to take a reference on mm, for __mmput() from
> mmput() from exit_mm() can drop mm->mm_count till 0 before the OOM
> reaper calls exit_oom_mm().

I have missed that as well. Please fold this in.

> 
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  mm/oom_kill.c | 13 ++++++++-----
>  1 file changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 715f77d..4c8b686 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -626,21 +626,24 @@ static int oom_reaper(void *unused)
>  		if (!list_empty(&oom_mm_list)) {
>  			mm = list_first_entry(&oom_mm_list, struct mm_struct,
>  					      oom_mm.list);
> -			victim = mm->oom_mm.victim;
>  			/*
> -			 * Take a reference on current victim thread in case
> -			 * oom_reap_task() raced with mark_oom_victim() by
> -			 * other threads sharing this mm.
> +			 * Take references on mm and victim in case
> +			 * oom_reap_task() raced with mark_oom_victim() or
> +			 * __mmput().
>  			 */
> +			atomic_inc(&mm->mm_count);
> +			victim = mm->oom_mm.victim;
>  			get_task_struct(victim);
>  		}
>  		spin_unlock(&oom_mm_lock);
>  		if (!mm)
>  			continue;
>  		oom_reap_task(victim, mm);
> -		put_task_struct(victim);
>  		/* Drop references taken by mark_oom_victim() */
>  		exit_oom_mm(mm);
> +		/* Drop references taken above. */
> +		put_task_struct(victim);
> +		mmdrop(mm);
>  	}
>  
>  	return 0;
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 7/8] mm,oom: Stop clearing TIF_MEMDIE on remote thread.
  2016-07-12 15:45     ` Tetsuo Handa
@ 2016-07-13  8:13       ` Michal Hocko
  0 siblings, 0 replies; 32+ messages in thread
From: Michal Hocko @ 2016-07-13  8:13 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Wed 13-07-16 00:45:59, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Tue 12-07-16 22:29:22, Tetsuo Handa wrote:
> > > Since no kernel code path needs to clear TIF_MEMDIE flag on a remote
> > > thread we can drop the task parameter and enforce that actually.
> > > 
> > > Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> > > Acked-by: Michal Hocko <mhocko@suse.com>
> > 
> > Please wait with this one along with removing exit_oom_victim from the
> > oom reaper after we settle with the rest of the series. I believe we
> > really need to handle oom_killer_disable in the same batch and that
> > sounds outside of the scope of this series.
> > 
> > I can even pick your patch and rebase it along with the rest that I have
> > posted recently, unless you have objections of course.
> 
> I have no objections. Please insert my patches into your series.

Let's wait after the merge window closes and things calm down. In the
mean time it would be great to summarize pros and cons of the two
approach so that we can decide properly and have this for future
reference.

Thanks!
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
                   ` (7 preceding siblings ...)
  2016-07-12 13:29 ` [PATCH 8/8] oom_reaper: Revert "oom_reaper: close race with exiting task" Tetsuo Handa
@ 2016-07-21 11:21 ` Michal Hocko
  2016-07-22 11:09   ` Tetsuo Handa
  8 siblings, 1 reply; 32+ messages in thread
From: Michal Hocko @ 2016-07-21 11:21 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Tue 12-07-16 22:29:15, Tetsuo Handa wrote:
> This series is an update of
> http://lkml.kernel.org/r/201607080058.BFI87504.JtFOOFQFVHSLOM@I-love.SAKURA.ne.jp .
> 
> This series is based on top of linux-next-20160712 +
> http://lkml.kernel.org/r/1467201562-6709-1-git-send-email-mhocko@kernel.org .

I was thinking about this vs. signal_struct::oom_mm [1] and came to the
conclusion that as of now they are mostly equivalent wrt. oom livelock
detection and coping with it. So for now any of them should be good to
go. Good!

Now what about future plans? I would like to get rid of TIF_MEMDIE
altogether and give access to memory reserves to oom victim when they
allocate the memory. Something like:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 788e4f22e0bb..34446f49c2e1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3358,7 +3358,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (!in_interrupt() &&
 				((current->flags & PF_MEMALLOC) ||
-				 unlikely(test_thread_flag(TIF_MEMDIE))))
+				 tsk_is_oom_victim(current))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 #ifdef CONFIG_CMA

where tsk_is_oom_victim wouldn't require the given task to go via
out_of_memory. This would solve some of the problems we have right now
when a thread doesn't get access to memory reserves because it never
reaches out_of_memory (e.g. recently mentioned mempool_alloc doing
__GFP_NORETRY). It would also make the code easier to follow. If we want
to implement that we need an easy to implement tsk_is_oom_victim
obviously. With the signal_struct::oom_mm this is really trivial thing.
I am not sure we can do that with the mm list though because we are
loosing the task->mm at certain point in time. The only way I can see
this would fly would be preserving TIF_MEMDIE and setting it for all
threads but I am not sure this is very much better and puts the mm list
approach to a worse possition from my POV.

What do you think Tetsuo?

[1] http://lkml.kernel.org/r/1467365190-24640-1-git-send-email-mhocko@kernel.org
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-21 11:21 ` [PATCH v3 0/8] Change OOM killer to use list of mm_struct Michal Hocko
@ 2016-07-22 11:09   ` Tetsuo Handa
  2016-07-22 12:05     ` Michal Hocko
  0 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-22 11:09 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

Michal Hocko wrote:
> On Tue 12-07-16 22:29:15, Tetsuo Handa wrote:
> > This series is an update of
> > http://lkml.kernel.org/r/201607080058.BFI87504.JtFOOFQFVHSLOM@I-love.SAKURA.ne.jp .
> > 
> > This series is based on top of linux-next-20160712 +
> > http://lkml.kernel.org/r/1467201562-6709-1-git-send-email-mhocko@kernel.org .
> 
> I was thinking about this vs. signal_struct::oom_mm [1] and came to the
> conclusion that as of now they are mostly equivalent wrt. oom livelock
> detection and coping with it. So for now any of them should be good to
> go. Good!
> 
> Now what about future plans? I would like to get rid of TIF_MEMDIE
> altogether and give access to memory reserves to oom victim when they
> allocate the memory. Something like:

Before doing so, can we handle a silent hang up caused by lowmem livelock
at http://lkml.kernel.org/r/20160211225929.GU14668@dastard ? It is a nearly
7 years old bug (since commit 35cd78156c499ef8 "vmscan: throttle direct
reclaim when too many pages are isolated already") which got no progress
so far.

Also, can we apply "[RFC PATCH 2/6] oom, suspend: fix oom_killer_disable vs.
pm suspend properly" at
http://lkml.kernel.org/r/1467365190-24640-3-git-send-email-mhocko@kernel.org
regardless of oom_mm_list vs. signal_struct::oom_mm ? While it would be
preferable to wait for a few seconds to see if !__GFP_NOFAIL allocation
requester can terminate and release memory after the OOM killer is disabled,
looping __GFP_NOFAIL allocation requester forever until out of battery or
thermal runaway when a suspend event was triggered due to low battery or
thermal is a silly choice.

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 788e4f22e0bb..34446f49c2e1 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3358,7 +3358,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
>  			alloc_flags |= ALLOC_NO_WATERMARKS;
>  		else if (!in_interrupt() &&
>  				((current->flags & PF_MEMALLOC) ||
> -				 unlikely(test_thread_flag(TIF_MEMDIE))))
> +				 tsk_is_oom_victim(current))
>  			alloc_flags |= ALLOC_NO_WATERMARKS;
>  	}
>  #ifdef CONFIG_CMA
> 
> where tsk_is_oom_victim wouldn't require the given task to go via
> out_of_memory. This would solve some of the problems we have right now
> when a thread doesn't get access to memory reserves because it never
> reaches out_of_memory (e.g. recently mentioned mempool_alloc doing
> __GFP_NORETRY). It would also make the code easier to follow. If we want
> to implement that we need an easy to implement tsk_is_oom_victim
> obviously. With the signal_struct::oom_mm this is really trivial thing.
> I am not sure we can do that with the mm list though because we are
> loosing the task->mm at certain point in time.

bool tsk_is_oom_victim(void)
{
	return current->mm && test_bit(MMF_OOM_KILLED, &current->mm->flags) &&
		 (fatal_signal_pending(current) || (current->flags & PF_EXITING));
}

>                                                The only way I can see
> this would fly would be preserving TIF_MEMDIE and setting it for all
> threads but I am not sure this is very much better and puts the mm list
> approach to a worse possition from my POV.
> 

But do we still need ALLOC_NO_WATERMARKS for OOM victims? I didn't have a
chance to post below series but I'm suspecting that we need to distinguish
"threads killed by the OOM killer" and "threads killed by SIGKILL" and
"threads normally exiting via exit()".


>From 23a73b9a1243460e7cfc30638ac88a38f93677fa Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 19 Feb 2016 13:22:20 +0900
Subject: [PATCH 1/3] mm,page_alloc: do not loop __GFP_NOFAIL allocation after
 the OOM killer is disabled.

Currently, we are calling panic() from out_of_memory() if the OOM killer
failed to find a process to OOM-kill. But the caller of __GFP_NOFAIL
allocations may not want to call panic() now because ALLOC_NO_WATERMARKS
is not tried yet.

This patch changes out_of_memory() to return false in order to allow the
caller to decide whether to call panic(), and allows __GFP_NOFAIL
allocation requester to try ALLOC_NO_WATERMARKS before calling panic().

This patch changes __GFP_NOFAIL allocation requester to call panic()
when the OOM killer is disabled. While it would be preferable to wait
for a few seconds to see if !__GFP_NOFAIL allocation requester can
terminate and release memory, looping until out of battery or thermal
runaway is bad.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 include/linux/oom.h |  1 +
 mm/oom_kill.c       | 14 +++++++++-----
 mm/page_alloc.c     | 37 +++++++++++++++++++++++--------------
 3 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 45993b8..7b0e476 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -90,6 +90,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 		struct task_struct *task, unsigned long totalpages);

 extern bool out_of_memory(struct oom_control *oc);
+extern void out_of_memory_panic(struct oom_control *oc);

 extern void exit_oom_victim(struct task_struct *tsk);

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d7bb9c1..0ed12ce 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -406,6 +406,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p,
 		dump_tasks(memcg, oc->nodemask);
 }

+void out_of_memory_panic(struct oom_control *oc)
+{
+	dump_header(oc, NULL, NULL);
+	panic("Out of memory and no killable processes...\n");
+}
+
 /*
  * Number of OOM victims in flight
  */
@@ -892,11 +898,9 @@ bool out_of_memory(struct oom_control *oc)
 	}

 	p = select_bad_process(oc, &points, totalpages);
-	/* Found nothing?!?! Either we hang forever, or we panic. */
-	if (!p && !is_sysrq_oom(oc)) {
-		dump_header(oc, NULL, NULL);
-		panic("Out of memory and no killable processes...\n");
-	}
+	/* Found nothing?!?! Let the caller fail the allocation or panic. */
+	if (!p && !is_sysrq_oom(oc))
+		return false;
 	if (p && p != (void *)-1UL) {
 		oom_kill_process(oc, p, points, totalpages, NULL,
 				 "Out of memory");
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 85e7588..a2da66c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2870,21 +2870,30 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+	if (out_of_memory(&oc))
 		*did_some_progress = 1;
-
-		if (gfp_mask & __GFP_NOFAIL) {
-			page = get_page_from_freelist(gfp_mask, order,
-					ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
-			/*
-			 * fallback to ignore cpuset restriction if our nodes
-			 * are depleted
-			 */
-			if (!page)
-				page = get_page_from_freelist(gfp_mask, order,
-					ALLOC_NO_WATERMARKS, ac);
-		}
-	}
+	/*
+	 * The OOM killer is disabled or there are no OOM-killable processes.
+	 * This is the last chance for __GFP_NOFAIL allocations to retry.
+	 */
+	else if (gfp_mask & __GFP_NOFAIL) {
+		*did_some_progress = 1;
+		page = get_page_from_freelist(gfp_mask, order,
+					      ALLOC_NO_WATERMARKS|ALLOC_CPUSET,
+					      ac);
+		if (page)
+			goto out;
+		/*
+		 * fallback to ignore cpuset restriction if our nodes
+		 * are depleted
+		 */
+		page = get_page_from_freelist(gfp_mask, order,
+					      ALLOC_NO_WATERMARKS, ac);
+		/* Exhausted what can be done so it's time to panic. */
+		if (!page)
+			out_of_memory_panic(&oc);
+	} else if (!oom_killer_disabled)
+		out_of_memory_panic(&oc);
 out:
 	mutex_unlock(&oom_lock);
 	return page;
-- 
1.8.3.1

>From 45f9ae8ab7efd5a6c3770afaccd2eaed58cb1d45 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 19 Feb 2016 13:29:25 +0900
Subject: [PATCH 2/3] mm,page_alloc: favor exiting tasks over normal tasks.

While the OOM reaper likely can reap memory used by OOM victims before
they terminate, it is possible that the OOM reaper cannot reap their
memory due to sharing it with OOM-unkillable processes or memory reaped
by the OOM reaper is preferentially used by them.

This patch allows exiting tasks (either fatal_signal_pending() or
PF_EXITING) tasks to access some of memory reserves, in order to
make sure that they can try harder before the OOM killer decides
to choose next OOM victim.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/page_alloc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a2da66c..e29e596 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3078,6 +3078,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 				((current->flags & PF_MEMALLOC) ||
 				 unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
+		else if (!in_interrupt() && (fatal_signal_pending(current) ||
+					     (current->flags & PF_EXITING)))
+			alloc_flags |= ALLOC_HARDER;
 	}
 #ifdef CONFIG_CMA
 	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-- 
1.8.3.1

>From bb2e5cfa41901b88f711b0e17bf273e491b911bb Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 19 Feb 2016 13:42:38 +0900
Subject: [PATCH 3/3] mm,page_alloc: do not give up looping

Since the OOM reaper (and timeout based next OOM victim selection) can
guarantee that memory reserves are refilled as long as the OOM killer
is invoked, we don't need to fail !__GFP_NOFAIL allocations as soon as
chosen for an OOM-victim. Also, we don't need to call panic() as soon as
all OOM-killable processes are OOM-killed.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 mm/page_alloc.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e29e596..7b194b4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2893,7 +2893,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		if (!page)
 			out_of_memory_panic(&oc);
 	} else if (!oom_killer_disabled)
-		out_of_memory_panic(&oc);
+		*did_some_progress = 1;
 out:
 	mutex_unlock(&oom_lock);
 	return page;
@@ -3074,9 +3074,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
-		else if (!in_interrupt() &&
-				((current->flags & PF_MEMALLOC) ||
-				 unlikely(test_thread_flag(TIF_MEMDIE))))
+		else if (!in_interrupt() && (current->flags & PF_MEMALLOC))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (!in_interrupt() && (fatal_signal_pending(current) ||
 					     (current->flags & PF_EXITING)))
@@ -3303,10 +3301,6 @@ retry:
 		goto nopage;
 	}

-	/* Avoid allocations with no watermarks from looping endlessly */
-	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
-		goto nopage;
-
 	/*
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-22 11:09   ` Tetsuo Handa
@ 2016-07-22 12:05     ` Michal Hocko
  2016-07-23  2:59       ` Tetsuo Handa
  0 siblings, 1 reply; 32+ messages in thread
From: Michal Hocko @ 2016-07-22 12:05 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Fri 22-07-16 20:09:42, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Tue 12-07-16 22:29:15, Tetsuo Handa wrote:
> > > This series is an update of
> > > http://lkml.kernel.org/r/201607080058.BFI87504.JtFOOFQFVHSLOM@I-love.SAKURA.ne.jp .
> > > 
> > > This series is based on top of linux-next-20160712 +
> > > http://lkml.kernel.org/r/1467201562-6709-1-git-send-email-mhocko@kernel.org .
> > 
> > I was thinking about this vs. signal_struct::oom_mm [1] and came to the
> > conclusion that as of now they are mostly equivalent wrt. oom livelock
> > detection and coping with it. So for now any of them should be good to
> > go. Good!
> > 
> > Now what about future plans? I would like to get rid of TIF_MEMDIE
> > altogether and give access to memory reserves to oom victim when they
> > allocate the memory. Something like:
> 
> Before doing so, can we handle a silent hang up caused by lowmem livelock
> at http://lkml.kernel.org/r/20160211225929.GU14668@dastard ? It is a nearly
> 7 years old bug (since commit 35cd78156c499ef8 "vmscan: throttle direct
> reclaim when too many pages are isolated already") which got no progress
> so far.

I do not see any dependecy/relation on/to the OOM work. I am even not
sure why you are bringing that up here.

> Also, can we apply "[RFC PATCH 2/6] oom, suspend: fix oom_killer_disable vs.
> pm suspend properly" at
> http://lkml.kernel.org/r/1467365190-24640-3-git-send-email-mhocko@kernel.org
> regardless of oom_mm_list vs. signal_struct::oom_mm ?

Why would we want to hurry? The current workaround should work just fine
for such an unlikely event like oom during suspend. Besides that I would
like to have the stable mm (whichever approach we decide) patches in the
mmotm after the merge window closes and target 4.9. That would include
the above as well.

[...]
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 788e4f22e0bb..34446f49c2e1 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -3358,7 +3358,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
> >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> >  		else if (!in_interrupt() &&
> >  				((current->flags & PF_MEMALLOC) ||
> > -				 unlikely(test_thread_flag(TIF_MEMDIE))))
> > +				 tsk_is_oom_victim(current))
> >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> >  	}
> >  #ifdef CONFIG_CMA
> > 
> > where tsk_is_oom_victim wouldn't require the given task to go via
> > out_of_memory. This would solve some of the problems we have right now
> > when a thread doesn't get access to memory reserves because it never
> > reaches out_of_memory (e.g. recently mentioned mempool_alloc doing
> > __GFP_NORETRY). It would also make the code easier to follow. If we want
> > to implement that we need an easy to implement tsk_is_oom_victim
> > obviously. With the signal_struct::oom_mm this is really trivial thing.
> > I am not sure we can do that with the mm list though because we are
> > loosing the task->mm at certain point in time.
> 
> bool tsk_is_oom_victim(void)
> {
> 	return current->mm && test_bit(MMF_OOM_KILLED, &current->mm->flags) &&
> 		 (fatal_signal_pending(current) || (current->flags & PF_EXITING));
> }

which doesn't work as soon as exit_mm clears the mm which is exactly
the concern I have raised above.

> 
> >                                                The only way I can see
> > this would fly would be preserving TIF_MEMDIE and setting it for all
> > threads but I am not sure this is very much better and puts the mm list
> > approach to a worse possition from my POV.
> > 
> 
> But do we still need ALLOC_NO_WATERMARKS for OOM victims?

Yes as a safety net for cases when the oom_reaper cannot reclaim enough
to get us out of OOM. Maybe one day we can make the oom_reaper
completely bullet proof and granting access to memory reserves would be
pointless. One reason I want to get rid of TIF_MEMDIE is that all would
need to do at that time would be a single line dropping
tsk_is_oom_victim from gfp_to_alloc_flags.

> I didn't have a
> chance to post below series but I'm suspecting that we need to distinguish
> "threads killed by the OOM killer" and "threads killed by SIGKILL" and
> "threads normally exiting via exit()".

Let's stick to discussing the two approaches for now before proposing
even further changes please. I have a serious interest to collect all
the arguments speaking for the two solutions to do an educated decision.
Can we stay on track and get to some conclusion, please?

[...]
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-22 12:05     ` Michal Hocko
@ 2016-07-23  2:59       ` Tetsuo Handa
  2016-07-25  8:48         ` Michal Hocko
  0 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-23  2:59 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

Michal Hocko wrote:
> > > Now what about future plans? I would like to get rid of TIF_MEMDIE
> > > altogether and give access to memory reserves to oom victim when they
> > > allocate the memory. Something like:
> > 
> > Before doing so, can we handle a silent hang up caused by lowmem livelock
> > at http://lkml.kernel.org/r/20160211225929.GU14668@dastard ? It is a nearly
> > 7 years old bug (since commit 35cd78156c499ef8 "vmscan: throttle direct
> > reclaim when too many pages are isolated already") which got no progress
> > so far.
> 
> I do not see any dependecy/relation on/to the OOM work. I am even not
> sure why you are bringing that up here.

This is a ABBA deadlock bug which disables the OOM killer caused by kswapd
waiting for GFP_NOIO allocations whereas GFP_NOIO allocations waiting for
kswapd. A flag like GFP_TRANSIENT suggested at
http://lkml.kernel.org/r/878twt5i1j.fsf@notabene.neil.brown.name which
prevents the allocating task from being throttled is expected if we want to
avoid escaping from too_many_isolated() loop in shrink_inactive_list()
using timeout.

> [...]
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index 788e4f22e0bb..34446f49c2e1 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -3358,7 +3358,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
> > >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> > >  		else if (!in_interrupt() &&
> > >  				((current->flags & PF_MEMALLOC) ||
> > > -				 unlikely(test_thread_flag(TIF_MEMDIE))))
> > > +				 tsk_is_oom_victim(current))
> > >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> > >  	}
> > >  #ifdef CONFIG_CMA
> > > 
> > > where tsk_is_oom_victim wouldn't require the given task to go via
> > > out_of_memory. This would solve some of the problems we have right now
> > > when a thread doesn't get access to memory reserves because it never
> > > reaches out_of_memory (e.g. recently mentioned mempool_alloc doing
> > > __GFP_NORETRY). It would also make the code easier to follow. If we want
> > > to implement that we need an easy to implement tsk_is_oom_victim
> > > obviously. With the signal_struct::oom_mm this is really trivial thing.
> > > I am not sure we can do that with the mm list though because we are
> > > loosing the task->mm at certain point in time.
> > 
> > bool tsk_is_oom_victim(void)
> > {
> > 	return current->mm && test_bit(MMF_OOM_KILLED, &current->mm->flags) &&
> > 		 (fatal_signal_pending(current) || (current->flags & PF_EXITING));
> > }
> 
> which doesn't work as soon as exit_mm clears the mm which is exactly
> the concern I have raised above.

Are you planning to change the scope where the OOM victims can access memory
reserves?

(1) If you plan to allow the OOM victims to access memory reserves until
    TASK_DEAD, tsk_is_oom_victim() will be as trivial as

bool tsk_is_oom_victim(struct task_struct *task)
{
	return task->signal->oom_mm;
}

    because you won't prevent the OOM victims to access memory reserves at
    e.g. exit_task_work() from do_exit(). In that case, I will suggest

bool tsk_is_oom_victim(struct task_struct *task)
{
	return (fatal_signal_pending(task) || (task->flags & PF_EXITING));
}

    like "[PATCH 2/3] mm,page_alloc: favor exiting tasks over normal tasks."
    does.

(2) If you plan to allow the OOM victims to access memory reserves until only
    before calling mmput() from exit_mm() from do_exit(), tsk_is_oom_victim()
    will be

bool tsk_is_oom_victim(struct task_struct *task)
{
	return task->signal->oom_mm && task->mm;
}

    because you don't allow the OOM victims to access memory reserves at
    __mmput() from mmput() from exit_mm() from do_exit(). In that case, I think

bool tsk_is_oom_victim(void)
{
	return current->mm && test_bit(MMF_OOM_KILLED, &current->mm->flags) &&
		(fatal_signal_pending(current) || (current->flags & PF_EXITING));
}

    should work. But as you think it does not work, you are not planning to
    allow the OOM victims to access memory reserves until only before calling
    mmput() from exit_mm() from do_exit(), are you?

(3) If you are not planning to change the scope where the OOM victims can access
    memory reserves (i.e. neither (1) nor (2) above), how can we control it
    without using per task_struct flags like TIF_MEMDIE?

> 
> > 
> > >                                                The only way I can see
> > > this would fly would be preserving TIF_MEMDIE and setting it for all
> > > threads but I am not sure this is very much better and puts the mm list
> > > approach to a worse possition from my POV.
> > > 
> > 
> > But do we still need ALLOC_NO_WATERMARKS for OOM victims?
> 
> Yes as a safety net for cases when the oom_reaper cannot reclaim enough
> to get us out of OOM. Maybe one day we can make the oom_reaper
> completely bullet proof and granting access to memory reserves would be
> pointless. One reason I want to get rid of TIF_MEMDIE is that all would
> need to do at that time would be a single line dropping
> tsk_is_oom_victim from gfp_to_alloc_flags.

I didn't mean to forbid access to memory reserves completely. I meant that
do we need to allow access to all of memory reserves (via ALLOC_NO_WATERMARKS)
rather than portion of memory reserves (via ALLOC_HARDER like [PATCH 2/3] does).
I'm thinking that we can treat "threads killed by the OOM killer" and "threads
killed by SIGKILL" and "threads normally exiting via exit()" evenly by allowing
them access to portion of memory reserves.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-23  2:59       ` Tetsuo Handa
@ 2016-07-25  8:48         ` Michal Hocko
  2016-07-25 11:07           ` Tetsuo Handa
  0 siblings, 1 reply; 32+ messages in thread
From: Michal Hocko @ 2016-07-25  8:48 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Sat 23-07-16 11:59:25, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > > > Now what about future plans? I would like to get rid of TIF_MEMDIE
> > > > altogether and give access to memory reserves to oom victim when they
> > > > allocate the memory. Something like:
> > > 
> > > Before doing so, can we handle a silent hang up caused by lowmem livelock
> > > at http://lkml.kernel.org/r/20160211225929.GU14668@dastard ? It is a nearly
> > > 7 years old bug (since commit 35cd78156c499ef8 "vmscan: throttle direct
> > > reclaim when too many pages are isolated already") which got no progress
> > > so far.
> > 
> > I do not see any dependecy/relation on/to the OOM work. I am even not
> > sure why you are bringing that up here.
> 
> This is a ABBA deadlock bug which disables the OOM killer caused by kswapd
> waiting for GFP_NOIO allocations whereas GFP_NOIO allocations waiting for
> kswapd. A flag like GFP_TRANSIENT suggested at
> http://lkml.kernel.org/r/878twt5i1j.fsf@notabene.neil.brown.name which
> prevents the allocating task from being throttled is expected if we want to
> avoid escaping from too_many_isolated() loop in shrink_inactive_list()
> using timeout.

But this is completely irrelevant to _this_ particular discussion so I
really fail to see why you keep bringing it up. It is definitely not
helping to move on...

> > [...]
> > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > index 788e4f22e0bb..34446f49c2e1 100644
> > > > --- a/mm/page_alloc.c
> > > > +++ b/mm/page_alloc.c
> > > > @@ -3358,7 +3358,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
> > > >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> > > >  		else if (!in_interrupt() &&
> > > >  				((current->flags & PF_MEMALLOC) ||
> > > > -				 unlikely(test_thread_flag(TIF_MEMDIE))))
> > > > +				 tsk_is_oom_victim(current))
> > > >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> > > >  	}
> > > >  #ifdef CONFIG_CMA
> > > > 
> > > > where tsk_is_oom_victim wouldn't require the given task to go via
> > > > out_of_memory. This would solve some of the problems we have right now
> > > > when a thread doesn't get access to memory reserves because it never
> > > > reaches out_of_memory (e.g. recently mentioned mempool_alloc doing
> > > > __GFP_NORETRY). It would also make the code easier to follow. If we want
> > > > to implement that we need an easy to implement tsk_is_oom_victim
> > > > obviously. With the signal_struct::oom_mm this is really trivial thing.
> > > > I am not sure we can do that with the mm list though because we are
> > > > loosing the task->mm at certain point in time.
> > > 
> > > bool tsk_is_oom_victim(void)
> > > {
> > > 	return current->mm && test_bit(MMF_OOM_KILLED, &current->mm->flags) &&
> > > 		 (fatal_signal_pending(current) || (current->flags & PF_EXITING));
> > > }
> > 
> > which doesn't work as soon as exit_mm clears the mm which is exactly
> > the concern I have raised above.
> 
> Are you planning to change the scope where the OOM victims can access memory
> reserves?

Yes. Because we know that there are some post exit_mm allocations and I
do not want to get back to PF_EXITING and other tricks...

> (1) If you plan to allow the OOM victims to access memory reserves until
>     TASK_DEAD, tsk_is_oom_victim() will be as trivial as
> 
> bool tsk_is_oom_victim(struct task_struct *task)
> {
> 	return task->signal->oom_mm;
> }

yes, exactly. That's what I've tried to say above. with the oom_mm this
is trivial to implement while mm lists will not help us much due to
their life time. This also means that we know about the oom victim until
it is unhashed and become invisible to the oom killer.

>     because you won't prevent the OOM victims to access memory reserves at
>     e.g. exit_task_work() from do_exit(). In that case, I will suggest
> 
> bool tsk_is_oom_victim(struct task_struct *task)
> {
> 	return (fatal_signal_pending(task) || (task->flags & PF_EXITING));
> }

No. This will cover any SIGKILLED or exiting task. I really do not think
we can safely give exiting tasks access to memory reserves in general.
That would require much more changes.

>     like "[PATCH 2/3] mm,page_alloc: favor exiting tasks over normal tasks."
>     does.
> 
> (2) If you plan to allow the OOM victims to access memory reserves until only
>     before calling mmput() from exit_mm() from do_exit(), tsk_is_oom_victim()
>     will be
> 
> bool tsk_is_oom_victim(struct task_struct *task)
> {
> 	return task->signal->oom_mm && task->mm;
> }
> 
>     because you don't allow the OOM victims to access memory reserves at
>     __mmput() from mmput() from exit_mm() from do_exit(). In that case, I think
> 
> bool tsk_is_oom_victim(void)
> {
> 	return current->mm && test_bit(MMF_OOM_KILLED, &current->mm->flags) &&
> 		(fatal_signal_pending(current) || (current->flags & PF_EXITING));
> }
> 
>     should work. But as you think it does not work, you are not planning to
>     allow the OOM victims to access memory reserves until only before calling
>     mmput() from exit_mm() from do_exit(), are you?

Yes. the exit_mm is not really suitable place to cut the access to
memory reserves. a) mmput might be not the last one and b) even if it is
we shouldn't really rely it has cleared the memory. It will in 99% cases
but we have seen that the code had to play PF_EXITING tricks in the past
to cover post exit_mm allocations. I think the code flow would get
simplified greatly if we just do not rely on tsk->mm for anything but
the oom victim selection.

> (3) If you are not planning to change the scope where the OOM victims can access
>     memory reserves (i.e. neither (1) nor (2) above), how can we control it
>     without using per task_struct flags like TIF_MEMDIE?
> 
> > 
> > > 
> > > >                                                The only way I can see
> > > > this would fly would be preserving TIF_MEMDIE and setting it for all
> > > > threads but I am not sure this is very much better and puts the mm list
> > > > approach to a worse possition from my POV.
> > > > 
> > > 
> > > But do we still need ALLOC_NO_WATERMARKS for OOM victims?
> > 
> > Yes as a safety net for cases when the oom_reaper cannot reclaim enough
> > to get us out of OOM. Maybe one day we can make the oom_reaper
> > completely bullet proof and granting access to memory reserves would be
> > pointless. One reason I want to get rid of TIF_MEMDIE is that all would
> > need to do at that time would be a single line dropping
> > tsk_is_oom_victim from gfp_to_alloc_flags.
> 
> I didn't mean to forbid access to memory reserves completely. I meant that
> do we need to allow access to all of memory reserves (via ALLOC_NO_WATERMARKS)
> rather than portion of memory reserves (via ALLOC_HARDER like [PATCH 2/3] does).
> I'm thinking that we can treat "threads killed by the OOM killer" and "threads
> killed by SIGKILL" and "threads normally exiting via exit()" evenly by allowing
> them access to portion of memory reserves.

I didn't plan to change how much from the memory reserve the victim can
consume. And I believe this is not really necessary at this stage. Maybe
we want to do something about it but I would propose it to later. At
this stage I would really like to make access to memory reserves
independent on any other oom decisions. So either mm lists or
signal::oom_mm approach. Can we get to some decision which one to go?

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-25  8:48         ` Michal Hocko
@ 2016-07-25 11:07           ` Tetsuo Handa
  2016-07-25 11:21             ` Michal Hocko
  0 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-25 11:07 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

Michal Hocko wrote:
> > > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > > index 788e4f22e0bb..34446f49c2e1 100644
> > > > > --- a/mm/page_alloc.c
> > > > > +++ b/mm/page_alloc.c
> > > > > @@ -3358,7 +3358,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
> > > > >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> > > > >  		else if (!in_interrupt() &&
> > > > >  				((current->flags & PF_MEMALLOC) ||
> > > > > -				 unlikely(test_thread_flag(TIF_MEMDIE))))
> > > > > +				 tsk_is_oom_victim(current))
> > > > >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> > > > >  	}
> > > > >  #ifdef CONFIG_CMA
> > > > > 
> > > > > where tsk_is_oom_victim wouldn't require the given task to go via
> > > > > out_of_memory. This would solve some of the problems we have right now
> > > > > when a thread doesn't get access to memory reserves because it never
> > > > > reaches out_of_memory (e.g. recently mentioned mempool_alloc doing
> > > > > __GFP_NORETRY). It would also make the code easier to follow. If we want
> > > > > to implement that we need an easy to implement tsk_is_oom_victim
> > > > > obviously. With the signal_struct::oom_mm this is really trivial thing.
> > > > > I am not sure we can do that with the mm list though because we are
> > > > > loosing the task->mm at certain point in time.
> > > > 
> > > > bool tsk_is_oom_victim(void)
> > > > {
> > > > 	return current->mm && test_bit(MMF_OOM_KILLED, &current->mm->flags) &&
> > > > 		 (fatal_signal_pending(current) || (current->flags & PF_EXITING));
> > > > }
> > > 
> > > which doesn't work as soon as exit_mm clears the mm which is exactly
> > > the concern I have raised above.
> > 
> > Are you planning to change the scope where the OOM victims can access memory
> > reserves?
> 
> Yes. Because we know that there are some post exit_mm allocations and I
> do not want to get back to PF_EXITING and other tricks...
> 
> > (1) If you plan to allow the OOM victims to access memory reserves until
> >     TASK_DEAD, tsk_is_oom_victim() will be as trivial as
> > 
> > bool tsk_is_oom_victim(struct task_struct *task)
> > {
> > 	return task->signal->oom_mm;
> > }
> 
> yes, exactly. That's what I've tried to say above. with the oom_mm this
> is trivial to implement while mm lists will not help us much due to
> their life time. This also means that we know about the oom victim until
> it is unhashed and become invisible to the oom killer.

Then, what are advantages with allowing only OOM victims access to memory
reserves after they left exit_mm()? OOM victims might be waiting for locks
at e.g. exit_task_work() held by non OOM victims waiting for memory
allocation. If you change the OOM killer wait until existing OOM victims
are removed from task_list, we might OOM livelock, don't we? I think that
what we should do is make the OOM killer wait until MMF_OOM_REAPED is set
rather than wait until existing OOM victims are removed from task_list.

Since we assume that mm_struct is the primary source of memory consumption,
we don't select threads which already left exit_mm(). Since we assume that
mm_struct is the primary source of memory consumption, why should we
distinguish OOM victims and non OOM victims after they left exit_mm()?

> Yes. the exit_mm is not really suitable place to cut the access to
> memory reserves. a) mmput might be not the last one and b) even if it is
> we shouldn't really rely it has cleared the memory. It will in 99% cases
> but we have seen that the code had to play PF_EXITING tricks in the past
> to cover post exit_mm allocations. I think the code flow would get
> simplified greatly if we just do not rely on tsk->mm for anything but
> the oom victim selection.

Even if exit_mm() is not suitable place to cut the access to memory reserves,
I don't see advantages with allowing only OOM victims access to memory
reserves after they left exit_mm().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-25 11:07           ` Tetsuo Handa
@ 2016-07-25 11:21             ` Michal Hocko
  2016-07-25 11:47               ` Tetsuo Handa
  0 siblings, 1 reply; 32+ messages in thread
From: Michal Hocko @ 2016-07-25 11:21 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Mon 25-07-16 20:07:11, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > > > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > > > index 788e4f22e0bb..34446f49c2e1 100644
> > > > > > --- a/mm/page_alloc.c
> > > > > > +++ b/mm/page_alloc.c
> > > > > > @@ -3358,7 +3358,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
> > > > > >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> > > > > >  		else if (!in_interrupt() &&
> > > > > >  				((current->flags & PF_MEMALLOC) ||
> > > > > > -				 unlikely(test_thread_flag(TIF_MEMDIE))))
> > > > > > +				 tsk_is_oom_victim(current))
> > > > > >  			alloc_flags |= ALLOC_NO_WATERMARKS;
> > > > > >  	}
> > > > > >  #ifdef CONFIG_CMA
> > > > > > 
> > > > > > where tsk_is_oom_victim wouldn't require the given task to go via
> > > > > > out_of_memory. This would solve some of the problems we have right now
> > > > > > when a thread doesn't get access to memory reserves because it never
> > > > > > reaches out_of_memory (e.g. recently mentioned mempool_alloc doing
> > > > > > __GFP_NORETRY). It would also make the code easier to follow. If we want
> > > > > > to implement that we need an easy to implement tsk_is_oom_victim
> > > > > > obviously. With the signal_struct::oom_mm this is really trivial thing.
> > > > > > I am not sure we can do that with the mm list though because we are
> > > > > > loosing the task->mm at certain point in time.
> > > > > 
> > > > > bool tsk_is_oom_victim(void)
> > > > > {
> > > > > 	return current->mm && test_bit(MMF_OOM_KILLED, &current->mm->flags) &&
> > > > > 		 (fatal_signal_pending(current) || (current->flags & PF_EXITING));
> > > > > }
> > > > 
> > > > which doesn't work as soon as exit_mm clears the mm which is exactly
> > > > the concern I have raised above.
> > > 
> > > Are you planning to change the scope where the OOM victims can access memory
> > > reserves?
> > 
> > Yes. Because we know that there are some post exit_mm allocations and I
> > do not want to get back to PF_EXITING and other tricks...
> > 
> > > (1) If you plan to allow the OOM victims to access memory reserves until
> > >     TASK_DEAD, tsk_is_oom_victim() will be as trivial as
> > > 
> > > bool tsk_is_oom_victim(struct task_struct *task)
> > > {
> > > 	return task->signal->oom_mm;
> > > }
> > 
> > yes, exactly. That's what I've tried to say above. with the oom_mm this
> > is trivial to implement while mm lists will not help us much due to
> > their life time. This also means that we know about the oom victim until
> > it is unhashed and become invisible to the oom killer.
> 
> Then, what are advantages with allowing only OOM victims access to memory
> reserves after they left exit_mm()?

Because they might need it in order to move on... Say you want to close
all the files which might release considerable amount of memory or any
other post exit_mm() resources.

> OOM victims might be waiting for locks
> at e.g. exit_task_work() held by non OOM victims waiting for memory
> allocation. If you change the OOM killer wait until existing OOM victims
> are removed from task_list, we might OOM livelock, don't we?

I didn't say the oom killer would wait for those victims to finish. We
have a per mm flag to tell the oom killer to skip over that task.

> I think that
> what we should do is make the OOM killer wait until MMF_OOM_REAPED is set
> rather than wait until existing OOM victims are removed from task_list.

Yes.

> Since we assume that mm_struct is the primary source of memory consumption,
> we don't select threads which already left exit_mm(). Since we assume that
> mm_struct is the primary source of memory consumption, why should we
> distinguish OOM victims and non OOM victims after they left exit_mm()?

Because we might prevent from pointless OOM killer selection that way.
If we know that the currently allocating task is an OOM victim then
giving it access to memory reserves is preferable to selecting another
oom victim.

> > Yes. the exit_mm is not really suitable place to cut the access to
> > memory reserves. a) mmput might be not the last one and b) even if it is
> > we shouldn't really rely it has cleared the memory. It will in 99% cases
> > but we have seen that the code had to play PF_EXITING tricks in the past
> > to cover post exit_mm allocations. I think the code flow would get
> > simplified greatly if we just do not rely on tsk->mm for anything but
> > the oom victim selection.
> 
> Even if exit_mm() is not suitable place to cut the access to memory reserves,
> I don't see advantages with allowing only OOM victims access to memory
> reserves after they left exit_mm().

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-25 11:21             ` Michal Hocko
@ 2016-07-25 11:47               ` Tetsuo Handa
  2016-07-25 11:59                 ` Michal Hocko
  0 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-25 11:47 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

Michal Hocko wrote:
> On Mon 25-07-16 20:07:11, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > > Are you planning to change the scope where the OOM victims can access memory
> > > > reserves?
> > > 
> > > Yes. Because we know that there are some post exit_mm allocations and I
> > > do not want to get back to PF_EXITING and other tricks...
> > > 
> > > > (1) If you plan to allow the OOM victims to access memory reserves until
> > > >     TASK_DEAD, tsk_is_oom_victim() will be as trivial as
> > > > 
> > > > bool tsk_is_oom_victim(struct task_struct *task)
> > > > {
> > > > 	return task->signal->oom_mm;
> > > > }
> > > 
> > > yes, exactly. That's what I've tried to say above. with the oom_mm this
> > > is trivial to implement while mm lists will not help us much due to
> > > their life time. This also means that we know about the oom victim until
> > > it is unhashed and become invisible to the oom killer.
> > 
> > Then, what are advantages with allowing only OOM victims access to memory
> > reserves after they left exit_mm()?
> 
> Because they might need it in order to move on... Say you want to close
> all the files which might release considerable amount of memory or any
> other post exit_mm() resources.

OOM victims might need memory reserves in order to move on, but non OOM victims
might also need memory reserves in order to move on. And non OOM victims might
be blocking OOM victims via locks.

> > Since we assume that mm_struct is the primary source of memory consumption,
> > we don't select threads which already left exit_mm(). Since we assume that
> > mm_struct is the primary source of memory consumption, why should we
> > distinguish OOM victims and non OOM victims after they left exit_mm()?
> 
> Because we might prevent from pointless OOM killer selection that way.

That "might" sounds obscure to me.

If currently allocating task is not an OOM victim then not giving it
access to memory reserves will cause OOM victim selection.
We might prevent from pointless OOM victim selection by giving
killed/exiting tasks access to memory reserves.

> If we know that the currently allocating task is an OOM victim then
> giving it access to memory reserves is preferable to selecting another
> oom victim.

If we know that the currently allocating task is killed/exiting then
giving it access to memory reserves is preferable to selecting another
OOM victim.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-25 11:47               ` Tetsuo Handa
@ 2016-07-25 11:59                 ` Michal Hocko
  2016-07-25 14:02                   ` Tetsuo Handa
  0 siblings, 1 reply; 32+ messages in thread
From: Michal Hocko @ 2016-07-25 11:59 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Mon 25-07-16 20:47:03, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Mon 25-07-16 20:07:11, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > > Are you planning to change the scope where the OOM victims can access memory
> > > > > reserves?
> > > > 
> > > > Yes. Because we know that there are some post exit_mm allocations and I
> > > > do not want to get back to PF_EXITING and other tricks...
> > > > 
> > > > > (1) If you plan to allow the OOM victims to access memory reserves until
> > > > >     TASK_DEAD, tsk_is_oom_victim() will be as trivial as
> > > > > 
> > > > > bool tsk_is_oom_victim(struct task_struct *task)
> > > > > {
> > > > > 	return task->signal->oom_mm;
> > > > > }
> > > > 
> > > > yes, exactly. That's what I've tried to say above. with the oom_mm this
> > > > is trivial to implement while mm lists will not help us much due to
> > > > their life time. This also means that we know about the oom victim until
> > > > it is unhashed and become invisible to the oom killer.
> > > 
> > > Then, what are advantages with allowing only OOM victims access to memory
> > > reserves after they left exit_mm()?
> > 
> > Because they might need it in order to move on... Say you want to close
> > all the files which might release considerable amount of memory or any
> > other post exit_mm() resources.
> 
> OOM victims might need memory reserves in order to move on, but non OOM victims
> might also need memory reserves in order to move on. And non OOM victims might
> be blocking OOM victims via locks.

Yes that might be true but OOM situations are rare events and quite
reduced in the scope. Considering all exiting tasks is more dangerous
because they might deplete those memory reserves easily.

> > > Since we assume that mm_struct is the primary source of memory consumption,
> > > we don't select threads which already left exit_mm(). Since we assume that
> > > mm_struct is the primary source of memory consumption, why should we
> > > distinguish OOM victims and non OOM victims after they left exit_mm()?
> > 
> > Because we might prevent from pointless OOM killer selection that way.
> 
> That "might" sounds obscure to me.
> 
> If currently allocating task is not an OOM victim then not giving it
> access to memory reserves will cause OOM victim selection.

Sure, that is true. I am talking about the case where the current victim
tries to get out and exit and it needs a memory for that.

> We might prevent from pointless OOM victim selection by giving
> killed/exiting tasks access to memory reserves.

This will open risks for other problems, I am afraid. Please note that
we are only trying to reduce the damage as much as possible. There is no
100% correct thing to do.

> > If we know that the currently allocating task is an OOM victim then
> > giving it access to memory reserves is preferable to selecting another
> > oom victim.
> 
> If we know that the currently allocating task is killed/exiting then
> giving it access to memory reserves is preferable to selecting another
> OOM victim.

I believe this is getting getting off topic. Can we get back to mm list
vs signal::oom_mm decision? I have expressed one aspect that would speak
for oom_mm as it provides a persistent and easy to detect oom victim
which would be tricky with the mm list approach. Could you name some
arguments which would speak for the mm list and would be a problem with
the other approach?
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-25 11:59                 ` Michal Hocko
@ 2016-07-25 14:02                   ` Tetsuo Handa
  2016-07-25 14:17                     ` Michal Hocko
  0 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-25 14:02 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

Michal Hocko wrote:
> On Mon 25-07-16 20:47:03, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > On Mon 25-07-16 20:07:11, Tetsuo Handa wrote:
> > > > Michal Hocko wrote:
> > > > > > Are you planning to change the scope where the OOM victims can access memory
> > > > > > reserves?
> > > > > 
> > > > > Yes. Because we know that there are some post exit_mm allocations and I
> > > > > do not want to get back to PF_EXITING and other tricks...
> > > > > 
> > > > > > (1) If you plan to allow the OOM victims to access memory reserves until
> > > > > >     TASK_DEAD, tsk_is_oom_victim() will be as trivial as
> > > > > > 
> > > > > > bool tsk_is_oom_victim(struct task_struct *task)
> > > > > > {
> > > > > > 	return task->signal->oom_mm;
> > > > > > }
> > > > > 
> > > > > yes, exactly. That's what I've tried to say above. with the oom_mm this
> > > > > is trivial to implement while mm lists will not help us much due to
> > > > > their life time. This also means that we know about the oom victim until
> > > > > it is unhashed and become invisible to the oom killer.
> > > > 
> > > > Then, what are advantages with allowing only OOM victims access to memory
> > > > reserves after they left exit_mm()?
> > > 
> > > Because they might need it in order to move on... Say you want to close
> > > all the files which might release considerable amount of memory or any
> > > other post exit_mm() resources.
> > 
> > OOM victims might need memory reserves in order to move on, but non OOM victims
> > might also need memory reserves in order to move on. And non OOM victims might
> > be blocking OOM victims via locks.
> 
> Yes that might be true but OOM situations are rare events and quite
> reduced in the scope. Considering all exiting tasks is more dangerous
> because they might deplete those memory reserves easily.

Why do you assume that we grant all of memory reserves?
I'm suggesting that we grant portion of memory reserves.
Killed/exiting tasks cannot deplete memory reserves.

> 
> > > > Since we assume that mm_struct is the primary source of memory consumption,
> > > > we don't select threads which already left exit_mm(). Since we assume that
> > > > mm_struct is the primary source of memory consumption, why should we
> > > > distinguish OOM victims and non OOM victims after they left exit_mm()?
> > > 
> > > Because we might prevent from pointless OOM killer selection that way.
> > 
> > That "might" sounds obscure to me.
> > 
> > If currently allocating task is not an OOM victim then not giving it
> > access to memory reserves will cause OOM victim selection.
> 
> Sure, that is true. I am talking about the case where the current victim
> tries to get out and exit and it needs a memory for that.
> 
> > We might prevent from pointless OOM victim selection by giving
> > killed/exiting tasks access to memory reserves.
> 
> This will open risks for other problems, I am afraid. Please note that
> we are only trying to reduce the damage as much as possible. There is no
> 100% correct thing to do.

My suggestion (allowing only portion of memory reserves) includes that
memory allocations done by killed/exiting tasks do not give up. That is,
try to guarantee that memory allocations for commit/cleanup operations
do not fail due to use of ALLOC_NO_WATERMARKS, for there is no means for
killed/exiting tasks to handle problems caused by memory allocation
failures.

> 
> > > If we know that the currently allocating task is an OOM victim then
> > > giving it access to memory reserves is preferable to selecting another
> > > oom victim.
> > 
> > If we know that the currently allocating task is killed/exiting then
> > giving it access to memory reserves is preferable to selecting another
> > OOM victim.
> 
> I believe this is getting getting off topic. Can we get back to mm list
> vs signal::oom_mm decision? I have expressed one aspect that would speak
> for oom_mm as it provides a persistent and easy to detect oom victim
> which would be tricky with the mm list approach. Could you name some
> arguments which would speak for the mm list and would be a problem with
> the other approach?

I thought we are talking about future plan. I didn't know you are asking for
some arguments which would speak for the mm list.

Since the mm list approach turned out that we after all need victim's
task_struct in order to test eligibility of victim's mm, the signal::oom_mm
approach will be easier to access both victim's task_struct and victim's mm
than the mm list approach. I'm fine with signal::oom_mm approach regarding
oom_scan_process_thread() part.

But I don't like use of ALLOC_NO_WATERMARKS by signal::oom_mm != NULL tasks
after they passed exit_mm(). Such behavior may cause post-exit_mm() allocation
requests which might be doing commit/cleanup operations to start failing. I'm
trying to reduce the damage as much as possible by not giving up memory
allocations by OOM victims or by killed/exiting tasks (unless __GFP_KILLABLE
is used and killed by SIGKILL). My approach will select next OOM victim when
killed/exiting tasks cannot satisfy their allocation requests even if some
portion of memory reserves are granted because my approach does not use
ALLOC_NO_WATERMARKS.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-25 14:02                   ` Tetsuo Handa
@ 2016-07-25 14:17                     ` Michal Hocko
  2016-07-25 21:40                       ` Tetsuo Handa
  0 siblings, 1 reply; 32+ messages in thread
From: Michal Hocko @ 2016-07-25 14:17 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Mon 25-07-16 23:02:35, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Mon 25-07-16 20:47:03, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > On Mon 25-07-16 20:07:11, Tetsuo Handa wrote:
> > > > > Michal Hocko wrote:
> > > > > > > Are you planning to change the scope where the OOM victims can access memory
> > > > > > > reserves?
> > > > > > 
> > > > > > Yes. Because we know that there are some post exit_mm allocations and I
> > > > > > do not want to get back to PF_EXITING and other tricks...
> > > > > > 
> > > > > > > (1) If you plan to allow the OOM victims to access memory reserves until
> > > > > > >     TASK_DEAD, tsk_is_oom_victim() will be as trivial as
> > > > > > > 
> > > > > > > bool tsk_is_oom_victim(struct task_struct *task)
> > > > > > > {
> > > > > > > 	return task->signal->oom_mm;
> > > > > > > }
> > > > > > 
> > > > > > yes, exactly. That's what I've tried to say above. with the oom_mm this
> > > > > > is trivial to implement while mm lists will not help us much due to
> > > > > > their life time. This also means that we know about the oom victim until
> > > > > > it is unhashed and become invisible to the oom killer.
> > > > > 
> > > > > Then, what are advantages with allowing only OOM victims access to memory
> > > > > reserves after they left exit_mm()?
> > > > 
> > > > Because they might need it in order to move on... Say you want to close
> > > > all the files which might release considerable amount of memory or any
> > > > other post exit_mm() resources.
> > > 
> > > OOM victims might need memory reserves in order to move on, but non OOM victims
> > > might also need memory reserves in order to move on. And non OOM victims might
> > > be blocking OOM victims via locks.
> > 
> > Yes that might be true but OOM situations are rare events and quite
> > reduced in the scope. Considering all exiting tasks is more dangerous
> > because they might deplete those memory reserves easily.
> 
> Why do you assume that we grant all of memory reserves?

I've said deplete "those memory reserves". It would be just too easy to
exit many tasks at once and use up that memory.

> I'm suggesting that we grant portion of memory reserves.

Which doesn't solve anything because it will always be a finite resource
which can get depleted. This is basically the same as the oom victim
(ab)using reserves accept that OOM is much less likely and it is under
control of the kernel which task gets killed.

[...]
> > > > If we know that the currently allocating task is an OOM victim then
> > > > giving it access to memory reserves is preferable to selecting another
> > > > oom victim.
> > > 
> > > If we know that the currently allocating task is killed/exiting then
> > > giving it access to memory reserves is preferable to selecting another
> > > OOM victim.
> > 
> > I believe this is getting getting off topic. Can we get back to mm list
> > vs signal::oom_mm decision? I have expressed one aspect that would speak
> > for oom_mm as it provides a persistent and easy to detect oom victim
> > which would be tricky with the mm list approach. Could you name some
> > arguments which would speak for the mm list and would be a problem with
> > the other approach?
> 
> I thought we are talking about future plan. I didn't know you are asking for
> some arguments which would speak for the mm list.

I have brought the future plans just because one part of it might be
much easier to implement if we go with the signal struct based approach.
As there was no other tie breaker I felt like it could help us with the
decision.

> Since the mm list approach turned out that we after all need victim's
> task_struct in order to test eligibility of victim's mm, the signal::oom_mm
> approach will be easier to access both victim's task_struct and victim's mm
> than the mm list approach. I'm fine with signal::oom_mm approach regarding
> oom_scan_process_thread() part.

OK
 
> But I don't like use of ALLOC_NO_WATERMARKS by signal::oom_mm != NULL tasks
> after they passed exit_mm().

I am not proposing that now and we can discuss it later when an actual
patch exists. All I wanted to achieve now is to agree on the first step
and direction. If you are ok with the oom_mm approach and do not see any
strong reasons to prefer mm list based one then I will post my current
pile later this week. Then I would like to handle kthread gracefully. I
guess this would be more than enough for this cycle before actually
meddling with TIF_MEMDIE.

Thanks!
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-25 14:17                     ` Michal Hocko
@ 2016-07-25 21:40                       ` Tetsuo Handa
  2016-07-26  7:52                         ` Michal Hocko
  0 siblings, 1 reply; 32+ messages in thread
From: Tetsuo Handa @ 2016-07-25 21:40 UTC (permalink / raw)
  To: mhocko; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

Michal Hocko wrote:
> On Mon 25-07-16 23:02:35, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > On Mon 25-07-16 20:47:03, Tetsuo Handa wrote:
> > > > Michal Hocko wrote:
> > > > > On Mon 25-07-16 20:07:11, Tetsuo Handa wrote:
> > > > > > Michal Hocko wrote:
> > > > > > > > Are you planning to change the scope where the OOM victims can access memory
> > > > > > > > reserves?
> > > > > > > 
> > > > > > > Yes. Because we know that there are some post exit_mm allocations and I
> > > > > > > do not want to get back to PF_EXITING and other tricks...
> > > > > > > 
> > > > > > > > (1) If you plan to allow the OOM victims to access memory reserves until
> > > > > > > >     TASK_DEAD, tsk_is_oom_victim() will be as trivial as
> > > > > > > > 
> > > > > > > > bool tsk_is_oom_victim(struct task_struct *task)
> > > > > > > > {
> > > > > > > > 	return task->signal->oom_mm;
> > > > > > > > }
> > > > > > > 
> > > > > > > yes, exactly. That's what I've tried to say above. with the oom_mm this
> > > > > > > is trivial to implement while mm lists will not help us much due to
> > > > > > > their life time. This also means that we know about the oom victim until
> > > > > > > it is unhashed and become invisible to the oom killer.
> > > > > > 
> > > > > > Then, what are advantages with allowing only OOM victims access to memory
> > > > > > reserves after they left exit_mm()?
> > > > > 
> > > > > Because they might need it in order to move on... Say you want to close
> > > > > all the files which might release considerable amount of memory or any
> > > > > other post exit_mm() resources.
> > > > 
> > > > OOM victims might need memory reserves in order to move on, but non OOM victims
> > > > might also need memory reserves in order to move on. And non OOM victims might
> > > > be blocking OOM victims via locks.
> > > 
> > > Yes that might be true but OOM situations are rare events and quite
> > > reduced in the scope. Considering all exiting tasks is more dangerous
> > > because they might deplete those memory reserves easily.
> > 
> > Why do you assume that we grant all of memory reserves?
> 
> I've said deplete "those memory reserves". It would be just too easy to
> exit many tasks at once and use up that memory.

But that will not be a problem unless an OOM event occurs. Even if some
portion of memory reserves are granted, killed/exiting tasks unlikely
access memory reserves. If killed/exiting tasks need to deplete that
portion of memory reserves, it is reasonable to select an OOM victim.

> 
> > I'm suggesting that we grant portion of memory reserves.
> 
> Which doesn't solve anything because it will always be a finite resource
> which can get depleted. This is basically the same as the oom victim
> (ab)using reserves accept that OOM is much less likely and it is under
> control of the kernel which task gets killed.

Given that OOM is much less likely event, maybe we even do not need to use
task_struct->oom_reaper_list and instead we can use a global variable

  static struct mm_struct *current_oom_mm;

and wait for current_oom_mm to become NULL regardless of in which domain an
OOM event occurred (as with we changed to use global oom_lock for preventing
concurrent OOM killer invocations)? Then, we can determine OOM_SCAN_ABORT by
inspecting that variable. This change may defer invocation of OOM killer in
different domains, but concurrent OOM events in different domains will be
also much less likely?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH v3 0/8] Change OOM killer to use list of mm_struct.
  2016-07-25 21:40                       ` Tetsuo Handa
@ 2016-07-26  7:52                         ` Michal Hocko
  0 siblings, 0 replies; 32+ messages in thread
From: Michal Hocko @ 2016-07-26  7:52 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, linux-mm, oleg, rientjes, vdavydov, mst

On Tue 26-07-16 06:40:54, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Mon 25-07-16 23:02:35, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > On Mon 25-07-16 20:47:03, Tetsuo Handa wrote:
> > > > > Michal Hocko wrote:
> > > > > > On Mon 25-07-16 20:07:11, Tetsuo Handa wrote:
[...]
> > > > > > > Then, what are advantages with allowing only OOM victims access to memory
> > > > > > > reserves after they left exit_mm()?
> > > > > > 
> > > > > > Because they might need it in order to move on... Say you want to close
> > > > > > all the files which might release considerable amount of memory or any
> > > > > > other post exit_mm() resources.
> > > > > 
> > > > > OOM victims might need memory reserves in order to move on, but non OOM victims
> > > > > might also need memory reserves in order to move on. And non OOM victims might
> > > > > be blocking OOM victims via locks.
> > > > 
> > > > Yes that might be true but OOM situations are rare events and quite
> > > > reduced in the scope. Considering all exiting tasks is more dangerous
> > > > because they might deplete those memory reserves easily.
> > > 
> > > Why do you assume that we grant all of memory reserves?
> > 
> > I've said deplete "those memory reserves". It would be just too easy to
> > exit many tasks at once and use up that memory.
> 
> But that will not be a problem unless an OOM event occurs.

And then it might make the problem just worse. I do not want to
speculate about adversary workloads but this just sounds like a bad
idea in general...

> Even if some
> portion of memory reserves are granted, killed/exiting tasks unlikely
> access memory reserves. If killed/exiting tasks need to deplete that
> portion of memory reserves, it is reasonable to select an OOM victim.
> 
> > 
> > > I'm suggesting that we grant portion of memory reserves.
> > 
> > Which doesn't solve anything because it will always be a finite resource
> > which can get depleted. This is basically the same as the oom victim
> > (ab)using reserves accept that OOM is much less likely and it is under
> > control of the kernel which task gets killed.
> 
> Given that OOM is much less likely event, maybe we even do not need to use
> task_struct->oom_reaper_list and instead we can use a global variable
> 
>   static struct mm_struct *current_oom_mm;
> 
> and wait for current_oom_mm to become NULL regardless of in which domain an
> OOM event occurred (as with we changed to use global oom_lock for preventing
> concurrent OOM killer invocations)?

Heh, this is very similar to what I used to have there in the beginning
and you have pushed to make it a list.

> Then, we can determine OOM_SCAN_ABORT by
> inspecting that variable. This change may defer invocation of OOM killer in
> different domains, but concurrent OOM events in different domains will be
> also much less likely?

Considering that there may be hundreds of memory cgroups configured then
I expect we will be pushed towards more parallelism in the future.

Anyway I think we went largely off topic.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2016-07-26  7:52 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-12 13:29 [PATCH v3 0/8] Change OOM killer to use list of mm_struct Tetsuo Handa
2016-07-12 13:29 ` [PATCH 1/8] mm,oom_reaper: Reduce find_lock_task_mm() usage Tetsuo Handa
2016-07-12 13:29 ` [PATCH 2/8] mm,oom_reaper: Do not attempt to reap a task twice Tetsuo Handa
2016-07-12 14:19   ` Michal Hocko
2016-07-12 13:29 ` [PATCH 3/8] mm,oom: Use list of mm_struct used by OOM victims Tetsuo Handa
2016-07-12 14:28   ` Michal Hocko
2016-07-12 13:29 ` [PATCH 4/8] mm,oom: Close oom_has_pending_mm race Tetsuo Handa
2016-07-12 14:36   ` Michal Hocko
2016-07-12 13:29 ` [PATCH 5/8] mm,oom_reaper: Make OOM reaper use list of mm_struct Tetsuo Handa
2016-07-12 14:51   ` Michal Hocko
2016-07-12 15:42     ` Tetsuo Handa
2016-07-13  7:48       ` Michal Hocko
2016-07-12 13:29 ` [PATCH 6/8] mm,oom: Remove OOM_SCAN_ABORT case and signal_struct->oom_victims Tetsuo Handa
2016-07-12 13:29 ` [PATCH 7/8] mm,oom: Stop clearing TIF_MEMDIE on remote thread Tetsuo Handa
2016-07-12 14:53   ` Michal Hocko
2016-07-12 15:45     ` Tetsuo Handa
2016-07-13  8:13       ` Michal Hocko
2016-07-12 13:29 ` [PATCH 8/8] oom_reaper: Revert "oom_reaper: close race with exiting task" Tetsuo Handa
2016-07-12 14:56   ` Michal Hocko
2016-07-21 11:21 ` [PATCH v3 0/8] Change OOM killer to use list of mm_struct Michal Hocko
2016-07-22 11:09   ` Tetsuo Handa
2016-07-22 12:05     ` Michal Hocko
2016-07-23  2:59       ` Tetsuo Handa
2016-07-25  8:48         ` Michal Hocko
2016-07-25 11:07           ` Tetsuo Handa
2016-07-25 11:21             ` Michal Hocko
2016-07-25 11:47               ` Tetsuo Handa
2016-07-25 11:59                 ` Michal Hocko
2016-07-25 14:02                   ` Tetsuo Handa
2016-07-25 14:17                     ` Michal Hocko
2016-07-25 21:40                       ` Tetsuo Handa
2016-07-26  7:52                         ` Michal Hocko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).