All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] mm, oom: Remove wake_oom_reaper().
@ 2018-08-04 13:29 Tetsuo Handa
  2018-08-04 13:29 ` [PATCH 2/4] mm, oom: Check pending victims earlier in out_of_memory() Tetsuo Handa
                   ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Tetsuo Handa @ 2018-08-04 13:29 UTC (permalink / raw)
  To: linux-mm; +Cc: Tetsuo Handa, David Rientjes, Michal Hocko, Roman Gushchin

Currently, wake_oom_reaper() is checking whether an OOM victim thread is
already chained to the OOM victim list. But chaining one thread from each
OOM victim process is sufficient.

Since mark_oom_victim() sets signal->oom_mm for one thread from each OOM
victim process, we can chain that OOM victim thread there. Since oom_lock
is held during __oom_kill_process(), by replacing oom_reaper_lock with
oom_lock, it becomes safe to use mark_oom_victim() even if MMF_OOM_SKIP is
later set due to is_global_init() case.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Roman Gushchin <guro@fb.com>
---
 mm/oom_kill.c | 41 ++++++++++-------------------------------
 1 file changed, 10 insertions(+), 31 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0e10b86..dad0409 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -486,7 +486,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
 static struct task_struct *oom_reaper_th;
 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
-static DEFINE_SPINLOCK(oom_reaper_lock);
 
 bool __oom_reap_task_mm(struct mm_struct *mm)
 {
@@ -607,7 +606,7 @@ static void oom_reap_task(struct task_struct *tsk)
 	 */
 	set_bit(MMF_OOM_SKIP, &mm->flags);
 
-	/* Drop a reference taken by wake_oom_reaper */
+	/* Drop a reference taken by mark_oom_victim(). */
 	put_task_struct(tsk);
 }
 
@@ -617,12 +616,12 @@ static int oom_reaper(void *unused)
 		struct task_struct *tsk = NULL;
 
 		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
-		spin_lock(&oom_reaper_lock);
+		mutex_lock(&oom_lock);
 		if (oom_reaper_list != NULL) {
 			tsk = oom_reaper_list;
 			oom_reaper_list = tsk->oom_reaper_list;
 		}
-		spin_unlock(&oom_reaper_lock);
+		mutex_unlock(&oom_lock);
 
 		if (tsk)
 			oom_reap_task(tsk);
@@ -631,32 +630,12 @@ static int oom_reaper(void *unused)
 	return 0;
 }
 
-static void wake_oom_reaper(struct task_struct *tsk)
-{
-	/* tsk is already queued? */
-	if (tsk == oom_reaper_list || tsk->oom_reaper_list)
-		return;
-
-	get_task_struct(tsk);
-
-	spin_lock(&oom_reaper_lock);
-	tsk->oom_reaper_list = oom_reaper_list;
-	oom_reaper_list = tsk;
-	spin_unlock(&oom_reaper_lock);
-	trace_wake_reaper(tsk->pid);
-	wake_up(&oom_reaper_wait);
-}
-
 static int __init oom_init(void)
 {
 	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
 	return 0;
 }
 subsys_initcall(oom_init)
-#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
-{
-}
 #endif /* CONFIG_MMU */
 
 /**
@@ -682,6 +661,13 @@ static void mark_oom_victim(struct task_struct *tsk)
 	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
 		mmgrab(tsk->signal->oom_mm);
 		set_bit(MMF_OOM_VICTIM, &mm->flags);
+#ifdef CONFIG_MMU
+		get_task_struct(tsk);
+		tsk->oom_reaper_list = oom_reaper_list;
+		oom_reaper_list = tsk;
+		trace_wake_reaper(tsk->pid);
+		wake_up(&oom_reaper_wait);
+#endif
 	}
 
 	/*
@@ -833,7 +819,6 @@ static void __oom_kill_process(struct task_struct *victim)
 {
 	struct task_struct *p;
 	struct mm_struct *mm;
-	bool can_oom_reap = true;
 
 	p = find_lock_task_mm(victim);
 	if (!p) {
@@ -883,7 +868,6 @@ static void __oom_kill_process(struct task_struct *victim)
 		if (same_thread_group(p, victim))
 			continue;
 		if (is_global_init(p)) {
-			can_oom_reap = false;
 			set_bit(MMF_OOM_SKIP, &mm->flags);
 			pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
 					task_pid_nr(victim), victim->comm,
@@ -900,9 +884,6 @@ static void __oom_kill_process(struct task_struct *victim)
 	}
 	rcu_read_unlock();
 
-	if (can_oom_reap)
-		wake_oom_reaper(victim);
-
 	mmdrop(mm);
 	put_task_struct(victim);
 }
@@ -941,7 +922,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	task_lock(p);
 	if (task_will_free_mem(p)) {
 		mark_oom_victim(p);
-		wake_oom_reaper(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -1071,7 +1051,6 @@ bool out_of_memory(struct oom_control *oc)
 	 */
 	if (task_will_free_mem(current)) {
 		mark_oom_victim(current);
-		wake_oom_reaper(current);
 		return true;
 	}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 2/4] mm, oom: Check pending victims earlier in out_of_memory().
  2018-08-04 13:29 [PATCH 1/4] mm, oom: Remove wake_oom_reaper() Tetsuo Handa
@ 2018-08-04 13:29 ` Tetsuo Handa
  2018-08-04 13:29 ` [PATCH 3/4] mm, oom: Remove unused "abort" path Tetsuo Handa
  2018-08-04 13:29 ` [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes Tetsuo Handa
  2 siblings, 0 replies; 38+ messages in thread
From: Tetsuo Handa @ 2018-08-04 13:29 UTC (permalink / raw)
  To: linux-mm; +Cc: Tetsuo Handa, David Rientjes, Michal Hocko, Roman Gushchin

Regarding CONFIG_MMU=y case, we have a list of inflight OOM victim threads
which are chained to oom_reaper_list. Therefore, by doing the same thing
for CONFIG_MMU=n case, we can check whether there are inflight OOM victims
before starting process/memcg list traversal. Since it is likely that only
few threads are chained to oom_reaper_list, checking all victims' OOM
domain will not matter.

Thus, check whether there are inflight OOM victims before starting
process/memcg list traversal. To do so, we need to chain OOM victims until
MMF_OOM_SKIP is set. Thus, this patch changes the OOM reaper to wait for
an request from the OOM killer using oom_reap_target variable. This change
allows the OOM reaper to preferentially reclaim from mm which the OOM
killer is waiting for the OOM reaper to reclaim.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Roman Gushchin <guro@fb.com>
---
 include/linux/oom.h   |  1 +
 include/linux/sched.h |  4 +--
 kernel/fork.c         |  2 ++
 mm/oom_kill.c         | 97 +++++++++++++++++++++++++++++----------------------
 4 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 69864a5..4a147871 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -104,6 +104,7 @@ extern unsigned long oom_badness(struct task_struct *p,
 extern bool out_of_memory(struct oom_control *oc);
 
 extern void exit_oom_victim(void);
+extern void exit_oom_mm(struct mm_struct *mm);
 
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e686dc..589fe78 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1173,9 +1173,7 @@ struct task_struct {
 	unsigned long			task_state_change;
 #endif
 	int				pagefault_disabled;
-#ifdef CONFIG_MMU
-	struct task_struct		*oom_reaper_list;
-#endif
+	struct list_head		oom_victim_list;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 276fdc6..ba1260d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1010,6 +1010,8 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
+	if (unlikely(mm_is_oom_victim(mm)))
+		exit_oom_mm(mm);
 	mmdrop(mm);
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dad0409..a743a8e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -321,18 +321,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto next;
 
 	/*
-	 * This task already has access to memory reserves and is being killed.
-	 * Don't allow any other task to have access to the reserves unless
-	 * the task has MMF_OOM_SKIP because chances that it would release
-	 * any memory is quite low.
-	 */
-	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
-		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
-			goto next;
-		goto abort;
-	}
-
-	/*
 	 * If task is allocating a lot of memory and has been marked to be
 	 * killed first if it triggers an oom, then select it.
 	 */
@@ -356,11 +344,6 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	oc->chosen_points = points;
 next:
 	return 0;
-abort:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
-	oc->chosen = (void *)-1UL;
-	return 1;
 }
 
 /*
@@ -478,6 +461,8 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
 	return false;
 }
 
+static LIST_HEAD(oom_victim_list);
+
 #ifdef CONFIG_MMU
 /*
  * OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -485,7 +470,7 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
  */
 static struct task_struct *oom_reaper_th;
 static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
-static struct task_struct *oom_reaper_list;
+static struct task_struct *oom_reap_target;
 
 bool __oom_reap_task_mm(struct mm_struct *mm)
 {
@@ -598,33 +583,21 @@ static void oom_reap_task(struct task_struct *tsk)
 	debug_show_all_locks();
 
 done:
-	tsk->oom_reaper_list = NULL;
-
 	/*
 	 * Hide this mm from OOM killer because it has been either reaped or
 	 * somebody can't call up_write(mmap_sem).
 	 */
 	set_bit(MMF_OOM_SKIP, &mm->flags);
-
-	/* Drop a reference taken by mark_oom_victim(). */
-	put_task_struct(tsk);
 }
 
 static int oom_reaper(void *unused)
 {
 	while (true) {
-		struct task_struct *tsk = NULL;
-
-		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
-		mutex_lock(&oom_lock);
-		if (oom_reaper_list != NULL) {
-			tsk = oom_reaper_list;
-			oom_reaper_list = tsk->oom_reaper_list;
-		}
-		mutex_unlock(&oom_lock);
-
-		if (tsk)
-			oom_reap_task(tsk);
+		wait_event_freezable(oom_reaper_wait, oom_reap_target != NULL);
+		oom_reap_task(oom_reap_target);
+		/* Drop a reference taken by oom_has_pending_victims(). */
+		put_task_struct(oom_reap_target);
+		oom_reap_target = NULL;
 	}
 
 	return 0;
@@ -661,13 +634,8 @@ static void mark_oom_victim(struct task_struct *tsk)
 	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
 		mmgrab(tsk->signal->oom_mm);
 		set_bit(MMF_OOM_VICTIM, &mm->flags);
-#ifdef CONFIG_MMU
 		get_task_struct(tsk);
-		tsk->oom_reaper_list = oom_reaper_list;
-		oom_reaper_list = tsk;
-		trace_wake_reaper(tsk->pid);
-		wake_up(&oom_reaper_wait);
-#endif
+		list_add(&tsk->oom_victim_list, &oom_victim_list);
 	}
 
 	/*
@@ -681,6 +649,21 @@ static void mark_oom_victim(struct task_struct *tsk)
 	trace_mark_victim(tsk->pid);
 }
 
+void exit_oom_mm(struct mm_struct *mm)
+{
+	struct task_struct *p, *tmp;
+
+	mutex_lock(&oom_lock);
+	list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) {
+		if (mm != p->signal->oom_mm)
+			continue;
+		list_del(&p->oom_victim_list);
+		/* Drop a reference taken by mark_oom_victim(). */
+		put_task_struct(p);
+	}
+	mutex_unlock(&oom_lock);
+}
+
 /**
  * exit_oom_victim - note the exit of an OOM victim
  */
@@ -1020,6 +1003,35 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+static bool oom_has_pending_victims(struct oom_control *oc)
+{
+	struct task_struct *p;
+
+	if (is_sysrq_oom(oc))
+		return false;
+	/*
+	 * Since oom_reap_task()/exit_mmap() will set MMF_OOM_SKIP, let's
+	 * wait for pending victims until MMF_OOM_SKIP is set or __mmput()
+	 * completes.
+	 */
+	list_for_each_entry(p, &oom_victim_list, oom_victim_list) {
+		if (oom_unkillable_task(p, oc->memcg, oc->nodemask))
+			continue;
+		if (!test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags)) {
+#ifdef CONFIG_MMU
+			if (!oom_reap_target) {
+				get_task_struct(p);
+				oom_reap_target = p;
+				trace_wake_reaper(p->pid);
+				wake_up(&oom_reaper_wait);
+			}
+#endif
+			return true;
+		}
+	}
+	return false;
+}
+
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @oc: pointer to struct oom_control
@@ -1072,6 +1084,9 @@ bool out_of_memory(struct oom_control *oc)
 		oc->nodemask = NULL;
 	check_panic_on_oom(oc, constraint);
 
+	if (oom_has_pending_victims(oc))
+		return true;
+
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 3/4] mm, oom: Remove unused "abort" path.
  2018-08-04 13:29 [PATCH 1/4] mm, oom: Remove wake_oom_reaper() Tetsuo Handa
  2018-08-04 13:29 ` [PATCH 2/4] mm, oom: Check pending victims earlier in out_of_memory() Tetsuo Handa
@ 2018-08-04 13:29 ` Tetsuo Handa
  2018-08-04 13:29 ` [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes Tetsuo Handa
  2 siblings, 0 replies; 38+ messages in thread
From: Tetsuo Handa @ 2018-08-04 13:29 UTC (permalink / raw)
  To: linux-mm; +Cc: Tetsuo Handa, David Rientjes, Michal Hocko, Roman Gushchin

Since oom_evaluate_task() no longer aborts, we can remove no longer
used "abort" path in the callers.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Roman Gushchin <guro@fb.com>
---
 include/linux/memcontrol.h |  9 ++++-----
 mm/memcontrol.c            | 18 +++++-------------
 mm/oom_kill.c              | 34 ++++++++++++++++------------------
 3 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 652f602..396b01d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -417,8 +417,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
 				   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
-int mem_cgroup_scan_tasks(struct mem_cgroup *,
-			  int (*)(struct task_struct *, void *), void *);
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+			   void (*fn)(struct task_struct *, void *), void *arg);
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
@@ -917,10 +917,9 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
 {
 }
 
-static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
-		int (*fn)(struct task_struct *, void *), void *arg)
+static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+		void (*fn)(struct task_struct *, void *), void *arg)
 {
-	return 0;
 }
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4e3c131..f743778 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1058,17 +1058,14 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
  * @arg: argument passed to @fn
  *
  * This function iterates over tasks attached to @memcg or to any of its
- * descendants and calls @fn for each task. If @fn returns a non-zero
- * value, the function breaks the iteration loop and returns the value.
- * Otherwise, it will iterate over all tasks and return 0.
+ * descendants and calls @fn for each task.
  *
  * This function must not be called for the root memory cgroup.
  */
-int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
-			  int (*fn)(struct task_struct *, void *), void *arg)
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+			   void (*fn)(struct task_struct *, void *), void *arg)
 {
 	struct mem_cgroup *iter;
-	int ret = 0;
 
 	BUG_ON(memcg == root_mem_cgroup);
 
@@ -1077,15 +1074,10 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 		struct task_struct *task;
 
 		css_task_iter_start(&iter->css, 0, &it);
-		while (!ret && (task = css_task_iter_next(&it)))
-			ret = fn(task, arg);
+		while ((task = css_task_iter_next(&it)))
+			fn(task, arg);
 		css_task_iter_end(&it);
-		if (ret) {
-			mem_cgroup_iter_break(memcg, iter);
-			break;
-		}
 	}
-	return ret;
 }
 
 /**
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a743a8e..783f04d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -312,13 +312,13 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
 	return CONSTRAINT_NONE;
 }
 
-static int oom_evaluate_task(struct task_struct *task, void *arg)
+static void oom_evaluate_task(struct task_struct *task, void *arg)
 {
 	struct oom_control *oc = arg;
 	unsigned long points;
 
 	if (oom_unkillable_task(task, NULL, oc->nodemask))
-		goto next;
+		return;
 
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
@@ -331,24 +331,22 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 
 	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
 	if (!points || points < oc->chosen_points)
-		goto next;
+		return;
 
 	/* Prefer thread group leaders for display purposes */
 	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
-		goto next;
+		return;
 select:
 	if (oc->chosen)
 		put_task_struct(oc->chosen);
 	get_task_struct(task);
 	oc->chosen = task;
 	oc->chosen_points = points;
-next:
-	return 0;
 }
 
 /*
  * Simple selection loop. We choose the process with the highest number of
- * 'points'. In case scan was aborted, oc->chosen is set to -1.
+ * 'points'.
  */
 static void select_bad_process(struct oom_control *oc)
 {
@@ -359,8 +357,7 @@ static void select_bad_process(struct oom_control *oc)
 
 		rcu_read_lock();
 		for_each_process(p)
-			if (oom_evaluate_task(p, oc))
-				break;
+			oom_evaluate_task(p, oc);
 		rcu_read_unlock();
 	}
 
@@ -876,13 +873,12 @@ static void __oom_kill_process(struct task_struct *victim)
  * Kill provided task unless it's secured by setting
  * oom_score_adj to OOM_SCORE_ADJ_MIN.
  */
-static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+static void oom_kill_memcg_member(struct task_struct *task, void *unused)
 {
 	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(task);
 		__oom_kill_process(task);
 	}
-	return 0;
 }
 
 static void oom_kill_process(struct oom_control *oc, const char *message)
@@ -1098,14 +1094,16 @@ bool out_of_memory(struct oom_control *oc)
 
 	select_bad_process(oc);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
-	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
-		dump_header(oc, NULL);
-		panic("Out of memory and no killable processes...\n");
+	if (!oc->chosen) {
+		if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
+			dump_header(oc, NULL);
+			panic("Out of memory and no killable processes...\n");
+		}
+		return false;
 	}
-	if (oc->chosen && oc->chosen != (void *)-1UL)
-		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
-				 "Memory cgroup out of memory");
-	return !!oc->chosen;
+	oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
+			 "Memory cgroup out of memory");
+	return true;
 }
 
 /*
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-04 13:29 [PATCH 1/4] mm, oom: Remove wake_oom_reaper() Tetsuo Handa
  2018-08-04 13:29 ` [PATCH 2/4] mm, oom: Check pending victims earlier in out_of_memory() Tetsuo Handa
  2018-08-04 13:29 ` [PATCH 3/4] mm, oom: Remove unused "abort" path Tetsuo Handa
@ 2018-08-04 13:29 ` Tetsuo Handa
  2018-08-06 13:45   ` Michal Hocko
  2 siblings, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-08-04 13:29 UTC (permalink / raw)
  To: linux-mm; +Cc: Tetsuo Handa, David Rientjes, Michal Hocko, Roman Gushchin

David Rientjes is complaining about current behavior that the OOM killer
selects next OOM victim as soon as MMF_OOM_SKIP is set even if
__oom_reap_task_mm() returned without any progress.

To address this problem, this patch adds a timeout with whether the OOM
score of an OOM victim's memory is decreasing over time as a feedback,
after MMF_OOM_SKIP is set by the OOM reaper or exit_mmap().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Roman Gushchin <guro@fb.com>
---
 include/linux/sched.h |  3 ++
 mm/oom_kill.c         | 81 ++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 589fe78..70c7dfd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1174,6 +1174,9 @@ struct task_struct {
 #endif
 	int				pagefault_disabled;
 	struct list_head		oom_victim_list;
+	unsigned long			last_oom_compared;
+	unsigned long			last_oom_score;
+	unsigned char			oom_reap_stall_count;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 783f04d..7cad886 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -49,6 +49,12 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
 
+static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
+{
+	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
+		mm_pgtables_bytes(mm) / PAGE_SIZE;
+}
+
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
@@ -230,8 +236,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	points = oom_victim_mm_score(p->mm);
 	task_unlock(p);
 
 	/* Normalize to oom_score_adj units */
@@ -571,15 +576,6 @@ static void oom_reap_task(struct task_struct *tsk)
 	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
 		schedule_timeout_idle(HZ/10);
 
-	if (attempts <= MAX_OOM_REAP_RETRIES ||
-	    test_bit(MMF_OOM_SKIP, &mm->flags))
-		goto done;
-
-	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
-		task_pid_nr(tsk), tsk->comm);
-	debug_show_all_locks();
-
-done:
 	/*
 	 * Hide this mm from OOM killer because it has been either reaped or
 	 * somebody can't call up_write(mmap_sem).
@@ -631,6 +627,9 @@ static void mark_oom_victim(struct task_struct *tsk)
 	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
 		mmgrab(tsk->signal->oom_mm);
 		set_bit(MMF_OOM_VICTIM, &mm->flags);
+		tsk->last_oom_compared = jiffies;
+		tsk->last_oom_score = oom_victim_mm_score(mm);
+		tsk->oom_reap_stall_count = 0;
 		get_task_struct(tsk);
 		list_add(&tsk->oom_victim_list, &oom_victim_list);
 	}
@@ -867,7 +866,6 @@ static void __oom_kill_process(struct task_struct *victim)
 	mmdrop(mm);
 	put_task_struct(victim);
 }
-#undef K
 
 /*
  * Kill provided task unless it's secured by setting
@@ -999,33 +997,74 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+static bool victim_mm_stalling(struct task_struct *p, struct mm_struct *mm)
+{
+	unsigned long score;
+
+	if (time_before(jiffies, p->last_oom_compared + HZ / 10))
+		return false;
+	score = oom_victim_mm_score(mm);
+	if (score < p->last_oom_score)
+		p->oom_reap_stall_count = 0;
+	else
+		p->oom_reap_stall_count++;
+	p->last_oom_score = oom_victim_mm_score(mm);
+	p->last_oom_compared = jiffies;
+	if (p->oom_reap_stall_count < 30)
+		return false;
+	pr_info("Gave up waiting for process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+		task_pid_nr(p), p->comm, K(mm->total_vm),
+		K(get_mm_counter(mm, MM_ANONPAGES)),
+		K(get_mm_counter(mm, MM_FILEPAGES)),
+		K(get_mm_counter(mm, MM_SHMEMPAGES)));
+	return true;
+}
+
 static bool oom_has_pending_victims(struct oom_control *oc)
 {
-	struct task_struct *p;
+	struct task_struct *p, *tmp;
+	bool ret = false;
+	bool gaveup = false;
 
 	if (is_sysrq_oom(oc))
 		return false;
 	/*
-	 * Since oom_reap_task()/exit_mmap() will set MMF_OOM_SKIP, let's
-	 * wait for pending victims until MMF_OOM_SKIP is set or __mmput()
-	 * completes.
+	 * Wait for pending victims until __mmput() completes or stalled
+	 * too long.
 	 */
-	list_for_each_entry(p, &oom_victim_list, oom_victim_list) {
+	list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) {
+		struct mm_struct *mm = p->signal->oom_mm;
+
 		if (oom_unkillable_task(p, oc->memcg, oc->nodemask))
 			continue;
-		if (!test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags)) {
+		ret = true;
 #ifdef CONFIG_MMU
+		/*
+		 * Since the OOM reaper exists, we can safely wait until
+		 * MMF_OOM_SKIP is set.
+		 */
+		if (!test_bit(MMF_OOM_SKIP, &mm->flags)) {
 			if (!oom_reap_target) {
 				get_task_struct(p);
 				oom_reap_target = p;
 				trace_wake_reaper(p->pid);
 				wake_up(&oom_reaper_wait);
 			}
-#endif
-			return true;
+			continue;
 		}
+#endif
+		/* We can wait as long as OOM score is decreasing over time. */
+		if (!victim_mm_stalling(p, mm))
+			continue;
+		gaveup = true;
+		list_del(&p->oom_victim_list);
+		/* Drop a reference taken by mark_oom_victim(). */
+		put_task_struct(p);
 	}
-	return false;
+	if (gaveup)
+		debug_show_all_locks();
+
+	return ret;
 }
 
 /**
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-04 13:29 ` [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes Tetsuo Handa
@ 2018-08-06 13:45   ` Michal Hocko
  2018-08-06 20:19     ` David Rientjes
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-08-06 13:45 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: linux-mm, David Rientjes, Roman Gushchin

On Sat 04-08-18 22:29:46, Tetsuo Handa wrote:
> David Rientjes is complaining about current behavior that the OOM killer
> selects next OOM victim as soon as MMF_OOM_SKIP is set even if
> __oom_reap_task_mm() returned without any progress.
> 
> To address this problem, this patch adds a timeout with whether the OOM
> score of an OOM victim's memory is decreasing over time as a feedback,
> after MMF_OOM_SKIP is set by the OOM reaper or exit_mmap().

I still hate any feedback mechanism based on time. We have seen that
these paths are completely non-deterministic time wise that building
any heuristic on top of it just sounds wrong.

Yes we have problems that the oom reaper doesn't handle all types of
memory yet. We should cover most of reasonably large memory types by
now. There is still mlock to take care of and that would be much
preferable to work on ragardless the retry mechanism becuase this work
will simply not handle that case either.

So I do not really see this would be an improvement. I still stand by my
argument that any retry mechanism should be based on the direct feedback
from the oom reaper rather than some magic "this took that long without
any progress".

> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Roman Gushchin <guro@fb.com>
> ---
>  include/linux/sched.h |  3 ++
>  mm/oom_kill.c         | 81 ++++++++++++++++++++++++++++++++++++++-------------
>  2 files changed, 63 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 589fe78..70c7dfd 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1174,6 +1174,9 @@ struct task_struct {
>  #endif
>  	int				pagefault_disabled;
>  	struct list_head		oom_victim_list;
> +	unsigned long			last_oom_compared;
> +	unsigned long			last_oom_score;
> +	unsigned char			oom_reap_stall_count;
>  #ifdef CONFIG_VMAP_STACK
>  	struct vm_struct		*stack_vm_area;
>  #endif
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 783f04d..7cad886 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -49,6 +49,12 @@
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/oom.h>
>  
> +static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
> +{
> +	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
> +		mm_pgtables_bytes(mm) / PAGE_SIZE;
> +}
> +
>  int sysctl_panic_on_oom;
>  int sysctl_oom_kill_allocating_task;
>  int sysctl_oom_dump_tasks = 1;
> @@ -230,8 +236,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
>  	 * The baseline for the badness score is the proportion of RAM that each
>  	 * task's rss, pagetable and swap space use.
>  	 */
> -	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
> -		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
> +	points = oom_victim_mm_score(p->mm);
>  	task_unlock(p);
>  
>  	/* Normalize to oom_score_adj units */
> @@ -571,15 +576,6 @@ static void oom_reap_task(struct task_struct *tsk)
>  	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
>  		schedule_timeout_idle(HZ/10);
>  
> -	if (attempts <= MAX_OOM_REAP_RETRIES ||
> -	    test_bit(MMF_OOM_SKIP, &mm->flags))
> -		goto done;
> -
> -	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
> -		task_pid_nr(tsk), tsk->comm);
> -	debug_show_all_locks();
> -
> -done:
>  	/*
>  	 * Hide this mm from OOM killer because it has been either reaped or
>  	 * somebody can't call up_write(mmap_sem).
> @@ -631,6 +627,9 @@ static void mark_oom_victim(struct task_struct *tsk)
>  	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
>  		mmgrab(tsk->signal->oom_mm);
>  		set_bit(MMF_OOM_VICTIM, &mm->flags);
> +		tsk->last_oom_compared = jiffies;
> +		tsk->last_oom_score = oom_victim_mm_score(mm);
> +		tsk->oom_reap_stall_count = 0;
>  		get_task_struct(tsk);
>  		list_add(&tsk->oom_victim_list, &oom_victim_list);
>  	}
> @@ -867,7 +866,6 @@ static void __oom_kill_process(struct task_struct *victim)
>  	mmdrop(mm);
>  	put_task_struct(victim);
>  }
> -#undef K
>  
>  /*
>   * Kill provided task unless it's secured by setting
> @@ -999,33 +997,74 @@ int unregister_oom_notifier(struct notifier_block *nb)
>  }
>  EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>  
> +static bool victim_mm_stalling(struct task_struct *p, struct mm_struct *mm)
> +{
> +	unsigned long score;
> +
> +	if (time_before(jiffies, p->last_oom_compared + HZ / 10))
> +		return false;
> +	score = oom_victim_mm_score(mm);
> +	if (score < p->last_oom_score)
> +		p->oom_reap_stall_count = 0;
> +	else
> +		p->oom_reap_stall_count++;
> +	p->last_oom_score = oom_victim_mm_score(mm);
> +	p->last_oom_compared = jiffies;
> +	if (p->oom_reap_stall_count < 30)
> +		return false;
> +	pr_info("Gave up waiting for process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
> +		task_pid_nr(p), p->comm, K(mm->total_vm),
> +		K(get_mm_counter(mm, MM_ANONPAGES)),
> +		K(get_mm_counter(mm, MM_FILEPAGES)),
> +		K(get_mm_counter(mm, MM_SHMEMPAGES)));
> +	return true;
> +}
> +
>  static bool oom_has_pending_victims(struct oom_control *oc)
>  {
> -	struct task_struct *p;
> +	struct task_struct *p, *tmp;
> +	bool ret = false;
> +	bool gaveup = false;
>  
>  	if (is_sysrq_oom(oc))
>  		return false;
>  	/*
> -	 * Since oom_reap_task()/exit_mmap() will set MMF_OOM_SKIP, let's
> -	 * wait for pending victims until MMF_OOM_SKIP is set or __mmput()
> -	 * completes.
> +	 * Wait for pending victims until __mmput() completes or stalled
> +	 * too long.
>  	 */
> -	list_for_each_entry(p, &oom_victim_list, oom_victim_list) {
> +	list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) {
> +		struct mm_struct *mm = p->signal->oom_mm;
> +
>  		if (oom_unkillable_task(p, oc->memcg, oc->nodemask))
>  			continue;
> -		if (!test_bit(MMF_OOM_SKIP, &p->signal->oom_mm->flags)) {
> +		ret = true;
>  #ifdef CONFIG_MMU
> +		/*
> +		 * Since the OOM reaper exists, we can safely wait until
> +		 * MMF_OOM_SKIP is set.
> +		 */
> +		if (!test_bit(MMF_OOM_SKIP, &mm->flags)) {
>  			if (!oom_reap_target) {
>  				get_task_struct(p);
>  				oom_reap_target = p;
>  				trace_wake_reaper(p->pid);
>  				wake_up(&oom_reaper_wait);
>  			}
> -#endif
> -			return true;
> +			continue;
>  		}
> +#endif
> +		/* We can wait as long as OOM score is decreasing over time. */
> +		if (!victim_mm_stalling(p, mm))
> +			continue;
> +		gaveup = true;
> +		list_del(&p->oom_victim_list);
> +		/* Drop a reference taken by mark_oom_victim(). */
> +		put_task_struct(p);
>  	}
> -	return false;
> +	if (gaveup)
> +		debug_show_all_locks();
> +
> +	return ret;
>  }
>  
>  /**
> -- 
> 1.8.3.1

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-06 13:45   ` Michal Hocko
@ 2018-08-06 20:19     ` David Rientjes
  2018-08-06 20:51       ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: David Rientjes @ 2018-08-06 20:19 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Mon, 6 Aug 2018, Michal Hocko wrote:

> On Sat 04-08-18 22:29:46, Tetsuo Handa wrote:
> > David Rientjes is complaining about current behavior that the OOM killer
> > selects next OOM victim as soon as MMF_OOM_SKIP is set even if
> > __oom_reap_task_mm() returned without any progress.
> > 
> > To address this problem, this patch adds a timeout with whether the OOM
> > score of an OOM victim's memory is decreasing over time as a feedback,
> > after MMF_OOM_SKIP is set by the OOM reaper or exit_mmap().
> 
> I still hate any feedback mechanism based on time. We have seen that
> these paths are completely non-deterministic time wise that building
> any heuristic on top of it just sounds wrong.
> 
> Yes we have problems that the oom reaper doesn't handle all types of
> memory yet. We should cover most of reasonably large memory types by
> now. There is still mlock to take care of and that would be much
> preferable to work on ragardless the retry mechanism becuase this work
> will simply not handle that case either.
> 
> So I do not really see this would be an improvement. I still stand by my
> argument that any retry mechanism should be based on the direct feedback
> from the oom reaper rather than some magic "this took that long without
> any progress".
> 

At the risk of continually repeating the same statement, the oom reaper 
cannot provide the direct feedback for all possible memory freeing.  
Waking up periodically and finding mm->mmap_sem contended is one problem, 
but the other problem that I've already shown is the unnecessary oom 
killing of additional processes while a thread has already reached 
exit_mmap().  The oom reaper cannot free page tables which is problematic 
for malloc implementations such as tcmalloc that do not release virtual 
memory.  For binaries with heaps that are very large, sometimes over 
100GB, this is a substantial amount of memory and we have seen unnecessary 
oom killing before and during free_pgtables() of the victim.  This is long 
after the oom reaper would operate on any mm.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-06 20:19     ` David Rientjes
@ 2018-08-06 20:51       ` Michal Hocko
  2018-08-09 20:16         ` David Rientjes
  2018-09-01 11:48         ` Tetsuo Handa
  0 siblings, 2 replies; 38+ messages in thread
From: Michal Hocko @ 2018-08-06 20:51 UTC (permalink / raw)
  To: David Rientjes; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Mon 06-08-18 13:19:18, David Rientjes wrote:
> On Mon, 6 Aug 2018, Michal Hocko wrote:
> 
> > On Sat 04-08-18 22:29:46, Tetsuo Handa wrote:
> > > David Rientjes is complaining about current behavior that the OOM killer
> > > selects next OOM victim as soon as MMF_OOM_SKIP is set even if
> > > __oom_reap_task_mm() returned without any progress.
> > > 
> > > To address this problem, this patch adds a timeout with whether the OOM
> > > score of an OOM victim's memory is decreasing over time as a feedback,
> > > after MMF_OOM_SKIP is set by the OOM reaper or exit_mmap().
> > 
> > I still hate any feedback mechanism based on time. We have seen that
> > these paths are completely non-deterministic time wise that building
> > any heuristic on top of it just sounds wrong.
> > 
> > Yes we have problems that the oom reaper doesn't handle all types of
> > memory yet. We should cover most of reasonably large memory types by
> > now. There is still mlock to take care of and that would be much
> > preferable to work on ragardless the retry mechanism becuase this work
> > will simply not handle that case either.
> > 
> > So I do not really see this would be an improvement. I still stand by my
> > argument that any retry mechanism should be based on the direct feedback
> > from the oom reaper rather than some magic "this took that long without
> > any progress".
> > 
> 
> At the risk of continually repeating the same statement, the oom reaper 
> cannot provide the direct feedback for all possible memory freeing.  
> Waking up periodically and finding mm->mmap_sem contended is one problem, 
> but the other problem that I've already shown is the unnecessary oom 
> killing of additional processes while a thread has already reached 
> exit_mmap().  The oom reaper cannot free page tables which is problematic 
> for malloc implementations such as tcmalloc that do not release virtual 
> memory. 

But once we know that the exit path is past the point of blocking we can
have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
oom_reaper doesn't hide the current victim too early and we can safely
wait for the exit path to reclaim the rest. So there is a feedback
channel. I would even do not mind to poll for that state few times -
similar to polling for the mmap_sem. But it would still be some feedback
rather than a certain amount of time has passed since the last check.

> For binaries with heaps that are very large, sometimes over 
> 100GB, this is a substantial amount of memory and we have seen unnecessary 
> oom killing before and during free_pgtables() of the victim.  This is long 
> after the oom reaper would operate on any mm.

Well, a specific example would be really helpful. I have to admit I
haven't seen many oom victim without any memory mapped to the address
space. I can construct pathological corner cases of course but well, is
this a reasonable usecase to base the implementation on? A malicious user
can usually find other ways how to hurt the system and that's why it
should be contained.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-06 20:51       ` Michal Hocko
@ 2018-08-09 20:16         ` David Rientjes
  2018-08-10  9:07           ` Michal Hocko
  2018-09-01 11:48         ` Tetsuo Handa
  1 sibling, 1 reply; 38+ messages in thread
From: David Rientjes @ 2018-08-09 20:16 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Mon, 6 Aug 2018, Michal Hocko wrote:

> > At the risk of continually repeating the same statement, the oom reaper 
> > cannot provide the direct feedback for all possible memory freeing.  
> > Waking up periodically and finding mm->mmap_sem contended is one problem, 
> > but the other problem that I've already shown is the unnecessary oom 
> > killing of additional processes while a thread has already reached 
> > exit_mmap().  The oom reaper cannot free page tables which is problematic 
> > for malloc implementations such as tcmalloc that do not release virtual 
> > memory. 
> 
> But once we know that the exit path is past the point of blocking we can
> have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> oom_reaper doesn't hide the current victim too early and we can safely
> wait for the exit path to reclaim the rest. So there is a feedback
> channel. I would even do not mind to poll for that state few times -
> similar to polling for the mmap_sem. But it would still be some feedback
> rather than a certain amount of time has passed since the last check.
> 

Yes, of course, it would be easy to rely on exit_mmap() to set 
MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
when we are assured of forward progress.  What polling are you proposing 
other than a timeout based mechanism to do this?

We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
complete free_pgtables() for that mm.  The problem is the same: when does 
the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
been set in a timely manner?

If this is an argument that the oom reaper should loop checking for 
MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
than just setting the jiffies in the mm itself, that's just implementing 
the same thing and doing so in a way where the oom reaper stalls operating 
on a single mm rather than round-robin iterating over mm's in my patch.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-09 20:16         ` David Rientjes
@ 2018-08-10  9:07           ` Michal Hocko
  2018-08-10 10:54             ` Tetsuo Handa
  2018-08-19 23:45             ` David Rientjes
  0 siblings, 2 replies; 38+ messages in thread
From: Michal Hocko @ 2018-08-10  9:07 UTC (permalink / raw)
  To: David Rientjes; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Thu 09-08-18 13:16:25, David Rientjes wrote:
> On Mon, 6 Aug 2018, Michal Hocko wrote:
> 
> > > At the risk of continually repeating the same statement, the oom reaper 
> > > cannot provide the direct feedback for all possible memory freeing.  
> > > Waking up periodically and finding mm->mmap_sem contended is one problem, 
> > > but the other problem that I've already shown is the unnecessary oom 
> > > killing of additional processes while a thread has already reached 
> > > exit_mmap().  The oom reaper cannot free page tables which is problematic 
> > > for malloc implementations such as tcmalloc that do not release virtual 
> > > memory. 
> > 
> > But once we know that the exit path is past the point of blocking we can
> > have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> > oom_reaper doesn't hide the current victim too early and we can safely
> > wait for the exit path to reclaim the rest. So there is a feedback
> > channel. I would even do not mind to poll for that state few times -
> > similar to polling for the mmap_sem. But it would still be some feedback
> > rather than a certain amount of time has passed since the last check.
> > 
> 
> Yes, of course, it would be easy to rely on exit_mmap() to set 
> MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
> when we are assured of forward progress.  What polling are you proposing 
> other than a timeout based mechanism to do this?

I was thinking about doing something like the following
- oom_reaper checks the amount of victim's memory after it is done with
  reaping (e.g. by calling oom_badness before and after). If it wasn't able to
  reclaim much then return false and keep retrying with the existing
  mechanism
- once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
  MMF_OOM_SKIP flag.

> We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
> complete free_pgtables() for that mm.  The problem is the same: when does 
> the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
> been set in a timely manner?

reuse the current retry policy which is the number of attempts rather
than any timeout.

> If this is an argument that the oom reaper should loop checking for 
> MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
> than just setting the jiffies in the mm itself, that's just implementing 
> the same thing and doing so in a way where the oom reaper stalls operating 
> on a single mm rather than round-robin iterating over mm's in my patch.

I've said earlier that I do not mind doing round robin in the oom repaer
but this is certainly more complex than what we do now and I haven't
seen any actual example where it would matter. OOM reaper is a safely
measure. Nothing should fall apart if it is slow. The primary work
should be happening from the exit path anyway.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-10  9:07           ` Michal Hocko
@ 2018-08-10 10:54             ` Tetsuo Handa
  2018-08-10 11:16               ` Michal Hocko
  2018-08-19 23:45             ` David Rientjes
  1 sibling, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-08-10 10:54 UTC (permalink / raw)
  To: Michal Hocko, David Rientjes; +Cc: linux-mm, Roman Gushchin

On 2018/08/10 18:07, Michal Hocko wrote:
>> Yes, of course, it would be easy to rely on exit_mmap() to set 
>> MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
>> when we are assured of forward progress.  What polling are you proposing 
>> other than a timeout based mechanism to do this?
> 
> I was thinking about doing something like the following
> - oom_reaper checks the amount of victim's memory after it is done with
>   reaping (e.g. by calling oom_badness before and after). 

OK. We can apply

+static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
+{
+	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
+		mm_pgtables_bytes(mm) / PAGE_SIZE;
+}

and

-	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+	points = oom_victim_mm_score(p->mm);

part, can't we?

>                                                           If it wasn't able to
>   reclaim much then return false and keep retrying with the existing
>   mechanism

How do you decide whether oom_reaper() was not able to reclaim much?

> - once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
>   MMF_OOM_SKIP flag.

Unless oom_victim_mm_score() becomes close to 0, setting MMF_OOM_SKIP is
considered premature. oom_reaper() will have to keep retrying....

> 
>> We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
>> complete free_pgtables() for that mm.  The problem is the same: when does 
>> the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
>> been set in a timely manner?
> 
> reuse the current retry policy which is the number of attempts rather
> than any timeout.

And this is really I can't understand. The number of attempts multiplied
by retry interval _is_ nothing but timeout.

We are already using timeout based decision, with some attempt to reclaim
memory if conditions are met.

> 
>> If this is an argument that the oom reaper should loop checking for 
>> MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
>> than just setting the jiffies in the mm itself, that's just implementing 
>> the same thing and doing so in a way where the oom reaper stalls operating 
>> on a single mm rather than round-robin iterating over mm's in my patch.
> 
> I've said earlier that I do not mind doing round robin in the oom repaer
> but this is certainly more complex than what we do now and I haven't
> seen any actual example where it would matter. OOM reaper is a safely
> measure. Nothing should fall apart if it is slow. 

The OOM reaper can fail if allocating threads have high priority. You seem to
assume that realtime threads won't trigger OOM path. But since !PF_WQ_WORKER
threads do only cond_resched() due to your "the cargo cult programming" refusal,
and like Andrew Morton commented

  cond_resched() is a no-op in the presence of realtime policy threads
  and using to attempt to yield to a different thread it in this fashion
  is broken.

at "mm: disable preemption before swapcache_free" thread, we can't guarantee
that allocating threads shall give the OOM reaper enough CPU resource for
making forward progress. And my direct OOM reaping proposal was also refused
by you. I really dislike counting OOM reaper as a safety measure.

>                                                   The primary work
> should be happening from the exit path anyway.
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-10 10:54             ` Tetsuo Handa
@ 2018-08-10 11:16               ` Michal Hocko
  2018-08-11  3:12                 ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-08-10 11:16 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: David Rientjes, linux-mm, Roman Gushchin

On Fri 10-08-18 19:54:39, Tetsuo Handa wrote:
> On 2018/08/10 18:07, Michal Hocko wrote:
> >> Yes, of course, it would be easy to rely on exit_mmap() to set 
> >> MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
> >> when we are assured of forward progress.  What polling are you proposing 
> >> other than a timeout based mechanism to do this?
> > 
> > I was thinking about doing something like the following
> > - oom_reaper checks the amount of victim's memory after it is done with
> >   reaping (e.g. by calling oom_badness before and after). 
> 
> OK. We can apply
> 
> +static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
> +{
> +	return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
> +		mm_pgtables_bytes(mm) / PAGE_SIZE;
> +}
> 
> and
> 
> -	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
> -		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
> +	points = oom_victim_mm_score(p->mm);
> 
> part, can't we?
> 
> >                                                           If it wasn't able to
> >   reclaim much then return false and keep retrying with the existing
> >   mechanism
> 
> How do you decide whether oom_reaper() was not able to reclaim much?

Just a rule of thumb. If it freed at least few kBs then we should be good
to MMF_OOM_SKIP.
 
> > - once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
> >   MMF_OOM_SKIP flag.
> 
> Unless oom_victim_mm_score() becomes close to 0, setting MMF_OOM_SKIP is
> considered premature. oom_reaper() will have to keep retrying....

there absolutely have to be a cap for retrying. Otherwise you have
lockup scenarios back when the memory is mostly consumed by page tables.

> >> We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
> >> complete free_pgtables() for that mm.  The problem is the same: when does 
> >> the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
> >> been set in a timely manner?
> > 
> > reuse the current retry policy which is the number of attempts rather
> > than any timeout.
> 
> And this is really I can't understand. The number of attempts multiplied
> by retry interval _is_ nothing but timeout.

Yes it is a timeout but it is not the time that matters. It is that we
have tried sufficient times. Looks at it this way. You can retry 5 times
in 10s or just once. Depending on what is going on in the system. I
would really prefer the behavior to be deterministic.

> We are already using timeout based decision, with some attempt to reclaim
> memory if conditions are met.

Timeout based decision is when you, well, make a decision after a
certain time passes. And we do not do that.
 
> >> If this is an argument that the oom reaper should loop checking for 
> >> MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
> >> than just setting the jiffies in the mm itself, that's just implementing 
> >> the same thing and doing so in a way where the oom reaper stalls operating 
> >> on a single mm rather than round-robin iterating over mm's in my patch.
> > 
> > I've said earlier that I do not mind doing round robin in the oom repaer
> > but this is certainly more complex than what we do now and I haven't
> > seen any actual example where it would matter. OOM reaper is a safely
> > measure. Nothing should fall apart if it is slow. 
> 
> The OOM reaper can fail if allocating threads have high priority. You seem to
> assume that realtime threads won't trigger OOM path. But since !PF_WQ_WORKER
> threads do only cond_resched() due to your "the cargo cult programming" refusal,
> and like Andrew Morton commented
> 
>   cond_resched() is a no-op in the presence of realtime policy threads
>   and using to attempt to yield to a different thread it in this fashion
>   is broken.
> 
> at "mm: disable preemption before swapcache_free" thread, we can't guarantee
> that allocating threads shall give the OOM reaper enough CPU resource for
> making forward progress. And my direct OOM reaping proposal was also refused
> by you. I really dislike counting OOM reaper as a safety measure.

Well, yeah, you can screw up your system with real time priority tasks
all you want. I really fail to see why you are bringing that up now
though. Yet another offtopic?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-10 11:16               ` Michal Hocko
@ 2018-08-11  3:12                 ` Tetsuo Handa
  2018-08-14 11:33                   ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-08-11  3:12 UTC (permalink / raw)
  To: Michal Hocko; +Cc: David Rientjes, linux-mm, Roman Gushchin

On 2018/08/10 20:16, Michal Hocko wrote:
>> How do you decide whether oom_reaper() was not able to reclaim much?
> 
> Just a rule of thumb. If it freed at least few kBs then we should be good
> to MMF_OOM_SKIP.

I don't think so. We are talking about situations where MMF_OOM_SKIP is set
before memory enough to prevent the OOM killer from selecting next OOM victim
was reclaimed.

>> Unless oom_victim_mm_score() becomes close to 0, setting MMF_OOM_SKIP is
>> considered premature. oom_reaper() will have to keep retrying....
> 
> there absolutely have to be a cap for retrying. Otherwise you have
> lockup scenarios back when the memory is mostly consumed by page tables.

Right, we absolutely need a cap for retrying.

>>>> We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
>>>> complete free_pgtables() for that mm.  The problem is the same: when does 
>>>> the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
>>>> been set in a timely manner?
>>>
>>> reuse the current retry policy which is the number of attempts rather
>>> than any timeout.
>>
>> And this is really I can't understand. The number of attempts multiplied
>> by retry interval _is_ nothing but timeout.
> 
> Yes it is a timeout but it is not the time that matters. It is that we
> have tried sufficient times. Looks at it this way. You can retry 5 times
> in 10s or just once. Depending on what is going on in the system. I
> would really prefer the behavior to be deterministic.

What is the difference between

// Reclaim attempt by the OOM reaper
	for_each_OOM_victim_mm_without_MMF_OOM_SKIP {
		for (attempts = 0; attempts < MAX_OOM_REAP_RETRIES &&
		     !test_bit(MMF_EXIT_MMAP, &mm->flags); attempts++) {
			oom_reap_task_mm(tsk, mm):
			schedule_timeout_idle(HZ/10);
		}
		// It is time to make final decision
		if (test_bit(MMF_EXIT_MMAP, &mm->flags))
			continue;
		pr_info("Gave up waiting for process %d (%s) ...\n", ...);
		set_bit(MMF_OOM_SKIP, &mm->flags); // Allow selecting next OOM victim.
	}

(I assume this is what you call "reuse the current retry policy") and

// Initialization at mark_oom_victim()
	mm->last_reap_attempted = jiffies;
	mm->reap_attempted = 0;

// Reclaim attempt by allocating thread
	// Try allocation while waiting before oom_reap_task_mm()
	page = get_page_from_freelist(...);
	if (page)
		return page;
	for_each_OOM_victim_mm_without_MMF_OOM_SKIP {
		// Check if it is time to try oom_reap_task_mm()
		if (!time_after(jiffies, mm->last_reap_attempted + HZ / 10))
			continue;
		oom_reap_task_mm(tsk, mm);
		mm->last_reap_attempted = jiffies;
		if (mm->reap_attempted++ <= MAX_OOM_REAP_RETRIES)
			continue;
		// It is time to make final decision
		if (test_bit(MMF_EXIT_MMAP, &mm->flags))
			continue;
		pr_info("Gave up waiting for process %d (%s) ...\n", ...);
		set_bit(MMF_OOM_SKIP, &mm->flags); // Allow selecting next OOM victim.
	}

(this is what I call "direct OOM reaping") ?

Apart from the former is "sequential processing" and "the OOM reaper pays the cost
for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
allocating thread pays the cost for reclaiming", both are timeout based back off
with number of retry attempt with a cap.

>> We are already using timeout based decision, with some attempt to reclaim
>> memory if conditions are met.
> 
> Timeout based decision is when you, well, make a decision after a
> certain time passes. And we do not do that.

But we are talking about what we can do after oom_reap_task_mm() can no longer
make progress. Both the former and the latter will wait until a time controlled
by the number of attempts and retry interval elapses.

>>>> If this is an argument that the oom reaper should loop checking for 
>>>> MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
>>>> than just setting the jiffies in the mm itself, that's just implementing 
>>>> the same thing and doing so in a way where the oom reaper stalls operating 
>>>> on a single mm rather than round-robin iterating over mm's in my patch.
>>>
>>> I've said earlier that I do not mind doing round robin in the oom repaer
>>> but this is certainly more complex than what we do now and I haven't
>>> seen any actual example where it would matter. OOM reaper is a safely
>>> measure. Nothing should fall apart if it is slow. 
>>
>> The OOM reaper can fail if allocating threads have high priority. You seem to
>> assume that realtime threads won't trigger OOM path. But since !PF_WQ_WORKER
>> threads do only cond_resched() due to your "the cargo cult programming" refusal,
>> and like Andrew Morton commented
>>
>>   cond_resched() is a no-op in the presence of realtime policy threads
>>   and using to attempt to yield to a different thread it in this fashion
>>   is broken.
>>
>> at "mm: disable preemption before swapcache_free" thread, we can't guarantee
>> that allocating threads shall give the OOM reaper enough CPU resource for
>> making forward progress. And my direct OOM reaping proposal was also refused
>> by you. I really dislike counting OOM reaper as a safety measure.
> 
> Well, yeah, you can screw up your system with real time priority tasks
> all you want. I really fail to see why you are bringing that up now
> though. Yet another offtopic?
> 

Not offtopic at all. My direct OOM reaping proposal is exactly handling such
situation. And I already suggested how we could avoid forcing some allocating
thread to pay the full cost for reclaiming all reclaimable memory.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-11  3:12                 ` Tetsuo Handa
@ 2018-08-14 11:33                   ` Michal Hocko
  2018-08-19 14:23                     ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-08-14 11:33 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: David Rientjes, linux-mm, Roman Gushchin

On Sat 11-08-18 12:12:52, Tetsuo Handa wrote:
> On 2018/08/10 20:16, Michal Hocko wrote:
> >> How do you decide whether oom_reaper() was not able to reclaim much?
> > 
> > Just a rule of thumb. If it freed at least few kBs then we should be good
> > to MMF_OOM_SKIP.
> 
> I don't think so. We are talking about situations where MMF_OOM_SKIP is set
> before memory enough to prevent the OOM killer from selecting next OOM victim
> was reclaimed.

There is nothing like enough memory to prevent a new victim selection.
Just think of streaming source of allocation without any end. There is
simply no way to tell that we have freed enough. We have to guess and
tune based on reasonable workloads.

[...]
> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
> allocating thread pays the cost for reclaiming", both are timeout based back off
> with number of retry attempt with a cap.

And it is exactly the who pays the price concern I've already tried to
explain that bothers me.

I really do not see how making the code more complex by ensuring that
allocators share a fair part of the direct oom repaing will make the
situation any easier. Really there are basically two issues we really
should be after. Improve the oom reaper to tear down wider range of
memory (namely mlock) and to improve the cooperation with the exit path
to handle free_pgtables more gracefully because it is true that some
processes might really consume a lot of memory in page tables without
mapping  a lot of anonymous memory. Neither of the two is addressed by
your proposal. So if you want to help then try to think about the two
issues.

> >> We are already using timeout based decision, with some attempt to reclaim
> >> memory if conditions are met.
> > 
> > Timeout based decision is when you, well, make a decision after a
> > certain time passes. And we do not do that.
> 
> But we are talking about what we can do after oom_reap_task_mm() can no longer
> make progress. Both the former and the latter will wait until a time controlled
> by the number of attempts and retry interval elapses.

Do not confuse a sleep with the number of attempts. The latter is a unit
of work done the former is a unit of time.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-14 11:33                   ` Michal Hocko
@ 2018-08-19 14:23                     ` Tetsuo Handa
  2018-08-20  5:54                       ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-08-19 14:23 UTC (permalink / raw)
  To: Michal Hocko
  Cc: David Rientjes, linux-mm, Roman Gushchin, Andrew Morton, Linus Torvalds

On 2018/08/14 20:33, Michal Hocko wrote:
> On Sat 11-08-18 12:12:52, Tetsuo Handa wrote:
>> On 2018/08/10 20:16, Michal Hocko wrote:
>>>> How do you decide whether oom_reaper() was not able to reclaim much?
>>>
>>> Just a rule of thumb. If it freed at least few kBs then we should be good
>>> to MMF_OOM_SKIP.
>>
>> I don't think so. We are talking about situations where MMF_OOM_SKIP is set
>> before memory enough to prevent the OOM killer from selecting next OOM victim
>> was reclaimed.
> 
> There is nothing like enough memory to prevent a new victim selection.
> Just think of streaming source of allocation without any end. There is
> simply no way to tell that we have freed enough. We have to guess and
> tune based on reasonable workloads.

I'm not talking about "allocation without any end" case.
We already inserted fatal_signal_pending(current) checks (except vmalloc()
where tsk_is_oom_victim(current) would be used instead).

What we are talking about is a situation where we could avoid selecting next
OOM victim if we waited for some more time after MMF_OOM_SKIP was set.

>> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
>> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
>> allocating thread pays the cost for reclaiming", both are timeout based back off
>> with number of retry attempt with a cap.
> 
> And it is exactly the who pays the price concern I've already tried to
> explain that bothers me.

Are you aware that we can fall into situation where nobody can pay the price for
reclaiming memory?

> 
> I really do not see how making the code more complex by ensuring that
> allocators share a fair part of the direct oom repaing will make the
> situation any easier.

You are completely ignoring/misunderstanding the background of
commit 9bfe5ded054b8e28 ("mm, oom: remove sleep from under oom_lock").

That patch was applied in order to mitigate a lockup problem caused by the fact
that allocators can deprive the OOM reaper of all CPU resources for making progress
due to very very broken assumption at

        /*
         * Acquire the oom lock.  If that fails, somebody else is
         * making progress for us.
         */
        if (!mutex_trylock(&oom_lock)) {
                *did_some_progress = 1;
                schedule_timeout_uninterruptible(1);
                return NULL;
        }

on the allocator side.

Direct OOM reaping is a method for ensuring that allocators spend _some_ CPU
resources for making progress. I already showed how to prevent allocators from
trying to reclaim all (e.g. multiple TB) memory at once because you worried it.

>                       Really there are basically two issues we really
> should be after. Improve the oom reaper to tear down wider range of
> memory (namely mlock) and to improve the cooperation with the exit path
> to handle free_pgtables more gracefully because it is true that some
> processes might really consume a lot of memory in page tables without
> mapping  a lot of anonymous memory. Neither of the two is addressed by
> your proposal. So if you want to help then try to think about the two
> issues.

Your "improvement" is to tear down wider range of memory whereas
my "improvement" is to ensure that CPU resource is spent for reclaiming memory and
David's "improvement" is to mitigate unnecessary killing of additional processes.
Therefore, your "Neither of the two is addressed by your proposal." is pointless.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-10  9:07           ` Michal Hocko
  2018-08-10 10:54             ` Tetsuo Handa
@ 2018-08-19 23:45             ` David Rientjes
  2018-08-20  6:07               ` Michal Hocko
  1 sibling, 1 reply; 38+ messages in thread
From: David Rientjes @ 2018-08-19 23:45 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin


> > > > At the risk of continually repeating the same statement, the oom reaper 
> > > > cannot provide the direct feedback for all possible memory freeing.  
> > > > Waking up periodically and finding mm->mmap_sem contended is one problem, 
> > > > but the other problem that I've already shown is the unnecessary oom 
> > > > killing of additional processes while a thread has already reached 
> > > > exit_mmap().  The oom reaper cannot free page tables which is problematic 
> > > > for malloc implementations such as tcmalloc that do not release virtual 
> > > > memory. 
> > > 
> > > But once we know that the exit path is past the point of blocking we can
> > > have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> > > oom_reaper doesn't hide the current victim too early and we can safely
> > > wait for the exit path to reclaim the rest. So there is a feedback
> > > channel. I would even do not mind to poll for that state few times -
> > > similar to polling for the mmap_sem. But it would still be some feedback
> > > rather than a certain amount of time has passed since the last check.
> > > 
> > 
> > Yes, of course, it would be easy to rely on exit_mmap() to set 
> > MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
> > when we are assured of forward progress.  What polling are you proposing 
> > other than a timeout based mechanism to do this?
> 
> I was thinking about doing something like the following
> - oom_reaper checks the amount of victim's memory after it is done with
>   reaping (e.g. by calling oom_badness before and after). If it wasn't able to
>   reclaim much then return false and keep retrying with the existing
>   mechanism

I'm not sure how you define the threshold to consider what is substantial 
memory freeing.

> - once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
>   MMF_OOM_SKIP flag.
> 
> > We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
> > complete free_pgtables() for that mm.  The problem is the same: when does 
> > the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
> > been set in a timely manner?
> 
> reuse the current retry policy which is the number of attempts rather
> than any timeout.
> 
> > If this is an argument that the oom reaper should loop checking for 
> > MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
> > than just setting the jiffies in the mm itself, that's just implementing 
> > the same thing and doing so in a way where the oom reaper stalls operating 
> > on a single mm rather than round-robin iterating over mm's in my patch.
> 
> I've said earlier that I do not mind doing round robin in the oom repaer
> but this is certainly more complex than what we do now and I haven't
> seen any actual example where it would matter. OOM reaper is a safely
> measure. Nothing should fall apart if it is slow. The primary work
> should be happening from the exit path anyway.

The oom reaper will always be unable to free some memory, such as page 
tables.  If it can't grab mm->mmap_sem in a reasonable amount of time, it 
also can give up early.  The munlock() case is another example.  We 
experience unnecessary oom killing during free_pgtables() where the 
single-threaded exit_mmap() is freeing an enormous amount of page tables 
(usually a malloc implementation such as tcmalloc that does not free 
virtual memory) and other processes are faulting faster than we can free.  
It's a combination of a multiprocessor system and a lot of virtual memory 
from the original victim.  This is the same case as being unable to 
munlock quickly enough in exit_mmap() to free the memory.

We must wait until free_pgtables() completes in exit_mmap() before killing 
additional processes in the large majority (99.96% of cases from my data) 
of instances where oom livelock does not occur.  In the remainder of 
situations, livelock has been prevented by what the oom reaper has been 
able to free.  We can, of course, not do free_pgtables() from the oom 
reaper.  So my approach was to allow for a reasonable amount of time for 
the victim to free a lot of memory before declaring that additional 
processes must be oom killed.  It would be functionally similar to having 
the oom reaper retry many, many more times than 10 and having a linked 
list of mm_structs to reap.  I don't care one way or another if it's a 
timeout based solution or many, many retries that have schedule_timeout() 
that yields the same time period in the end.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-19 14:23                     ` Tetsuo Handa
@ 2018-08-20  5:54                       ` Michal Hocko
  2018-08-20 22:03                         ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-08-20  5:54 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: David Rientjes, linux-mm, Roman Gushchin, Andrew Morton, Linus Torvalds

On Sun 19-08-18 23:23:41, Tetsuo Handa wrote:
> On 2018/08/14 20:33, Michal Hocko wrote:
> > On Sat 11-08-18 12:12:52, Tetsuo Handa wrote:
> >> On 2018/08/10 20:16, Michal Hocko wrote:
> >>>> How do you decide whether oom_reaper() was not able to reclaim much?
> >>>
> >>> Just a rule of thumb. If it freed at least few kBs then we should be good
> >>> to MMF_OOM_SKIP.
> >>
> >> I don't think so. We are talking about situations where MMF_OOM_SKIP is set
> >> before memory enough to prevent the OOM killer from selecting next OOM victim
> >> was reclaimed.
> > 
> > There is nothing like enough memory to prevent a new victim selection.
> > Just think of streaming source of allocation without any end. There is
> > simply no way to tell that we have freed enough. We have to guess and
> > tune based on reasonable workloads.
> 
> I'm not talking about "allocation without any end" case.
> We already inserted fatal_signal_pending(current) checks (except vmalloc()
> where tsk_is_oom_victim(current) would be used instead).
> 
> What we are talking about is a situation where we could avoid selecting next
> OOM victim if we waited for some more time after MMF_OOM_SKIP was set.

And that some more time is undefined without a crystal ball. And we have
desperately shortage of those.
 
> >> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
> >> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
> >> allocating thread pays the cost for reclaiming", both are timeout based back off
> >> with number of retry attempt with a cap.
> > 
> > And it is exactly the who pays the price concern I've already tried to
> > explain that bothers me.
> 
> Are you aware that we can fall into situation where nobody can pay the price for
> reclaiming memory?

I fail to see how this is related to direct vs. kthread oom reaping
though. Unless the kthread is starved by other means then it can always
jump in and handle the situation.

> > I really do not see how making the code more complex by ensuring that
> > allocators share a fair part of the direct oom repaing will make the
> > situation any easier.
> 
> You are completely ignoring/misunderstanding the background of
> commit 9bfe5ded054b8e28 ("mm, oom: remove sleep from under oom_lock").
> 
> That patch was applied in order to mitigate a lockup problem caused by the fact
> that allocators can deprive the OOM reaper of all CPU resources for making progress
> due to very very broken assumption at
> 
>         /*
>          * Acquire the oom lock.  If that fails, somebody else is
>          * making progress for us.
>          */
>         if (!mutex_trylock(&oom_lock)) {
>                 *did_some_progress = 1;
>                 schedule_timeout_uninterruptible(1);
>                 return NULL;
>         }
> 
> on the allocator side.
> 
> Direct OOM reaping is a method for ensuring that allocators spend _some_ CPU
> resources for making progress. I already showed how to prevent allocators from
> trying to reclaim all (e.g. multiple TB) memory at once because you worried it.
> 
> >                       Really there are basically two issues we really
> > should be after. Improve the oom reaper to tear down wider range of
> > memory (namely mlock) and to improve the cooperation with the exit path
> > to handle free_pgtables more gracefully because it is true that some
> > processes might really consume a lot of memory in page tables without
> > mapping  a lot of anonymous memory. Neither of the two is addressed by
> > your proposal. So if you want to help then try to think about the two
> > issues.
> 
> Your "improvement" is to tear down wider range of memory whereas
> my "improvement" is to ensure that CPU resource is spent for reclaiming memory and
> David's "improvement" is to mitigate unnecessary killing of additional processes.
> Therefore, your "Neither of the two is addressed by your proposal." is pointless.

OK, then we really have to agree to disagree.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-19 23:45             ` David Rientjes
@ 2018-08-20  6:07               ` Michal Hocko
  2018-08-20 21:31                 ` David Rientjes
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-08-20  6:07 UTC (permalink / raw)
  To: David Rientjes; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Sun 19-08-18 16:45:36, David Rientjes wrote:
> 
> > > > > At the risk of continually repeating the same statement, the oom reaper 
> > > > > cannot provide the direct feedback for all possible memory freeing.  
> > > > > Waking up periodically and finding mm->mmap_sem contended is one problem, 
> > > > > but the other problem that I've already shown is the unnecessary oom 
> > > > > killing of additional processes while a thread has already reached 
> > > > > exit_mmap().  The oom reaper cannot free page tables which is problematic 
> > > > > for malloc implementations such as tcmalloc that do not release virtual 
> > > > > memory. 
> > > > 
> > > > But once we know that the exit path is past the point of blocking we can
> > > > have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> > > > oom_reaper doesn't hide the current victim too early and we can safely
> > > > wait for the exit path to reclaim the rest. So there is a feedback
> > > > channel. I would even do not mind to poll for that state few times -
> > > > similar to polling for the mmap_sem. But it would still be some feedback
> > > > rather than a certain amount of time has passed since the last check.
> > > > 
> > > 
> > > Yes, of course, it would be easy to rely on exit_mmap() to set 
> > > MMF_OOM_SKIP itself and have the oom reaper drop the task from its list 
> > > when we are assured of forward progress.  What polling are you proposing 
> > > other than a timeout based mechanism to do this?
> > 
> > I was thinking about doing something like the following
> > - oom_reaper checks the amount of victim's memory after it is done with
> >   reaping (e.g. by calling oom_badness before and after). If it wasn't able to
> >   reclaim much then return false and keep retrying with the existing
> >   mechanism
> 
> I'm not sure how you define the threshold to consider what is substantial 
> memory freeing.

If a rule of thumb (few Megs freed or X% of oom_badness reduced) doesn't
really turn out to be working well then we can try to be more clever
e.g. detect for too many ptes to free and wait for those.

> > - once a flag (e.g. MMF_OOM_MMAP) is set it bails out and won't set the
> >   MMF_OOM_SKIP flag.
> > 
> > > We could set a MMF_EXIT_MMAP in exit_mmap() to specify that it will 
> > > complete free_pgtables() for that mm.  The problem is the same: when does 
> > > the oom reaper decide to set MMF_OOM_SKIP because MMF_EXIT_MMAP has not 
> > > been set in a timely manner?
> > 
> > reuse the current retry policy which is the number of attempts rather
> > than any timeout.
> > 
> > > If this is an argument that the oom reaper should loop checking for 
> > > MMF_EXIT_MMAP and doing schedule_timeout(1) a set number of times rather 
> > > than just setting the jiffies in the mm itself, that's just implementing 
> > > the same thing and doing so in a way where the oom reaper stalls operating 
> > > on a single mm rather than round-robin iterating over mm's in my patch.
> > 
> > I've said earlier that I do not mind doing round robin in the oom repaer
> > but this is certainly more complex than what we do now and I haven't
> > seen any actual example where it would matter. OOM reaper is a safely
> > measure. Nothing should fall apart if it is slow. The primary work
> > should be happening from the exit path anyway.
> 
> The oom reaper will always be unable to free some memory, such as page 
> tables.  If it can't grab mm->mmap_sem in a reasonable amount of time, it 
> also can give up early.  The munlock() case is another example.  We 
> experience unnecessary oom killing during free_pgtables() where the 
> single-threaded exit_mmap() is freeing an enormous amount of page tables 
> (usually a malloc implementation such as tcmalloc that does not free 
> virtual memory) and other processes are faulting faster than we can free.  
> It's a combination of a multiprocessor system and a lot of virtual memory 
> from the original victim.  This is the same case as being unable to 
> munlock quickly enough in exit_mmap() to free the memory.
> 
> We must wait until free_pgtables() completes in exit_mmap() before killing 
> additional processes in the large majority (99.96% of cases from my data) 
> of instances where oom livelock does not occur.  In the remainder of 
> situations, livelock has been prevented by what the oom reaper has been 
> able to free.  We can, of course, not do free_pgtables() from the oom 
> reaper.  So my approach was to allow for a reasonable amount of time for 
> the victim to free a lot of memory before declaring that additional 
> processes must be oom killed.  It would be functionally similar to having 
> the oom reaper retry many, many more times than 10 and having a linked 
> list of mm_structs to reap.  I don't care one way or another if it's a 
> timeout based solution or many, many retries that have schedule_timeout() 
> that yields the same time period in the end.

I would really keep the current retry logic with an extension to allow
to keep retrying or hand over to exit_mmap when we know it is past the
last moment of blocking.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-20  6:07               ` Michal Hocko
@ 2018-08-20 21:31                 ` David Rientjes
  2018-08-21  6:09                   ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: David Rientjes @ 2018-08-20 21:31 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Mon, 20 Aug 2018, Michal Hocko wrote:

> > The oom reaper will always be unable to free some memory, such as page 
> > tables.  If it can't grab mm->mmap_sem in a reasonable amount of time, it 
> > also can give up early.  The munlock() case is another example.  We 
> > experience unnecessary oom killing during free_pgtables() where the 
> > single-threaded exit_mmap() is freeing an enormous amount of page tables 
> > (usually a malloc implementation such as tcmalloc that does not free 
> > virtual memory) and other processes are faulting faster than we can free.  
> > It's a combination of a multiprocessor system and a lot of virtual memory 
> > from the original victim.  This is the same case as being unable to 
> > munlock quickly enough in exit_mmap() to free the memory.
> > 
> > We must wait until free_pgtables() completes in exit_mmap() before killing 
> > additional processes in the large majority (99.96% of cases from my data) 
> > of instances where oom livelock does not occur.  In the remainder of 
> > situations, livelock has been prevented by what the oom reaper has been 
> > able to free.  We can, of course, not do free_pgtables() from the oom 
> > reaper.  So my approach was to allow for a reasonable amount of time for 
> > the victim to free a lot of memory before declaring that additional 
> > processes must be oom killed.  It would be functionally similar to having 
> > the oom reaper retry many, many more times than 10 and having a linked 
> > list of mm_structs to reap.  I don't care one way or another if it's a 
> > timeout based solution or many, many retries that have schedule_timeout() 
> > that yields the same time period in the end.
> 
> I would really keep the current retry logic with an extension to allow
> to keep retrying or hand over to exit_mmap when we know it is past the
> last moment of blocking.
> 

Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
and once it reaches a certain threshold, either give up and set 
MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
fine, but obviously I'll be suggesting that the threshold is rather large.  
So if I adjust my patch to be a retry counter rather than timestamp, do 
you have any other reservations?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-20  5:54                       ` Michal Hocko
@ 2018-08-20 22:03                         ` Tetsuo Handa
  2018-08-21  6:16                           ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-08-20 22:03 UTC (permalink / raw)
  To: Michal Hocko
  Cc: David Rientjes, linux-mm, Roman Gushchin, Andrew Morton, Linus Torvalds

On 2018/08/20 14:54, Michal Hocko wrote:
>>>> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
>>>> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
>>>> allocating thread pays the cost for reclaiming", both are timeout based back off
>>>> with number of retry attempt with a cap.
>>>
>>> And it is exactly the who pays the price concern I've already tried to
>>> explain that bothers me.
>>
>> Are you aware that we can fall into situation where nobody can pay the price for
>> reclaiming memory?
> 
> I fail to see how this is related to direct vs. kthread oom reaping
> though. Unless the kthread is starved by other means then it can always
> jump in and handle the situation.

I'm saying that concurrent allocators can starve the OOM reaper kernel thread.
I don't care if the OOM reaper kernel thread is starved by something other than
concurrent allocators, as long as that something is doing useful things.

Allocators wait for progress using (almost) busy loop is prone to lockup; they are
not doing useful things. But direct OOM reaping allows allocators avoid lockup and
do useful things.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-20 21:31                 ` David Rientjes
@ 2018-08-21  6:09                   ` Michal Hocko
  2018-08-21 17:20                     ` David Rientjes
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-08-21  6:09 UTC (permalink / raw)
  To: David Rientjes; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Mon 20-08-18 14:31:04, David Rientjes wrote:
> On Mon, 20 Aug 2018, Michal Hocko wrote:
> 
> > > The oom reaper will always be unable to free some memory, such as page 
> > > tables.  If it can't grab mm->mmap_sem in a reasonable amount of time, it 
> > > also can give up early.  The munlock() case is another example.  We 
> > > experience unnecessary oom killing during free_pgtables() where the 
> > > single-threaded exit_mmap() is freeing an enormous amount of page tables 
> > > (usually a malloc implementation such as tcmalloc that does not free 
> > > virtual memory) and other processes are faulting faster than we can free.  
> > > It's a combination of a multiprocessor system and a lot of virtual memory 
> > > from the original victim.  This is the same case as being unable to 
> > > munlock quickly enough in exit_mmap() to free the memory.
> > > 
> > > We must wait until free_pgtables() completes in exit_mmap() before killing 
> > > additional processes in the large majority (99.96% of cases from my data) 
> > > of instances where oom livelock does not occur.  In the remainder of 
> > > situations, livelock has been prevented by what the oom reaper has been 
> > > able to free.  We can, of course, not do free_pgtables() from the oom 
> > > reaper.  So my approach was to allow for a reasonable amount of time for 
> > > the victim to free a lot of memory before declaring that additional 
> > > processes must be oom killed.  It would be functionally similar to having 
> > > the oom reaper retry many, many more times than 10 and having a linked 
> > > list of mm_structs to reap.  I don't care one way or another if it's a 
> > > timeout based solution or many, many retries that have schedule_timeout() 
> > > that yields the same time period in the end.
> > 
> > I would really keep the current retry logic with an extension to allow
> > to keep retrying or hand over to exit_mmap when we know it is past the
> > last moment of blocking.
> > 
> 
> Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
> and once it reaches a certain threshold, either give up and set 
> MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
> fine, but obviously I'll be suggesting that the threshold is rather large.  
> So if I adjust my patch to be a retry counter rather than timestamp, do 
> you have any other reservations?

It absolutely has to be an internal thing without any user API to be
set. Also I still haven't heard any specific argument why would oom
reaper need to do per-task attempt and loop over all victims on the
list. Maybe you have some examples though.

I believe that we really need to think about the hand over between the
two paths first and only build a more elaborate retry logic on top of
it.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-20 22:03                         ` Tetsuo Handa
@ 2018-08-21  6:16                           ` Michal Hocko
  2018-08-21 13:39                             ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-08-21  6:16 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: David Rientjes, linux-mm, Roman Gushchin, Andrew Morton, Linus Torvalds

On Tue 21-08-18 07:03:10, Tetsuo Handa wrote:
> On 2018/08/20 14:54, Michal Hocko wrote:
> >>>> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
> >>>> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
> >>>> allocating thread pays the cost for reclaiming", both are timeout based back off
> >>>> with number of retry attempt with a cap.
> >>>
> >>> And it is exactly the who pays the price concern I've already tried to
> >>> explain that bothers me.
> >>
> >> Are you aware that we can fall into situation where nobody can pay the price for
> >> reclaiming memory?
> > 
> > I fail to see how this is related to direct vs. kthread oom reaping
> > though. Unless the kthread is starved by other means then it can always
> > jump in and handle the situation.
> 
> I'm saying that concurrent allocators can starve the OOM reaper kernel thread.
> I don't care if the OOM reaper kernel thread is starved by something other than
> concurrent allocators, as long as that something is doing useful things.
> 
> Allocators wait for progress using (almost) busy loop is prone to lockup; they are
> not doing useful things. But direct OOM reaping allows allocators avoid lockup and
> do useful things.

As long as those allocators are making _some_ progress and they are not
preempted themselves. Those might be low priority as well. To make it
more fun those high priority might easily preempt those which try to
make the direct reaping. And if you really want to achieve at least some
fairness there you will quickly grown into a complex scheme. Really our
direct reclaim is already quite fragile when it comes to fairness and
now you want to extend it to be even more fragile. Really, I think you
are not really appreciating what kind of complex beast you are going to
create.

If we have priority inversion problems during oom then we can always
return back to high priority oom reaper. This would be so much simpler.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-21  6:16                           ` Michal Hocko
@ 2018-08-21 13:39                             ` Tetsuo Handa
  0 siblings, 0 replies; 38+ messages in thread
From: Tetsuo Handa @ 2018-08-21 13:39 UTC (permalink / raw)
  To: Michal Hocko
  Cc: David Rientjes, linux-mm, Roman Gushchin, Andrew Morton, Linus Torvalds

On 2018/08/21 15:16, Michal Hocko wrote:
> On Tue 21-08-18 07:03:10, Tetsuo Handa wrote:
>> On 2018/08/20 14:54, Michal Hocko wrote:
>>>>>> Apart from the former is "sequential processing" and "the OOM reaper pays the cost
>>>>>> for reclaiming" while the latter is "parallel (or round-robin) processing" and "the
>>>>>> allocating thread pays the cost for reclaiming", both are timeout based back off
>>>>>> with number of retry attempt with a cap.
>>>>>
>>>>> And it is exactly the who pays the price concern I've already tried to
>>>>> explain that bothers me.
>>>>
>>>> Are you aware that we can fall into situation where nobody can pay the price for
>>>> reclaiming memory?
>>>
>>> I fail to see how this is related to direct vs. kthread oom reaping
>>> though. Unless the kthread is starved by other means then it can always
>>> jump in and handle the situation.
>>
>> I'm saying that concurrent allocators can starve the OOM reaper kernel thread.
>> I don't care if the OOM reaper kernel thread is starved by something other than
>> concurrent allocators, as long as that something is doing useful things.
>>
>> Allocators wait for progress using (almost) busy loop is prone to lockup; they are
>> not doing useful things. But direct OOM reaping allows allocators avoid lockup and
>> do useful things.
> 
> As long as those allocators are making _some_ progress and they are not
> preempted themselves.

Even on linux-next-20180820 where neither the OOM reaper nor exit_mmap() waits for
oom_lock, a cluster of concurrently allocating realtime threads can make the OOM
victim get MMF_OOM_SKIP in 1800 milliseconds

[  122.291910] Out of memory: Kill process 1097 (a.out) score 868 or sacrifice child
[  122.296431] Killed process 1117 (a.out) total-vm:5244kB, anon-rss:84kB, file-rss:0kB, shmem-rss:0kB
[  125.186061] oom_reaper: reaped process 1117 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  125.191487] crond invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  131.405754] Out of memory: Kill process 1097 (a.out) score 868 or sacrifice child
[  131.409970] Killed process 1121 (a.out) total-vm:5244kB, anon-rss:84kB, file-rss:0kB, shmem-rss:0kB
[  132.185982] oom_reaper: reaped process 1121 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  132.234704] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  141.396004] Out of memory: Kill process 1097 (a.out) score 868 or sacrifice child
[  141.400194] Killed process 1128 (a.out) total-vm:5244kB, anon-rss:84kB, file-rss:0kB, shmem-rss:0kB
[  143.184631] oom_reaper: reaped process 1128 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  143.193077] in:imjournal invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  143.721617] Out of memory: Kill process 1097 (a.out) score 868 or sacrifice child
[  143.725965] Killed process 1858 (a.out) total-vm:5244kB, anon-rss:1040kB, file-rss:28kB, shmem-rss:0kB
[  145.218808] oom_reaper: reaped process 1858 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  145.223753] systemd-journal invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

whereas applying

@@ -440,17 +440,11 @@
 +		ret = true;
 +#ifdef CONFIG_MMU
 +		/*
-+		 * Since the OOM reaper exists, we can safely wait until
-+		 * MMF_OOM_SKIP is set.
++		 * We can safely try to reclaim until MMF_OOM_SKIP is set.
 +		 */
 +		if (!test_bit(MMF_OOM_SKIP, &mm->flags)) {
-+			if (!oom_reap_target) {
-+				get_task_struct(p);
-+				oom_reap_target = p;
-+				trace_wake_reaper(p->pid);
-+				wake_up(&oom_reaper_wait);
-+			}
-+			continue;
++			if (oom_reap_task_mm(p, mm))
++				set_bit(MMF_OOM_SKIP, &mm->flags);
 +		}
 +#endif
 +		/* We can wait as long as OOM score is decreasing over time. */

on top of this series can make the OOM victim get MMF_OOM_SKIP in 10 milliseconds.

[   43.407032] Out of memory: Kill process 1071 (a.out) score 865 or sacrifice child
[   43.411134] Killed process 1816 (a.out) total-vm:5244kB, anon-rss:1040kB, file-rss:0kB, shmem-rss:0kB
[   43.416427] oom_reaper: reaped process 1816 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[   44.689731] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  159.572877] Out of memory: Kill process 2157 (a.out) score 891 or sacrifice child
[  159.576924] Killed process 2158 (first-victim) total-vm:5244kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB
[  159.580933] oom_reaper: reaped process 2158 (first-victim), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  160.602346] systemd-journal invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  160.774149] Out of memory: Kill process 2157 (a.out) score 891 or sacrifice child
[  160.778139] Killed process 2159 (a.out) total-vm:5244kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB
[  160.781779] oom_reaper: reaped process 2159 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  162.745425] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

[  162.916173] Out of memory: Kill process 2157 (a.out) score 891 or sacrifice child
[  162.920239] Killed process 2160 (a.out) total-vm:5244kB, anon-rss:88kB, file-rss:0kB, shmem-rss:0kB
[  162.924396] oom_reaper: reaped process 2160 (a.out), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  166.034599] Gave up waiting for process 2160 (a.out) total-vm:5244kB, anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  166.038446] INFO: lockdep is turned off.
[  166.041780] a.out invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0

This number shows how much CPU resources we will waste for memory reclaim activities
without any progress. (I'm OK with doing both async reclaim by the OOM reaper kernel
thread and sync reclaim by allocating threads.)

>                       Those might be low priority as well.

It will not cause problems as long as we don't reclaim memory with oom_lock held.

>                                                            To make it
> more fun those high priority might easily preempt those which try to
> make the direct reaping.

Ditto.

>                          And if you really want to achieve at least some
> fairness there you will quickly grown into a complex scheme. Really our
> direct reclaim is already quite fragile when it comes to fairness

You can propose removing direct reclaim.
Then, we will get reliable __GFP_KILLABLE as a bonus.

>                                                                   and
> now you want to extend it to be even more fragile.

Direct OOM reaping is different from direct reclaim that direct OOM reaping
is never blocked on memory allocation. Thus, it will not make more fragile.

>                                                    Really, I think you
> are not really appreciating what kind of complex beast you are going to
> create.

Saying "dragon" or "beast" does not defeat me. Rather, such words drive me
to more and more direct OOM reaping (because you don't like waiting for
oom_lock at __alloc_pages_may_oom() which is a simple way for making sure
that CPU resource is spent for memory reclaim activities).

> 
> If we have priority inversion problems during oom then we can always
> return back to high priority oom reaper. This would be so much simpler.

We could utilize higher scheduling priority for memory reclaim activities
by the OOM reaper kernel thread until MMF_OOM_SKIP is set. But what we need
to think about is how we can wait for memory reclaim activities after
MMF_OOM_SKIP is set. A thread doing exit_mmap() might be idle scheduling
priority. Even if allocating threads found that exit_mmap() already reached
to the point of "no more being blocked on memory allocation", allocating
threads might keep exit_mmap() unable to make progress (for many minutes,
effectively forever) due to idle scheduling priority.

You want to preserve "the fairness destroyer" just because you fear creating
"a new monster". But the point of "no more being blocked on memory allocation"
cannot exist without making sure that CPU resources are spent for memory reclaim
activities. Without seriously considering how we can make sure that allocating
threads give enough CPU resources to memory reclaim activities (both "which the
OOM reaper can do" and "which will be done after the OOM reaper gave up"), your
"hand over" plan will fail. Allocating threads pay the cost for memory reclaim
activities is much simpler way.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-21  6:09                   ` Michal Hocko
@ 2018-08-21 17:20                     ` David Rientjes
  2018-08-22  8:03                       ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: David Rientjes @ 2018-08-21 17:20 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Tue, 21 Aug 2018, Michal Hocko wrote:

> > Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
> > and once it reaches a certain threshold, either give up and set 
> > MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
> > fine, but obviously I'll be suggesting that the threshold is rather large.  
> > So if I adjust my patch to be a retry counter rather than timestamp, do 
> > you have any other reservations?
> 
> It absolutely has to be an internal thing without any user API to be
> set. Also I still haven't heard any specific argument why would oom
> reaper need to do per-task attempt and loop over all victims on the
> list. Maybe you have some examples though.
> 

It would be per-mm in this case, the task itself is no longer important 
other than printing to the kernel log.  I think we could simply print that 
the oom reaper has reaped mm->owner.

The oom reaper would need to loop over the per-mm list because the retry 
counter is going to have a high threshold so that processes have the 
ability to free their memory before the oom reaper declares it can no 
longer make forward progress.  We cannot stall trying to reap a single mm 
with a high retry threshold from a memcg hierarchy when another memcg 
hierarchy is also oom.  The ability for one victim to make forward 
progress can depend on a lock held by another oom memcg hierarchy where 
reaping would allow it to be dropped.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-21 17:20                     ` David Rientjes
@ 2018-08-22  8:03                       ` Michal Hocko
  2018-08-22 20:54                         ` David Rientjes
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-08-22  8:03 UTC (permalink / raw)
  To: David Rientjes; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Tue 21-08-18 10:20:00, David Rientjes wrote:
> On Tue, 21 Aug 2018, Michal Hocko wrote:
> 
> > > Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
> > > and once it reaches a certain threshold, either give up and set 
> > > MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
> > > fine, but obviously I'll be suggesting that the threshold is rather large.  
> > > So if I adjust my patch to be a retry counter rather than timestamp, do 
> > > you have any other reservations?
> > 
> > It absolutely has to be an internal thing without any user API to be
> > set. Also I still haven't heard any specific argument why would oom
> > reaper need to do per-task attempt and loop over all victims on the
> > list. Maybe you have some examples though.
> > 
> 
> It would be per-mm in this case, the task itself is no longer important 
> other than printing to the kernel log.  I think we could simply print that 
> the oom reaper has reaped mm->owner.
> 
> The oom reaper would need to loop over the per-mm list because the retry 
> counter is going to have a high threshold so that processes have the 
> ability to free their memory before the oom reaper declares it can no 
> longer make forward progress.

What do you actually mean by a high threshold?

> We cannot stall trying to reap a single mm 
> with a high retry threshold from a memcg hierarchy when another memcg 
> hierarchy is also oom.  The ability for one victim to make forward 
> progress can depend on a lock held by another oom memcg hierarchy where 
> reaping would allow it to be dropped.

Could you be more specific please?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-22  8:03                       ` Michal Hocko
@ 2018-08-22 20:54                         ` David Rientjes
  0 siblings, 0 replies; 38+ messages in thread
From: David Rientjes @ 2018-08-22 20:54 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Tetsuo Handa, linux-mm, Roman Gushchin

On Wed, 22 Aug 2018, Michal Hocko wrote:

> > > > Ok, so it appears you're suggesting a per-mm counter of oom reaper retries 
> > > > and once it reaches a certain threshold, either give up and set 
> > > > MMF_OOM_SKIP or declare that exit_mmap() is responsible for it.  That's 
> > > > fine, but obviously I'll be suggesting that the threshold is rather large.  
> > > > So if I adjust my patch to be a retry counter rather than timestamp, do 
> > > > you have any other reservations?
> > > 
> > > It absolutely has to be an internal thing without any user API to be
> > > set. Also I still haven't heard any specific argument why would oom
> > > reaper need to do per-task attempt and loop over all victims on the
> > > list. Maybe you have some examples though.
> > > 
> > 
> > It would be per-mm in this case, the task itself is no longer important 
> > other than printing to the kernel log.  I think we could simply print that 
> > the oom reaper has reaped mm->owner.
> > 
> > The oom reaper would need to loop over the per-mm list because the retry 
> > counter is going to have a high threshold so that processes have the 
> > ability to free their memory before the oom reaper declares it can no 
> > longer make forward progress.
> 
> What do you actually mean by a high threshold?
> 

As suggested in the timeout based approach of my patchset, 10s seems to 
work well for current server memory capacities, so if combined with a 
schedule_timeout(HZ/10) after iterating through mm_struct's to reap, the 
threshold would be best defined so it can allow at least 10s.

> > We cannot stall trying to reap a single mm 
> > with a high retry threshold from a memcg hierarchy when another memcg 
> > hierarchy is also oom.  The ability for one victim to make forward 
> > progress can depend on a lock held by another oom memcg hierarchy where 
> > reaping would allow it to be dropped.
> 
> Could you be more specific please?
> 

It's problematic to stall for 10s trying to oom reap (or free through 
exit_mmap()) a single mm while not trying to free memory from others: if 
you are reaping memory from a memcg subtree's victim and it takes a long 
time, either for a single try with a lot of memory or many tries with 
little or no memory, it increases the likelihood of livelocks in other 
memcg hierarchies because of the oom reaper is not attempting to reap its 
memory.  The victim may depend on a lock that a memory charger is holding 
but the oom reaper is not able to help yet.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-08-06 20:51       ` Michal Hocko
  2018-08-09 20:16         ` David Rientjes
@ 2018-09-01 11:48         ` Tetsuo Handa
  2018-09-06 11:35           ` Michal Hocko
  1 sibling, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-09-01 11:48 UTC (permalink / raw)
  To: Michal Hocko, David Rientjes; +Cc: linux-mm, Roman Gushchin

On 2018/08/07 5:51, Michal Hocko wrote:
>> At the risk of continually repeating the same statement, the oom reaper 
>> cannot provide the direct feedback for all possible memory freeing.  
>> Waking up periodically and finding mm->mmap_sem contended is one problem, 
>> but the other problem that I've already shown is the unnecessary oom 
>> killing of additional processes while a thread has already reached 
>> exit_mmap().  The oom reaper cannot free page tables which is problematic 
>> for malloc implementations such as tcmalloc that do not release virtual 
>> memory. 
> 
> But once we know that the exit path is past the point of blocking we can
> have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> oom_reaper doesn't hide the current victim too early and we can safely
> wait for the exit path to reclaim the rest. So there is a feedback
> channel. I would even do not mind to poll for that state few times -
> similar to polling for the mmap_sem. But it would still be some feedback
> rather than a certain amount of time has passed since the last check.

Michal, will you show us how we can handover as an actual patch? I'm not
happy with postponing current situation with just your wish to handover.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-01 11:48         ` Tetsuo Handa
@ 2018-09-06 11:35           ` Michal Hocko
  2018-09-06 11:50             ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-09-06 11:35 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: David Rientjes, linux-mm, Roman Gushchin

On Sat 01-09-18 20:48:57, Tetsuo Handa wrote:
> On 2018/08/07 5:51, Michal Hocko wrote:
> >> At the risk of continually repeating the same statement, the oom reaper 
> >> cannot provide the direct feedback for all possible memory freeing.  
> >> Waking up periodically and finding mm->mmap_sem contended is one problem, 
> >> but the other problem that I've already shown is the unnecessary oom 
> >> killing of additional processes while a thread has already reached 
> >> exit_mmap().  The oom reaper cannot free page tables which is problematic 
> >> for malloc implementations such as tcmalloc that do not release virtual 
> >> memory. 
> > 
> > But once we know that the exit path is past the point of blocking we can
> > have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> > oom_reaper doesn't hide the current victim too early and we can safely
> > wait for the exit path to reclaim the rest. So there is a feedback
> > channel. I would even do not mind to poll for that state few times -
> > similar to polling for the mmap_sem. But it would still be some feedback
> > rather than a certain amount of time has passed since the last check.
> 
> Michal, will you show us how we can handover as an actual patch? I'm not
> happy with postponing current situation with just your wish to handover.

I am sorry but I am bussy with other higher priority issues. I believe I
have outlined the scheme that might work (see above). All it takes is to
look into that closer a play with it.

I haven't seen bug reports except for David's very vaguely argued
report. I have asked about details several times but haven't received
any. So I didn't really give it a top priority and consider it as a
corner case which is nice to solve rather than absolutely have to do it
right away because many users would put spell on us because their
workloads are eaten by that evil OOM killer.

If I've misinterpreted the priority then I can certainly reconsider and
reprioritize or somebody else might have a look into it.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-06 11:35           ` Michal Hocko
@ 2018-09-06 11:50             ` Tetsuo Handa
  2018-09-06 12:05               ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-09-06 11:50 UTC (permalink / raw)
  To: Michal Hocko; +Cc: David Rientjes, linux-mm, Roman Gushchin

On 2018/09/06 20:35, Michal Hocko wrote:
> On Sat 01-09-18 20:48:57, Tetsuo Handa wrote:
>> On 2018/08/07 5:51, Michal Hocko wrote:
>>>> At the risk of continually repeating the same statement, the oom reaper 
>>>> cannot provide the direct feedback for all possible memory freeing.  
>>>> Waking up periodically and finding mm->mmap_sem contended is one problem, 
>>>> but the other problem that I've already shown is the unnecessary oom 
>>>> killing of additional processes while a thread has already reached 
>>>> exit_mmap().  The oom reaper cannot free page tables which is problematic 
>>>> for malloc implementations such as tcmalloc that do not release virtual 
>>>> memory. 
>>>
>>> But once we know that the exit path is past the point of blocking we can
>>> have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
>>> oom_reaper doesn't hide the current victim too early and we can safely
>>> wait for the exit path to reclaim the rest. So there is a feedback
>>> channel. I would even do not mind to poll for that state few times -
>>> similar to polling for the mmap_sem. But it would still be some feedback
>>> rather than a certain amount of time has passed since the last check.
>>
>> Michal, will you show us how we can handover as an actual patch? I'm not
>> happy with postponing current situation with just your wish to handover.
> 
> I am sorry but I am bussy with other higher priority issues. I believe I
> have outlined the scheme that might work (see above). All it takes is to
> look into that closer a play with it.

If you are too busy, please show "the point of no-blocking" using source code
instead. If such "the point of no-blocking" really exists, it can be executed
by allocating threads. I think that such "the point of no-blocking" is so late
stage of freeing that it makes no difference with timeout based back off.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-06 11:50             ` Tetsuo Handa
@ 2018-09-06 12:05               ` Michal Hocko
  2018-09-06 13:40                 ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-09-06 12:05 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: David Rientjes, linux-mm, Roman Gushchin

On Thu 06-09-18 20:50:53, Tetsuo Handa wrote:
> On 2018/09/06 20:35, Michal Hocko wrote:
> > On Sat 01-09-18 20:48:57, Tetsuo Handa wrote:
> >> On 2018/08/07 5:51, Michal Hocko wrote:
> >>>> At the risk of continually repeating the same statement, the oom reaper 
> >>>> cannot provide the direct feedback for all possible memory freeing.  
> >>>> Waking up periodically and finding mm->mmap_sem contended is one problem, 
> >>>> but the other problem that I've already shown is the unnecessary oom 
> >>>> killing of additional processes while a thread has already reached 
> >>>> exit_mmap().  The oom reaper cannot free page tables which is problematic 
> >>>> for malloc implementations such as tcmalloc that do not release virtual 
> >>>> memory. 
> >>>
> >>> But once we know that the exit path is past the point of blocking we can
> >>> have MMF_OOM_SKIP handover from the oom_reaper to the exit path. So the
> >>> oom_reaper doesn't hide the current victim too early and we can safely
> >>> wait for the exit path to reclaim the rest. So there is a feedback
> >>> channel. I would even do not mind to poll for that state few times -
> >>> similar to polling for the mmap_sem. But it would still be some feedback
> >>> rather than a certain amount of time has passed since the last check.
> >>
> >> Michal, will you show us how we can handover as an actual patch? I'm not
> >> happy with postponing current situation with just your wish to handover.
> > 
> > I am sorry but I am bussy with other higher priority issues. I believe I
> > have outlined the scheme that might work (see above). All it takes is to
> > look into that closer a play with it.
> 
> If you are too busy, please show "the point of no-blocking" using source code
> instead. If such "the point of no-blocking" really exists, it can be executed
> by allocating threads.

I would have to study this much deeper but I _suspect_ that we are not
taking any blocking locks right after we return from unmap_vmas. In
other words the place we used to have synchronization with the
oom_reaper in the past.

> I think that such "the point of no-blocking" is so late stage of
> freeing that it makes no difference with timeout based back off.

It is! It is feedback driven rather than a random time passed approach.
And more importantly this syncing with exit_mmap matters only when there
is going to be way much more memory in page tables than in mappings
which is a _rare_ case.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-06 12:05               ` Michal Hocko
@ 2018-09-06 13:40                 ` Tetsuo Handa
  2018-09-06 13:56                   ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-09-06 13:40 UTC (permalink / raw)
  To: Michal Hocko; +Cc: David Rientjes, linux-mm, Roman Gushchin

On 2018/09/06 21:05, Michal Hocko wrote:
>> If you are too busy, please show "the point of no-blocking" using source code
>> instead. If such "the point of no-blocking" really exists, it can be executed
>> by allocating threads.
> 
> I would have to study this much deeper but I _suspect_ that we are not
> taking any blocking locks right after we return from unmap_vmas. In
> other words the place we used to have synchronization with the
> oom_reaper in the past.

See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
complaining"). Since this dependency is inode-based (i.e. irrelevant with
OOM victims), waiting for this lock can livelock.

So, where is safe "the point of no-blocking" ?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-06 13:40                 ` Tetsuo Handa
@ 2018-09-06 13:56                   ` Michal Hocko
  2018-09-06 14:06                     ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-09-06 13:56 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: David Rientjes, linux-mm, Roman Gushchin

On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
> On 2018/09/06 21:05, Michal Hocko wrote:
> >> If you are too busy, please show "the point of no-blocking" using source code
> >> instead. If such "the point of no-blocking" really exists, it can be executed
> >> by allocating threads.
> > 
> > I would have to study this much deeper but I _suspect_ that we are not
> > taking any blocking locks right after we return from unmap_vmas. In
> > other words the place we used to have synchronization with the
> > oom_reaper in the past.
> 
> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
> complaining"). Since this dependency is inode-based (i.e. irrelevant with
> OOM victims), waiting for this lock can livelock.
> 
> So, where is safe "the point of no-blocking" ?

Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
have to think about that some more. Maybe we can split those into two parts.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-06 13:56                   ` Michal Hocko
@ 2018-09-06 14:06                     ` Tetsuo Handa
  2018-09-06 14:16                       ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-09-06 14:06 UTC (permalink / raw)
  To: Michal Hocko; +Cc: David Rientjes, linux-mm, Roman Gushchin

On 2018/09/06 22:56, Michal Hocko wrote:
> On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
>> On 2018/09/06 21:05, Michal Hocko wrote:
>>>> If you are too busy, please show "the point of no-blocking" using source code
>>>> instead. If such "the point of no-blocking" really exists, it can be executed
>>>> by allocating threads.
>>>
>>> I would have to study this much deeper but I _suspect_ that we are not
>>> taking any blocking locks right after we return from unmap_vmas. In
>>> other words the place we used to have synchronization with the
>>> oom_reaper in the past.
>>
>> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
>> complaining"). Since this dependency is inode-based (i.e. irrelevant with
>> OOM victims), waiting for this lock can livelock.
>>
>> So, where is safe "the point of no-blocking" ?
> 
> Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
> have to think about that some more. Maybe we can split those into two parts.
> 

Meanwhile, I'd really like to use timeout based back off. Like I wrote at
http://lkml.kernel.org/r/201809060703.w8673Kbs076435@www262.sakura.ne.jp ,
we need to wait for some period after all.

We can replace timeout based back off after we got safe "the point of no-blocking" .

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-06 14:06                     ` Tetsuo Handa
@ 2018-09-06 14:16                       ` Michal Hocko
  2018-09-06 21:13                         ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-09-06 14:16 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: David Rientjes, linux-mm, Roman Gushchin

On Thu 06-09-18 23:06:40, Tetsuo Handa wrote:
> On 2018/09/06 22:56, Michal Hocko wrote:
> > On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
> >> On 2018/09/06 21:05, Michal Hocko wrote:
> >>>> If you are too busy, please show "the point of no-blocking" using source code
> >>>> instead. If such "the point of no-blocking" really exists, it can be executed
> >>>> by allocating threads.
> >>>
> >>> I would have to study this much deeper but I _suspect_ that we are not
> >>> taking any blocking locks right after we return from unmap_vmas. In
> >>> other words the place we used to have synchronization with the
> >>> oom_reaper in the past.
> >>
> >> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
> >> complaining"). Since this dependency is inode-based (i.e. irrelevant with
> >> OOM victims), waiting for this lock can livelock.
> >>
> >> So, where is safe "the point of no-blocking" ?
> > 
> > Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
> > have to think about that some more. Maybe we can split those into two parts.
> > 
> 
> Meanwhile, I'd really like to use timeout based back off. Like I wrote at
> http://lkml.kernel.org/r/201809060703.w8673Kbs076435@www262.sakura.ne.jp ,
> we need to wait for some period after all.
> 
> We can replace timeout based back off after we got safe "the point of no-blocking" .

Why don't you invest your time in the long term solution rather than
playing with something that doesn't solve anything just papers over the
issue?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-06 14:16                       ` Michal Hocko
@ 2018-09-06 21:13                         ` Tetsuo Handa
  2018-09-07 11:10                           ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-09-06 21:13 UTC (permalink / raw)
  To: Michal Hocko; +Cc: David Rientjes, linux-mm, Roman Gushchin

On 2018/09/06 23:16, Michal Hocko wrote:
> On Thu 06-09-18 23:06:40, Tetsuo Handa wrote:
>> On 2018/09/06 22:56, Michal Hocko wrote:
>>> On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
>>>> On 2018/09/06 21:05, Michal Hocko wrote:
>>>>>> If you are too busy, please show "the point of no-blocking" using source code
>>>>>> instead. If such "the point of no-blocking" really exists, it can be executed
>>>>>> by allocating threads.
>>>>>
>>>>> I would have to study this much deeper but I _suspect_ that we are not
>>>>> taking any blocking locks right after we return from unmap_vmas. In
>>>>> other words the place we used to have synchronization with the
>>>>> oom_reaper in the past.
>>>>
>>>> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
>>>> complaining"). Since this dependency is inode-based (i.e. irrelevant with
>>>> OOM victims), waiting for this lock can livelock.
>>>>
>>>> So, where is safe "the point of no-blocking" ?
>>>
>>> Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
>>> have to think about that some more. Maybe we can split those into two parts.
>>>
>>
>> Meanwhile, I'd really like to use timeout based back off. Like I wrote at
>> http://lkml.kernel.org/r/201809060703.w8673Kbs076435@www262.sakura.ne.jp ,
>> we need to wait for some period after all.
>>
>> We can replace timeout based back off after we got safe "the point of no-blocking" .
> 
> Why don't you invest your time in the long term solution rather than
> playing with something that doesn't solve anything just papers over the
> issue?
> 

I am not a MM people. I am a secure programmer from security subsystem.
You are almost always introducing bugs (like you call dragons) rather
than starting from safe changes. The OOM killer _is_ always racy. Even
your what you think the long term solution _shall be_ racy. I can't
waste my time in what you think the long term solution. Please don't
refuse/ignore my (or David's) patches without your counter patches.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-06 21:13                         ` Tetsuo Handa
@ 2018-09-07 11:10                           ` Michal Hocko
  2018-09-07 11:36                             ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-09-07 11:10 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: David Rientjes, linux-mm, Roman Gushchin

On Fri 07-09-18 06:13:13, Tetsuo Handa wrote:
> On 2018/09/06 23:16, Michal Hocko wrote:
> > On Thu 06-09-18 23:06:40, Tetsuo Handa wrote:
> >> On 2018/09/06 22:56, Michal Hocko wrote:
> >>> On Thu 06-09-18 22:40:24, Tetsuo Handa wrote:
> >>>> On 2018/09/06 21:05, Michal Hocko wrote:
> >>>>>> If you are too busy, please show "the point of no-blocking" using source code
> >>>>>> instead. If such "the point of no-blocking" really exists, it can be executed
> >>>>>> by allocating threads.
> >>>>>
> >>>>> I would have to study this much deeper but I _suspect_ that we are not
> >>>>> taking any blocking locks right after we return from unmap_vmas. In
> >>>>> other words the place we used to have synchronization with the
> >>>>> oom_reaper in the past.
> >>>>
> >>>> See commit 97b1255cb27c551d ("mm,oom_reaper: check for MMF_OOM_SKIP before
> >>>> complaining"). Since this dependency is inode-based (i.e. irrelevant with
> >>>> OOM victims), waiting for this lock can livelock.
> >>>>
> >>>> So, where is safe "the point of no-blocking" ?
> >>>
> >>> Ohh, right unlink_file_vma and its i_mmap_rwsem lock. As I've said I
> >>> have to think about that some more. Maybe we can split those into two parts.
> >>>
> >>
> >> Meanwhile, I'd really like to use timeout based back off. Like I wrote at
> >> http://lkml.kernel.org/r/201809060703.w8673Kbs076435@www262.sakura.ne.jp ,
> >> we need to wait for some period after all.
> >>
> >> We can replace timeout based back off after we got safe "the point of no-blocking" .
> > 
> > Why don't you invest your time in the long term solution rather than
> > playing with something that doesn't solve anything just papers over the
> > issue?
> > 
> 
> I am not a MM people. I am a secure programmer from security subsystem.

OK, so let me be completely honest with you. You have pretty strong
statements about the MM code while you are not considering yourself an
MM person. You are suggesting hacks which do not solve real underlying
problems and I will keep shooting those down.

> You are almost always introducing bugs (like you call dragons) rather
> than starting from safe changes. The OOM killer _is_ always racy. Even
> your what you think the long term solution _shall be_ racy. 

The reason why this area is so easy to to get wrong is basically a lack
of comprehensible design.  We have historical hacks here and there. I
really do not want to follow that direction and as long as my word has
some weigh (which is not my decision of course) I will keep fighting
for simplifications and an overall design refinements. If we are to add
heuristic they should be backed by well understood workloads we do care
about.

You might have your toy workload that hits different corner cases and
testing those is fine. But I absolutely disagree to base any non trivial
changes for that kind of workload unless they are a general improvement.

If you disagree then we have to agree to disagree and it doesn't make
much sense to continue in discussion.

> I can't waste my time in what you think the long term solution. Please
> don't refuse/ignore my (or David's) patches without your counter
> patches.

If you do not care about long term sanity of the code and if you do not
care about a larger picture then I am not interested in any patches from
you. MM code is far from trivial and no playground. This attitude of
yours is just dangerous.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-07 11:10                           ` Michal Hocko
@ 2018-09-07 11:36                             ` Tetsuo Handa
  2018-09-07 11:51                               ` Michal Hocko
  0 siblings, 1 reply; 38+ messages in thread
From: Tetsuo Handa @ 2018-09-07 11:36 UTC (permalink / raw)
  To: Michal Hocko; +Cc: David Rientjes, linux-mm, Roman Gushchin

On 2018/09/07 20:10, Michal Hocko wrote:
>> I can't waste my time in what you think the long term solution. Please
>> don't refuse/ignore my (or David's) patches without your counter
>> patches.
> 
> If you do not care about long term sanity of the code and if you do not
> care about a larger picture then I am not interested in any patches from
> you. MM code is far from trivial and no playground. This attitude of
> yours is just dangerous.
> 

Then, please explain how we guarantee that enough CPU resource is spent
between "exit_mmap() set MMF_OOM_SKIP" and "the OOM killer finds MMF_OOM_SKIP
was already set" so that last second allocation with high watermark can't fail
when 50% of available memory was already reclaimed.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-07 11:36                             ` Tetsuo Handa
@ 2018-09-07 11:51                               ` Michal Hocko
  2018-09-07 13:30                                 ` Tetsuo Handa
  0 siblings, 1 reply; 38+ messages in thread
From: Michal Hocko @ 2018-09-07 11:51 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: David Rientjes, linux-mm, Roman Gushchin

On Fri 07-09-18 20:36:31, Tetsuo Handa wrote:
> On 2018/09/07 20:10, Michal Hocko wrote:
> >> I can't waste my time in what you think the long term solution. Please
> >> don't refuse/ignore my (or David's) patches without your counter
> >> patches.
> > 
> > If you do not care about long term sanity of the code and if you do not
> > care about a larger picture then I am not interested in any patches from
> > you. MM code is far from trivial and no playground. This attitude of
> > yours is just dangerous.
> > 
> 
> Then, please explain how we guarantee that enough CPU resource is spent
> between "exit_mmap() set MMF_OOM_SKIP" and "the OOM killer finds MMF_OOM_SKIP
> was already set" so that last second allocation with high watermark can't fail
> when 50% of available memory was already reclaimed.

There is no guarantee. Full stop! This is an inherently racy land. We
can strive to work reasonably well but this will never be perfect. And
no, no timeout is going to solve it either. We have to live with the
fact that sometimes we hit the race and kill an additional task. As long
as there are no reasonable workloads which hit this race then we are
good enough.

The only guarantee we can talk about is the forward progress guarantee.
If we know that exit_mmap is past the blocking point then we can hand
over MMF_OOM_SKIP setting to the exit path rather than oom_reaper. Last
moment (minute, milisecond, nanosecond for that matter) allocation is in
no way related or solveable without a strong locking and we have learned
this is not a good idea in the past.

This is nothing new though. This discussion is not moving forward. It
just burns time so this is my last email in this thread.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes.
  2018-09-07 11:51                               ` Michal Hocko
@ 2018-09-07 13:30                                 ` Tetsuo Handa
  0 siblings, 0 replies; 38+ messages in thread
From: Tetsuo Handa @ 2018-09-07 13:30 UTC (permalink / raw)
  To: Michal Hocko
  Cc: David Rientjes, linux-mm, Roman Gushchin, Andrew Morton, Linus Torvalds

On 2018/09/07 20:51, Michal Hocko wrote:
> On Fri 07-09-18 20:36:31, Tetsuo Handa wrote:
>> On 2018/09/07 20:10, Michal Hocko wrote:
>>>> I can't waste my time in what you think the long term solution. Please
>>>> don't refuse/ignore my (or David's) patches without your counter
>>>> patches.
>>>
>>> If you do not care about long term sanity of the code and if you do not
>>> care about a larger picture then I am not interested in any patches from
>>> you. MM code is far from trivial and no playground. This attitude of
>>> yours is just dangerous.
>>>
>>
>> Then, please explain how we guarantee that enough CPU resource is spent
>> between "exit_mmap() set MMF_OOM_SKIP" and "the OOM killer finds MMF_OOM_SKIP
>> was already set" so that last second allocation with high watermark can't fail
>> when 50% of available memory was already reclaimed.
> 
> There is no guarantee. Full stop! This is an inherently racy land. We
> can strive to work reasonably well but this will never be perfect.

That is enough explanation that we have no choice but mitigate it using
heuristics. No feedback based approach is possible. My or David's patch
has been justified. Thank you!

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2018-09-07 13:30 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-04 13:29 [PATCH 1/4] mm, oom: Remove wake_oom_reaper() Tetsuo Handa
2018-08-04 13:29 ` [PATCH 2/4] mm, oom: Check pending victims earlier in out_of_memory() Tetsuo Handa
2018-08-04 13:29 ` [PATCH 3/4] mm, oom: Remove unused "abort" path Tetsuo Handa
2018-08-04 13:29 ` [PATCH 4/4] mm, oom: Fix unnecessary killing of additional processes Tetsuo Handa
2018-08-06 13:45   ` Michal Hocko
2018-08-06 20:19     ` David Rientjes
2018-08-06 20:51       ` Michal Hocko
2018-08-09 20:16         ` David Rientjes
2018-08-10  9:07           ` Michal Hocko
2018-08-10 10:54             ` Tetsuo Handa
2018-08-10 11:16               ` Michal Hocko
2018-08-11  3:12                 ` Tetsuo Handa
2018-08-14 11:33                   ` Michal Hocko
2018-08-19 14:23                     ` Tetsuo Handa
2018-08-20  5:54                       ` Michal Hocko
2018-08-20 22:03                         ` Tetsuo Handa
2018-08-21  6:16                           ` Michal Hocko
2018-08-21 13:39                             ` Tetsuo Handa
2018-08-19 23:45             ` David Rientjes
2018-08-20  6:07               ` Michal Hocko
2018-08-20 21:31                 ` David Rientjes
2018-08-21  6:09                   ` Michal Hocko
2018-08-21 17:20                     ` David Rientjes
2018-08-22  8:03                       ` Michal Hocko
2018-08-22 20:54                         ` David Rientjes
2018-09-01 11:48         ` Tetsuo Handa
2018-09-06 11:35           ` Michal Hocko
2018-09-06 11:50             ` Tetsuo Handa
2018-09-06 12:05               ` Michal Hocko
2018-09-06 13:40                 ` Tetsuo Handa
2018-09-06 13:56                   ` Michal Hocko
2018-09-06 14:06                     ` Tetsuo Handa
2018-09-06 14:16                       ` Michal Hocko
2018-09-06 21:13                         ` Tetsuo Handa
2018-09-07 11:10                           ` Michal Hocko
2018-09-07 11:36                             ` Tetsuo Handa
2018-09-07 11:51                               ` Michal Hocko
2018-09-07 13:30                                 ` Tetsuo Handa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.