All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] mm, oom: refactor dump_tasks for memcg OOMs
@ 2019-06-17 23:12 ` Shakeel Butt
  0 siblings, 0 replies; 6+ messages in thread
From: Shakeel Butt @ 2019-06-17 23:12 UTC (permalink / raw)
  To: Johannes Weiner, Tetsuo Handa, Michal Hocko, Andrew Morton,
	Roman Gushchin
  Cc: linux-mm, linux-kernel, Shakeel Butt

dump_tasks() currently goes through all the processes present on the
system even for memcg OOMs. Change dump_tasks() similar to
select_bad_process() and use mem_cgroup_scan_tasks() to selectively
traverse the processes of the memcgs during memcg OOM.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
Changelog since v1:
- Divide the patch into two patches.

 mm/oom_kill.c | 68 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 05aaa1a5920b..bd80997e0969 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -385,10 +385,38 @@ static void select_bad_process(struct oom_control *oc)
 	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
 }
 
+static int dump_task(struct task_struct *p, void *arg)
+{
+	struct oom_control *oc = arg;
+	struct task_struct *task;
+
+	if (oom_unkillable_task(p, NULL, oc->nodemask))
+		return 0;
+
+	task = find_lock_task_mm(p);
+	if (!task) {
+		/*
+		 * This is a kthread or all of p's threads have already
+		 * detached their mm's.  There's no need to report
+		 * them; they can't be oom killed anyway.
+		 */
+		return 0;
+	}
+
+	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
+		task->pid, from_kuid(&init_user_ns, task_uid(task)),
+		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+		mm_pgtables_bytes(task->mm),
+		get_mm_counter(task->mm, MM_SWAPENTS),
+		task->signal->oom_score_adj, task->comm);
+	task_unlock(task);
+
+	return 0;
+}
+
 /**
  * dump_tasks - dump current memory state of all system tasks
- * @memcg: current's memory controller, if constrained
- * @nodemask: nodemask passed to page allocator for mempolicy ooms
+ * @oc: pointer to struct oom_control
  *
  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
@@ -396,37 +424,21 @@ static void select_bad_process(struct oom_control *oc)
  * State information includes task's pid, uid, tgid, vm size, rss,
  * pgtables_bytes, swapents, oom_score_adj value, and name.
  */
-static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_tasks(struct oom_control *oc)
 {
-	struct task_struct *p;
-	struct task_struct *task;
-
 	pr_info("Tasks state (memory values in pages):\n");
 	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
-	rcu_read_lock();
-	for_each_process(p) {
-		if (oom_unkillable_task(p, memcg, nodemask))
-			continue;
 
-		task = find_lock_task_mm(p);
-		if (!task) {
-			/*
-			 * This is a kthread or all of p's threads have already
-			 * detached their mm's.  There's no need to report
-			 * them; they can't be oom killed anyway.
-			 */
-			continue;
-		}
+	if (is_memcg_oom(oc))
+		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
+	else {
+		struct task_struct *p;
 
-		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
-			task->pid, from_kuid(&init_user_ns, task_uid(task)),
-			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			mm_pgtables_bytes(task->mm),
-			get_mm_counter(task->mm, MM_SWAPENTS),
-			task->signal->oom_score_adj, task->comm);
-		task_unlock(task);
+		rcu_read_lock();
+		for_each_process(p)
+			dump_task(p, oc);
+		rcu_read_unlock();
 	}
-	rcu_read_unlock();
 }
 
 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
@@ -458,7 +470,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 			dump_unreclaimable_slab();
 	}
 	if (sysctl_oom_dump_tasks)
-		dump_tasks(oc->memcg, oc->nodemask);
+		dump_tasks(oc);
 	if (p)
 		dump_oom_summary(oc, p);
 }
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v2 1/2] mm, oom: refactor dump_tasks for memcg OOMs
@ 2019-06-17 23:12 ` Shakeel Butt
  0 siblings, 0 replies; 6+ messages in thread
From: Shakeel Butt @ 2019-06-17 23:12 UTC (permalink / raw)
  To: Johannes Weiner, Tetsuo Handa, Michal Hocko, Andrew Morton,
	Roman Gushchin
  Cc: linux-mm, linux-kernel, Shakeel Butt

dump_tasks() currently goes through all the processes present on the
system even for memcg OOMs. Change dump_tasks() similar to
select_bad_process() and use mem_cgroup_scan_tasks() to selectively
traverse the processes of the memcgs during memcg OOM.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
Changelog since v1:
- Divide the patch into two patches.

 mm/oom_kill.c | 68 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 05aaa1a5920b..bd80997e0969 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -385,10 +385,38 @@ static void select_bad_process(struct oom_control *oc)
 	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
 }
 
+static int dump_task(struct task_struct *p, void *arg)
+{
+	struct oom_control *oc = arg;
+	struct task_struct *task;
+
+	if (oom_unkillable_task(p, NULL, oc->nodemask))
+		return 0;
+
+	task = find_lock_task_mm(p);
+	if (!task) {
+		/*
+		 * This is a kthread or all of p's threads have already
+		 * detached their mm's.  There's no need to report
+		 * them; they can't be oom killed anyway.
+		 */
+		return 0;
+	}
+
+	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
+		task->pid, from_kuid(&init_user_ns, task_uid(task)),
+		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+		mm_pgtables_bytes(task->mm),
+		get_mm_counter(task->mm, MM_SWAPENTS),
+		task->signal->oom_score_adj, task->comm);
+	task_unlock(task);
+
+	return 0;
+}
+
 /**
  * dump_tasks - dump current memory state of all system tasks
- * @memcg: current's memory controller, if constrained
- * @nodemask: nodemask passed to page allocator for mempolicy ooms
+ * @oc: pointer to struct oom_control
  *
  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
@@ -396,37 +424,21 @@ static void select_bad_process(struct oom_control *oc)
  * State information includes task's pid, uid, tgid, vm size, rss,
  * pgtables_bytes, swapents, oom_score_adj value, and name.
  */
-static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_tasks(struct oom_control *oc)
 {
-	struct task_struct *p;
-	struct task_struct *task;
-
 	pr_info("Tasks state (memory values in pages):\n");
 	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
-	rcu_read_lock();
-	for_each_process(p) {
-		if (oom_unkillable_task(p, memcg, nodemask))
-			continue;
 
-		task = find_lock_task_mm(p);
-		if (!task) {
-			/*
-			 * This is a kthread or all of p's threads have already
-			 * detached their mm's.  There's no need to report
-			 * them; they can't be oom killed anyway.
-			 */
-			continue;
-		}
+	if (is_memcg_oom(oc))
+		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
+	else {
+		struct task_struct *p;
 
-		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
-			task->pid, from_kuid(&init_user_ns, task_uid(task)),
-			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			mm_pgtables_bytes(task->mm),
-			get_mm_counter(task->mm, MM_SWAPENTS),
-			task->signal->oom_score_adj, task->comm);
-		task_unlock(task);
+		rcu_read_lock();
+		for_each_process(p)
+			dump_task(p, oc);
+		rcu_read_unlock();
 	}
-	rcu_read_unlock();
 }
 
 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
@@ -458,7 +470,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 			dump_unreclaimable_slab();
 	}
 	if (sysctl_oom_dump_tasks)
-		dump_tasks(oc->memcg, oc->nodemask);
+		dump_tasks(oc);
 	if (p)
 		dump_oom_summary(oc, p);
 }
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v2 2/2] mm, oom: fix oom_unkillable_task for memcg OOMs
  2019-06-17 23:12 ` Shakeel Butt
@ 2019-06-17 23:12   ` Shakeel Butt
  -1 siblings, 0 replies; 6+ messages in thread
From: Shakeel Butt @ 2019-06-17 23:12 UTC (permalink / raw)
  To: Johannes Weiner, Tetsuo Handa, Michal Hocko, Andrew Morton,
	Roman Gushchin
  Cc: linux-mm, linux-kernel, Shakeel Butt

Currently oom_unkillable_task() checks mems_allowed even for memcg OOMs
which does not make sense as memcg OOMs can not be triggered due to
numa constraints. Fixing that.

This commit also removed the bogus usage of oom_unkillable_task() from
oom_badness(). Currently reading /proc/[pid]/oom_score will do a bogus
cpuset_mems_allowed_intersects() check. Removing that.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
Changelog since v1:
- Divide the patch into two patches.

 fs/proc/base.c      |  3 +--
 include/linux/oom.h |  1 -
 mm/oom_kill.c       | 28 +++++++++++++++-------------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b8d5d100ed4a..57b7a0d75ef5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -532,8 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long totalpages = totalram_pages() + total_swap_pages;
 	unsigned long points = 0;
 
-	points = oom_badness(task, NULL, NULL, totalpages) *
-					1000 / totalpages;
+	points = oom_badness(task, totalpages) * 1000 / totalpages;
 	seq_printf(m, "%lu\n", points);
 
 	return 0;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index d07992009265..c696c265f019 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
 extern unsigned long oom_badness(struct task_struct *p,
-		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
 
 extern bool out_of_memory(struct oom_control *oc);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bd80997e0969..d779d9da1069 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -152,20 +152,23 @@ static inline bool is_memcg_oom(struct oom_control *oc)
 }
 
 /* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p,
-		struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static bool oom_unkillable_task(struct task_struct *p, struct oom_control *oc)
 {
 	if (is_global_init(p))
 		return true;
 	if (p->flags & PF_KTHREAD)
 		return true;
 
-	/* When mem_cgroup_out_of_memory() and p is not member of the group */
-	if (memcg && !task_in_mem_cgroup(p, memcg))
-		return true;
+	/*
+	 * For memcg OOM, we reach here through mem_cgroup_scan_tasks(), no
+	 * need to check p's memcg membership and the checks after this
+	 * are irrelevant for memcg OOMs.
+	 */
+	if (is_memcg_oom(oc))
+		return false;
 
 	/* p may not have freeable memory in nodemask */
-	if (!has_intersects_mems_allowed(p, nodemask))
+	if (!has_intersects_mems_allowed(p, oc->nodemask))
 		return true;
 
 	return false;
@@ -201,13 +204,12 @@ static bool is_dump_unreclaim_slabs(void)
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			  const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
 {
 	long points;
 	long adj;
 
-	if (oom_unkillable_task(p, memcg, nodemask))
+	if (is_global_init(p) || p->flags & PF_KTHREAD)
 		return 0;
 
 	p = find_lock_task_mm(p);
@@ -318,7 +320,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	struct oom_control *oc = arg;
 	unsigned long points;
 
-	if (oom_unkillable_task(task, NULL, oc->nodemask))
+	if (oom_unkillable_task(task, oc))
 		goto next;
 
 	/*
@@ -342,7 +344,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto select;
 	}
 
-	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
+	points = oom_badness(task, oc->totalpages);
 	if (!points || points < oc->chosen_points)
 		goto next;
 
@@ -390,7 +392,7 @@ static int dump_task(struct task_struct *p, void *arg)
 	struct oom_control *oc = arg;
 	struct task_struct *task;
 
-	if (oom_unkillable_task(p, NULL, oc->nodemask))
+	if (oom_unkillable_task(p, oc))
 		return 0;
 
 	task = find_lock_task_mm(p);
@@ -1090,7 +1092,7 @@ bool out_of_memory(struct oom_control *oc)
 	check_panic_on_oom(oc, constraint);
 
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
-	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
+	    current->mm && !oom_unkillable_task(current, oc) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
 		oc->chosen = current;
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v2 2/2] mm, oom: fix oom_unkillable_task for memcg OOMs
@ 2019-06-17 23:12   ` Shakeel Butt
  0 siblings, 0 replies; 6+ messages in thread
From: Shakeel Butt @ 2019-06-17 23:12 UTC (permalink / raw)
  To: Johannes Weiner, Tetsuo Handa, Michal Hocko, Andrew Morton,
	Roman Gushchin
  Cc: linux-mm, linux-kernel, Shakeel Butt

Currently oom_unkillable_task() checks mems_allowed even for memcg OOMs
which does not make sense as memcg OOMs can not be triggered due to
numa constraints. Fixing that.

This commit also removed the bogus usage of oom_unkillable_task() from
oom_badness(). Currently reading /proc/[pid]/oom_score will do a bogus
cpuset_mems_allowed_intersects() check. Removing that.

Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
Changelog since v1:
- Divide the patch into two patches.

 fs/proc/base.c      |  3 +--
 include/linux/oom.h |  1 -
 mm/oom_kill.c       | 28 +++++++++++++++-------------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b8d5d100ed4a..57b7a0d75ef5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -532,8 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long totalpages = totalram_pages() + total_swap_pages;
 	unsigned long points = 0;
 
-	points = oom_badness(task, NULL, NULL, totalpages) *
-					1000 / totalpages;
+	points = oom_badness(task, totalpages) * 1000 / totalpages;
 	seq_printf(m, "%lu\n", points);
 
 	return 0;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index d07992009265..c696c265f019 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 bool __oom_reap_task_mm(struct mm_struct *mm);
 
 extern unsigned long oom_badness(struct task_struct *p,
-		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
 
 extern bool out_of_memory(struct oom_control *oc);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bd80997e0969..d779d9da1069 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -152,20 +152,23 @@ static inline bool is_memcg_oom(struct oom_control *oc)
 }
 
 /* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p,
-		struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static bool oom_unkillable_task(struct task_struct *p, struct oom_control *oc)
 {
 	if (is_global_init(p))
 		return true;
 	if (p->flags & PF_KTHREAD)
 		return true;
 
-	/* When mem_cgroup_out_of_memory() and p is not member of the group */
-	if (memcg && !task_in_mem_cgroup(p, memcg))
-		return true;
+	/*
+	 * For memcg OOM, we reach here through mem_cgroup_scan_tasks(), no
+	 * need to check p's memcg membership and the checks after this
+	 * are irrelevant for memcg OOMs.
+	 */
+	if (is_memcg_oom(oc))
+		return false;
 
 	/* p may not have freeable memory in nodemask */
-	if (!has_intersects_mems_allowed(p, nodemask))
+	if (!has_intersects_mems_allowed(p, oc->nodemask))
 		return true;
 
 	return false;
@@ -201,13 +204,12 @@ static bool is_dump_unreclaim_slabs(void)
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			  const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
 {
 	long points;
 	long adj;
 
-	if (oom_unkillable_task(p, memcg, nodemask))
+	if (is_global_init(p) || p->flags & PF_KTHREAD)
 		return 0;
 
 	p = find_lock_task_mm(p);
@@ -318,7 +320,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	struct oom_control *oc = arg;
 	unsigned long points;
 
-	if (oom_unkillable_task(task, NULL, oc->nodemask))
+	if (oom_unkillable_task(task, oc))
 		goto next;
 
 	/*
@@ -342,7 +344,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto select;
 	}
 
-	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
+	points = oom_badness(task, oc->totalpages);
 	if (!points || points < oc->chosen_points)
 		goto next;
 
@@ -390,7 +392,7 @@ static int dump_task(struct task_struct *p, void *arg)
 	struct oom_control *oc = arg;
 	struct task_struct *task;
 
-	if (oom_unkillable_task(p, NULL, oc->nodemask))
+	if (oom_unkillable_task(p, oc))
 		return 0;
 
 	task = find_lock_task_mm(p);
@@ -1090,7 +1092,7 @@ bool out_of_memory(struct oom_control *oc)
 	check_panic_on_oom(oc, constraint);
 
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
-	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
+	    current->mm && !oom_unkillable_task(current, oc) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
 		oc->chosen = current;
-- 
2.22.0.410.gd8fdbe21b5-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 1/2] mm, oom: refactor dump_tasks for memcg OOMs
  2019-06-17 23:12 ` Shakeel Butt
  (?)
  (?)
@ 2019-06-18 12:24 ` Michal Hocko
  -1 siblings, 0 replies; 6+ messages in thread
From: Michal Hocko @ 2019-06-18 12:24 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Johannes Weiner, Roman Gushchin, Tetsuo Handa, Andrew Morton,
	linux-mm, linux-kernel

On Mon 17-06-19 16:12:06, Shakeel Butt wrote:
> dump_tasks() currently goes through all the processes present on the
> system even for memcg OOMs. Change dump_tasks() similar to
> select_bad_process() and use mem_cgroup_scan_tasks() to selectively
> traverse the processes of the memcgs during memcg OOM.

The changelog is quite modest to be honest. I would go with

"
dump_tasks() traverses all the existing processes even for the memcg OOM
context which is not only unnecessary but also wasteful. This imposes
a long RCU critical section even from a contained context which can be
quite disruptive.

Change dump_tasks() to be aligned with select_bad_process and use
mem_cgroup_scan_tasks to selectively traverse only processes of the
target memcg hierarchy during memcg OOM.
"

> Signed-off-by: Shakeel Butt <shakeelb@google.com>

Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!

> ---
> Changelog since v1:
> - Divide the patch into two patches.
> 
>  mm/oom_kill.c | 68 ++++++++++++++++++++++++++++++---------------------
>  1 file changed, 40 insertions(+), 28 deletions(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 05aaa1a5920b..bd80997e0969 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -385,10 +385,38 @@ static void select_bad_process(struct oom_control *oc)
>  	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
>  }
>  
> +static int dump_task(struct task_struct *p, void *arg)
> +{
> +	struct oom_control *oc = arg;
> +	struct task_struct *task;
> +
> +	if (oom_unkillable_task(p, NULL, oc->nodemask))
> +		return 0;
> +
> +	task = find_lock_task_mm(p);
> +	if (!task) {
> +		/*
> +		 * This is a kthread or all of p's threads have already
> +		 * detached their mm's.  There's no need to report
> +		 * them; they can't be oom killed anyway.
> +		 */
> +		return 0;
> +	}
> +
> +	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> +		task->pid, from_kuid(&init_user_ns, task_uid(task)),
> +		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> +		mm_pgtables_bytes(task->mm),
> +		get_mm_counter(task->mm, MM_SWAPENTS),
> +		task->signal->oom_score_adj, task->comm);
> +	task_unlock(task);
> +
> +	return 0;
> +}
> +
>  /**
>   * dump_tasks - dump current memory state of all system tasks
> - * @memcg: current's memory controller, if constrained
> - * @nodemask: nodemask passed to page allocator for mempolicy ooms
> + * @oc: pointer to struct oom_control
>   *
>   * Dumps the current memory state of all eligible tasks.  Tasks not in the same
>   * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
> @@ -396,37 +424,21 @@ static void select_bad_process(struct oom_control *oc)
>   * State information includes task's pid, uid, tgid, vm size, rss,
>   * pgtables_bytes, swapents, oom_score_adj value, and name.
>   */
> -static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +static void dump_tasks(struct oom_control *oc)
>  {
> -	struct task_struct *p;
> -	struct task_struct *task;
> -
>  	pr_info("Tasks state (memory values in pages):\n");
>  	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> -	rcu_read_lock();
> -	for_each_process(p) {
> -		if (oom_unkillable_task(p, memcg, nodemask))
> -			continue;
>  
> -		task = find_lock_task_mm(p);
> -		if (!task) {
> -			/*
> -			 * This is a kthread or all of p's threads have already
> -			 * detached their mm's.  There's no need to report
> -			 * them; they can't be oom killed anyway.
> -			 */
> -			continue;
> -		}
> +	if (is_memcg_oom(oc))
> +		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
> +	else {
> +		struct task_struct *p;
>  
> -		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> -			task->pid, from_kuid(&init_user_ns, task_uid(task)),
> -			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> -			mm_pgtables_bytes(task->mm),
> -			get_mm_counter(task->mm, MM_SWAPENTS),
> -			task->signal->oom_score_adj, task->comm);
> -		task_unlock(task);
> +		rcu_read_lock();
> +		for_each_process(p)
> +			dump_task(p, oc);
> +		rcu_read_unlock();
>  	}
> -	rcu_read_unlock();
>  }
>  
>  static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
> @@ -458,7 +470,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
>  			dump_unreclaimable_slab();
>  	}
>  	if (sysctl_oom_dump_tasks)
> -		dump_tasks(oc->memcg, oc->nodemask);
> +		dump_tasks(oc);
>  	if (p)
>  		dump_oom_summary(oc, p);
>  }
> -- 
> 2.22.0.410.gd8fdbe21b5-goog
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 2/2] mm, oom: fix oom_unkillable_task for memcg OOMs
  2019-06-17 23:12   ` Shakeel Butt
  (?)
@ 2019-06-18 12:34   ` Michal Hocko
  -1 siblings, 0 replies; 6+ messages in thread
From: Michal Hocko @ 2019-06-18 12:34 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Johannes Weiner, Roman Gushchin, Tetsuo Handa, Andrew Morton,
	linux-mm, linux-kernel

On Mon 17-06-19 16:12:07, Shakeel Butt wrote:
> Currently oom_unkillable_task() checks mems_allowed even for memcg OOMs
> which does not make sense as memcg OOMs can not be triggered due to
> numa constraints. Fixing that.

"Fixing that" is a poor description of the fix. Also it is quite useful
to note that it is not only bogus to check mems_allowed. It is also
harmful as per the syzbot test IIRC. Pasting the report here would
be helpful as well.

> This commit also removed the bogus usage of oom_unkillable_task() from
> oom_badness(). Currently reading /proc/[pid]/oom_score will do a bogus
> cpuset_mems_allowed_intersects() check. Removing that.

Again, there shouldn't be any real reason to squash the two things into
a single patch. This is a subtle bug/behavior on its own because the
result of oom_badness depends on the calling process context. This
should be called out in the changelog explicitly.

> Signed-off-by: Shakeel Butt <shakeelb@google.com>

Other than that it looks good to me. I will ack after the split out and
the changelog improvements.

Thanks!

> ---
> Changelog since v1:
> - Divide the patch into two patches.
> 
>  fs/proc/base.c      |  3 +--
>  include/linux/oom.h |  1 -
>  mm/oom_kill.c       | 28 +++++++++++++++-------------
>  3 files changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index b8d5d100ed4a..57b7a0d75ef5 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -532,8 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
>  	unsigned long totalpages = totalram_pages() + total_swap_pages;
>  	unsigned long points = 0;
>  
> -	points = oom_badness(task, NULL, NULL, totalpages) *
> -					1000 / totalpages;
> +	points = oom_badness(task, totalpages) * 1000 / totalpages;
>  	seq_printf(m, "%lu\n", points);
>  
>  	return 0;
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index d07992009265..c696c265f019 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
>  bool __oom_reap_task_mm(struct mm_struct *mm);
>  
>  extern unsigned long oom_badness(struct task_struct *p,
> -		struct mem_cgroup *memcg, const nodemask_t *nodemask,
>  		unsigned long totalpages);
>  
>  extern bool out_of_memory(struct oom_control *oc);
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index bd80997e0969..d779d9da1069 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -152,20 +152,23 @@ static inline bool is_memcg_oom(struct oom_control *oc)
>  }
>  
>  /* return true if the task is not adequate as candidate victim task. */
> -static bool oom_unkillable_task(struct task_struct *p,
> -		struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +static bool oom_unkillable_task(struct task_struct *p, struct oom_control *oc)
>  {
>  	if (is_global_init(p))
>  		return true;
>  	if (p->flags & PF_KTHREAD)
>  		return true;
>  
> -	/* When mem_cgroup_out_of_memory() and p is not member of the group */
> -	if (memcg && !task_in_mem_cgroup(p, memcg))
> -		return true;
> +	/*
> +	 * For memcg OOM, we reach here through mem_cgroup_scan_tasks(), no
> +	 * need to check p's memcg membership and the checks after this
> +	 * are irrelevant for memcg OOMs.
> +	 */
> +	if (is_memcg_oom(oc))
> +		return false;
>  
>  	/* p may not have freeable memory in nodemask */
> -	if (!has_intersects_mems_allowed(p, nodemask))
> +	if (!has_intersects_mems_allowed(p, oc->nodemask))
>  		return true;
>  
>  	return false;
> @@ -201,13 +204,12 @@ static bool is_dump_unreclaim_slabs(void)
>   * predictable as possible.  The goal is to return the highest value for the
>   * task consuming the most memory to avoid subsequent oom failures.
>   */
> -unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
> -			  const nodemask_t *nodemask, unsigned long totalpages)
> +unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
>  {
>  	long points;
>  	long adj;
>  
> -	if (oom_unkillable_task(p, memcg, nodemask))
> +	if (is_global_init(p) || p->flags & PF_KTHREAD)
>  		return 0;
>  
>  	p = find_lock_task_mm(p);
> @@ -318,7 +320,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
>  	struct oom_control *oc = arg;
>  	unsigned long points;
>  
> -	if (oom_unkillable_task(task, NULL, oc->nodemask))
> +	if (oom_unkillable_task(task, oc))
>  		goto next;
>  
>  	/*
> @@ -342,7 +344,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
>  		goto select;
>  	}
>  
> -	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
> +	points = oom_badness(task, oc->totalpages);
>  	if (!points || points < oc->chosen_points)
>  		goto next;
>  
> @@ -390,7 +392,7 @@ static int dump_task(struct task_struct *p, void *arg)
>  	struct oom_control *oc = arg;
>  	struct task_struct *task;
>  
> -	if (oom_unkillable_task(p, NULL, oc->nodemask))
> +	if (oom_unkillable_task(p, oc))
>  		return 0;
>  
>  	task = find_lock_task_mm(p);
> @@ -1090,7 +1092,7 @@ bool out_of_memory(struct oom_control *oc)
>  	check_panic_on_oom(oc, constraint);
>  
>  	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
> -	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
> +	    current->mm && !oom_unkillable_task(current, oc) &&
>  	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
>  		get_task_struct(current);
>  		oc->chosen = current;
> -- 
> 2.22.0.410.gd8fdbe21b5-goog
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-06-18 12:34 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-17 23:12 [PATCH v2 1/2] mm, oom: refactor dump_tasks for memcg OOMs Shakeel Butt
2019-06-17 23:12 ` Shakeel Butt
2019-06-17 23:12 ` [PATCH v2 2/2] mm, oom: fix oom_unkillable_task " Shakeel Butt
2019-06-17 23:12   ` Shakeel Butt
2019-06-18 12:34   ` Michal Hocko
2019-06-18 12:24 ` [PATCH v2 1/2] mm, oom: refactor dump_tasks " Michal Hocko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.