linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] mm: oom: add memcg to oom_control
@ 2016-05-27 14:17 Vladimir Davydov
  2016-05-27 14:17 ` [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom Vladimir Davydov
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Vladimir Davydov @ 2016-05-27 14:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Tetsuo Handa, David Rientjes, Johannes Weiner,
	linux-mm, linux-kernel

It's a part of oom context just like allocation order and nodemask, so
let's move it to oom_control instead of passing it in the argument list.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
---
 drivers/tty/sysrq.c |  1 +
 include/linux/oom.h |  8 +++++---
 mm/memcontrol.c     |  5 +++--
 mm/oom_kill.c       | 32 +++++++++++++++-----------------
 mm/page_alloc.c     |  1 +
 5 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index e5139402e7f8..52bbd27e93ae 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -363,6 +363,7 @@ static void moom_callback(struct work_struct *ignored)
 	struct oom_control oc = {
 		.zonelist = node_zonelist(first_memory_node, gfp_mask),
 		.nodemask = NULL,
+		.memcg = NULL,
 		.gfp_mask = gfp_mask,
 		.order = -1,
 	};
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 83469522690a..cbc24a5fe28d 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -23,6 +23,9 @@ struct oom_control {
 	/* Used to determine mempolicy */
 	nodemask_t *nodemask;
 
+	/* Memory cgroup in which oom is invoked, or NULL for global oom */
+	struct mem_cgroup *memcg;
+
 	/* Used to determine cpuset and node locality requirement */
 	const gfp_t gfp_mask;
 
@@ -83,11 +86,10 @@ extern unsigned long oom_badness(struct task_struct *p,
 
 extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 			     unsigned int points, unsigned long totalpages,
-			     struct mem_cgroup *memcg, const char *message);
+			     const char *message);
 
 extern void check_panic_on_oom(struct oom_control *oc,
-			       enum oom_constraint constraint,
-			       struct mem_cgroup *memcg);
+			       enum oom_constraint constraint);
 
 extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 		struct task_struct *task, unsigned long totalpages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 37ba604984c9..eeb3b14de01a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1259,6 +1259,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	struct oom_control oc = {
 		.zonelist = NULL,
 		.nodemask = NULL,
+		.memcg = memcg,
 		.gfp_mask = gfp_mask,
 		.order = order,
 	};
@@ -1281,7 +1282,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		goto unlock;
 	}
 
-	check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
+	check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
 	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct css_task_iter it;
@@ -1329,7 +1330,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	if (chosen) {
 		points = chosen_points * 1000 / totalpages;
-		oom_kill_process(&oc, chosen, points, totalpages, memcg,
+		oom_kill_process(&oc, chosen, points, totalpages,
 				 "Memory cgroup out of memory");
 	}
 unlock:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b95c4c101b35..b3424199069b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -383,8 +383,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	rcu_read_unlock();
 }
 
-static void dump_header(struct oom_control *oc, struct task_struct *p,
-			struct mem_cgroup *memcg)
+static void dump_header(struct oom_control *oc, struct task_struct *p)
 {
 	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
 		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
@@ -392,12 +391,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p,
 
 	cpuset_print_current_mems_allowed();
 	dump_stack();
-	if (memcg)
-		mem_cgroup_print_oom_info(memcg, p);
+	if (oc->memcg)
+		mem_cgroup_print_oom_info(oc->memcg, p);
 	else
 		show_mem(SHOW_MEM_FILTER_NODES);
 	if (sysctl_oom_dump_tasks)
-		dump_tasks(memcg, oc->nodemask);
+		dump_tasks(oc->memcg, oc->nodemask);
 }
 
 /*
@@ -748,7 +747,7 @@ void oom_killer_enable(void)
  */
 void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 		      unsigned int points, unsigned long totalpages,
-		      struct mem_cgroup *memcg, const char *message)
+		      const char *message)
 {
 	struct task_struct *victim = p;
 	struct task_struct *child;
@@ -774,7 +773,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	task_unlock(p);
 
 	if (__ratelimit(&oom_rs))
-		dump_header(oc, p, memcg);
+		dump_header(oc, p);
 
 	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
 		message, task_pid_nr(p), p->comm, points);
@@ -795,8 +794,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 			/*
 			 * oom_badness() returns 0 if the thread is unkillable
 			 */
-			child_points = oom_badness(child, memcg, oc->nodemask,
-								totalpages);
+			child_points = oom_badness(child,
+					oc->memcg, oc->nodemask, totalpages);
 			if (child_points > victim_points) {
 				put_task_struct(victim);
 				victim = child;
@@ -874,8 +873,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 /*
  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
  */
-void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
-			struct mem_cgroup *memcg)
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
 {
 	if (likely(!sysctl_panic_on_oom))
 		return;
@@ -891,7 +889,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
 	/* Do not panic for oom kills triggered by sysrq */
 	if (is_sysrq_oom(oc))
 		return;
-	dump_header(oc, NULL, memcg);
+	dump_header(oc, NULL);
 	panic("Out of memory: %s panic_on_oom is enabled\n",
 		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
@@ -966,13 +964,13 @@ bool out_of_memory(struct oom_control *oc)
 	constraint = constrained_alloc(oc, &totalpages);
 	if (constraint != CONSTRAINT_MEMORY_POLICY)
 		oc->nodemask = NULL;
-	check_panic_on_oom(oc, constraint, NULL);
+	check_panic_on_oom(oc, constraint);
 
 	if (sysctl_oom_kill_allocating_task && current->mm &&
 	    !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
-		oom_kill_process(oc, current, 0, totalpages, NULL,
+		oom_kill_process(oc, current, 0, totalpages,
 				 "Out of memory (oom_kill_allocating_task)");
 		return true;
 	}
@@ -980,12 +978,11 @@ bool out_of_memory(struct oom_control *oc)
 	p = select_bad_process(oc, &points, totalpages);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p && !is_sysrq_oom(oc)) {
-		dump_header(oc, NULL, NULL);
+		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
 	if (p && p != (void *)-1UL) {
-		oom_kill_process(oc, p, points, totalpages, NULL,
-				 "Out of memory");
+		oom_kill_process(oc, p, points, totalpages, "Out of memory");
 		/*
 		 * Give the killed process a good chance to exit before trying
 		 * to allocate memory again.
@@ -1005,6 +1002,7 @@ void pagefault_out_of_memory(void)
 	struct oom_control oc = {
 		.zonelist = NULL,
 		.nodemask = NULL,
+		.memcg = NULL,
 		.gfp_mask = 0,
 		.order = 0,
 	};
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f21f56f88c8a..7da8310b86e9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3101,6 +3101,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct oom_control oc = {
 		.zonelist = ac->zonelist,
 		.nodemask = ac->nodemask,
+		.memcg = NULL,
 		.gfp_mask = gfp_mask,
 		.order = order,
 	};
-- 
2.1.4

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-05-27 14:17 [PATCH 1/2] mm: oom: add memcg to oom_control Vladimir Davydov
@ 2016-05-27 14:17 ` Vladimir Davydov
  2016-05-27 14:26   ` Michal Hocko
                     ` (2 more replies)
  2016-05-27 14:34 ` [PATCH 1/2] mm: oom: add memcg to oom_control Michal Hocko
  2016-05-27 17:20 ` Johannes Weiner
  2 siblings, 3 replies; 14+ messages in thread
From: Vladimir Davydov @ 2016-05-27 14:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Tetsuo Handa, David Rientjes, Johannes Weiner,
	linux-mm, linux-kernel

When selecting an oom victim, we use the same heuristic for both memory
cgroup and global oom. The only difference is the scope of tasks to
select the victim from. So we could just export an iterator over all
memcg tasks and keep all oom related logic in oom_kill.c, but instead we
duplicate pieces of it in memcontrol.c reusing some initially private
functions of oom_kill.c in order to not duplicate all of it. That looks
ugly and error prone, because any modification of select_bad_process
should also be propagated to mem_cgroup_out_of_memory.

Let's rework this as follows: keep all oom heuristic related code
private to oom_kill.c and make oom_kill.c use exported memcg functions
when it's really necessary (like in case of iterating over memcg tasks).

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
---
 include/linux/memcontrol.h |  15 ++++
 include/linux/oom.h        |  51 -------------
 mm/memcontrol.c            | 112 ++++++++++-----------------
 mm/oom_kill.c              | 183 +++++++++++++++++++++++++++++----------------
 4 files changed, 176 insertions(+), 185 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a805474df4ab..021c49ebae21 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -324,6 +324,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
 				   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+int mem_cgroup_scan_tasks(struct mem_cgroup *,
+			  int (*)(struct task_struct *, void *), void *);
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
@@ -417,6 +419,8 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 
 void mem_cgroup_handle_over_high(void);
 
+unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg);
+
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 				struct task_struct *p);
 
@@ -610,6 +614,12 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
 {
 }
 
+static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+		int (*fn)(struct task_struct *, void *), void *arg)
+{
+	return 0;
+}
+
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
 	return 0;
@@ -640,6 +650,11 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 	return 0;
 }
 
+static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+
 static inline void
 mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
diff --git a/include/linux/oom.h b/include/linux/oom.h
index cbc24a5fe28d..5a72853c8fc0 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -36,23 +36,6 @@ struct oom_control {
 	const int order;
 };
 
-/*
- * Types of limitations to the nodes from which allocations may occur
- */
-enum oom_constraint {
-	CONSTRAINT_NONE,
-	CONSTRAINT_CPUSET,
-	CONSTRAINT_MEMORY_POLICY,
-	CONSTRAINT_MEMCG,
-};
-
-enum oom_scan_t {
-	OOM_SCAN_OK,		/* scan thread and find its badness */
-	OOM_SCAN_CONTINUE,	/* do not consider thread for oom kill */
-	OOM_SCAN_ABORT,		/* abort the iteration and return */
-	OOM_SCAN_SELECT,	/* always select this thread first */
-};
-
 extern struct mutex oom_lock;
 
 static inline void set_current_oom_origin(void)
@@ -70,8 +53,6 @@ static inline bool oom_task_origin(const struct task_struct *p)
 	return p->signal->oom_flag_origin;
 }
 
-extern void mark_oom_victim(struct task_struct *tsk);
-
 #ifdef CONFIG_MMU
 extern void try_oom_reaper(struct task_struct *tsk);
 #else
@@ -84,16 +65,6 @@ extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
 
-extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
-			     unsigned int points, unsigned long totalpages,
-			     const char *message);
-
-extern void check_panic_on_oom(struct oom_control *oc,
-			       enum oom_constraint constraint);
-
-extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-		struct task_struct *task, unsigned long totalpages);
-
 extern bool out_of_memory(struct oom_control *oc);
 
 extern void exit_oom_victim(struct task_struct *tsk);
@@ -107,28 +78,6 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
-static inline bool task_will_free_mem(struct task_struct *task)
-{
-	struct signal_struct *sig = task->signal;
-
-	/*
-	 * A coredumping process may sleep for an extended period in exit_mm(),
-	 * so the oom killer cannot assume that the process will promptly exit
-	 * and release memory.
-	 */
-	if (sig->flags & SIGNAL_GROUP_COREDUMP)
-		return false;
-
-	if (!(task->flags & PF_EXITING))
-		return false;
-
-	/* Make sure that the whole thread group is going down */
-	if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
-		return false;
-
-	return true;
-}
-
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eeb3b14de01a..6ad31795d231 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -944,6 +944,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
 /**
+ * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
+ * @memcg: hierarchy root
+ * @fn: function to call for each task
+ * @arg: argument passed to @fn
+ *
+ * This function iterates over tasks attached to @memcg or to any of its
+ * descendants and calls @fn for each task. If @fn returns a non-zero
+ * value, the function breaks the iteration loop and returns the value.
+ * Otherwise, it will iterate over all tasks and return 0.
+ *
+ * This function must not be called for the root memory cgroup.
+ */
+int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+			  int (*fn)(struct task_struct *, void *), void *arg)
+{
+	struct mem_cgroup *iter;
+	int ret = 0;
+
+	BUG_ON(memcg == root_mem_cgroup);
+
+	for_each_mem_cgroup_tree(iter, memcg) {
+		struct css_task_iter it;
+		struct task_struct *task;
+
+		css_task_iter_start(&iter->css, &it);
+		while (!ret && (task = css_task_iter_next(&it)))
+			ret = fn(task, arg);
+		css_task_iter_end(&it);
+		if (ret) {
+			mem_cgroup_iter_break(memcg, iter);
+			break;
+		}
+	}
+	return ret;
+}
+
+/**
  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
  * @zone: zone of the wanted lruvec
  * @memcg: memcg of the wanted lruvec
@@ -1236,7 +1273,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 /*
  * Return the memory (and swap, if configured) limit for a memcg.
  */
-static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
 	unsigned long limit;
 
@@ -1263,79 +1300,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		.gfp_mask = gfp_mask,
 		.order = order,
 	};
-	struct mem_cgroup *iter;
-	unsigned long chosen_points = 0;
-	unsigned long totalpages;
-	unsigned int points = 0;
-	struct task_struct *chosen = NULL;
+	bool ret;
 
 	mutex_lock(&oom_lock);
-
-	/*
-	 * If current has a pending SIGKILL or is exiting, then automatically
-	 * select it.  The goal is to allow it to allocate so that it may
-	 * quickly exit and free its memory.
-	 */
-	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-		mark_oom_victim(current);
-		try_oom_reaper(current);
-		goto unlock;
-	}
-
-	check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
-	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
-	for_each_mem_cgroup_tree(iter, memcg) {
-		struct css_task_iter it;
-		struct task_struct *task;
-
-		css_task_iter_start(&iter->css, &it);
-		while ((task = css_task_iter_next(&it))) {
-			switch (oom_scan_process_thread(&oc, task, totalpages)) {
-			case OOM_SCAN_SELECT:
-				if (chosen)
-					put_task_struct(chosen);
-				chosen = task;
-				chosen_points = ULONG_MAX;
-				get_task_struct(chosen);
-				/* fall through */
-			case OOM_SCAN_CONTINUE:
-				continue;
-			case OOM_SCAN_ABORT:
-				css_task_iter_end(&it);
-				mem_cgroup_iter_break(memcg, iter);
-				if (chosen)
-					put_task_struct(chosen);
-				/* Set a dummy value to return "true". */
-				chosen = (void *) 1;
-				goto unlock;
-			case OOM_SCAN_OK:
-				break;
-			};
-			points = oom_badness(task, memcg, NULL, totalpages);
-			if (!points || points < chosen_points)
-				continue;
-			/* Prefer thread group leaders for display purposes */
-			if (points == chosen_points &&
-			    thread_group_leader(chosen))
-				continue;
-
-			if (chosen)
-				put_task_struct(chosen);
-			chosen = task;
-			chosen_points = points;
-			get_task_struct(chosen);
-		}
-		css_task_iter_end(&it);
-	}
-
-	if (chosen) {
-		points = chosen_points * 1000 / totalpages;
-		oom_kill_process(&oc, chosen, points, totalpages,
-				 "Memory cgroup out of memory");
-	}
-unlock:
+	ret = out_of_memory(&oc);
 	mutex_unlock(&oom_lock);
-	return chosen;
+	return ret;
 }
 
 #if MAX_NUMNODES > 1
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b3424199069b..a0a490b2c264 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -48,6 +48,16 @@ int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 
+/*
+ * Types of limitations to the nodes from which allocations may occur
+ */
+enum oom_constraint {
+	CONSTRAINT_NONE,
+	CONSTRAINT_CPUSET,
+	CONSTRAINT_MEMORY_POLICY,
+	CONSTRAINT_MEMCG,
+};
+
 DEFINE_MUTEX(oom_lock);
 
 #ifdef CONFIG_NUMA
@@ -98,6 +108,28 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
 }
 #endif /* CONFIG_NUMA */
 
+static bool task_will_free_mem(struct task_struct *task)
+{
+	struct signal_struct *sig = task->signal;
+
+	/*
+	 * A coredumping process may sleep for an extended period in exit_mm(),
+	 * so the oom killer cannot assume that the process will promptly exit
+	 * and release memory.
+	 */
+	if (sig->flags & SIGNAL_GROUP_COREDUMP)
+		return false;
+
+	if (!(task->flags & PF_EXITING))
+		return false;
+
+	/* Make sure that the whole thread group is going down */
+	if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
+		return false;
+
+	return true;
+}
+
 /*
  * The process p may have detached its own ->mm while exiting or through
  * use_mm(), but one or more of its subthreads may still have a valid
@@ -214,7 +246,6 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 /*
  * Determine the type of allocation constraint.
  */
-#ifdef CONFIG_NUMA
 static enum oom_constraint constrained_alloc(struct oom_control *oc,
 					     unsigned long *totalpages)
 {
@@ -224,9 +255,17 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
 	bool cpuset_limited = false;
 	int nid;
 
+	if (oc->memcg) {
+		*totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
+		return CONSTRAINT_MEMCG;
+	}
+
 	/* Default to all available memory */
 	*totalpages = totalram_pages + total_swap_pages;
 
+	if (!IS_ENABLED(CONFIG_NUMA))
+		return CONSTRAINT_NONE;
+
 	if (!oc->zonelist)
 		return CONSTRAINT_NONE;
 	/*
@@ -264,36 +303,77 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
 	}
 	return CONSTRAINT_NONE;
 }
-#else
-static enum oom_constraint constrained_alloc(struct oom_control *oc,
-					     unsigned long *totalpages)
+
+static void oom_scan_tasks(struct oom_control *oc,
+			   int (*fn)(struct task_struct *, void *), void *arg)
 {
-	*totalpages = totalram_pages + total_swap_pages;
-	return CONSTRAINT_NONE;
+	struct task_struct *p;
+
+	if (oc->memcg) {
+		mem_cgroup_scan_tasks(oc->memcg, fn, arg);
+		return;
+	}
+
+	rcu_read_lock();
+	for_each_process(p) {
+		if (fn(p, arg))
+			break;
+	}
+	rcu_read_unlock();
 }
-#endif
 
-enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-			struct task_struct *task, unsigned long totalpages)
+struct oom_evaluate_task_arg {
+	struct oom_control *oc;
+	unsigned long totalpages;
+	struct task_struct *chosen;
+	unsigned long chosen_points;
+};
+
+static int oom_evaluate_task(struct task_struct *task, void *_arg)
 {
+	struct oom_evaluate_task_arg *arg = _arg;
+	struct oom_control *oc = arg->oc;
+	unsigned long totalpages = arg->totalpages;
+	unsigned long points;
+
 	if (oom_unkillable_task(task, NULL, oc->nodemask))
-		return OOM_SCAN_CONTINUE;
+		return 0;
 
 	/*
 	 * This task already has access to memory reserves and is being killed.
 	 * Don't allow any other task to have access to the reserves.
 	 */
-	if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
-		return OOM_SCAN_ABORT;
+	if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
+		if (arg->chosen)
+			put_task_struct(arg->chosen);
+		arg->chosen = (struct task_struct *)(-1UL);
+		return 1;
+	}
 
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
 	 * killed first if it triggers an oom, then select it.
 	 */
-	if (oom_task_origin(task))
-		return OOM_SCAN_SELECT;
+	if (oom_task_origin(task)) {
+		points = ULONG_MAX;
+		goto select;
+	}
+
+	points = oom_badness(task, NULL, oc->nodemask, totalpages);
+	if (!points || points < arg->chosen_points)
+		return 0;
 
-	return OOM_SCAN_OK;
+	/* Prefer thread group leaders for display purposes */
+	if (points == arg->chosen_points &&
+	    thread_group_leader(arg->chosen))
+		return 0;
+select:
+	if (arg->chosen)
+		put_task_struct(arg->chosen);
+	get_task_struct(task);
+	arg->chosen = task;
+	arg->chosen_points = points;
+	return 0;
 }
 
 /*
@@ -303,40 +383,15 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 static struct task_struct *select_bad_process(struct oom_control *oc,
 		unsigned int *ppoints, unsigned long totalpages)
 {
-	struct task_struct *p;
-	struct task_struct *chosen = NULL;
-	unsigned long chosen_points = 0;
-
-	rcu_read_lock();
-	for_each_process(p) {
-		unsigned int points;
-
-		switch (oom_scan_process_thread(oc, p, totalpages)) {
-		case OOM_SCAN_SELECT:
-			chosen = p;
-			chosen_points = ULONG_MAX;
-			/* fall through */
-		case OOM_SCAN_CONTINUE:
-			continue;
-		case OOM_SCAN_ABORT:
-			rcu_read_unlock();
-			return (struct task_struct *)(-1UL);
-		case OOM_SCAN_OK:
-			break;
-		};
-		points = oom_badness(p, NULL, oc->nodemask, totalpages);
-		if (!points || points < chosen_points)
-			continue;
+	struct oom_evaluate_task_arg arg = {
+		.oc = oc,
+		.totalpages = totalpages,
+	};
 
-		chosen = p;
-		chosen_points = points;
-	}
-	if (chosen)
-		get_task_struct(chosen);
-	rcu_read_unlock();
+	oom_scan_tasks(oc, oom_evaluate_task, &arg);
 
-	*ppoints = chosen_points * 1000 / totalpages;
-	return chosen;
+	*ppoints = arg.chosen_points * 1000 / totalpages;
+	return arg.chosen;
 }
 
 /**
@@ -674,7 +729,7 @@ static void wake_oom_reaper(struct task_struct *tsk)
  * Has to be called with oom_lock held and never after
  * oom has been disabled already.
  */
-void mark_oom_victim(struct task_struct *tsk)
+static void mark_oom_victim(struct task_struct *tsk)
 {
 	WARN_ON(oom_killer_disabled);
 	/* OOM killer might race with memcg OOM */
@@ -745,9 +800,8 @@ void oom_killer_enable(void)
  * Must be called while holding a reference to p, which will be released upon
  * returning.
  */
-void oom_kill_process(struct oom_control *oc, struct task_struct *p,
-		      unsigned int points, unsigned long totalpages,
-		      const char *message)
+static void oom_kill_process(struct oom_control *oc, struct task_struct *p,
+			     unsigned int points, unsigned long totalpages)
 {
 	struct task_struct *victim = p;
 	struct task_struct *child;
@@ -776,7 +830,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 		dump_header(oc, p);
 
 	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
-		message, task_pid_nr(p), p->comm, points);
+	       oc->memcg ? "Memory cgroup out of memory" : "Out of memory",
+	       task_pid_nr(p), p->comm, points);
 
 	/*
 	 * If any of p's children has a different mm and is eligible for kill,
@@ -873,7 +928,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 /*
  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
  */
-void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
+static void check_panic_on_oom(struct oom_control *oc,
+			       enum oom_constraint constraint)
 {
 	if (likely(!sysctl_panic_on_oom))
 		return;
@@ -928,10 +984,12 @@ bool out_of_memory(struct oom_control *oc)
 	if (oom_killer_disabled)
 		return false;
 
-	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
-	if (freed > 0)
-		/* Got some memory back in the last second. */
-		return true;
+	if (!oc->memcg) {
+		blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+		if (freed > 0)
+			/* Got some memory back in the last second. */
+			return true;
+	}
 
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
@@ -959,7 +1017,7 @@ bool out_of_memory(struct oom_control *oc)
 
 	/*
 	 * Check if there were limitations on the allocation (only relevant for
-	 * NUMA) that may require different handling.
+	 * NUMA and memcg) that may require different handling.
 	 */
 	constraint = constrained_alloc(oc, &totalpages);
 	if (constraint != CONSTRAINT_MEMORY_POLICY)
@@ -970,26 +1028,25 @@ bool out_of_memory(struct oom_control *oc)
 	    !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
-		oom_kill_process(oc, current, 0, totalpages,
-				 "Out of memory (oom_kill_allocating_task)");
+		oom_kill_process(oc, current, 0, totalpages);
 		return true;
 	}
 
 	p = select_bad_process(oc, &points, totalpages);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
-	if (!p && !is_sysrq_oom(oc)) {
+	if (!p && !is_sysrq_oom(oc) && !oc->memcg) {
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
 	if (p && p != (void *)-1UL) {
-		oom_kill_process(oc, p, points, totalpages, "Out of memory");
+		oom_kill_process(oc, p, points, totalpages);
 		/*
 		 * Give the killed process a good chance to exit before trying
 		 * to allocate memory again.
 		 */
 		schedule_timeout_killable(1);
 	}
-	return true;
+	return !!p;
 }
 
 /*
-- 
2.1.4

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-05-27 14:17 ` [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom Vladimir Davydov
@ 2016-05-27 14:26   ` Michal Hocko
  2016-05-27 14:45     ` Vladimir Davydov
  2016-05-27 17:23   ` Johannes Weiner
  2016-06-08  8:33   ` Michal Hocko
  2 siblings, 1 reply; 14+ messages in thread
From: Michal Hocko @ 2016-05-27 14:26 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Andrew Morton, Tetsuo Handa, David Rientjes, Johannes Weiner,
	linux-mm, linux-kernel

On Fri 27-05-16 17:17:42, Vladimir Davydov wrote:
> When selecting an oom victim, we use the same heuristic for both memory
> cgroup and global oom. The only difference is the scope of tasks to
> select the victim from. So we could just export an iterator over all
> memcg tasks and keep all oom related logic in oom_kill.c, but instead we
> duplicate pieces of it in memcontrol.c reusing some initially private
> functions of oom_kill.c in order to not duplicate all of it. That looks
> ugly and error prone, because any modification of select_bad_process
> should also be propagated to mem_cgroup_out_of_memory.
> 
> Let's rework this as follows: keep all oom heuristic related code
> private to oom_kill.c and make oom_kill.c use exported memcg functions
> when it's really necessary (like in case of iterating over memcg tasks).

I am doing quite large changes in this area and this would cause many
conflicts. Do you think you can postpone this after my patchset [1] gets
sorted out please?

I haven't looked at the patch carefully so I cannot tell much about it
right now but just wanted to give a heads up for the conflicts.

[1] http://lkml.kernel.org/r/1464266415-15558-1-git-send-email-mhocko@kernel.org

Thanks!

> 
> Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
> ---
>  include/linux/memcontrol.h |  15 ++++
>  include/linux/oom.h        |  51 -------------
>  mm/memcontrol.c            | 112 ++++++++++-----------------
>  mm/oom_kill.c              | 183 +++++++++++++++++++++++++++++----------------
>  4 files changed, 176 insertions(+), 185 deletions(-)
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] mm: oom: add memcg to oom_control
  2016-05-27 14:17 [PATCH 1/2] mm: oom: add memcg to oom_control Vladimir Davydov
  2016-05-27 14:17 ` [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom Vladimir Davydov
@ 2016-05-27 14:34 ` Michal Hocko
  2016-05-27 17:20 ` Johannes Weiner
  2 siblings, 0 replies; 14+ messages in thread
From: Michal Hocko @ 2016-05-27 14:34 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Andrew Morton, Tetsuo Handa, David Rientjes, Johannes Weiner,
	linux-mm, linux-kernel

On Fri 27-05-16 17:17:41, Vladimir Davydov wrote:
> It's a part of oom context just like allocation order and nodemask, so
> let's move it to oom_control instead of passing it in the argument list.

Don't remember why we haven't done that when the structure was adde.

> Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  drivers/tty/sysrq.c |  1 +
>  include/linux/oom.h |  8 +++++---
>  mm/memcontrol.c     |  5 +++--
>  mm/oom_kill.c       | 32 +++++++++++++++-----------------
>  mm/page_alloc.c     |  1 +
>  5 files changed, 25 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
> index e5139402e7f8..52bbd27e93ae 100644
> --- a/drivers/tty/sysrq.c
> +++ b/drivers/tty/sysrq.c
> @@ -363,6 +363,7 @@ static void moom_callback(struct work_struct *ignored)
>  	struct oom_control oc = {
>  		.zonelist = node_zonelist(first_memory_node, gfp_mask),
>  		.nodemask = NULL,
> +		.memcg = NULL,
>  		.gfp_mask = gfp_mask,
>  		.order = -1,
>  	};
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 83469522690a..cbc24a5fe28d 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -23,6 +23,9 @@ struct oom_control {
>  	/* Used to determine mempolicy */
>  	nodemask_t *nodemask;
>  
> +	/* Memory cgroup in which oom is invoked, or NULL for global oom */
> +	struct mem_cgroup *memcg;
> +
>  	/* Used to determine cpuset and node locality requirement */
>  	const gfp_t gfp_mask;
>  
> @@ -83,11 +86,10 @@ extern unsigned long oom_badness(struct task_struct *p,
>  
>  extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  			     unsigned int points, unsigned long totalpages,
> -			     struct mem_cgroup *memcg, const char *message);
> +			     const char *message);
>  
>  extern void check_panic_on_oom(struct oom_control *oc,
> -			       enum oom_constraint constraint,
> -			       struct mem_cgroup *memcg);
> +			       enum oom_constraint constraint);
>  
>  extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
>  		struct task_struct *task, unsigned long totalpages);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 37ba604984c9..eeb3b14de01a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1259,6 +1259,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	struct oom_control oc = {
>  		.zonelist = NULL,
>  		.nodemask = NULL,
> +		.memcg = memcg,
>  		.gfp_mask = gfp_mask,
>  		.order = order,
>  	};
> @@ -1281,7 +1282,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  		goto unlock;
>  	}
>  
> -	check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
> +	check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
>  	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
>  	for_each_mem_cgroup_tree(iter, memcg) {
>  		struct css_task_iter it;
> @@ -1329,7 +1330,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  
>  	if (chosen) {
>  		points = chosen_points * 1000 / totalpages;
> -		oom_kill_process(&oc, chosen, points, totalpages, memcg,
> +		oom_kill_process(&oc, chosen, points, totalpages,
>  				 "Memory cgroup out of memory");
>  	}
>  unlock:
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index b95c4c101b35..b3424199069b 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -383,8 +383,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
>  	rcu_read_unlock();
>  }
>  
> -static void dump_header(struct oom_control *oc, struct task_struct *p,
> -			struct mem_cgroup *memcg)
> +static void dump_header(struct oom_control *oc, struct task_struct *p)
>  {
>  	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
>  		current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
> @@ -392,12 +391,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p,
>  
>  	cpuset_print_current_mems_allowed();
>  	dump_stack();
> -	if (memcg)
> -		mem_cgroup_print_oom_info(memcg, p);
> +	if (oc->memcg)
> +		mem_cgroup_print_oom_info(oc->memcg, p);
>  	else
>  		show_mem(SHOW_MEM_FILTER_NODES);
>  	if (sysctl_oom_dump_tasks)
> -		dump_tasks(memcg, oc->nodemask);
> +		dump_tasks(oc->memcg, oc->nodemask);
>  }
>  
>  /*
> @@ -748,7 +747,7 @@ void oom_killer_enable(void)
>   */
>  void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  		      unsigned int points, unsigned long totalpages,
> -		      struct mem_cgroup *memcg, const char *message)
> +		      const char *message)
>  {
>  	struct task_struct *victim = p;
>  	struct task_struct *child;
> @@ -774,7 +773,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  	task_unlock(p);
>  
>  	if (__ratelimit(&oom_rs))
> -		dump_header(oc, p, memcg);
> +		dump_header(oc, p);
>  
>  	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
>  		message, task_pid_nr(p), p->comm, points);
> @@ -795,8 +794,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  			/*
>  			 * oom_badness() returns 0 if the thread is unkillable
>  			 */
> -			child_points = oom_badness(child, memcg, oc->nodemask,
> -								totalpages);
> +			child_points = oom_badness(child,
> +					oc->memcg, oc->nodemask, totalpages);
>  			if (child_points > victim_points) {
>  				put_task_struct(victim);
>  				victim = child;
> @@ -874,8 +873,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  /*
>   * Determines whether the kernel must panic because of the panic_on_oom sysctl.
>   */
> -void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
> -			struct mem_cgroup *memcg)
> +void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
>  {
>  	if (likely(!sysctl_panic_on_oom))
>  		return;
> @@ -891,7 +889,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
>  	/* Do not panic for oom kills triggered by sysrq */
>  	if (is_sysrq_oom(oc))
>  		return;
> -	dump_header(oc, NULL, memcg);
> +	dump_header(oc, NULL);
>  	panic("Out of memory: %s panic_on_oom is enabled\n",
>  		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
>  }
> @@ -966,13 +964,13 @@ bool out_of_memory(struct oom_control *oc)
>  	constraint = constrained_alloc(oc, &totalpages);
>  	if (constraint != CONSTRAINT_MEMORY_POLICY)
>  		oc->nodemask = NULL;
> -	check_panic_on_oom(oc, constraint, NULL);
> +	check_panic_on_oom(oc, constraint);
>  
>  	if (sysctl_oom_kill_allocating_task && current->mm &&
>  	    !oom_unkillable_task(current, NULL, oc->nodemask) &&
>  	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
>  		get_task_struct(current);
> -		oom_kill_process(oc, current, 0, totalpages, NULL,
> +		oom_kill_process(oc, current, 0, totalpages,
>  				 "Out of memory (oom_kill_allocating_task)");
>  		return true;
>  	}
> @@ -980,12 +978,11 @@ bool out_of_memory(struct oom_control *oc)
>  	p = select_bad_process(oc, &points, totalpages);
>  	/* Found nothing?!?! Either we hang forever, or we panic. */
>  	if (!p && !is_sysrq_oom(oc)) {
> -		dump_header(oc, NULL, NULL);
> +		dump_header(oc, NULL);
>  		panic("Out of memory and no killable processes...\n");
>  	}
>  	if (p && p != (void *)-1UL) {
> -		oom_kill_process(oc, p, points, totalpages, NULL,
> -				 "Out of memory");
> +		oom_kill_process(oc, p, points, totalpages, "Out of memory");
>  		/*
>  		 * Give the killed process a good chance to exit before trying
>  		 * to allocate memory again.
> @@ -1005,6 +1002,7 @@ void pagefault_out_of_memory(void)
>  	struct oom_control oc = {
>  		.zonelist = NULL,
>  		.nodemask = NULL,
> +		.memcg = NULL,
>  		.gfp_mask = 0,
>  		.order = 0,
>  	};
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f21f56f88c8a..7da8310b86e9 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3101,6 +3101,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
>  	struct oom_control oc = {
>  		.zonelist = ac->zonelist,
>  		.nodemask = ac->nodemask,
> +		.memcg = NULL,
>  		.gfp_mask = gfp_mask,
>  		.order = order,
>  	};
> -- 
> 2.1.4

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-05-27 14:26   ` Michal Hocko
@ 2016-05-27 14:45     ` Vladimir Davydov
  2016-05-27 14:52       ` Michal Hocko
  0 siblings, 1 reply; 14+ messages in thread
From: Vladimir Davydov @ 2016-05-27 14:45 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Tetsuo Handa, David Rientjes, Johannes Weiner,
	linux-mm, linux-kernel

On Fri, May 27, 2016 at 04:26:26PM +0200, Michal Hocko wrote:
> On Fri 27-05-16 17:17:42, Vladimir Davydov wrote:
> > When selecting an oom victim, we use the same heuristic for both memory
> > cgroup and global oom. The only difference is the scope of tasks to
> > select the victim from. So we could just export an iterator over all
> > memcg tasks and keep all oom related logic in oom_kill.c, but instead we
> > duplicate pieces of it in memcontrol.c reusing some initially private
> > functions of oom_kill.c in order to not duplicate all of it. That looks
> > ugly and error prone, because any modification of select_bad_process
> > should also be propagated to mem_cgroup_out_of_memory.
> > 
> > Let's rework this as follows: keep all oom heuristic related code
> > private to oom_kill.c and make oom_kill.c use exported memcg functions
> > when it's really necessary (like in case of iterating over memcg tasks).
> 
> I am doing quite large changes in this area and this would cause many
> conflicts. Do you think you can postpone this after my patchset [1] gets
> sorted out please?

I'm fine with it.

> 
> I haven't looked at the patch carefully so I cannot tell much about it
> right now but just wanted to give a heads up for the conflicts.

I'd appreciate if you could take a look at this patch once time permits.

Thanks,
Vladimir

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-05-27 14:45     ` Vladimir Davydov
@ 2016-05-27 14:52       ` Michal Hocko
  0 siblings, 0 replies; 14+ messages in thread
From: Michal Hocko @ 2016-05-27 14:52 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Andrew Morton, Tetsuo Handa, David Rientjes, Johannes Weiner,
	linux-mm, linux-kernel

On Fri 27-05-16 17:45:49, Vladimir Davydov wrote:
> On Fri, May 27, 2016 at 04:26:26PM +0200, Michal Hocko wrote:
[...]
> > I am doing quite large changes in this area and this would cause many
> > conflicts. Do you think you can postpone this after my patchset [1] gets
> > sorted out please?
> 
> I'm fine with it.

Thanks!
 
> > I haven't looked at the patch carefully so I cannot tell much about it
> > right now but just wanted to give a heads up for the conflicts.
> 
> I'd appreciate if you could take a look at this patch once time permits.

Sure, I will try next week. It's been a long week and the brain is in
the weekend mode already...

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] mm: oom: add memcg to oom_control
  2016-05-27 14:17 [PATCH 1/2] mm: oom: add memcg to oom_control Vladimir Davydov
  2016-05-27 14:17 ` [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom Vladimir Davydov
  2016-05-27 14:34 ` [PATCH 1/2] mm: oom: add memcg to oom_control Michal Hocko
@ 2016-05-27 17:20 ` Johannes Weiner
  2 siblings, 0 replies; 14+ messages in thread
From: Johannes Weiner @ 2016-05-27 17:20 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Andrew Morton, Michal Hocko, Tetsuo Handa, David Rientjes,
	linux-mm, linux-kernel

On Fri, May 27, 2016 at 05:17:41PM +0300, Vladimir Davydov wrote:
> It's a part of oom context just like allocation order and nodemask, so
> let's move it to oom_control instead of passing it in the argument list.
> 
> Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-05-27 14:17 ` [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom Vladimir Davydov
  2016-05-27 14:26   ` Michal Hocko
@ 2016-05-27 17:23   ` Johannes Weiner
  2016-06-08  8:33   ` Michal Hocko
  2 siblings, 0 replies; 14+ messages in thread
From: Johannes Weiner @ 2016-05-27 17:23 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Andrew Morton, Michal Hocko, Tetsuo Handa, David Rientjes,
	linux-mm, linux-kernel

On Fri, May 27, 2016 at 05:17:42PM +0300, Vladimir Davydov wrote:
> When selecting an oom victim, we use the same heuristic for both memory
> cgroup and global oom. The only difference is the scope of tasks to
> select the victim from. So we could just export an iterator over all
> memcg tasks and keep all oom related logic in oom_kill.c, but instead we
> duplicate pieces of it in memcontrol.c reusing some initially private
> functions of oom_kill.c in order to not duplicate all of it. That looks
> ugly and error prone, because any modification of select_bad_process
> should also be propagated to mem_cgroup_out_of_memory.
> 
> Let's rework this as follows: keep all oom heuristic related code
> private to oom_kill.c and make oom_kill.c use exported memcg functions
> when it's really necessary (like in case of iterating over memcg tasks).
> 
> Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>

Cool work!

I'll do a full review after the rebase on top of Michal's stuff.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-05-27 14:17 ` [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom Vladimir Davydov
  2016-05-27 14:26   ` Michal Hocko
  2016-05-27 17:23   ` Johannes Weiner
@ 2016-06-08  8:33   ` Michal Hocko
  2016-06-08 11:18     ` Tetsuo Handa
  2016-06-08 13:52     ` Vladimir Davydov
  2 siblings, 2 replies; 14+ messages in thread
From: Michal Hocko @ 2016-06-08  8:33 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Andrew Morton, Tetsuo Handa, David Rientjes, Johannes Weiner,
	linux-mm, linux-kernel

On Fri 27-05-16 17:17:42, Vladimir Davydov wrote:
[...]
> @@ -970,26 +1028,25 @@ bool out_of_memory(struct oom_control *oc)
>  	    !oom_unkillable_task(current, NULL, oc->nodemask) &&
>  	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
>  		get_task_struct(current);
> -		oom_kill_process(oc, current, 0, totalpages,
> -				 "Out of memory (oom_kill_allocating_task)");
> +		oom_kill_process(oc, current, 0, totalpages);
>  		return true;
>  	}

Do we really want to introduce sysctl_oom_kill_allocating_task to memcg
as well? The heuristic is quite dubious even for the global context IMHO
because it leads to a very random behavior.
  
>  	p = select_bad_process(oc, &points, totalpages);
>  	/* Found nothing?!?! Either we hang forever, or we panic. */
> -	if (!p && !is_sysrq_oom(oc)) {
> +	if (!p && !is_sysrq_oom(oc) && !oc->memcg) {
>  		dump_header(oc, NULL);
>  		panic("Out of memory and no killable processes...\n");
>  	}
>  	if (p && p != (void *)-1UL) {
> -		oom_kill_process(oc, p, points, totalpages, "Out of memory");
> +		oom_kill_process(oc, p, points, totalpages);
>  		/*
>  		 * Give the killed process a good chance to exit before trying
>  		 * to allocate memory again.
>  		 */
>  		schedule_timeout_killable(1);
>  	}
> -	return true;
> +	return !!p;
>  }

Now if you look at out_of_memory() the only shared "heuristic" with the
memcg part is the bypass for the exiting tasks. Plus both need the
oom_lock.
You have to special case oom notifiers, panic on no victim handling and
I guess the oom_kill_allocating task is not intentional either. So I
am not really sure this is an improvement. I even hate how we conflate
sysrq vs. regular global oom context together but my cleanup for that
has failed in the past.

The victim selection code can be reduced because it is basically
shared between the two, only the iterator differs. But I guess that
can be eliminated by a simple helper.
---
 include/linux/oom.h |  5 +++++
 mm/memcontrol.c     | 47 ++++++-----------------------------------
 mm/oom_kill.c       | 60 ++++++++++++++++++++++++++++-------------------------
 3 files changed, 43 insertions(+), 69 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 606137b3b778..7b3eb253ba23 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -34,6 +34,9 @@ struct oom_control {
 	 * for display purposes.
 	 */
 	const int order;
+
+	struct task_struct *chosen;
+	unsigned long chosen_points;
 };
 
 /*
@@ -80,6 +83,8 @@ static inline void try_oom_reaper(struct task_struct *tsk)
 }
 #endif
 
+extern int oom_evaluate_task(struct oom_control *oc, struct task_struct *p,
+		unsigned long totalpages);
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 83a6a2b92301..9c51b4d11691 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1264,10 +1264,8 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		.order = order,
 	};
 	struct mem_cgroup *iter;
-	unsigned long chosen_points = 0;
 	unsigned long totalpages;
 	unsigned int points = 0;
-	struct task_struct *chosen = NULL;
 
 	mutex_lock(&oom_lock);
 
@@ -1289,53 +1287,20 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		struct task_struct *task;
 
 		css_task_iter_start(&iter->css, &it);
-		while ((task = css_task_iter_next(&it))) {
-			switch (oom_scan_process_thread(&oc, task)) {
-			case OOM_SCAN_SELECT:
-				if (chosen)
-					put_task_struct(chosen);
-				chosen = task;
-				chosen_points = ULONG_MAX;
-				get_task_struct(chosen);
-				/* fall through */
-			case OOM_SCAN_CONTINUE:
-				continue;
-			case OOM_SCAN_ABORT:
-				css_task_iter_end(&it);
-				mem_cgroup_iter_break(memcg, iter);
-				if (chosen)
-					put_task_struct(chosen);
-				/* Set a dummy value to return "true". */
-				chosen = (void *) 1;
-				goto unlock;
-			case OOM_SCAN_OK:
+		while ((task = css_task_iter_next(&it)))
+			if (!oom_evaluate_task(&oc, task, totalpages))
 				break;
-			};
-			points = oom_badness(task, memcg, NULL, totalpages);
-			if (!points || points < chosen_points)
-				continue;
-			/* Prefer thread group leaders for display purposes */
-			if (points == chosen_points &&
-			    thread_group_leader(chosen))
-				continue;
-
-			if (chosen)
-				put_task_struct(chosen);
-			chosen = task;
-			chosen_points = points;
-			get_task_struct(chosen);
-		}
 		css_task_iter_end(&it);
 	}
 
-	if (chosen) {
-		points = chosen_points * 1000 / totalpages;
-		oom_kill_process(&oc, chosen, points, totalpages,
+	if (oc.chosen) {
+		points = oc.chosen_points * 1000 / totalpages;
+		oom_kill_process(&oc, oc.chosen, points, totalpages,
 				 "Memory cgroup out of memory");
 	}
 unlock:
 	mutex_unlock(&oom_lock);
-	return chosen;
+	return oc.chosen;
 }
 
 #if MAX_NUMNODES > 1
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c11f8bdd0c12..bce3ea262110 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -296,6 +296,34 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 	return OOM_SCAN_OK;
 }
 
+int oom_evaluate_task(struct oom_control *oc, struct task_struct *p, unsigned long totalpages)
+{
+	unsigned long points;
+
+	switch (oom_scan_process_thread(oc, p)) {
+	case OOM_SCAN_SELECT:
+		points = ULONG_MAX;
+		goto select_task;
+	case OOM_SCAN_CONTINUE:
+		return 1;
+	case OOM_SCAN_ABORT:
+		return 0;
+	case OOM_SCAN_OK:
+		break;
+	};
+	points = oom_badness(p, oc->memcg, oc->nodemask, totalpages);
+	if (points || points < oc->chosen_points)
+		return 1;
+
+select_task:
+	if (oc->chosen)
+		put_task_struct(oc->chosen);
+	get_task_struct(p);
+	oc->chosen = p;
+	oc->chosen_points = points;
+	return 1;
+}
+
 /*
  * Simple selection loop. We chose the process with the highest
  * number of 'points'.  Returns -1 on scan abort.
@@ -304,39 +332,15 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
 		unsigned int *ppoints, unsigned long totalpages)
 {
 	struct task_struct *p;
-	struct task_struct *chosen = NULL;
-	unsigned long chosen_points = 0;
 
 	rcu_read_lock();
-	for_each_process(p) {
-		unsigned int points;
-
-		switch (oom_scan_process_thread(oc, p)) {
-		case OOM_SCAN_SELECT:
-			chosen = p;
-			chosen_points = ULONG_MAX;
-			/* fall through */
-		case OOM_SCAN_CONTINUE:
-			continue;
-		case OOM_SCAN_ABORT:
-			rcu_read_unlock();
-			return (struct task_struct *)(-1UL);
-		case OOM_SCAN_OK:
+	for_each_process(p)
+		if (!oom_evaluate_task(oc, p, totalpages))
 			break;
-		};
-		points = oom_badness(p, NULL, oc->nodemask, totalpages);
-		if (!points || points < chosen_points)
-			continue;
-
-		chosen = p;
-		chosen_points = points;
-	}
-	if (chosen)
-		get_task_struct(chosen);
 	rcu_read_unlock();
 
-	*ppoints = chosen_points * 1000 / totalpages;
-	return chosen;
+	*ppoints = oc->chosen_points * 1000 / totalpages;
+	return oc->chosen;
 }
 
 /**

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-06-08  8:33   ` Michal Hocko
@ 2016-06-08 11:18     ` Tetsuo Handa
  2016-06-08 11:24       ` Tetsuo Handa
  2016-06-08 14:21       ` Michal Hocko
  2016-06-08 13:52     ` Vladimir Davydov
  1 sibling, 2 replies; 14+ messages in thread
From: Tetsuo Handa @ 2016-06-08 11:18 UTC (permalink / raw)
  To: mhocko, vdavydov; +Cc: akpm, rientjes, hannes, linux-mm, linux-kernel

Michal Hocko wrote:
> The victim selection code can be reduced because it is basically
> shared between the two, only the iterator differs. But I guess that
> can be eliminated by a simple helper.

Thank you for CC: me. I like this clean up.

> ---
>  include/linux/oom.h |  5 +++++
>  mm/memcontrol.c     | 47 ++++++-----------------------------------
>  mm/oom_kill.c       | 60 ++++++++++++++++++++++++++++-------------------------
>  3 files changed, 43 insertions(+), 69 deletions(-)

I think we can apply your version with below changes folded into your version.
(I think totalpages argument can be passed via oom_control as well. Also, according to
http://lkml.kernel.org/r/201602192336.EJF90671.HMFLFSVOFJOtOQ@I-love.SAKURA.ne.jp ,
we can safely replace oc->memcg in oom_badness() in oom_evaluate_task() with NULL. )

 include/linux/oom.h |   10 ----------
 mm/memcontrol.c     |    7 +++++--
 mm/oom_kill.c       |   14 ++++++++++++--
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7b3eb25..77e98a0 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -49,13 +49,6 @@ enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
-enum oom_scan_t {
-	OOM_SCAN_OK,		/* scan thread and find its badness */
-	OOM_SCAN_CONTINUE,	/* do not consider thread for oom kill */
-	OOM_SCAN_ABORT,		/* abort the iteration and return */
-	OOM_SCAN_SELECT,	/* always select this thread first */
-};
-
 extern struct mutex oom_lock;
 
 static inline void set_current_oom_origin(void)
@@ -96,9 +89,6 @@ extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 extern void check_panic_on_oom(struct oom_control *oc,
 			       enum oom_constraint constraint);
 
-extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-					       struct task_struct *task);
-
 extern bool out_of_memory(struct oom_control *oc);
 
 extern void exit_oom_victim(struct task_struct *tsk);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9c51b4d..f3482a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1288,12 +1288,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 		css_task_iter_start(&iter->css, &it);
 		while ((task = css_task_iter_next(&it)))
-			if (!oom_evaluate_task(&oc, task, totalpages))
+			if (!oom_evaluate_task(&oc, task, totalpages)) {
+				css_task_iter_end(&it);
+				mem_cgroup_iter_break(memcg, iter);
 				break;
+			}
 		css_task_iter_end(&it);
 	}
 
-	if (oc.chosen) {
+	if (oc.chosen && oc.chosen != (void *) -1UL) {
 		points = oc.chosen_points * 1000 / totalpages;
 		oom_kill_process(&oc, oc.chosen, points, totalpages,
 				 "Memory cgroup out of memory");
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bce3ea2..f634bca 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -273,8 +273,15 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
 }
 #endif
 
-enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
-					struct task_struct *task)
+enum oom_scan_t {
+	OOM_SCAN_OK,		/* scan thread and find its badness */
+	OOM_SCAN_CONTINUE,	/* do not consider thread for oom kill */
+	OOM_SCAN_ABORT,		/* abort the iteration and return */
+	OOM_SCAN_SELECT,	/* always select this thread first */
+};
+
+static enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+					       struct task_struct *task)
 {
 	if (oom_unkillable_task(task, NULL, oc->nodemask))
 		return OOM_SCAN_CONTINUE;
@@ -307,6 +314,9 @@ int oom_evaluate_task(struct oom_control *oc, struct task_struct *p, unsigned lo
 	case OOM_SCAN_CONTINUE:
 		return 1;
 	case OOM_SCAN_ABORT:
+		if (oc->chosen)
+			put_task_struct(oc->chosen);
+		oc->chosen = (void *) -1UL;
 		return 0;
 	case OOM_SCAN_OK:
 		break;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-06-08 11:18     ` Tetsuo Handa
@ 2016-06-08 11:24       ` Tetsuo Handa
  2016-06-08 14:21       ` Michal Hocko
  1 sibling, 0 replies; 14+ messages in thread
From: Tetsuo Handa @ 2016-06-08 11:24 UTC (permalink / raw)
  To: mhocko, vdavydov; +Cc: akpm, rientjes, hannes, linux-mm, linux-kernel

Tetsuo Handa wrote:
> Michal Hocko wrote:
> > The victim selection code can be reduced because it is basically
> > shared between the two, only the iterator differs. But I guess that
> > can be eliminated by a simple helper.
> 
> Thank you for CC: me. I like this clean up.
> 
> > ---
> >  include/linux/oom.h |  5 +++++
> >  mm/memcontrol.c     | 47 ++++++-----------------------------------
> >  mm/oom_kill.c       | 60 ++++++++++++++++++++++++++++-------------------------
> >  3 files changed, 43 insertions(+), 69 deletions(-)
> 
> I think we can apply your version with below changes folded into your version.
> (I think totalpages argument can be passed via oom_control as well. Also, according to
> http://lkml.kernel.org/r/201602192336.EJF90671.HMFLFSVOFJOtOQ@I-love.SAKURA.ne.jp ,
> we can safely replace oc->memcg in oom_badness() in oom_evaluate_task() with NULL. )
> 
>  include/linux/oom.h |   10 ----------
>  mm/memcontrol.c     |    7 +++++--
>  mm/oom_kill.c       |   14 ++++++++++++--
>  3 files changed, 17 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 7b3eb25..77e98a0 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -49,13 +49,6 @@ enum oom_constraint {
>  	CONSTRAINT_MEMCG,
>  };
>  
> -enum oom_scan_t {
> -	OOM_SCAN_OK,		/* scan thread and find its badness */
> -	OOM_SCAN_CONTINUE,	/* do not consider thread for oom kill */
> -	OOM_SCAN_ABORT,		/* abort the iteration and return */
> -	OOM_SCAN_SELECT,	/* always select this thread first */
> -};
> -
>  extern struct mutex oom_lock;
>  
>  static inline void set_current_oom_origin(void)
> @@ -96,9 +89,6 @@ extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
>  extern void check_panic_on_oom(struct oom_control *oc,
>  			       enum oom_constraint constraint);
>  
> -extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> -					       struct task_struct *task);
> -
>  extern bool out_of_memory(struct oom_control *oc);
>  
>  extern void exit_oom_victim(struct task_struct *tsk);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 9c51b4d..f3482a2 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1288,12 +1288,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  
>  		css_task_iter_start(&iter->css, &it);
>  		while ((task = css_task_iter_next(&it)))
> -			if (!oom_evaluate_task(&oc, task, totalpages))
> +			if (!oom_evaluate_task(&oc, task, totalpages)) {
> +				css_task_iter_end(&it);

Oops. Duplicated css_task_iter_end() calls. If it is safe to reverse ordering of
css_task_iter_end(&it) and mem_cgroup_iter_break(memcg, iter), removing this
css_task_iter_end(&it) line is the simplest fix.

> +				mem_cgroup_iter_break(memcg, iter);
>  				break;
> +			}
>  		css_task_iter_end(&it);
>  	}
>  
> -	if (oc.chosen) {
> +	if (oc.chosen && oc.chosen != (void *) -1UL) {
>  		points = oc.chosen_points * 1000 / totalpages;
>  		oom_kill_process(&oc, oc.chosen, points, totalpages,
>  				 "Memory cgroup out of memory");
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index bce3ea2..f634bca 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -273,8 +273,15 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
>  }
>  #endif
>  
> -enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> -					struct task_struct *task)
> +enum oom_scan_t {
> +	OOM_SCAN_OK,		/* scan thread and find its badness */
> +	OOM_SCAN_CONTINUE,	/* do not consider thread for oom kill */
> +	OOM_SCAN_ABORT,		/* abort the iteration and return */
> +	OOM_SCAN_SELECT,	/* always select this thread first */
> +};
> +
> +static enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> +					       struct task_struct *task)
>  {
>  	if (oom_unkillable_task(task, NULL, oc->nodemask))
>  		return OOM_SCAN_CONTINUE;
> @@ -307,6 +314,9 @@ int oom_evaluate_task(struct oom_control *oc, struct task_struct *p, unsigned lo
>  	case OOM_SCAN_CONTINUE:
>  		return 1;
>  	case OOM_SCAN_ABORT:
> +		if (oc->chosen)
> +			put_task_struct(oc->chosen);
> +		oc->chosen = (void *) -1UL;
>  		return 0;
>  	case OOM_SCAN_OK:
>  		break;
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-06-08  8:33   ` Michal Hocko
  2016-06-08 11:18     ` Tetsuo Handa
@ 2016-06-08 13:52     ` Vladimir Davydov
  2016-06-08 14:46       ` Michal Hocko
  1 sibling, 1 reply; 14+ messages in thread
From: Vladimir Davydov @ 2016-06-08 13:52 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Tetsuo Handa, David Rientjes, Johannes Weiner,
	linux-mm, linux-kernel

On Wed, Jun 08, 2016 at 10:33:34AM +0200, Michal Hocko wrote:
> On Fri 27-05-16 17:17:42, Vladimir Davydov wrote:
> [...]
> > @@ -970,26 +1028,25 @@ bool out_of_memory(struct oom_control *oc)
> >  	    !oom_unkillable_task(current, NULL, oc->nodemask) &&
> >  	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
> >  		get_task_struct(current);
> > -		oom_kill_process(oc, current, 0, totalpages,
> > -				 "Out of memory (oom_kill_allocating_task)");
> > +		oom_kill_process(oc, current, 0, totalpages);
> >  		return true;
> >  	}
> 
> Do we really want to introduce sysctl_oom_kill_allocating_task to memcg
> as well?

Not sure, but why not? We take into account dump_tasks and panic_on_oom
on memcg oom so why should we treat this sysctl differently?

> The heuristic is quite dubious even for the global context IMHO
> because it leads to a very random behavior.
>   
> >  	p = select_bad_process(oc, &points, totalpages);
> >  	/* Found nothing?!?! Either we hang forever, or we panic. */
> > -	if (!p && !is_sysrq_oom(oc)) {
> > +	if (!p && !is_sysrq_oom(oc) && !oc->memcg) {
> >  		dump_header(oc, NULL);
> >  		panic("Out of memory and no killable processes...\n");
> >  	}
> >  	if (p && p != (void *)-1UL) {
> > -		oom_kill_process(oc, p, points, totalpages, "Out of memory");
> > +		oom_kill_process(oc, p, points, totalpages);
> >  		/*
> >  		 * Give the killed process a good chance to exit before trying
> >  		 * to allocate memory again.
> >  		 */
> >  		schedule_timeout_killable(1);
> >  	}
> > -	return true;
> > +	return !!p;
> >  }
> 
> Now if you look at out_of_memory() the only shared "heuristic" with the
> memcg part is the bypass for the exiting tasks.

bypass exiting task (task_will_free_mem)
check for panic (check_panic_on_oom)
oom badness evaluation (oom_scan_process_thread or oom_evaluate_task
after your patch)
points calculation + kill (oom_kill_process)

And if you need to modify any of these function calls or add yet another
check, you have to do it twice. Ugly.

> Plus both need the oom_lock.

I believe locking could be unified for global/memcg oom cases too.

> You have to special case oom notifiers, panic on no victim handling and
> I guess the oom_kill_allocating task is not intentional either. So I
> am not really sure this is an improvement. I even hate how we conflate
> sysrq vs. regular global oom context together but my cleanup for that
> has failed in the past.
> 
> The victim selection code can be reduced because it is basically
> shared between the two, only the iterator differs. But I guess that
> can be eliminated by a simple helper.

IMHO exporting a bunch of very oom-specific helpers (like those I
enumerated above), partially revealing oom implementation, instead of
well defined memcg helpers that could be reused anywhere else looks
ugly. It's like having shrink_zone implementation both in vmscan.c and
memcontrol.c with shrink_slab, shrink_lruvec, etc. exported, because we
need to iterate over cgroups there.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-06-08 11:18     ` Tetsuo Handa
  2016-06-08 11:24       ` Tetsuo Handa
@ 2016-06-08 14:21       ` Michal Hocko
  1 sibling, 0 replies; 14+ messages in thread
From: Michal Hocko @ 2016-06-08 14:21 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: vdavydov, akpm, rientjes, hannes, linux-mm, linux-kernel

On Wed 08-06-16 20:18:24, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > The victim selection code can be reduced because it is basically
> > shared between the two, only the iterator differs. But I guess that
> > can be eliminated by a simple helper.
> 
> Thank you for CC: me. I like this clean up.
> 
> > ---
> >  include/linux/oom.h |  5 +++++
> >  mm/memcontrol.c     | 47 ++++++-----------------------------------
> >  mm/oom_kill.c       | 60 ++++++++++++++++++++++++++++-------------------------
> >  3 files changed, 43 insertions(+), 69 deletions(-)
> 
> I think we can apply your version with below changes folded into your version.
> (I think totalpages argument can be passed via oom_control as well. Also, according to
> http://lkml.kernel.org/r/201602192336.EJF90671.HMFLFSVOFJOtOQ@I-love.SAKURA.ne.jp ,
> we can safely replace oc->memcg in oom_badness() in oom_evaluate_task() with NULL. )

yes oom_badness can never see a task from outside of the memcg
hierarchy.

[...]
> +static enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> +					       struct task_struct *task)
>  {
>  	if (oom_unkillable_task(task, NULL, oc->nodemask))
>  		return OOM_SCAN_CONTINUE;
> @@ -307,6 +314,9 @@ int oom_evaluate_task(struct oom_control *oc, struct task_struct *p, unsigned lo
>  	case OOM_SCAN_CONTINUE:
>  		return 1;
>  	case OOM_SCAN_ABORT:
> +		if (oc->chosen)
> +			put_task_struct(oc->chosen);
> +		oc->chosen = (void *) -1UL;

true including the memcg fixup.

>  		return 0;
>  	case OOM_SCAN_OK:
>  		break;

Thanks! I've updated the patch locally but I will wait for Vladimir what
he thinks about this wrt. the original approach.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom
  2016-06-08 13:52     ` Vladimir Davydov
@ 2016-06-08 14:46       ` Michal Hocko
  0 siblings, 0 replies; 14+ messages in thread
From: Michal Hocko @ 2016-06-08 14:46 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Andrew Morton, Tetsuo Handa, David Rientjes, Johannes Weiner,
	linux-mm, linux-kernel

On Wed 08-06-16 16:52:04, Vladimir Davydov wrote:
> On Wed, Jun 08, 2016 at 10:33:34AM +0200, Michal Hocko wrote:
> > On Fri 27-05-16 17:17:42, Vladimir Davydov wrote:
> > [...]
> > > @@ -970,26 +1028,25 @@ bool out_of_memory(struct oom_control *oc)
> > >  	    !oom_unkillable_task(current, NULL, oc->nodemask) &&
> > >  	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
> > >  		get_task_struct(current);
> > > -		oom_kill_process(oc, current, 0, totalpages,
> > > -				 "Out of memory (oom_kill_allocating_task)");
> > > +		oom_kill_process(oc, current, 0, totalpages);
> > >  		return true;
> > >  	}
> > 
> > Do we really want to introduce sysctl_oom_kill_allocating_task to memcg
> > as well?
> 
> Not sure, but why not? We take into account dump_tasks and panic_on_oom
> on memcg oom so why should we treat this sysctl differently?

Well, for one thing nobody has requested that and it would be a user
visible change which might be unexpected. And as already said I think it
was a mistake to introduce this sysctl in the first place. The behavior
is so random that I am even not sure it is usable in the real life.
Spreading it more doesn't sound like a good idea to me.

[...]

> > Now if you look at out_of_memory() the only shared "heuristic" with the
> > memcg part is the bypass for the exiting tasks.
> 
> bypass exiting task (task_will_free_mem)
> check for panic (check_panic_on_oom)
> oom badness evaluation (oom_scan_process_thread or oom_evaluate_task
> after your patch)
> points calculation + kill (oom_kill_process)
> 
> And if you need to modify any of these function calls or add yet another
> check, you have to do it twice. Ugly.

Ideally all those changes would happen inside those helpers. Also if you
look at out_of_memory and mem_cgroup_out_of_memory it is much easier to
follow the later one because it doesn't have that different combinations
of heuristic which only make sense for sysrq or global oom.

> > Plus both need the oom_lock.
> 
> I believe locking could be unified for global/memcg oom cases too.
> 
> > You have to special case oom notifiers, panic on no victim handling and
> > I guess the oom_kill_allocating task is not intentional either. So I
> > am not really sure this is an improvement. I even hate how we conflate
> > sysrq vs. regular global oom context together but my cleanup for that
> > has failed in the past.
> > 
> > The victim selection code can be reduced because it is basically
> > shared between the two, only the iterator differs. But I guess that
> > can be eliminated by a simple helper.
> 
> IMHO exporting a bunch of very oom-specific helpers (like those I
> enumerated above), partially revealing oom implementation, instead of
> well defined memcg helpers that could be reused anywhere else looks
> ugly. It's like having shrink_zone implementation both in vmscan.c and
> memcontrol.c with shrink_slab, shrink_lruvec, etc. exported, because we
> need to iterate over cgroups there.

I agree that the API for OOM killer parts is not really great. I am just
little bit afraid that iterators are just over engineered. I am even not
sure whethers those have any other potential users. The diffstat of the
cleanup I have here right now sounds really encouranging.
---
 include/linux/oom.h | 17 ++++-------
 mm/memcontrol.c     | 48 +++--------------------------
 mm/oom_kill.c       | 87 ++++++++++++++++++++++++++++++-----------------------
 3 files changed, 60 insertions(+), 92 deletions(-)

compared to yours
 include/linux/memcontrol.h |  15 ++++
 include/linux/oom.h        |  51 -------------
 mm/memcontrol.c            | 112 ++++++++++-----------------
 mm/oom_kill.c              | 183 +++++++++++++++++++++++++++++----------------
 4 files changed, 176 insertions(+), 185 deletions(-)

we save more LOC with a smaller patch. I know this is not an absolute
metric but I would rather go with simplicity than an elaborate
APIs. This is all pretty much mm/memcg internal.

Anyway I do not have strong opinion and will not insist. I can post
the full cleanup with suggestions from Tetsuo integrated if you are
interested.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-06-08 14:46 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-27 14:17 [PATCH 1/2] mm: oom: add memcg to oom_control Vladimir Davydov
2016-05-27 14:17 ` [PATCH 2/2] mm: oom: deduplicate victim selection code for memcg and global oom Vladimir Davydov
2016-05-27 14:26   ` Michal Hocko
2016-05-27 14:45     ` Vladimir Davydov
2016-05-27 14:52       ` Michal Hocko
2016-05-27 17:23   ` Johannes Weiner
2016-06-08  8:33   ` Michal Hocko
2016-06-08 11:18     ` Tetsuo Handa
2016-06-08 11:24       ` Tetsuo Handa
2016-06-08 14:21       ` Michal Hocko
2016-06-08 13:52     ` Vladimir Davydov
2016-06-08 14:46       ` Michal Hocko
2016-05-27 14:34 ` [PATCH 1/2] mm: oom: add memcg to oom_control Michal Hocko
2016-05-27 17:20 ` Johannes Weiner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).