linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [v6 1/4] mm, oom: refactor the oom_kill_process() function
@ 2017-08-23 16:51 Roman Gushchin
  2017-08-23 16:51 ` [v6 0/4] cgroup-aware OOM killer Roman Gushchin
                   ` (4 more replies)
  0 siblings, 5 replies; 26+ messages in thread
From: Roman Gushchin @ 2017-08-23 16:51 UTC (permalink / raw)
  To: linux-mm
  Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, David Rientjes, Tejun Heo, kernel-team, cgroups,
	linux-doc, linux-kernel

The oom_kill_process() function consists of two logical parts:
the first one is responsible for considering task's children as
a potential victim and printing the debug information.
The second half is responsible for sending SIGKILL to all
tasks sharing the mm struct with the given victim.

This commit splits the oom_kill_process() function with
an intention to re-use the the second half: __oom_kill_process().

The cgroup-aware OOM killer will kill multiple tasks
belonging to the victim cgroup. We don't need to print
the debug information for the each task, as well as play
with task selection (considering task's children),
so we can't use the existing oom_kill_process().

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
Cc: cgroups@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
---
 mm/oom_kill.c | 123 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 65 insertions(+), 58 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 53b44425ef35..5c29a3dd591b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -817,67 +817,12 @@ static bool task_will_free_mem(struct task_struct *task)
 	return ret;
 }
 
-static void oom_kill_process(struct oom_control *oc, const char *message)
+static void __oom_kill_process(struct task_struct *victim)
 {
-	struct task_struct *p = oc->chosen;
-	unsigned int points = oc->chosen_points;
-	struct task_struct *victim = p;
-	struct task_struct *child;
-	struct task_struct *t;
+	struct task_struct *p;
 	struct mm_struct *mm;
-	unsigned int victim_points = 0;
-	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
-					      DEFAULT_RATELIMIT_BURST);
 	bool can_oom_reap = true;
 
-	/*
-	 * If the task is already exiting, don't alarm the sysadmin or kill
-	 * its children or threads, just set TIF_MEMDIE so it can die quickly
-	 */
-	task_lock(p);
-	if (task_will_free_mem(p)) {
-		mark_oom_victim(p);
-		wake_oom_reaper(p);
-		task_unlock(p);
-		put_task_struct(p);
-		return;
-	}
-	task_unlock(p);
-
-	if (__ratelimit(&oom_rs))
-		dump_header(oc, p);
-
-	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
-		message, task_pid_nr(p), p->comm, points);
-
-	/*
-	 * If any of p's children has a different mm and is eligible for kill,
-	 * the one with the highest oom_badness() score is sacrificed for its
-	 * parent.  This attempts to lose the minimal amount of work done while
-	 * still freeing memory.
-	 */
-	read_lock(&tasklist_lock);
-	for_each_thread(p, t) {
-		list_for_each_entry(child, &t->children, sibling) {
-			unsigned int child_points;
-
-			if (process_shares_mm(child, p->mm))
-				continue;
-			/*
-			 * oom_badness() returns 0 if the thread is unkillable
-			 */
-			child_points = oom_badness(child,
-				oc->memcg, oc->nodemask, oc->totalpages);
-			if (child_points > victim_points) {
-				put_task_struct(victim);
-				victim = child;
-				victim_points = child_points;
-				get_task_struct(victim);
-			}
-		}
-	}
-	read_unlock(&tasklist_lock);
-
 	p = find_lock_task_mm(victim);
 	if (!p) {
 		put_task_struct(victim);
@@ -947,10 +892,72 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 		wake_oom_reaper(victim);
 
 	mmdrop(mm);
-	put_task_struct(victim);
 }
 #undef K
 
+static void oom_kill_process(struct oom_control *oc, const char *message)
+{
+	struct task_struct *p = oc->chosen;
+	unsigned int points = oc->chosen_points;
+	struct task_struct *victim = p;
+	struct task_struct *child;
+	struct task_struct *t;
+	unsigned int victim_points = 0;
+	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+					      DEFAULT_RATELIMIT_BURST);
+
+	/*
+	 * If the task is already exiting, don't alarm the sysadmin or kill
+	 * its children or threads, just set TIF_MEMDIE so it can die quickly
+	 */
+	task_lock(p);
+	if (task_will_free_mem(p)) {
+		mark_oom_victim(p);
+		wake_oom_reaper(p);
+		task_unlock(p);
+		put_task_struct(p);
+		return;
+	}
+	task_unlock(p);
+
+	if (__ratelimit(&oom_rs))
+		dump_header(oc, p);
+
+	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+		message, task_pid_nr(p), p->comm, points);
+
+	/*
+	 * If any of p's children has a different mm and is eligible for kill,
+	 * the one with the highest oom_badness() score is sacrificed for its
+	 * parent.  This attempts to lose the minimal amount of work done while
+	 * still freeing memory.
+	 */
+	read_lock(&tasklist_lock);
+	for_each_thread(p, t) {
+		list_for_each_entry(child, &t->children, sibling) {
+			unsigned int child_points;
+
+			if (process_shares_mm(child, p->mm))
+				continue;
+			/*
+			 * oom_badness() returns 0 if the thread is unkillable
+			 */
+			child_points = oom_badness(child,
+				oc->memcg, oc->nodemask, oc->totalpages);
+			if (child_points > victim_points) {
+				put_task_struct(victim);
+				victim = child;
+				victim_points = child_points;
+				get_task_struct(victim);
+			}
+		}
+	}
+	read_unlock(&tasklist_lock);
+
+	__oom_kill_process(victim);
+	put_task_struct(victim);
+}
+
 /*
  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
  */
-- 
2.13.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [v6 0/4] cgroup-aware OOM killer
  2017-08-23 16:51 [v6 1/4] mm, oom: refactor the oom_kill_process() function Roman Gushchin
@ 2017-08-23 16:51 ` Roman Gushchin
  2017-08-23 16:51 ` [v6 2/4] mm, oom: " Roman Gushchin
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 26+ messages in thread
From: Roman Gushchin @ 2017-08-23 16:51 UTC (permalink / raw)
  To: linux-mm
  Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, David Rientjes, Tejun Heo, kernel-team, cgroups,
	linux-doc, linux-kernel

This patchset makes the OOM killer cgroup-aware.

v6:
  - Renamed oom_control.chosen to oom_control.chosen_task
  - Renamed oom_kill_all_tasks to oom_kill_all
  - Per-node NR_SLAB_UNRECLAIMABLE accounting
  - Several minor fixes and cleanups
  - Docs updated

v5:
  - Rebased on top of Michal Hocko's patches, which have changed the
    way how OOM victims becoming an access to the memory
    reserves. Dropped corresponding part of this patchset
  - Separated the oom_kill_process() splitting into a standalone commit
  - Added debug output (suggested by David Rientjes)
  - Some minor fixes

v4:
  - Reworked per-cgroup oom_score_adj into oom_priority
    (based on ideas by David Rientjes)
  - Tasks with oom_score_adj -1000 are never selected if
    oom_kill_all_tasks is not set
  - Memcg victim selection code is reworked, and
    synchronization is based on finding tasks with OOM victim marker,
    rather then on global counter
  - Debug output is dropped
  - Refactored TIF_MEMDIE usage

v3:
  - Merged commits 1-4 into 6
  - Separated oom_score_adj logic and debug output into separate commits
  - Fixed swap accounting

v2:
  - Reworked victim selection based on feedback
    from Michal Hocko, Vladimir Davydov and Johannes Weiner
  - "Kill all tasks" is now an opt-in option, by default
    only one process will be killed
  - Added per-cgroup oom_score_adj
  - Refined oom score calculations, suggested by Vladimir Davydov
  - Converted to a patchset

v1:
  https://lkml.org/lkml/2017/5/18/969


Roman Gushchin (4):
  mm, oom: refactor the oom_kill_process() function
  mm, oom: cgroup-aware OOM killer
  mm, oom: introduce oom_priority for memory cgroups
  mm, oom, docs: describe the cgroup-aware OOM killer

 Documentation/cgroup-v2.txt |  62 ++++++++++
 include/linux/memcontrol.h  |  36 ++++++
 include/linux/oom.h         |  12 +-
 mm/memcontrol.c             | 290 ++++++++++++++++++++++++++++++++++++++++++++
 mm/oom_kill.c               | 209 ++++++++++++++++++++-----------
 5 files changed, 539 insertions(+), 70 deletions(-)

-- 
2.13.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-23 16:51 [v6 1/4] mm, oom: refactor the oom_kill_process() function Roman Gushchin
  2017-08-23 16:51 ` [v6 0/4] cgroup-aware OOM killer Roman Gushchin
@ 2017-08-23 16:51 ` Roman Gushchin
  2017-08-23 23:19   ` David Rientjes
  2017-08-24 11:47   ` Michal Hocko
  2017-08-23 16:52 ` [v6 3/4] mm, oom: introduce oom_priority for memory cgroups Roman Gushchin
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 26+ messages in thread
From: Roman Gushchin @ 2017-08-23 16:51 UTC (permalink / raw)
  To: linux-mm
  Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, David Rientjes, Tejun Heo, kernel-team, cgroups,
	linux-doc, linux-kernel

Traditionally, the OOM killer is operating on a process level.
Under oom conditions, it finds a process with the highest oom score
and kills it.

This behavior doesn't suit well the system with many running
containers:

1) There is no fairness between containers. A small container with
few large processes will be chosen over a large one with huge
number of small processes.

2) Containers often do not expect that some random process inside
will be killed. In many cases much safer behavior is to kill
all tasks in the container. Traditionally, this was implemented
in userspace, but doing it in the kernel has some advantages,
especially in a case of a system-wide OOM.

3) Per-process oom_score_adj affects global OOM, so it's a breache
in the isolation.

To address these issues, cgroup-aware OOM killer is introduced.

Under OOM conditions, it tries to find the biggest memory consumer,
and free memory by killing corresponding task(s). The difference
the "traditional" OOM killer is that it can treat memory cgroups
as memory consumers as well as single processes.

By default, it will look for the biggest leaf cgroup, and kill
the largest task inside.

But a user can change this behavior by enabling the per-cgroup
oom_kill_all_tasks option. If set, it causes the OOM killer treat
the whole cgroup as an indivisible memory consumer. In case if it's
selected as on OOM victim, all belonging tasks will be killed.

Tasks in the root cgroup are treated as independent memory consumers,
and are compared with other memory consumers (e.g. leaf cgroups).
The root cgroup doesn't support the oom_kill_all_tasks feature.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
Cc: cgroups@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
---
 include/linux/memcontrol.h |  33 +++++++
 include/linux/oom.h        |  12 ++-
 mm/memcontrol.c            | 242 +++++++++++++++++++++++++++++++++++++++++++++
 mm/oom_kill.c              |  92 ++++++++++++++---
 4 files changed, 364 insertions(+), 15 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8556f1b86d40..c57ee47c35bb 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -35,6 +35,7 @@ struct mem_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct oom_control;
 
 /* Cgroup-specific page state, on top of universal node page state */
 enum memcg_stat_item {
@@ -199,6 +200,12 @@ struct mem_cgroup {
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
+	/* kill all tasks in the subtree in case of OOM */
+	bool oom_kill_all;
+
+	/* cached OOM score */
+	long oom_score;
+
 	/* handle for "memory.events" */
 	struct cgroup_file events_file;
 
@@ -342,6 +349,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
 	return css ? container_of(css, struct mem_cgroup, css) : NULL;
 }
 
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+	css_put(&memcg->css);
+}
+
 #define mem_cgroup_from_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
@@ -480,6 +492,13 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
 
 bool mem_cgroup_oom_synchronize(bool wait);
 
+bool mem_cgroup_select_oom_victim(struct oom_control *oc);
+
+static inline bool mem_cgroup_oom_kill_all(struct mem_cgroup *memcg)
+{
+	return memcg->oom_kill_all;
+}
+
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
 #endif
@@ -743,6 +762,10 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
 	return true;
 }
 
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+}
+
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 		struct mem_cgroup *prev,
@@ -930,6 +953,16 @@ static inline
 void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+
+static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc)
+{
+	return false;
+}
+
+static inline bool mem_cgroup_oom_kill_all(struct mem_cgroup *memcg)
+{
+	return false;
+}
 #endif /* CONFIG_MEMCG */
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8a266e2be5a6..344ccb85eb74 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -7,6 +7,13 @@
 #include <linux/nodemask.h>
 #include <uapi/linux/oom.h>
 
+
+/*
+ * Special value returned by victim selection functions to indicate
+ * that are inflight OOM victims.
+ */
+#define INFLIGHT_VICTIM ((void *)-1UL)
+
 struct zonelist;
 struct notifier_block;
 struct mem_cgroup;
@@ -37,7 +44,8 @@ struct oom_control {
 
 	/* Used by oom implementation, do not set */
 	unsigned long totalpages;
-	struct task_struct *chosen;
+	struct task_struct *chosen_task;
+	struct mem_cgroup *chosen_memcg;
 	unsigned long chosen_points;
 };
 
@@ -79,6 +87,8 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
+extern int oom_evaluate_task(struct task_struct *task, void *arg);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index df6f63ee95d6..a620aaae6201 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2639,6 +2639,215 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
 	return ret;
 }
 
+static long memcg_oom_badness(struct mem_cgroup *memcg,
+			      const nodemask_t *nodemask)
+{
+	long points = 0;
+	int nid;
+	pg_data_t *pgdat;
+
+	for_each_node_state(nid, N_MEMORY) {
+		if (nodemask && !node_isset(nid, *nodemask))
+			continue;
+
+		points += mem_cgroup_node_nr_lru_pages(memcg, nid,
+				LRU_ALL_ANON | BIT(LRU_UNEVICTABLE));
+
+		pgdat = NODE_DATA(nid);
+		points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg),
+					    NR_SLAB_UNRECLAIMABLE);
+	}
+
+	points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /
+		(PAGE_SIZE / 1024);
+	points += memcg_page_state(memcg, MEMCG_SOCK);
+	points += memcg_page_state(memcg, MEMCG_SWAP);
+
+	return points;
+}
+
+/*
+ * Checks if the given memcg is a valid OOM victim and returns a number,
+ * which means the folowing:
+ *   -1: there are inflight OOM victim tasks, belonging to the memcg
+ *    0: memcg is not eligible, e.g. all belonging tasks are protected
+ *       by oom_score_adj set to OOM_SCORE_ADJ_MIN
+ *   >0: memcg is eligible, and the returned value is an estimation
+ *       of the memory footprint
+ */
+static long oom_evaluate_memcg(struct mem_cgroup *memcg,
+			       const nodemask_t *nodemask)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+	int eligible = 0;
+
+	/*
+	 * Memcg is OOM eligible if there are OOM killable tasks inside.
+	 *
+	 * If oom_kill_all is not set, we treat tasks with oom_score_adj
+	 * equal to OOM_SCORE_ADJ_MIN as unkillable. Otherwise, we do ignore
+	 * per-process oom_score_adj.
+	 *
+	 * If there are inflight OOM victim tasks inside the memcg,
+	 * we return -1.
+	 */
+	css_task_iter_start(&memcg->css, 0, &it);
+	while ((task = css_task_iter_next(&it))) {
+		if (!eligible &&
+		    (memcg->oom_kill_all ||
+		     task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN))
+			eligible = 1;
+
+		if (tsk_is_oom_victim(task) &&
+		    !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {
+			eligible = -1;
+			break;
+		}
+	}
+	css_task_iter_end(&it);
+
+	if (eligible <= 0)
+		return eligible;
+
+	return memcg_oom_badness(memcg, nodemask);
+}
+
+static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
+{
+	struct mem_cgroup *iter, *parent;
+
+	for_each_mem_cgroup_tree(iter, root) {
+		if (memcg_has_children(iter)) {
+			iter->oom_score = 0;
+			continue;
+		}
+
+		iter->oom_score = oom_evaluate_memcg(iter, oc->nodemask);
+
+		/*
+		 * Ignore empty and non-eligible memory cgroups.
+		 */
+		if (iter->oom_score == 0)
+			continue;
+
+		/*
+		 * If there are inflight OOM victims, we don't need to look
+		 * further for new victims.
+		 */
+		if (iter->oom_score == -1) {
+			oc->chosen_memcg = INFLIGHT_VICTIM;
+			mem_cgroup_iter_break(root, iter);
+			return;
+		}
+
+		for (parent = parent_mem_cgroup(iter); parent && parent != root;
+		     parent = parent_mem_cgroup(parent))
+			parent->oom_score += iter->oom_score;
+	}
+
+	for (;;) {
+		struct cgroup_subsys_state *css;
+		struct mem_cgroup *memcg = NULL;
+		long score = LONG_MIN;
+
+		css_for_each_child(css, &root->css) {
+			struct mem_cgroup *iter = mem_cgroup_from_css(css);
+
+			/*
+			 * Ignore empty and non-eligible memory cgroups.
+			 */
+			if (iter->oom_score == 0)
+				continue;
+
+			if (iter->oom_score > score) {
+				memcg = iter;
+				score = iter->oom_score;
+			}
+		}
+
+		if (!memcg) {
+			if (oc->memcg && root == oc->memcg) {
+				oc->chosen_memcg = oc->memcg;
+				css_get(&oc->chosen_memcg->css);
+				oc->chosen_points = oc->memcg->oom_score;
+			}
+			break;
+		}
+
+		if (memcg->oom_kill_all || !memcg_has_children(memcg)) {
+			oc->chosen_memcg = memcg;
+			css_get(&oc->chosen_memcg->css);
+			oc->chosen_points = score;
+			break;
+		}
+
+		root = memcg;
+	}
+}
+
+static void select_victim_root_cgroup_task(struct oom_control *oc)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+	int ret = 0;
+
+	css_task_iter_start(&root_mem_cgroup->css, 0, &it);
+	while (!ret && (task = css_task_iter_next(&it)))
+		ret = oom_evaluate_task(task, oc);
+	css_task_iter_end(&it);
+}
+
+bool mem_cgroup_select_oom_victim(struct oom_control *oc)
+{
+	struct mem_cgroup *root = root_mem_cgroup;
+
+	oc->chosen_task = NULL;
+	oc->chosen_memcg = NULL;
+
+	if (mem_cgroup_disabled())
+		return false;
+
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+		return false;
+
+	if (oc->memcg)
+		root = oc->memcg;
+
+	rcu_read_lock();
+
+	select_victim_memcg(root, oc);
+
+	/*
+	 * If existing OOM victims are found, no need to look further.
+	 */
+	if (oc->chosen_memcg == INFLIGHT_VICTIM) {
+		rcu_read_unlock();
+		return true;
+	}
+
+	/*
+	 * For system-wide OOMs we should consider tasks in the root cgroup
+	 * with oom_score larger than oc->chosen_points.
+	 */
+	if (!oc->memcg) {
+		select_victim_root_cgroup_task(oc);
+
+		if (oc->chosen_task && oc->chosen_memcg) {
+			/*
+			 * If we've decided to kill a task in the root memcg,
+			 * release chosen_memcg.
+			 */
+			css_put(&oc->chosen_memcg->css);
+			oc->chosen_memcg = NULL;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return oc->chosen_task || oc->chosen_memcg;
+}
+
 /*
  * Reclaims as many pages from the given memcg as possible.
  *
@@ -5190,6 +5399,33 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static int memory_oom_kill_all_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	bool oom_kill_all = memcg->oom_kill_all;
+
+	seq_printf(m, "%d\n", oom_kill_all);
+
+	return 0;
+}
+
+static ssize_t memory_oom_kill_all_write(struct kernfs_open_file *of,
+					       char *buf, size_t nbytes,
+					       loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int oom_kill_all;
+	int err;
+
+	err = kstrtoint(strstrip(buf), 0, &oom_kill_all);
+	if (err)
+		return err;
+
+	memcg->oom_kill_all = oom_kill_all;
+
+	return nbytes;
+}
+
 static int memory_events_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5310,6 +5546,12 @@ static struct cftype memory_files[] = {
 		.write = memory_max_write,
 	},
 	{
+		.name = "oom_kill_all",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_oom_kill_all_show,
+		.write = memory_oom_kill_all_write,
+	},
+	{
 		.name = "events",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.file_offset = offsetof(struct mem_cgroup, events_file),
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5c29a3dd591b..3261020ab781 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -288,7 +288,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
 	return CONSTRAINT_NONE;
 }
 
-static int oom_evaluate_task(struct task_struct *task, void *arg)
+int oom_evaluate_task(struct task_struct *task, void *arg)
 {
 	struct oom_control *oc = arg;
 	unsigned long points;
@@ -322,26 +322,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto next;
 
 	/* Prefer thread group leaders for display purposes */
-	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
+	if (points == oc->chosen_points && thread_group_leader(oc->chosen_task))
 		goto next;
 select:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
+	if (oc->chosen_task)
+		put_task_struct(oc->chosen_task);
 	get_task_struct(task);
-	oc->chosen = task;
+	oc->chosen_task = task;
 	oc->chosen_points = points;
 next:
 	return 0;
 abort:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
-	oc->chosen = (void *)-1UL;
+	if (oc->chosen_task)
+		put_task_struct(oc->chosen_task);
+	oc->chosen_task = INFLIGHT_VICTIM;
 	return 1;
 }
 
 /*
  * Simple selection loop. We choose the process with the highest number of
- * 'points'. In case scan was aborted, oc->chosen is set to -1.
+ * 'points'. In case scan was aborted, oc->chosen_task is set to -1.
  */
 static void select_bad_process(struct oom_control *oc)
 {
@@ -823,6 +823,9 @@ static void __oom_kill_process(struct task_struct *victim)
 	struct mm_struct *mm;
 	bool can_oom_reap = true;
 
+	if (is_global_init(victim) || (victim->flags & PF_KTHREAD))
+		return;
+
 	p = find_lock_task_mm(victim);
 	if (!p) {
 		put_task_struct(victim);
@@ -897,7 +900,7 @@ static void __oom_kill_process(struct task_struct *victim)
 
 static void oom_kill_process(struct oom_control *oc, const char *message)
 {
-	struct task_struct *p = oc->chosen;
+	struct task_struct *p = oc->chosen_task;
 	unsigned int points = oc->chosen_points;
 	struct task_struct *victim = p;
 	struct task_struct *child;
@@ -958,6 +961,64 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	put_task_struct(victim);
 }
 
+static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+{
+	if (!tsk_is_oom_victim(task))
+		__oom_kill_process(task);
+	return 0;
+}
+
+static bool oom_kill_memcg_victim(struct oom_control *oc)
+{
+	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	if (oc->chosen_task) {
+		if (oc->chosen_task == INFLIGHT_VICTIM)
+			return true;
+
+		if (__ratelimit(&oom_rs))
+			dump_header(oc, oc->chosen_task);
+
+		__oom_kill_process(oc->chosen_task);
+		put_task_struct(oc->chosen_task);
+
+		schedule_timeout_killable(1);
+		return true;
+
+	} else if (oc->chosen_memcg) {
+		if (oc->chosen_memcg == INFLIGHT_VICTIM)
+			return true;
+
+		/* Always begin with the biggest task */
+		oc->chosen_points = 0;
+		oc->chosen_task = NULL;
+		mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc);
+
+		if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) {
+			if (__ratelimit(&oom_rs))
+				dump_header(oc, oc->chosen_task);
+
+			__oom_kill_process(oc->chosen_task);
+			put_task_struct(oc->chosen_task);
+
+			if (mem_cgroup_oom_kill_all(oc->chosen_memcg))
+				mem_cgroup_scan_tasks(oc->chosen_memcg,
+						      oom_kill_memcg_member,
+						      NULL);
+			schedule_timeout_killable(1);
+		}
+
+		mem_cgroup_put(oc->chosen_memcg);
+		oc->chosen_memcg = NULL;
+		return oc->chosen_task;
+
+	} else {
+		oc->chosen_points = 0;
+		return false;
+	}
+}
+
 /*
  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
  */
@@ -1054,18 +1115,21 @@ bool out_of_memory(struct oom_control *oc)
 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
-		oc->chosen = current;
+		oc->chosen_task = current;
 		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
 		return true;
 	}
 
+	if (mem_cgroup_select_oom_victim(oc) && oom_kill_memcg_victim(oc))
+		return true;
+
 	select_bad_process(oc);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
-	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
+	if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
 		dump_header(oc, NULL);
 		panic("Out of memory and no killable processes...\n");
 	}
-	if (oc->chosen && oc->chosen != (void *)-1UL) {
+	if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) {
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
 				 "Memory cgroup out of memory");
 		/*
@@ -1074,7 +1138,7 @@ bool out_of_memory(struct oom_control *oc)
 		 */
 		schedule_timeout_killable(1);
 	}
-	return !!oc->chosen;
+	return !!oc->chosen_task;
 }
 
 /*
-- 
2.13.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [v6 3/4] mm, oom: introduce oom_priority for memory cgroups
  2017-08-23 16:51 [v6 1/4] mm, oom: refactor the oom_kill_process() function Roman Gushchin
  2017-08-23 16:51 ` [v6 0/4] cgroup-aware OOM killer Roman Gushchin
  2017-08-23 16:51 ` [v6 2/4] mm, oom: " Roman Gushchin
@ 2017-08-23 16:52 ` Roman Gushchin
  2017-08-24 12:10   ` Michal Hocko
  2017-08-23 16:52 ` [v6 4/4] mm, oom, docs: describe the cgroup-aware OOM killer Roman Gushchin
  2017-08-24 11:15 ` [v6 1/4] mm, oom: refactor the oom_kill_process() function Michal Hocko
  4 siblings, 1 reply; 26+ messages in thread
From: Roman Gushchin @ 2017-08-23 16:52 UTC (permalink / raw)
  To: linux-mm
  Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, David Rientjes, Tejun Heo, kernel-team, cgroups,
	linux-doc, linux-kernel

Introduce a per-memory-cgroup oom_priority setting: an integer number
within the [-10000, 10000] range, which defines the order in which
the OOM killer selects victim memory cgroups.

OOM killer prefers memory cgroups with larger priority if they are
populated with eligible tasks.

The oom_priority value is compared within sibling cgroups.

The root cgroup has the oom_priority 0, which cannot be changed.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: kernel-team@fb.com
Cc: cgroups@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
---
 include/linux/memcontrol.h |  3 +++
 mm/memcontrol.c            | 52 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c57ee47c35bb..915f0c19a2b5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -206,6 +206,9 @@ struct mem_cgroup {
 	/* cached OOM score */
 	long oom_score;
 
+	/* OOM killer priority */
+	short oom_priority;
+
 	/* handle for "memory.events" */
 	struct cgroup_file events_file;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a620aaae6201..a173e5b0d4d8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2749,6 +2749,7 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
 	for (;;) {
 		struct cgroup_subsys_state *css;
 		struct mem_cgroup *memcg = NULL;
+		short prio = SHRT_MIN;
 		long score = LONG_MIN;
 
 		css_for_each_child(css, &root->css) {
@@ -2760,7 +2761,12 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
 			if (iter->oom_score == 0)
 				continue;
 
-			if (iter->oom_score > score) {
+			if (iter->oom_priority > prio) {
+				memcg = iter;
+				prio = iter->oom_priority;
+				score = iter->oom_score;
+			} else if (iter->oom_priority == prio &&
+				   iter->oom_score > score) {
 				memcg = iter;
 				score = iter->oom_score;
 			}
@@ -2830,7 +2836,15 @@ bool mem_cgroup_select_oom_victim(struct oom_control *oc)
 	 * For system-wide OOMs we should consider tasks in the root cgroup
 	 * with oom_score larger than oc->chosen_points.
 	 */
-	if (!oc->memcg) {
+	if (!oc->memcg && !(oc->chosen_memcg &&
+			    oc->chosen_memcg->oom_priority > 0)) {
+		/*
+		 * Root memcg has priority 0, so if chosen memcg has lower
+		 * priority, any task in root cgroup is preferable.
+		 */
+		if (oc->chosen_memcg && oc->chosen_memcg->oom_priority < 0)
+			oc->chosen_points = 0;
+
 		select_victim_root_cgroup_task(oc);
 
 		if (oc->chosen_task && oc->chosen_memcg) {
@@ -5426,6 +5440,34 @@ static ssize_t memory_oom_kill_all_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static int memory_oom_priority_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "%d\n", memcg->oom_priority);
+
+	return 0;
+}
+
+static ssize_t memory_oom_priority_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int oom_priority;
+	int err;
+
+	err = kstrtoint(strstrip(buf), 0, &oom_priority);
+	if (err)
+		return err;
+
+	if (oom_priority < -10000 || oom_priority > 10000)
+		return -EINVAL;
+
+	memcg->oom_priority = (short)oom_priority;
+
+	return nbytes;
+}
+
 static int memory_events_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5552,6 +5594,12 @@ static struct cftype memory_files[] = {
 		.write = memory_oom_kill_all_write,
 	},
 	{
+		.name = "oom_priority",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_oom_priority_show,
+		.write = memory_oom_priority_write,
+	},
+	{
 		.name = "events",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.file_offset = offsetof(struct mem_cgroup, events_file),
-- 
2.13.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [v6 4/4] mm, oom, docs: describe the cgroup-aware OOM killer
  2017-08-23 16:51 [v6 1/4] mm, oom: refactor the oom_kill_process() function Roman Gushchin
                   ` (2 preceding siblings ...)
  2017-08-23 16:52 ` [v6 3/4] mm, oom: introduce oom_priority for memory cgroups Roman Gushchin
@ 2017-08-23 16:52 ` Roman Gushchin
  2017-08-24 11:15 ` [v6 1/4] mm, oom: refactor the oom_kill_process() function Michal Hocko
  4 siblings, 0 replies; 26+ messages in thread
From: Roman Gushchin @ 2017-08-23 16:52 UTC (permalink / raw)
  To: linux-mm
  Cc: Roman Gushchin, Michal Hocko, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, David Rientjes, Tejun Heo, kernel-team, cgroups,
	linux-doc, linux-kernel

Update cgroups v2 docs.

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
Cc: cgroups@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
---
 Documentation/cgroup-v2.txt | 62 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index dec5afdaa36d..79ac407bf5a0 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -48,6 +48,7 @@ v1 is available under Documentation/cgroup-v1/.
        5-2-1. Memory Interface Files
        5-2-2. Usage Guidelines
        5-2-3. Memory Ownership
+       5-2-4. OOM Killer
      5-3. IO
        5-3-1. IO Interface Files
        5-3-2. Writeback
@@ -1002,6 +1003,34 @@ PAGE_SIZE multiple when read back.
 	high limit is used and monitored properly, this limit's
 	utility is limited to providing the final safety net.
 
+  memory.oom_kill_all
+
+	A read-write single value file which exists on non-root
+	cgroups.  The default is "0".
+
+	If set, OOM killer will kill all processes attached to the cgroup
+	if selected as an OOM victim.
+
+	Be default, the OOM killer respects the /proc/pid/oom_score_adj
+	value -1000, and will never kill the task, unless oom_kill_all
+	is set.
+
+  memory.oom_priority
+
+	A read-write single value file which exists on non-root
+	cgroups.  The default is "0".
+
+	An integer number within the [-10000, 10000] range,
+	which defines the order in which the OOM killer selects victim
+	memory cgroups.
+
+	OOM killer prefers memory cgroups with larger priority if they
+	are populated with eligible tasks.
+
+	The oom_priority value is compared within sibling cgroups.
+
+	The root cgroup has the oom_priority 0, which cannot be changed.
+
   memory.events
 	A read-only flat-keyed file which exists on non-root cgroups.
 	The following entries are defined.  Unless specified
@@ -1206,6 +1235,39 @@ POSIX_FADV_DONTNEED to relinquish the ownership of memory areas
 belonging to the affected files to ensure correct memory ownership.
 
 
+OOM Killer
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Cgroup v2 memory controller implements a cgroup-aware OOM killer.
+It means that it treats cgroups as first class OOM entities.
+
+Under OOM conditions the memory controller tries to make the best
+choice of a victim, hierarchically looking for the largest memory
+consumer. By default, it will look for the biggest task in the
+biggest leaf memory cgroup.
+
+By default, all memory cgroups have oom_priority 0, and OOM killer
+will choice the cgroup with the largest memory consuption recursively
+on each level. For non-root cgroups it's possible to change
+the oom_priority, and it will cause the OOM killer to look
+at the priority value first, and compare sizes only of memory
+cgroups with equal priority.
+
+A user can change this behavior by enabling the per-cgroup
+oom_kill_all option. If set, OOM killer will kill all processes
+attached to the cgroup if selected as an OOM victim.
+
+Tasks in the root cgroup are treated as independent memory consumers,
+and are compared with other memory consumers (leaf memory cgroups).
+The root cgroup doesn't support the oom_kill_all feature.
+
+This affects both system- and cgroup-wide OOMs. For a cgroup-wide OOM
+the memory controller considers only cgroups belonging to the sub-tree
+of the OOM'ing cgroup.
+
+If there are no cgroups with the enabled memory controller,
+the OOM killer is using the "traditional" process-based approach.
+
 IO
 --
 
-- 
2.13.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-23 16:51 ` [v6 2/4] mm, oom: " Roman Gushchin
@ 2017-08-23 23:19   ` David Rientjes
  2017-08-25 10:57     ` Roman Gushchin
  2017-08-24 11:47   ` Michal Hocko
  1 sibling, 1 reply; 26+ messages in thread
From: David Rientjes @ 2017-08-23 23:19 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Michal Hocko, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Wed, 23 Aug 2017, Roman Gushchin wrote:

> Traditionally, the OOM killer is operating on a process level.
> Under oom conditions, it finds a process with the highest oom score
> and kills it.
> 
> This behavior doesn't suit well the system with many running
> containers:
> 
> 1) There is no fairness between containers. A small container with
> few large processes will be chosen over a large one with huge
> number of small processes.
> 
> 2) Containers often do not expect that some random process inside
> will be killed. In many cases much safer behavior is to kill
> all tasks in the container. Traditionally, this was implemented
> in userspace, but doing it in the kernel has some advantages,
> especially in a case of a system-wide OOM.
> 
> 3) Per-process oom_score_adj affects global OOM, so it's a breache
> in the isolation.
> 
> To address these issues, cgroup-aware OOM killer is introduced.
> 
> Under OOM conditions, it tries to find the biggest memory consumer,
> and free memory by killing corresponding task(s). The difference
> the "traditional" OOM killer is that it can treat memory cgroups
> as memory consumers as well as single processes.
> 
> By default, it will look for the biggest leaf cgroup, and kill
> the largest task inside.
> 
> But a user can change this behavior by enabling the per-cgroup
> oom_kill_all_tasks option. If set, it causes the OOM killer treat
> the whole cgroup as an indivisible memory consumer. In case if it's
> selected as on OOM victim, all belonging tasks will be killed.
> 

I'm very happy with the rest of the patchset, but I feel that I must renew 
my objection to memory.oom_kill_all_tasks being able to override the 
setting of the admin of setting a process to be oom disabled.  From my 
perspective, setting memory.oom_kill_all_tasks with an oom disabled 
process attached that now becomes killable either (1) overrides the 
CAP_SYS_RESOURCE oom disabled setting or (2) is lazy and doesn't modify 
/proc/pid/oom_score_adj itself.

I'm not sure what is objectionable about allowing 
memory.oom_kill_all_tasks to coexist with oom disabled processes.  Just 
kill everything else so that the oom disabled process can report the oom 
condition after notification, restart the task, etc.  If it's problematic, 
then whomever is declaring everything must be killed shall also modify 
/proc/pid/oom_score_adj of oom disabled processes.  If it doesn't have 
permission to change that, then I think there's a much larger concern.

> Tasks in the root cgroup are treated as independent memory consumers,
> and are compared with other memory consumers (e.g. leaf cgroups).
> The root cgroup doesn't support the oom_kill_all_tasks feature.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 1/4] mm, oom: refactor the oom_kill_process() function
  2017-08-23 16:51 [v6 1/4] mm, oom: refactor the oom_kill_process() function Roman Gushchin
                   ` (3 preceding siblings ...)
  2017-08-23 16:52 ` [v6 4/4] mm, oom, docs: describe the cgroup-aware OOM killer Roman Gushchin
@ 2017-08-24 11:15 ` Michal Hocko
  4 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2017-08-24 11:15 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

This patch fails to apply on top of the mmotm tree. It seems the only
reason is the missing
http://lkml.kernel.org/r/20170810075019.28998-2-mhocko@kernel.org

On Wed 23-08-17 17:51:57, Roman Gushchin wrote:
> The oom_kill_process() function consists of two logical parts:
> the first one is responsible for considering task's children as
> a potential victim and printing the debug information.
> The second half is responsible for sending SIGKILL to all
> tasks sharing the mm struct with the given victim.
> 
> This commit splits the oom_kill_process() function with
> an intention to re-use the the second half: __oom_kill_process().

Yes this makes some sense even without further changes.

> The cgroup-aware OOM killer will kill multiple tasks
> belonging to the victim cgroup. We don't need to print
> the debug information for the each task, as well as play
> with task selection (considering task's children),
> so we can't use the existing oom_kill_process().
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: kernel-team@fb.com
> Cc: cgroups@vger.kernel.org
> Cc: linux-doc@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: linux-mm@kvack.org

I do agree with the patch there is just one thing to fix up.

> ---
>  mm/oom_kill.c | 123 +++++++++++++++++++++++++++++++---------------------------
>  1 file changed, 65 insertions(+), 58 deletions(-)
> 
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 53b44425ef35..5c29a3dd591b 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -817,67 +817,12 @@ static bool task_will_free_mem(struct task_struct *task)
>  	return ret;
>  }
>  
> -static void oom_kill_process(struct oom_control *oc, const char *message)
> +static void __oom_kill_process(struct task_struct *victim)
>  {
[...]
>  	p = find_lock_task_mm(victim);
>  	if (!p) {
>  		put_task_struct(victim);

The context doesn't tell us but there is return right after this.

	p = find_lock_task_mm(victim);
	if (!p) {
		put_task_struct(victim);
		return;
	} else if (victim != p) {
		get_task_struct(p);
		put_task_struct(victim);
		victim = p;
	}

So we return with the reference dropped. Moreover we can change
the victim, drop the reference on old one...

> +static void oom_kill_process(struct oom_control *oc, const char *message)
> +{
[...]
> +	__oom_kill_process(victim);
> +	put_task_struct(victim);

while we drop it here again and won't drop the changed one. If we race
with the exiting task and there is no mm then we we double drop as well.
So I think that __oom_kill_process should really drop the reference for
all cases and oom_kill_process shouldn't care. Or if you absolutely
need a guarantee that the victim won't go away after __oom_kill_process
then you need to return the real victim and let the caller to deal with
put_task_struct.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-23 16:51 ` [v6 2/4] mm, oom: " Roman Gushchin
  2017-08-23 23:19   ` David Rientjes
@ 2017-08-24 11:47   ` Michal Hocko
  2017-08-24 12:28     ` Roman Gushchin
  1 sibling, 1 reply; 26+ messages in thread
From: Michal Hocko @ 2017-08-24 11:47 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

This doesn't apply on top of mmotm cleanly. You are missing
http://lkml.kernel.org/r/20170807113839.16695-3-mhocko@kernel.org

On Wed 23-08-17 17:51:59, Roman Gushchin wrote:
> Traditionally, the OOM killer is operating on a process level.
> Under oom conditions, it finds a process with the highest oom score
> and kills it.
> 
> This behavior doesn't suit well the system with many running
> containers:
> 
> 1) There is no fairness between containers. A small container with
> few large processes will be chosen over a large one with huge
> number of small processes.
> 
> 2) Containers often do not expect that some random process inside
> will be killed. In many cases much safer behavior is to kill
> all tasks in the container. Traditionally, this was implemented
> in userspace, but doing it in the kernel has some advantages,
> especially in a case of a system-wide OOM.
> 
> 3) Per-process oom_score_adj affects global OOM, so it's a breache
> in the isolation.

Please explain more. I guess you mean that an untrusted memcg could hide
itself from the global OOM killer by reducing the oom scores? Well you
need CAP_SYS_RESOURCE do reduce the current oom_score{_adj} as David has
already pointed out. I also agree that we absolutely must not kill an
oom disabled task. I am pretty sure somebody is using OOM_SCORE_ADJ_MIN
as a protection from an untrusted SIGKILL and inconsistent state as a
result. Those applications simply shouldn't behave differently in the
global and container contexts.

If nothing else we have to skip OOM_SCORE_ADJ_MIN tasks during the kill.

> To address these issues, cgroup-aware OOM killer is introduced.
> 
> Under OOM conditions, it tries to find the biggest memory consumer,
> and free memory by killing corresponding task(s). The difference
> the "traditional" OOM killer is that it can treat memory cgroups
> as memory consumers as well as single processes.
> 
> By default, it will look for the biggest leaf cgroup, and kill
> the largest task inside.

Why? I believe that the semantic should be as simple as kill the largest
oom killable entity. And the entity is either a process or a memcg which
is marked that way. Why should we mix things and select a memcg to kill
a process inside it? More on that below.

> But a user can change this behavior by enabling the per-cgroup
> oom_kill_all_tasks option. If set, it causes the OOM killer treat
> the whole cgroup as an indivisible memory consumer. In case if it's
> selected as on OOM victim, all belonging tasks will be killed.
> 
> Tasks in the root cgroup are treated as independent memory consumers,
> and are compared with other memory consumers (e.g. leaf cgroups).
> The root cgroup doesn't support the oom_kill_all_tasks feature.

If anything you wouldn't have to treat the root memcg any special. It
will be like any other memcg which doesn't have oom_kill_all_tasks...
 
[...]

> +static long memcg_oom_badness(struct mem_cgroup *memcg,
> +			      const nodemask_t *nodemask)
> +{
> +	long points = 0;
> +	int nid;
> +	pg_data_t *pgdat;
> +
> +	for_each_node_state(nid, N_MEMORY) {
> +		if (nodemask && !node_isset(nid, *nodemask))
> +			continue;
> +
> +		points += mem_cgroup_node_nr_lru_pages(memcg, nid,
> +				LRU_ALL_ANON | BIT(LRU_UNEVICTABLE));
> +
> +		pgdat = NODE_DATA(nid);
> +		points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg),
> +					    NR_SLAB_UNRECLAIMABLE);
> +	}
> +
> +	points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /
> +		(PAGE_SIZE / 1024);
> +	points += memcg_page_state(memcg, MEMCG_SOCK);
> +	points += memcg_page_state(memcg, MEMCG_SWAP);
> +
> +	return points;

I guess I have asked already and we haven't reached any consensus. I do
not like how you treat memcgs and tasks differently. Why cannot we have
a memcg score a sum of all its tasks? How do you want to compare memcg
score with tasks score? This just smells like the outcome of a weird
semantic that you try to select the largest group I have mentioned
above.

This is a rather fundamental concern and I believe we should find a
consensus on it before going any further. I believe that users shouldn't
see any difference in the OOM behavior when memcg v2 is used and there
is no kill-all memcg. If there is such a memcg then we should treat only
those specially. But you might have really strong usecases which haven't
been presented or I've missed their importance.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 3/4] mm, oom: introduce oom_priority for memory cgroups
  2017-08-23 16:52 ` [v6 3/4] mm, oom: introduce oom_priority for memory cgroups Roman Gushchin
@ 2017-08-24 12:10   ` Michal Hocko
  2017-08-24 12:51     ` Roman Gushchin
  0 siblings, 1 reply; 26+ messages in thread
From: Michal Hocko @ 2017-08-24 12:10 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Wed 23-08-17 17:52:00, Roman Gushchin wrote:
> Introduce a per-memory-cgroup oom_priority setting: an integer number
> within the [-10000, 10000] range, which defines the order in which
> the OOM killer selects victim memory cgroups.

Why do we need a range here?

> OOM killer prefers memory cgroups with larger priority if they are
> populated with eligible tasks.

So this is basically orthogonal to the score based selection and the
real size is only the tiebreaker for same priorities? Could you describe
the usecase? Becasuse to me this sounds like a separate oom killer
strategy. I can imagine somebody might be interested (e.g. always kill
the oldest memcgs...) but an explicit range wouldn't fly with such a
usecase very well.

That brings me back to my original suggestion. Wouldn't a "register an
oom strategy" approach much better than blending things together and
then have to wrap heads around different combinations of tunables?

[...]
> @@ -2760,7 +2761,12 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
>  			if (iter->oom_score == 0)
>  				continue;
>  
> -			if (iter->oom_score > score) {
> +			if (iter->oom_priority > prio) {
> +				memcg = iter;
> +				prio = iter->oom_priority;
> +				score = iter->oom_score;
> +			} else if (iter->oom_priority == prio &&
> +				   iter->oom_score > score) {
>  				memcg = iter;
>  				score = iter->oom_score;
>  			}

Just a minor thing. Why do we even have to calculate oom_score when we
use it only as a tiebreaker?
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-24 11:47   ` Michal Hocko
@ 2017-08-24 12:28     ` Roman Gushchin
  2017-08-24 12:58       ` Michal Hocko
  0 siblings, 1 reply; 26+ messages in thread
From: Roman Gushchin @ 2017-08-24 12:28 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

Hi Michal!

On Thu, Aug 24, 2017 at 01:47:06PM +0200, Michal Hocko wrote:
> This doesn't apply on top of mmotm cleanly. You are missing
> http://lkml.kernel.org/r/20170807113839.16695-3-mhocko@kernel.org

I'll rebase. Thanks!

> 
> On Wed 23-08-17 17:51:59, Roman Gushchin wrote:
> > Traditionally, the OOM killer is operating on a process level.
> > Under oom conditions, it finds a process with the highest oom score
> > and kills it.
> > 
> > This behavior doesn't suit well the system with many running
> > containers:
> > 
> > 1) There is no fairness between containers. A small container with
> > few large processes will be chosen over a large one with huge
> > number of small processes.
> > 
> > 2) Containers often do not expect that some random process inside
> > will be killed. In many cases much safer behavior is to kill
> > all tasks in the container. Traditionally, this was implemented
> > in userspace, but doing it in the kernel has some advantages,
> > especially in a case of a system-wide OOM.
> > 
> > 3) Per-process oom_score_adj affects global OOM, so it's a breache
> > in the isolation.
> 
> Please explain more. I guess you mean that an untrusted memcg could hide
> itself from the global OOM killer by reducing the oom scores? Well you
> need CAP_SYS_RESOURCE do reduce the current oom_score{_adj} as David has
> already pointed out. I also agree that we absolutely must not kill an
> oom disabled task. I am pretty sure somebody is using OOM_SCORE_ADJ_MIN
> as a protection from an untrusted SIGKILL and inconsistent state as a
> result. Those applications simply shouldn't behave differently in the
> global and container contexts.

The main point of the kill_all option is to clean up the victim cgroup
_completely_. If some tasks can survive, that means userspace should
take care of them, look at the cgroup after oom, and kill the survivors
manually.

If you want to rely on OOM_SCORE_ADJ_MIN, don't set kill_all.
I really don't get the usecase for this "kill all, except this and that".

Also, it's really confusing to respect -1000 value, and completely ignore -999.

I believe that any complex userspace OOM handling should use memory.high
and handle memory shortage before an actual OOM.

> 
> If nothing else we have to skip OOM_SCORE_ADJ_MIN tasks during the kill.
> 
> > To address these issues, cgroup-aware OOM killer is introduced.
> > 
> > Under OOM conditions, it tries to find the biggest memory consumer,
> > and free memory by killing corresponding task(s). The difference
> > the "traditional" OOM killer is that it can treat memory cgroups
> > as memory consumers as well as single processes.
> > 
> > By default, it will look for the biggest leaf cgroup, and kill
> > the largest task inside.
> 
> Why? I believe that the semantic should be as simple as kill the largest
> oom killable entity. And the entity is either a process or a memcg which
> is marked that way.

So, you still need to compare memcgroups and processes.

In my case, it's more like an exception (only processes from root memcg,
and only if there are no eligible cgroups with lower oom_priority).
You suggest to rely on this comparison.

> Why should we mix things and select a memcg to kill
> a process inside it? More on that below.

To have some sort of "fairness" in a containerized environemnt.
Say, 1 cgroup with 1 big task, another cgroup with many smaller tasks.
It's not necessary true, that first one is a better victim.

> 
> > But a user can change this behavior by enabling the per-cgroup
> > oom_kill_all_tasks option. If set, it causes the OOM killer treat
> > the whole cgroup as an indivisible memory consumer. In case if it's
> > selected as on OOM victim, all belonging tasks will be killed.
> > 
> > Tasks in the root cgroup are treated as independent memory consumers,
> > and are compared with other memory consumers (e.g. leaf cgroups).
> > The root cgroup doesn't support the oom_kill_all_tasks feature.
> 
> If anything you wouldn't have to treat the root memcg any special. It
> will be like any other memcg which doesn't have oom_kill_all_tasks...
>  
> [...]
> 
> > +static long memcg_oom_badness(struct mem_cgroup *memcg,
> > +			      const nodemask_t *nodemask)
> > +{
> > +	long points = 0;
> > +	int nid;
> > +	pg_data_t *pgdat;
> > +
> > +	for_each_node_state(nid, N_MEMORY) {
> > +		if (nodemask && !node_isset(nid, *nodemask))
> > +			continue;
> > +
> > +		points += mem_cgroup_node_nr_lru_pages(memcg, nid,
> > +				LRU_ALL_ANON | BIT(LRU_UNEVICTABLE));
> > +
> > +		pgdat = NODE_DATA(nid);
> > +		points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg),
> > +					    NR_SLAB_UNRECLAIMABLE);
> > +	}
> > +
> > +	points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /
> > +		(PAGE_SIZE / 1024);
> > +	points += memcg_page_state(memcg, MEMCG_SOCK);
> > +	points += memcg_page_state(memcg, MEMCG_SWAP);
> > +
> > +	return points;
> 
> I guess I have asked already and we haven't reached any consensus. I do
> not like how you treat memcgs and tasks differently. Why cannot we have
> a memcg score a sum of all its tasks?

It sounds like a more expensive way to get almost the same with less accuracy.
Why it's better?

> How do you want to compare memcg score with tasks score?

I have to do it for tasks in root cgroups, but it shouldn't be a common case.

> This just smells like the outcome of a weird
> semantic that you try to select the largest group I have mentioned
> above.
> 
> This is a rather fundamental concern and I believe we should find a
> consensus on it before going any further. I believe that users shouldn't
> see any difference in the OOM behavior when memcg v2 is used and there
> is no kill-all memcg. If there is such a memcg then we should treat only
> those specially. But you might have really strong usecases which haven't
> been presented or I've missed their importance.

I'll answer in the reply for your comments to the next commit.

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 3/4] mm, oom: introduce oom_priority for memory cgroups
  2017-08-24 12:10   ` Michal Hocko
@ 2017-08-24 12:51     ` Roman Gushchin
  2017-08-24 13:48       ` Michal Hocko
  0 siblings, 1 reply; 26+ messages in thread
From: Roman Gushchin @ 2017-08-24 12:51 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu, Aug 24, 2017 at 02:10:54PM +0200, Michal Hocko wrote:
> On Wed 23-08-17 17:52:00, Roman Gushchin wrote:
> > Introduce a per-memory-cgroup oom_priority setting: an integer number
> > within the [-10000, 10000] range, which defines the order in which
> > the OOM killer selects victim memory cgroups.
> 
> Why do we need a range here?

No specific reason, both [INT_MIN, INT_MAX] and [-10000, 10000] will
work equally. We should be able to predefine an OOM killing order for
any reasonable amount of cgroups.

> 
> > OOM killer prefers memory cgroups with larger priority if they are
> > populated with eligible tasks.
> 
> So this is basically orthogonal to the score based selection and the
> real size is only the tiebreaker for same priorities? Could you describe
> the usecase? Becasuse to me this sounds like a separate oom killer
> strategy. I can imagine somebody might be interested (e.g. always kill
> the oldest memcgs...) but an explicit range wouldn't fly with such a
> usecase very well.

The usecase: you have a machine with several containerized workloads
of different importance, and some system-level stuff, also in (memory)
cgroups.
In case of global memory shortage, some workloads should be killed in
a first order, others should be killed only if there is no other option.
Several workloads can have equal importance. Size-based tiebreaking
is very useful to catch memory leakers amongst them.

> 
> That brings me back to my original suggestion. Wouldn't a "register an
> oom strategy" approach much better than blending things together and
> then have to wrap heads around different combinations of tunables?

Well, I believe that 90% of this patchset is still relevant; the only
thing you might want to customize/replace size-based tiebreaking with
something else (like timestamp-based tiebreaking, mentioned by David earlier).

What about tunables, there are two, and they are completely orthogonal:
1) oom_priority allows to define an order, in which cgroups will be OOMed
2) oom_kill_all defines if all or just one task should be killed

So, I don't think it's a too complex interface.

Again, I'm not against oom strategy approach, it just looks as a much bigger
project, and I do not see a big need.

Do you have an example, which can't be effectively handled by an approach
I'm suggesting?

> 
> [...]
> > @@ -2760,7 +2761,12 @@ static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)
> >  			if (iter->oom_score == 0)
> >  				continue;
> >  
> > -			if (iter->oom_score > score) {
> > +			if (iter->oom_priority > prio) {
> > +				memcg = iter;
> > +				prio = iter->oom_priority;
> > +				score = iter->oom_score;
> > +			} else if (iter->oom_priority == prio &&
> > +				   iter->oom_score > score) {
> >  				memcg = iter;
> >  				score = iter->oom_score;
> >  			}
> 
> Just a minor thing. Why do we even have to calculate oom_score when we
> use it only as a tiebreaker?

Right now it's necessary, because at the same time we do look for
per-existing OOM victims. But if we can have a memcg-level counter for it,
this can be optimized.

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-24 12:28     ` Roman Gushchin
@ 2017-08-24 12:58       ` Michal Hocko
  2017-08-24 13:58         ` Roman Gushchin
  0 siblings, 1 reply; 26+ messages in thread
From: Michal Hocko @ 2017-08-24 12:58 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu 24-08-17 13:28:46, Roman Gushchin wrote:
> Hi Michal!
> 
> On Thu, Aug 24, 2017 at 01:47:06PM +0200, Michal Hocko wrote:
> > This doesn't apply on top of mmotm cleanly. You are missing
> > http://lkml.kernel.org/r/20170807113839.16695-3-mhocko@kernel.org
> 
> I'll rebase. Thanks!
> 
> > 
> > On Wed 23-08-17 17:51:59, Roman Gushchin wrote:
> > > Traditionally, the OOM killer is operating on a process level.
> > > Under oom conditions, it finds a process with the highest oom score
> > > and kills it.
> > > 
> > > This behavior doesn't suit well the system with many running
> > > containers:
> > > 
> > > 1) There is no fairness between containers. A small container with
> > > few large processes will be chosen over a large one with huge
> > > number of small processes.
> > > 
> > > 2) Containers often do not expect that some random process inside
> > > will be killed. In many cases much safer behavior is to kill
> > > all tasks in the container. Traditionally, this was implemented
> > > in userspace, but doing it in the kernel has some advantages,
> > > especially in a case of a system-wide OOM.
> > > 
> > > 3) Per-process oom_score_adj affects global OOM, so it's a breache
> > > in the isolation.
> > 
> > Please explain more. I guess you mean that an untrusted memcg could hide
> > itself from the global OOM killer by reducing the oom scores? Well you
> > need CAP_SYS_RESOURCE do reduce the current oom_score{_adj} as David has
> > already pointed out. I also agree that we absolutely must not kill an
> > oom disabled task. I am pretty sure somebody is using OOM_SCORE_ADJ_MIN
> > as a protection from an untrusted SIGKILL and inconsistent state as a
> > result. Those applications simply shouldn't behave differently in the
> > global and container contexts.
> 
> The main point of the kill_all option is to clean up the victim cgroup
> _completely_. If some tasks can survive, that means userspace should
> take care of them, look at the cgroup after oom, and kill the survivors
> manually.
> 
> If you want to rely on OOM_SCORE_ADJ_MIN, don't set kill_all.
> I really don't get the usecase for this "kill all, except this and that".

OOM_SCORE_ADJ_MIN has become a contract de-facto. You cannot simply
expect that somebody would alter a specific workload for a container
just to be safe against unexpected SIGKILL. kill-all might be set up the
memcg hierarchy which is out of your control.

> Also, it's really confusing to respect -1000 value, and completely ignore -999.
> 
> I believe that any complex userspace OOM handling should use memory.high
> and handle memory shortage before an actual OOM.
> 
> > 
> > If nothing else we have to skip OOM_SCORE_ADJ_MIN tasks during the kill.
> > 
> > > To address these issues, cgroup-aware OOM killer is introduced.
> > > 
> > > Under OOM conditions, it tries to find the biggest memory consumer,
> > > and free memory by killing corresponding task(s). The difference
> > > the "traditional" OOM killer is that it can treat memory cgroups
> > > as memory consumers as well as single processes.
> > > 
> > > By default, it will look for the biggest leaf cgroup, and kill
> > > the largest task inside.
> > 
> > Why? I believe that the semantic should be as simple as kill the largest
> > oom killable entity. And the entity is either a process or a memcg which
> > is marked that way.
> 
> So, you still need to compare memcgroups and processes.
> 
> In my case, it's more like an exception (only processes from root memcg,
> and only if there are no eligible cgroups with lower oom_priority).
> You suggest to rely on this comparison.
> 
> > Why should we mix things and select a memcg to kill
> > a process inside it? More on that below.
> 
> To have some sort of "fairness" in a containerized environemnt.
> Say, 1 cgroup with 1 big task, another cgroup with many smaller tasks.
> It's not necessary true, that first one is a better victim.

There is nothing like a "better victim". We are pretty much in a
catastrophic situation when we try to survive by killing a userspace.
We try to kill the largest because that assumes that we return the
most memory from it. Now I do understand that you want to treat the
memcg as a single killable entity but I find it really questionable
to do a per-memcg metric and then do not treat it like that and kill
only a single task. Just imagine a single memcg with zillions of taks
each very small and you select it as the largest while a small taks
itself doesn't help to help to get us out of the OOM.
 
> > > But a user can change this behavior by enabling the per-cgroup
> > > oom_kill_all_tasks option. If set, it causes the OOM killer treat
> > > the whole cgroup as an indivisible memory consumer. In case if it's
> > > selected as on OOM victim, all belonging tasks will be killed.
> > > 
> > > Tasks in the root cgroup are treated as independent memory consumers,
> > > and are compared with other memory consumers (e.g. leaf cgroups).
> > > The root cgroup doesn't support the oom_kill_all_tasks feature.
> > 
> > If anything you wouldn't have to treat the root memcg any special. It
> > will be like any other memcg which doesn't have oom_kill_all_tasks...
> >  
> > [...]
> > 
> > > +static long memcg_oom_badness(struct mem_cgroup *memcg,
> > > +			      const nodemask_t *nodemask)
> > > +{
> > > +	long points = 0;
> > > +	int nid;
> > > +	pg_data_t *pgdat;
> > > +
> > > +	for_each_node_state(nid, N_MEMORY) {
> > > +		if (nodemask && !node_isset(nid, *nodemask))
> > > +			continue;
> > > +
> > > +		points += mem_cgroup_node_nr_lru_pages(memcg, nid,
> > > +				LRU_ALL_ANON | BIT(LRU_UNEVICTABLE));
> > > +
> > > +		pgdat = NODE_DATA(nid);
> > > +		points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg),
> > > +					    NR_SLAB_UNRECLAIMABLE);
> > > +	}
> > > +
> > > +	points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /
> > > +		(PAGE_SIZE / 1024);
> > > +	points += memcg_page_state(memcg, MEMCG_SOCK);
> > > +	points += memcg_page_state(memcg, MEMCG_SWAP);
> > > +
> > > +	return points;
> > 
> > I guess I have asked already and we haven't reached any consensus. I do
> > not like how you treat memcgs and tasks differently. Why cannot we have
> > a memcg score a sum of all its tasks?
> 
> It sounds like a more expensive way to get almost the same with less accuracy.
> Why it's better?

because then you are comparing apples to apples? Besides that you have
to check each task for over-killing anyway. So I do not see any
performance merits here.

> > How do you want to compare memcg score with tasks score?
> 
> I have to do it for tasks in root cgroups, but it shouldn't be a common case.

How come? I can easily imagine a setup where only some memcgs which
really do need a kill-all semantic while all others can live with single
task killed perfectly fine.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 3/4] mm, oom: introduce oom_priority for memory cgroups
  2017-08-24 12:51     ` Roman Gushchin
@ 2017-08-24 13:48       ` Michal Hocko
  2017-08-24 14:11         ` Roman Gushchin
  0 siblings, 1 reply; 26+ messages in thread
From: Michal Hocko @ 2017-08-24 13:48 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu 24-08-17 13:51:13, Roman Gushchin wrote:
> On Thu, Aug 24, 2017 at 02:10:54PM +0200, Michal Hocko wrote:
> > On Wed 23-08-17 17:52:00, Roman Gushchin wrote:
> > > Introduce a per-memory-cgroup oom_priority setting: an integer number
> > > within the [-10000, 10000] range, which defines the order in which
> > > the OOM killer selects victim memory cgroups.
> > 
> > Why do we need a range here?
> 
> No specific reason, both [INT_MIN, INT_MAX] and [-10000, 10000] will
> work equally.

Then do not enforce a range because this just reduces possible usecases
(e.g. timestamp one...).

> We should be able to predefine an OOM killing order for
> any reasonable amount of cgroups.
> 
> > 
> > > OOM killer prefers memory cgroups with larger priority if they are
> > > populated with eligible tasks.
> > 
> > So this is basically orthogonal to the score based selection and the
> > real size is only the tiebreaker for same priorities? Could you describe
> > the usecase? Becasuse to me this sounds like a separate oom killer
> > strategy. I can imagine somebody might be interested (e.g. always kill
> > the oldest memcgs...) but an explicit range wouldn't fly with such a
> > usecase very well.
> 
> The usecase: you have a machine with several containerized workloads
> of different importance, and some system-level stuff, also in (memory)
> cgroups.
> In case of global memory shortage, some workloads should be killed in
> a first order, others should be killed only if there is no other option.
> Several workloads can have equal importance. Size-based tiebreaking
> is very useful to catch memory leakers amongst them.

OK, please document that in the changelog.

> > That brings me back to my original suggestion. Wouldn't a "register an
> > oom strategy" approach much better than blending things together and
> > then have to wrap heads around different combinations of tunables?
> 
> Well, I believe that 90% of this patchset is still relevant;

agreed and didn't say otherwise.

> the only
> thing you might want to customize/replace size-based tiebreaking with
> something else (like timestamp-based tiebreaking, mentioned by David earlier).

> What about tunables, there are two, and they are completely orthogonal:
> 1) oom_priority allows to define an order, in which cgroups will be OOMed
> 2) oom_kill_all defines if all or just one task should be killed
> 
> So, I don't think it's a too complex interface.
> 
> Again, I'm not against oom strategy approach, it just looks as a much bigger
> project, and I do not see a big need.

Well, I was thinking that our current oom victim selection code is
quite extensible already. Your patches will teach it kill the whole
group semantic which is already very useful. Now we can talk about the
selection criteria and this is something to be replaceable. Because even
the current discussion has shown that different people might and will
have different requirements. Can we structure the code in such a way
that new comparison algorithm would be simple to add without reworking
the whole selection logic?

> Do you have an example, which can't be effectively handled by an approach
> I'm suggesting?

No, I do not have any which would be _explicitly_ requested but I do
envision new requirements will emerge. The most probable one would be
kill the youngest container because that would imply the least amount of
work wasted.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-24 12:58       ` Michal Hocko
@ 2017-08-24 13:58         ` Roman Gushchin
  2017-08-24 14:13           ` Michal Hocko
  0 siblings, 1 reply; 26+ messages in thread
From: Roman Gushchin @ 2017-08-24 13:58 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu, Aug 24, 2017 at 02:58:11PM +0200, Michal Hocko wrote:
> On Thu 24-08-17 13:28:46, Roman Gushchin wrote:
> > Hi Michal!
> > 
> There is nothing like a "better victim". We are pretty much in a
> catastrophic situation when we try to survive by killing a userspace.

Not necessary, it can be a cgroup OOM.

> We try to kill the largest because that assumes that we return the
> most memory from it. Now I do understand that you want to treat the
> memcg as a single killable entity but I find it really questionable
> to do a per-memcg metric and then do not treat it like that and kill
> only a single task. Just imagine a single memcg with zillions of taks
> each very small and you select it as the largest while a small taks
> itself doesn't help to help to get us out of the OOM.

I don't think it's different from a non-containerized state: if you
have a zillion of small tasks in the system, you'll meet the same issues.

> > > I guess I have asked already and we haven't reached any consensus. I do
> > > not like how you treat memcgs and tasks differently. Why cannot we have
> > > a memcg score a sum of all its tasks?
> > 
> > It sounds like a more expensive way to get almost the same with less accuracy.
> > Why it's better?
> 
> because then you are comparing apples to apples?

Well, I can say that I compare some number of pages against some other number
of pages. And the relation between a page and memcg is more obvious, than a
relation between a page and a process.

Both ways are not ideal, and sum of the processes is not ideal too.
Especially, if you take oom_score_adj into account. Will you respect it?

I've started actually with such approach, but then found it weird.

> Besides that you have
> to check each task for over-killing anyway. So I do not see any
> performance merits here.

It's an implementation detail, and we can hopefully get rid of it at some point.

> 
> > > How do you want to compare memcg score with tasks score?
> > 
> > I have to do it for tasks in root cgroups, but it shouldn't be a common case.
> 
> How come? I can easily imagine a setup where only some memcgs which
> really do need a kill-all semantic while all others can live with single
> task killed perfectly fine.

I mean taking a unified cgroup hierarchy into an account, there should not
be lot of tasks in the root cgroup, if any.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 3/4] mm, oom: introduce oom_priority for memory cgroups
  2017-08-24 13:48       ` Michal Hocko
@ 2017-08-24 14:11         ` Roman Gushchin
  2017-08-28 20:54           ` David Rientjes
  0 siblings, 1 reply; 26+ messages in thread
From: Roman Gushchin @ 2017-08-24 14:11 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu, Aug 24, 2017 at 03:48:59PM +0200, Michal Hocko wrote:
> On Thu 24-08-17 13:51:13, Roman Gushchin wrote:
> > On Thu, Aug 24, 2017 at 02:10:54PM +0200, Michal Hocko wrote:
> > > On Wed 23-08-17 17:52:00, Roman Gushchin wrote:
> > > > Introduce a per-memory-cgroup oom_priority setting: an integer number
> > > > within the [-10000, 10000] range, which defines the order in which
> > > > the OOM killer selects victim memory cgroups.
> > > 
> > > Why do we need a range here?
> > 
> > No specific reason, both [INT_MIN, INT_MAX] and [-10000, 10000] will
> > work equally.
> 
> Then do not enforce a range because this just reduces possible usecases
> (e.g. timestamp one...).

I agree.

> 
> > We should be able to predefine an OOM killing order for
> > any reasonable amount of cgroups.
> > 
> > > 
> > > > OOM killer prefers memory cgroups with larger priority if they are
> > > > populated with eligible tasks.
> > > 
> > > So this is basically orthogonal to the score based selection and the
> > > real size is only the tiebreaker for same priorities? Could you describe
> > > the usecase? Becasuse to me this sounds like a separate oom killer
> > > strategy. I can imagine somebody might be interested (e.g. always kill
> > > the oldest memcgs...) but an explicit range wouldn't fly with such a
> > > usecase very well.
> > 
> > The usecase: you have a machine with several containerized workloads
> > of different importance, and some system-level stuff, also in (memory)
> > cgroups.
> > In case of global memory shortage, some workloads should be killed in
> > a first order, others should be killed only if there is no other option.
> > Several workloads can have equal importance. Size-based tiebreaking
> > is very useful to catch memory leakers amongst them.
> 
> OK, please document that in the changelog.

Sure.

> 
> > > That brings me back to my original suggestion. Wouldn't a "register an
> > > oom strategy" approach much better than blending things together and
> > > then have to wrap heads around different combinations of tunables?
> > 
> > Well, I believe that 90% of this patchset is still relevant;
> 
> agreed and didn't say otherwise.
> 
> > the only
> > thing you might want to customize/replace size-based tiebreaking with
> > something else (like timestamp-based tiebreaking, mentioned by David earlier).
> 
> > What about tunables, there are two, and they are completely orthogonal:
> > 1) oom_priority allows to define an order, in which cgroups will be OOMed
> > 2) oom_kill_all defines if all or just one task should be killed
> > 
> > So, I don't think it's a too complex interface.
> > 
> > Again, I'm not against oom strategy approach, it just looks as a much bigger
> > project, and I do not see a big need.
> 
> Well, I was thinking that our current oom victim selection code is
> quite extensible already. Your patches will teach it kill the whole
> group semantic which is already very useful. Now we can talk about the
> selection criteria and this is something to be replaceable. Because even
> the current discussion has shown that different people might and will
> have different requirements. Can we structure the code in such a way
> that new comparison algorithm would be simple to add without reworking
> the whole selection logic?

I'd say that extended oom_priority range and potentially customizable
memcg_oom_badness() should do the job for memcgroups.

We can extract a part of the oom_badness() into something like this:

unsigned long task_oom_badness(struct task_struct *p)
{
	/*
	 * The baseline for the badness score is the proportion of RAM that each
	 * task's rss, pagetable and swap space use.
	 */
	return get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
}


Also, it would be nice to introduce an oom_priority for tasks, as David
suggested.

> 
> > Do you have an example, which can't be effectively handled by an approach
> > I'm suggesting?
> 
> No, I do not have any which would be _explicitly_ requested but I do
> envision new requirements will emerge. The most probable one would be
> kill the youngest container because that would imply the least amount of
> work wasted.

I agree, this a nice feature. It can be implemented in userspace
by setting oom_priority.

Thanks!

Roman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-24 13:58         ` Roman Gushchin
@ 2017-08-24 14:13           ` Michal Hocko
  2017-08-24 14:58             ` Roman Gushchin
  0 siblings, 1 reply; 26+ messages in thread
From: Michal Hocko @ 2017-08-24 14:13 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu 24-08-17 14:58:42, Roman Gushchin wrote:
> On Thu, Aug 24, 2017 at 02:58:11PM +0200, Michal Hocko wrote:
> > On Thu 24-08-17 13:28:46, Roman Gushchin wrote:
> > > Hi Michal!
> > > 
> > There is nothing like a "better victim". We are pretty much in a
> > catastrophic situation when we try to survive by killing a userspace.
> 
> Not necessary, it can be a cgroup OOM.

memcg OOM is no different. The catastrophic is scoped to the specific
hierarchy but tasks in that hierarchy still fail to make a further
progress.

> > We try to kill the largest because that assumes that we return the
> > most memory from it. Now I do understand that you want to treat the
> > memcg as a single killable entity but I find it really questionable
> > to do a per-memcg metric and then do not treat it like that and kill
> > only a single task. Just imagine a single memcg with zillions of taks
> > each very small and you select it as the largest while a small taks
> > itself doesn't help to help to get us out of the OOM.
> 
> I don't think it's different from a non-containerized state: if you
> have a zillion of small tasks in the system, you'll meet the same issues.

Yes this is possible but usually you are comparing apples to apples so
you will kill the largest offender and then go on. To be honest I really
do hate how we try to kill a children rather than the selected victim
for the same reason.

> > > > I guess I have asked already and we haven't reached any consensus. I do
> > > > not like how you treat memcgs and tasks differently. Why cannot we have
> > > > a memcg score a sum of all its tasks?
> > > 
> > > It sounds like a more expensive way to get almost the same with less accuracy.
> > > Why it's better?
> > 
> > because then you are comparing apples to apples?
> 
> Well, I can say that I compare some number of pages against some other number
> of pages. And the relation between a page and memcg is more obvious, than a
> relation between a page and a process.

But you are comparing different accounting systems.
 
> Both ways are not ideal, and sum of the processes is not ideal too.
> Especially, if you take oom_score_adj into account. Will you respect it?

Yes, and I do not see any reason why we shouldn't.

> I've started actually with such approach, but then found it weird.
> 
> > Besides that you have
> > to check each task for over-killing anyway. So I do not see any
> > performance merits here.
> 
> It's an implementation detail, and we can hopefully get rid of it at some point.

Well, we might do some estimations and ignore oom scopes but I that
sounds really complicated and error prone. Unless we have anything like
that then I would start from tasks and build up the necessary to make a
decision at the higher level.
 
> > > > How do you want to compare memcg score with tasks score?
> > > 
> > > I have to do it for tasks in root cgroups, but it shouldn't be a common case.
> > 
> > How come? I can easily imagine a setup where only some memcgs which
> > really do need a kill-all semantic while all others can live with single
> > task killed perfectly fine.
> 
> I mean taking a unified cgroup hierarchy into an account, there should not
> be lot of tasks in the root cgroup, if any.

Is that really the case? I would assume that memory controller would be
enabled only in those subtrees which really use the functionality and
the rest will be sitting in the root memcg. It might be the case if you
are running only containers but I am not really sure this is true in
general.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-24 14:13           ` Michal Hocko
@ 2017-08-24 14:58             ` Roman Gushchin
  2017-08-25  8:14               ` Michal Hocko
  0 siblings, 1 reply; 26+ messages in thread
From: Roman Gushchin @ 2017-08-24 14:58 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu, Aug 24, 2017 at 04:13:37PM +0200, Michal Hocko wrote:
> On Thu 24-08-17 14:58:42, Roman Gushchin wrote:
> > On Thu, Aug 24, 2017 at 02:58:11PM +0200, Michal Hocko wrote:
> > > On Thu 24-08-17 13:28:46, Roman Gushchin wrote:
> > > > Hi Michal!
> > > > 
> > > There is nothing like a "better victim". We are pretty much in a
> > > catastrophic situation when we try to survive by killing a userspace.
> > 
> > Not necessary, it can be a cgroup OOM.
> 
> memcg OOM is no different. The catastrophic is scoped to the specific
> hierarchy but tasks in that hierarchy still fail to make a further
> progress.
> 
> > > We try to kill the largest because that assumes that we return the
> > > most memory from it. Now I do understand that you want to treat the
> > > memcg as a single killable entity but I find it really questionable
> > > to do a per-memcg metric and then do not treat it like that and kill
> > > only a single task. Just imagine a single memcg with zillions of taks
> > > each very small and you select it as the largest while a small taks
> > > itself doesn't help to help to get us out of the OOM.
> > 
> > I don't think it's different from a non-containerized state: if you
> > have a zillion of small tasks in the system, you'll meet the same issues.
> 
> Yes this is possible but usually you are comparing apples to apples so
> you will kill the largest offender and then go on. To be honest I really
> do hate how we try to kill a children rather than the selected victim
> for the same reason.

I do hate it too.

> 
> > > > > I guess I have asked already and we haven't reached any consensus. I do
> > > > > not like how you treat memcgs and tasks differently. Why cannot we have
> > > > > a memcg score a sum of all its tasks?
> > > > 
> > > > It sounds like a more expensive way to get almost the same with less accuracy.
> > > > Why it's better?
> > > 
> > > because then you are comparing apples to apples?
> > 
> > Well, I can say that I compare some number of pages against some other number
> > of pages. And the relation between a page and memcg is more obvious, than a
> > relation between a page and a process.
> 
> But you are comparing different accounting systems.
>  
> > Both ways are not ideal, and sum of the processes is not ideal too.
> > Especially, if you take oom_score_adj into account. Will you respect it?
> 
> Yes, and I do not see any reason why we shouldn't.

It makes things even more complicated.
Right now task's oom_score can be in (~ -total_memory, ~ +2*total_memory) range,
and it you're starting summing it, it can be multiplied by number of tasks...
Weird.
It also will be different in case of system and memcg-wide OOM.

> 
> > I've started actually with such approach, but then found it weird.
> > 
> > > Besides that you have
> > > to check each task for over-killing anyway. So I do not see any
> > > performance merits here.
> > 
> > It's an implementation detail, and we can hopefully get rid of it at some point.
> 
> Well, we might do some estimations and ignore oom scopes but I that
> sounds really complicated and error prone. Unless we have anything like
> that then I would start from tasks and build up the necessary to make a
> decision at the higher level.

Seriously speaking, do you have an example, when summing per-process
oom_score will work better?

Especially, if we're talking about customizing oom_score calculation,
it makes no sence to me. How you will sum process timestamps?

>  
> > > > > How do you want to compare memcg score with tasks score?
> > > > 
> > > > I have to do it for tasks in root cgroups, but it shouldn't be a common case.
> > > 
> > > How come? I can easily imagine a setup where only some memcgs which
> > > really do need a kill-all semantic while all others can live with single
> > > task killed perfectly fine.
> > 
> > I mean taking a unified cgroup hierarchy into an account, there should not
> > be lot of tasks in the root cgroup, if any.
> 
> Is that really the case? I would assume that memory controller would be
> enabled only in those subtrees which really use the functionality and
> the rest will be sitting in the root memcg. It might be the case if you
> are running only containers but I am not really sure this is true in
> general.

Agree.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-24 14:58             ` Roman Gushchin
@ 2017-08-25  8:14               ` Michal Hocko
  2017-08-25 10:39                 ` Roman Gushchin
  2017-08-30 11:22                 ` Roman Gushchin
  0 siblings, 2 replies; 26+ messages in thread
From: Michal Hocko @ 2017-08-25  8:14 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu 24-08-17 15:58:01, Roman Gushchin wrote:
> On Thu, Aug 24, 2017 at 04:13:37PM +0200, Michal Hocko wrote:
> > On Thu 24-08-17 14:58:42, Roman Gushchin wrote:
[...]
> > > Both ways are not ideal, and sum of the processes is not ideal too.
> > > Especially, if you take oom_score_adj into account. Will you respect it?
> > 
> > Yes, and I do not see any reason why we shouldn't.
> 
> It makes things even more complicated.
> Right now task's oom_score can be in (~ -total_memory, ~ +2*total_memory) range,
> and it you're starting summing it, it can be multiplied by number of tasks...
> Weird.

oom_score_adj is just a normalized bias so if tasks inside oom will use
it the whole memcg will get accumulated bias from all such tasks so it
is not completely off. I agree that the more tasks use the bias the more
biased the whole memcg will be. This might or might not be a problem.
As you are trying to reimplement the existing oom killer implementation
I do not think we cannot simply ignore API which people are used to.

If this was a configurable oom policy then I could see how ignoring
oom_score_adj is acceptable because it would be an explicit opt-in.

> It also will be different in case of system and memcg-wide OOM.

Why, we do honor oom_score_adj for the memcg OOM now and in fact the
kernel memcg OOM killer shouldn't be very much different from the global
one except for the tasks scope.

> > > I've started actually with such approach, but then found it weird.
> > > 
> > > > Besides that you have
> > > > to check each task for over-killing anyway. So I do not see any
> > > > performance merits here.
> > > 
> > > It's an implementation detail, and we can hopefully get rid of it at some point.
> > 
> > Well, we might do some estimations and ignore oom scopes but I that
> > sounds really complicated and error prone. Unless we have anything like
> > that then I would start from tasks and build up the necessary to make a
> > decision at the higher level.
> 
> Seriously speaking, do you have an example, when summing per-process
> oom_score will work better?

The primary reason I am pushing for this is to have the common iterator
code path (which we have since Vladimir has unified memcg and global oom
paths) and only parametrize the value calculation and victim selection.

> Especially, if we're talking about customizing oom_score calculation,
> it makes no sence to me. How you will sum process timestamps?

Well, I meant you could sum oom_badness for your particular
implementation. If we need some other policy then this wouldn't work and
that's why I've said that I would like to preserve the current common
code and only parametrize value calculation and victim selection...
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-25  8:14               ` Michal Hocko
@ 2017-08-25 10:39                 ` Roman Gushchin
  2017-08-25 10:58                   ` Michal Hocko
  2017-08-30 11:22                 ` Roman Gushchin
  1 sibling, 1 reply; 26+ messages in thread
From: Roman Gushchin @ 2017-08-25 10:39 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Fri, Aug 25, 2017 at 10:14:03AM +0200, Michal Hocko wrote:
> On Thu 24-08-17 15:58:01, Roman Gushchin wrote:
> > On Thu, Aug 24, 2017 at 04:13:37PM +0200, Michal Hocko wrote:
> > > On Thu 24-08-17 14:58:42, Roman Gushchin wrote:
> [...]
> > > > Both ways are not ideal, and sum of the processes is not ideal too.
> > > > Especially, if you take oom_score_adj into account. Will you respect it?
> > > 
> > > Yes, and I do not see any reason why we shouldn't.
> > 
> > It makes things even more complicated.
> > Right now task's oom_score can be in (~ -total_memory, ~ +2*total_memory) range,
> > and it you're starting summing it, it can be multiplied by number of tasks...
> > Weird.
> 
> oom_score_adj is just a normalized bias so if tasks inside oom will use
> it the whole memcg will get accumulated bias from all such tasks so it
> is not completely off. I agree that the more tasks use the bias the more
> biased the whole memcg will be. This might or might not be a problem.
> As you are trying to reimplement the existing oom killer implementation
> I do not think we cannot simply ignore API which people are used to.
> 
> If this was a configurable oom policy then I could see how ignoring
> oom_score_adj is acceptable because it would be an explicit opt-in.
>
> > It also will be different in case of system and memcg-wide OOM.
> 
> Why, we do honor oom_score_adj for the memcg OOM now and in fact the
> kernel memcg OOM killer shouldn't be very much different from the global
> one except for the tasks scope.

Assume, you have two tasks (2Gb and 1Gb) in a cgroup with limit 3Gb.
The second task has oom_score_adj +100. Total memory is 64Gb, for example.

I case of memcg-wide oom first task will be selected;
in case of system-wide OOM - the second.

Personally I don't like this, but it looks like we have to respect
oom_score_adj set to -1000, I'll alter my patch.

> 
> > > > I've started actually with such approach, but then found it weird.
> > > > 
> > > > > Besides that you have
> > > > > to check each task for over-killing anyway. So I do not see any
> > > > > performance merits here.
> > > > 
> > > > It's an implementation detail, and we can hopefully get rid of it at some point.
> > > 
> > > Well, we might do some estimations and ignore oom scopes but I that
> > > sounds really complicated and error prone. Unless we have anything like
> > > that then I would start from tasks and build up the necessary to make a
> > > decision at the higher level.
> > 
> > Seriously speaking, do you have an example, when summing per-process
> > oom_score will work better?
> 
> The primary reason I am pushing for this is to have the common iterator
> code path (which we have since Vladimir has unified memcg and global oom
> paths) and only parametrize the value calculation and victim selection.

I agree, but I'm not sure that we can (and have to) totally unify the way,
how oom_score is calculated for processes and cgroups.

But I'd like to see an unified oom_priority approach. This will allow
to define an OOM killing order in a clear way, and use size-based tiebreaking
for items of the same priority. Root-cgroup processes will be compared with
other memory consumers by oom_priority first and oom_score afterwards.

What do you think about it?

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-23 23:19   ` David Rientjes
@ 2017-08-25 10:57     ` Roman Gushchin
  0 siblings, 0 replies; 26+ messages in thread
From: Roman Gushchin @ 2017-08-25 10:57 UTC (permalink / raw)
  To: David Rientjes
  Cc: linux-mm, Michal Hocko, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

Hi David!

On Wed, Aug 23, 2017 at 04:19:11PM -0700, David Rientjes wrote:
> On Wed, 23 Aug 2017, Roman Gushchin wrote:
> 
> > Traditionally, the OOM killer is operating on a process level.
> > Under oom conditions, it finds a process with the highest oom score
> > and kills it.
> > 
> > This behavior doesn't suit well the system with many running
> > containers:
> > 
> > 1) There is no fairness between containers. A small container with
> > few large processes will be chosen over a large one with huge
> > number of small processes.
> > 
> > 2) Containers often do not expect that some random process inside
> > will be killed. In many cases much safer behavior is to kill
> > all tasks in the container. Traditionally, this was implemented
> > in userspace, but doing it in the kernel has some advantages,
> > especially in a case of a system-wide OOM.
> > 
> > 3) Per-process oom_score_adj affects global OOM, so it's a breache
> > in the isolation.
> > 
> > To address these issues, cgroup-aware OOM killer is introduced.
> > 
> > Under OOM conditions, it tries to find the biggest memory consumer,
> > and free memory by killing corresponding task(s). The difference
> > the "traditional" OOM killer is that it can treat memory cgroups
> > as memory consumers as well as single processes.
> > 
> > By default, it will look for the biggest leaf cgroup, and kill
> > the largest task inside.
> > 
> > But a user can change this behavior by enabling the per-cgroup
> > oom_kill_all_tasks option. If set, it causes the OOM killer treat
> > the whole cgroup as an indivisible memory consumer. In case if it's
> > selected as on OOM victim, all belonging tasks will be killed.
> > 
> 
> I'm very happy with the rest of the patchset, but I feel that I must renew 
> my objection to memory.oom_kill_all_tasks being able to override the 
> setting of the admin of setting a process to be oom disabled.  From my 
> perspective, setting memory.oom_kill_all_tasks with an oom disabled 
> process attached that now becomes killable either (1) overrides the 
> CAP_SYS_RESOURCE oom disabled setting or (2) is lazy and doesn't modify 
> /proc/pid/oom_score_adj itself.

Changed this in v7 (to be posted soon).

Thanks!

Roman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-25 10:39                 ` Roman Gushchin
@ 2017-08-25 10:58                   ` Michal Hocko
  0 siblings, 0 replies; 26+ messages in thread
From: Michal Hocko @ 2017-08-25 10:58 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Fri 25-08-17 11:39:51, Roman Gushchin wrote:
> On Fri, Aug 25, 2017 at 10:14:03AM +0200, Michal Hocko wrote:
> > On Thu 24-08-17 15:58:01, Roman Gushchin wrote:
> > > On Thu, Aug 24, 2017 at 04:13:37PM +0200, Michal Hocko wrote:
> > > > On Thu 24-08-17 14:58:42, Roman Gushchin wrote:
> > [...]
> > > > > Both ways are not ideal, and sum of the processes is not ideal too.
> > > > > Especially, if you take oom_score_adj into account. Will you respect it?
> > > > 
> > > > Yes, and I do not see any reason why we shouldn't.
> > > 
> > > It makes things even more complicated.
> > > Right now task's oom_score can be in (~ -total_memory, ~ +2*total_memory) range,
> > > and it you're starting summing it, it can be multiplied by number of tasks...
> > > Weird.
> > 
> > oom_score_adj is just a normalized bias so if tasks inside oom will use
> > it the whole memcg will get accumulated bias from all such tasks so it
> > is not completely off. I agree that the more tasks use the bias the more
> > biased the whole memcg will be. This might or might not be a problem.
> > As you are trying to reimplement the existing oom killer implementation
> > I do not think we cannot simply ignore API which people are used to.
> > 
> > If this was a configurable oom policy then I could see how ignoring
> > oom_score_adj is acceptable because it would be an explicit opt-in.
> >
> > > It also will be different in case of system and memcg-wide OOM.
> > 
> > Why, we do honor oom_score_adj for the memcg OOM now and in fact the
> > kernel memcg OOM killer shouldn't be very much different from the global
> > one except for the tasks scope.
> 
> Assume, you have two tasks (2Gb and 1Gb) in a cgroup with limit 3Gb.
> The second task has oom_score_adj +100. Total memory is 64Gb, for example.
> 
> I case of memcg-wide oom first task will be selected;
> in case of system-wide OOM - the second.
> 
> Personally I don't like this, but it looks like we have to respect
> oom_score_adj set to -1000, I'll alter my patch.

I cannot say I would love how oom_score_adj works but it's been like
that for a long time and people do rely on that. So we cannot simply
change it under people feets.
 
> > > > > I've started actually with such approach, but then found it weird.
> > > > > 
> > > > > > Besides that you have
> > > > > > to check each task for over-killing anyway. So I do not see any
> > > > > > performance merits here.
> > > > > 
> > > > > It's an implementation detail, and we can hopefully get rid of it at some point.
> > > > 
> > > > Well, we might do some estimations and ignore oom scopes but I that
> > > > sounds really complicated and error prone. Unless we have anything like
> > > > that then I would start from tasks and build up the necessary to make a
> > > > decision at the higher level.
> > > 
> > > Seriously speaking, do you have an example, when summing per-process
> > > oom_score will work better?
> > 
> > The primary reason I am pushing for this is to have the common iterator
> > code path (which we have since Vladimir has unified memcg and global oom
> > paths) and only parametrize the value calculation and victim selection.
> 
> I agree, but I'm not sure that we can (and have to) totally unify the way,
> how oom_score is calculated for processes and cgroups.
> 
> But I'd like to see an unified oom_priority approach. This will allow
> to define an OOM killing order in a clear way, and use size-based tiebreaking
> for items of the same priority. Root-cgroup processes will be compared with
> other memory consumers by oom_priority first and oom_score afterwards.

This again changes the existing semantic so I really thing we should be
careful and this all should be opt-in.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 3/4] mm, oom: introduce oom_priority for memory cgroups
  2017-08-24 14:11         ` Roman Gushchin
@ 2017-08-28 20:54           ` David Rientjes
  0 siblings, 0 replies; 26+ messages in thread
From: David Rientjes @ 2017-08-28 20:54 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Michal Hocko, linux-mm, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu, 24 Aug 2017, Roman Gushchin wrote:

> > > Do you have an example, which can't be effectively handled by an approach
> > > I'm suggesting?
> > 
> > No, I do not have any which would be _explicitly_ requested but I do
> > envision new requirements will emerge. The most probable one would be
> > kill the youngest container because that would imply the least amount of
> > work wasted.
> 
> I agree, this a nice feature. It can be implemented in userspace
> by setting oom_priority.
> 

Yes, the "kill the newest memory cgroup as a tiebreak" is not strictly 
required in the kernel and no cgroup should depend on this implementation 
detail to avoid being killed if it shares the same memory.oom_priority as 
another cgroup.  As you mention, it can be effectively implemented by 
userspace itself.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-25  8:14               ` Michal Hocko
  2017-08-25 10:39                 ` Roman Gushchin
@ 2017-08-30 11:22                 ` Roman Gushchin
  2017-08-30 20:56                   ` David Rientjes
  1 sibling, 1 reply; 26+ messages in thread
From: Roman Gushchin @ 2017-08-30 11:22 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Vladimir Davydov, Johannes Weiner, Tetsuo Handa,
	David Rientjes, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Fri, Aug 25, 2017 at 10:14:03AM +0200, Michal Hocko wrote:
> On Thu 24-08-17 15:58:01, Roman Gushchin wrote:
> > On Thu, Aug 24, 2017 at 04:13:37PM +0200, Michal Hocko wrote:
> > > On Thu 24-08-17 14:58:42, Roman Gushchin wrote:
> [...]
> > > > Both ways are not ideal, and sum of the processes is not ideal too.
> > > > Especially, if you take oom_score_adj into account. Will you respect it?
> > > 
> > > Yes, and I do not see any reason why we shouldn't.
> > 
> > It makes things even more complicated.
> > Right now task's oom_score can be in (~ -total_memory, ~ +2*total_memory) range,
> > and it you're starting summing it, it can be multiplied by number of tasks...
> > Weird.
> 
> oom_score_adj is just a normalized bias so if tasks inside oom will use
> it the whole memcg will get accumulated bias from all such tasks so it
> is not completely off. I agree that the more tasks use the bias the more
> biased the whole memcg will be. This might or might not be a problem.
> As you are trying to reimplement the existing oom killer implementation
> I do not think we cannot simply ignore API which people are used to.
> 
> If this was a configurable oom policy then I could see how ignoring
> oom_score_adj is acceptable because it would be an explicit opt-in.
> 
> > It also will be different in case of system and memcg-wide OOM.
> 
> Why, we do honor oom_score_adj for the memcg OOM now and in fact the
> kernel memcg OOM killer shouldn't be very much different from the global
> one except for the tasks scope.
> 
> > > > I've started actually with such approach, but then found it weird.
> > > > 
> > > > > Besides that you have
> > > > > to check each task for over-killing anyway. So I do not see any
> > > > > performance merits here.
> > > > 
> > > > It's an implementation detail, and we can hopefully get rid of it at some point.
> > > 
> > > Well, we might do some estimations and ignore oom scopes but I that
> > > sounds really complicated and error prone. Unless we have anything like
> > > that then I would start from tasks and build up the necessary to make a
> > > decision at the higher level.
> > 
> > Seriously speaking, do you have an example, when summing per-process
> > oom_score will work better?
> 
> The primary reason I am pushing for this is to have the common iterator
> code path (which we have since Vladimir has unified memcg and global oom
> paths) and only parametrize the value calculation and victim selection.
> 
> > Especially, if we're talking about customizing oom_score calculation,
> > it makes no sence to me. How you will sum process timestamps?
> 
> Well, I meant you could sum oom_badness for your particular
> implementation. If we need some other policy then this wouldn't work and
> that's why I've said that I would like to preserve the current common
> code and only parametrize value calculation and victim selection...

I've spent some time to implement such a version.

It really became shorter and more existing code were reused,
howewer I've met a couple of serious issues:

1) Simple summing of per-task oom_score doesn't make sense.
   First, we calculate oom_score per-task, while should sum per-process values,
   or, better, per-mm struct. We can take only threa-group leader's score
   into account, but it's also not 100% accurate.
   And, again, we have a question what to do with per-task oom_score_adj,
   if we don't task the task's oom_score into account.

   Using memcg stats still looks to me as a more accurate and consistent
   way of estimating memcg memory footprint.

2) If we're treating tasks from not-kill-all cgroups as separate oom entities,
   and compare them with memcgs with kill-all flag, we definitely need
   per-task oom_priority to provide a clear way to compare entities.

   Otherwise we need per-memcg size-based oom_score_adj, which is not
   the best idea, as we agreed earlier.

Thanks!

Roman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-30 11:22                 ` Roman Gushchin
@ 2017-08-30 20:56                   ` David Rientjes
  2017-08-31 13:34                     ` Roman Gushchin
  0 siblings, 1 reply; 26+ messages in thread
From: David Rientjes @ 2017-08-30 20:56 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Michal Hocko, linux-mm, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Wed, 30 Aug 2017, Roman Gushchin wrote:

> I've spent some time to implement such a version.
> 
> It really became shorter and more existing code were reused,
> howewer I've met a couple of serious issues:
> 
> 1) Simple summing of per-task oom_score doesn't make sense.
>    First, we calculate oom_score per-task, while should sum per-process values,
>    or, better, per-mm struct. We can take only threa-group leader's score
>    into account, but it's also not 100% accurate.
>    And, again, we have a question what to do with per-task oom_score_adj,
>    if we don't task the task's oom_score into account.
> 
>    Using memcg stats still looks to me as a more accurate and consistent
>    way of estimating memcg memory footprint.
> 

The patchset is introducing a new methodology for selecting oom victims so 
you can define how cgroups are compared vs other cgroups with your own 
"badness" calculation.  I think your implementation based heavily on anon 
and unevictable lrus and unreclaimable slab is fine and you can describe 
that detail in the documentation (along with the caveat that it is only 
calculated for nodes in the allocation's mempolicy).  With 
memory.oom_priority, the user has full ability to change that selection.  
Process selection heuristics have changed over time themselves, it's not 
something that must be backwards compatibile and trying to sum the usage 
from each of the cgroup's mm_struct's and respect oom_score_adj is 
unnecessarily complex.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-30 20:56                   ` David Rientjes
@ 2017-08-31 13:34                     ` Roman Gushchin
  2017-08-31 20:01                       ` David Rientjes
  0 siblings, 1 reply; 26+ messages in thread
From: Roman Gushchin @ 2017-08-31 13:34 UTC (permalink / raw)
  To: David Rientjes
  Cc: Michal Hocko, linux-mm, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Wed, Aug 30, 2017 at 01:56:22PM -0700, David Rientjes wrote:
> On Wed, 30 Aug 2017, Roman Gushchin wrote:
> 
> > I've spent some time to implement such a version.
> > 
> > It really became shorter and more existing code were reused,
> > howewer I've met a couple of serious issues:
> > 
> > 1) Simple summing of per-task oom_score doesn't make sense.
> >    First, we calculate oom_score per-task, while should sum per-process values,
> >    or, better, per-mm struct. We can take only threa-group leader's score
> >    into account, but it's also not 100% accurate.
> >    And, again, we have a question what to do with per-task oom_score_adj,
> >    if we don't task the task's oom_score into account.
> > 
> >    Using memcg stats still looks to me as a more accurate and consistent
> >    way of estimating memcg memory footprint.
> > 
> 
> The patchset is introducing a new methodology for selecting oom victims so 
> you can define how cgroups are compared vs other cgroups with your own 
> "badness" calculation.  I think your implementation based heavily on anon 
> and unevictable lrus and unreclaimable slab is fine and you can describe 
> that detail in the documentation (along with the caveat that it is only 
> calculated for nodes in the allocation's mempolicy).  With 
> memory.oom_priority, the user has full ability to change that selection.  
> Process selection heuristics have changed over time themselves, it's not 
> something that must be backwards compatibile and trying to sum the usage 
> from each of the cgroup's mm_struct's and respect oom_score_adj is 
> unnecessarily complex.

I agree.

So, it looks to me that we're close to an acceptable version,
and the only remaining question is the default behavior
(when oom_group is not set).

Michal suggests to ignore non-oom_group memcgs, and compare tasks with
memcgs with oom_group set. This makes the whole thing completely opt-in,
but then we probably need another knob (or value) to select between
"select memcg, kill biggest task" and "select memcg, kill all tasks".
Also, as the whole thing is based on comparison between processes and
memcgs, we probably need oom_priority for processes.
I'm not necessary against this options, but I do worry about the complexity
of resulting interface.

In my implementation we always select a victim memcg first (or a task
in root memcg), and then kill the biggest task inside.
It actually changes the victim selection policy. By doing this
we achieve per-memcg fairness, which makes sense in a containerized
environment.
I believe it's acceptable, but I can also add a cgroup v2 mount option
to completely revert to the per-process OOM killer for those users, who
for some reasons depend on the existing victim selection policy.

Any thoughts/objections?

Thanks!

Roman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [v6 2/4] mm, oom: cgroup-aware OOM killer
  2017-08-31 13:34                     ` Roman Gushchin
@ 2017-08-31 20:01                       ` David Rientjes
  0 siblings, 0 replies; 26+ messages in thread
From: David Rientjes @ 2017-08-31 20:01 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Michal Hocko, linux-mm, Vladimir Davydov, Johannes Weiner,
	Tetsuo Handa, Tejun Heo, kernel-team, cgroups, linux-doc,
	linux-kernel

On Thu, 31 Aug 2017, Roman Gushchin wrote:

> So, it looks to me that we're close to an acceptable version,
> and the only remaining question is the default behavior
> (when oom_group is not set).
> 

Nit: without knowledge of the implementation, I still don't think I would 
know what an "out of memory group" is.  Out of memory doesn't necessarily 
imply a kill.  I suggest oom_kill_all or something that includes the verb.

> Michal suggests to ignore non-oom_group memcgs, and compare tasks with
> memcgs with oom_group set. This makes the whole thing completely opt-in,
> but then we probably need another knob (or value) to select between
> "select memcg, kill biggest task" and "select memcg, kill all tasks".

It seems like that would either bias toward or bias against cgroups that 
opt-in.  I suggest comparing memory cgroups at each level in the hierarchy 
based on your new badness heuristic, regardless of any tunables it has 
enabled.  Then kill either the largest process or all the processes 
attached depending on oom_group or oom_kill_all.

> Also, as the whole thing is based on comparison between processes and
> memcgs, we probably need oom_priority for processes.

I think with the constraints of cgroup v2 that a victim memcg must first 
be chosen, and then a victim process attached to that memcg must be chosen 
or all eligible processes attached to it be killed, depending on the 
tunable.

The simplest and most clear way to define this, in my opinion, is to 
implement a heuristic that compares sibling memcgs based on usage, as you 
have done.  This can be overridden by a memory.oom_priority that userspace 
defines, and is enough support for them to change victim selection (no 
mount option needed, just set memory.oom_priority).  Then kill the largest 
process or all eligible processes attached.  We only use per-process 
priority to override process selection compared to sibling memcgs, but 
with cgroup v2 process constraints it doesn't seem like that is within the 
scope of your patchset.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2017-08-31 20:01 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-23 16:51 [v6 1/4] mm, oom: refactor the oom_kill_process() function Roman Gushchin
2017-08-23 16:51 ` [v6 0/4] cgroup-aware OOM killer Roman Gushchin
2017-08-23 16:51 ` [v6 2/4] mm, oom: " Roman Gushchin
2017-08-23 23:19   ` David Rientjes
2017-08-25 10:57     ` Roman Gushchin
2017-08-24 11:47   ` Michal Hocko
2017-08-24 12:28     ` Roman Gushchin
2017-08-24 12:58       ` Michal Hocko
2017-08-24 13:58         ` Roman Gushchin
2017-08-24 14:13           ` Michal Hocko
2017-08-24 14:58             ` Roman Gushchin
2017-08-25  8:14               ` Michal Hocko
2017-08-25 10:39                 ` Roman Gushchin
2017-08-25 10:58                   ` Michal Hocko
2017-08-30 11:22                 ` Roman Gushchin
2017-08-30 20:56                   ` David Rientjes
2017-08-31 13:34                     ` Roman Gushchin
2017-08-31 20:01                       ` David Rientjes
2017-08-23 16:52 ` [v6 3/4] mm, oom: introduce oom_priority for memory cgroups Roman Gushchin
2017-08-24 12:10   ` Michal Hocko
2017-08-24 12:51     ` Roman Gushchin
2017-08-24 13:48       ` Michal Hocko
2017-08-24 14:11         ` Roman Gushchin
2017-08-28 20:54           ` David Rientjes
2017-08-23 16:52 ` [v6 4/4] mm, oom, docs: describe the cgroup-aware OOM killer Roman Gushchin
2017-08-24 11:15 ` [v6 1/4] mm, oom: refactor the oom_kill_process() function Michal Hocko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).