Linux-kselftest Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 1/2] cgroup: allow deletion of cgroups containing only dying processes
@ 2020-01-16  4:36 Suren Baghdasaryan
  2020-01-16  4:36 ` [PATCH 2/2] kselftest/cgroup: add cgroup destruction test Suren Baghdasaryan
  2020-01-17 15:15 ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
  0 siblings, 2 replies; 19+ messages in thread
From: Suren Baghdasaryan @ 2020-01-16  4:36 UTC (permalink / raw)
  To: surenb
  Cc: tj, lizefan, hannes, matthias.bgg, cgroups, linux-kernel,
	linux-arm-kernel, linux-mediatek, shuah, guro, alex.shi, mkoutny,
	linux-kselftest, linger.lee, tomcherry, kernel-team

A cgroup containing only dying tasks will be seen as empty when a userspace
process reads its cgroup.procs or cgroup.tasks files. It should be safe to
delete such a cgroup as it is considered empty. However if one of the dying
tasks did not reach cgroup_exit then an attempt to delete the cgroup will
fail with EBUSY because cgroup_is_populated() will not consider it empty
until all tasks reach cgroup_exit. Such a condition can be triggered when
a task consumes large amounts of memory and spends enough time in exit_mm
to create delay between the moment it is flagged as PF_EXITING and the
moment it reaches cgroup_exit.
Fix this by detecting cgroups containing only dying tasks during cgroup
destruction and proceeding with it while postponing the final step of
releasing the last reference until the last task reaches cgroup_exit.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: JeiFeng Lee <linger.lee@mediatek.com>
Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations")
---
 include/linux/cgroup-defs.h |  3 ++
 kernel/cgroup/cgroup.c      | 65 +++++++++++++++++++++++++++++++++----
 2 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 63097cb243cb..f9bcccbac8dd 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -71,6 +71,9 @@ enum {
 
 	/* Cgroup is frozen. */
 	CGRP_FROZEN,
+
+	/* Cgroup is dead. */
+	CGRP_DEAD,
 };
 
 /* cgroup_root->flags */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 735af8f15f95..a99ebddd37d9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -795,10 +795,11 @@ static bool css_set_populated(struct css_set *cset)
  * that the content of the interface file has changed.  This can be used to
  * detect when @cgrp and its descendants become populated or empty.
  */
-static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+static bool cgroup_update_populated(struct cgroup *cgrp, bool populated)
 {
 	struct cgroup *child = NULL;
 	int adj = populated ? 1 : -1;
+	bool state_change = false;
 
 	lockdep_assert_held(&css_set_lock);
 
@@ -817,6 +818,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 		if (was_populated == cgroup_is_populated(cgrp))
 			break;
 
+		state_change = true;
 		cgroup1_check_for_release(cgrp);
 		TRACE_CGROUP_PATH(notify_populated, cgrp,
 				  cgroup_is_populated(cgrp));
@@ -825,6 +827,21 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 		child = cgrp;
 		cgrp = cgroup_parent(cgrp);
 	} while (cgrp);
+
+	return state_change;
+}
+
+static void cgroup_prune_dead(struct cgroup *cgrp)
+{
+	lockdep_assert_held(&css_set_lock);
+
+	do {
+		/* put the base reference if cgroup was already destroyed */
+		if (!cgroup_is_populated(cgrp) &&
+		    test_bit(CGRP_DEAD, &cgrp->flags))
+			percpu_ref_kill(&cgrp->self.refcnt);
+		cgrp = cgroup_parent(cgrp);
+	} while (cgrp);
 }
 
 /**
@@ -838,11 +855,15 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 static void css_set_update_populated(struct css_set *cset, bool populated)
 {
 	struct cgrp_cset_link *link;
+	bool state_change;
 
 	lockdep_assert_held(&css_set_lock);
 
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
-		cgroup_update_populated(link->cgrp, populated);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		state_change = cgroup_update_populated(link->cgrp, populated);
+		if (state_change && !populated)
+			cgroup_prune_dead(link->cgrp);
+	}
 }
 
 /*
@@ -5458,8 +5479,26 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 * Only migration can raise populated from zero and we're already
 	 * holding cgroup_mutex.
 	 */
-	if (cgroup_is_populated(cgrp))
-		return -EBUSY;
+	if (cgroup_is_populated(cgrp)) {
+		struct css_task_iter it;
+		struct task_struct *task;
+
+		/*
+		 * cgroup_is_populated does not account for exiting tasks
+		 * that did not reach cgroup_exit yet. Check if all the tasks
+		 * in this cgroup are exiting.
+		 */
+		css_task_iter_start(&cgrp->self, 0, &it);
+		do {
+			task = css_task_iter_next(&it);
+		} while (task && (task->flags & PF_EXITING));
+		css_task_iter_end(&it);
+
+		if (task) {
+			/* cgroup is indeed populated */
+			return -EBUSY;
+		}
+	}
 
 	/*
 	 * Make sure there's no live children.  We can't test emptiness of
@@ -5510,8 +5549,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	cgroup_bpf_offline(cgrp);
 
-	/* put the base reference */
-	percpu_ref_kill(&cgrp->self.refcnt);
+	/*
+	 * Take css_set_lock because of the possible race with
+	 * cgroup_update_populated.
+	 */
+	spin_lock_irq(&css_set_lock);
+	/* The last task might have died since we last checked */
+	if (cgroup_is_populated(cgrp)) {
+		/* mark cgroup for future destruction */
+		set_bit(CGRP_DEAD, &cgrp->flags);
+	} else {
+		/* put the base reference */
+		percpu_ref_kill(&cgrp->self.refcnt);
+	}
+	spin_unlock_irq(&css_set_lock);
 
 	return 0;
 };
-- 
2.25.0.rc1.283.g88dfdc4193-goog


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/2] kselftest/cgroup: add cgroup destruction test
  2020-01-16  4:36 [PATCH 1/2] cgroup: allow deletion of cgroups containing only dying processes Suren Baghdasaryan
@ 2020-01-16  4:36 ` Suren Baghdasaryan
  2020-01-17 15:15 ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
  1 sibling, 0 replies; 19+ messages in thread
From: Suren Baghdasaryan @ 2020-01-16  4:36 UTC (permalink / raw)
  To: surenb
  Cc: tj, lizefan, hannes, matthias.bgg, cgroups, linux-kernel,
	linux-arm-kernel, linux-mediatek, shuah, guro, alex.shi, mkoutny,
	linux-kselftest, linger.lee, tomcherry, kernel-team

Add new test to verify that a cgroup with dead processes can be destroyed.
The test spawns a child process which allocates and touches 100MB of RAM
to ensure prolonged exit. Subsequently it kills the child, waits until
the cgroup containing the child is empty and destroys the cgroup.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 tools/testing/selftests/cgroup/test_core.c | 113 +++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index c5ca669feb2b..2a5242ec1a49 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -2,7 +2,10 @@
 
 #include <linux/limits.h>
 #include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
 #include <unistd.h>
+#include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
 #include <signal.h>
@@ -12,6 +15,115 @@
 #include "../kselftest.h"
 #include "cgroup_util.h"
 
+static int touch_anon(char *buf, size_t size)
+{
+	int fd;
+	char *pos = buf;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	while (size > 0) {
+		ssize_t ret = read(fd, pos, size);
+
+		if (ret < 0) {
+			if (errno != EINTR) {
+				close(fd);
+				return -1;
+			}
+		} else {
+			pos += ret;
+			size -= ret;
+		}
+	}
+	close(fd);
+
+	return 0;
+}
+
+static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
+{
+	int ppid = getppid();
+	size_t size = (size_t)arg;
+	void *buf;
+
+	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+		   0, 0);
+	if (buf == MAP_FAILED)
+		return -1;
+
+	if (touch_anon((char *)buf, size)) {
+		munmap(buf, size);
+		return -1;
+	}
+
+	while (getppid() == ppid)
+		sleep(1);
+
+	munmap(buf, size);
+	return 0;
+}
+
+/*
+ * Create a child process that allocates and touches 100MB, then waits to be
+ * killed. Wait until the child is attached to the cgroup, kill all processes
+ * in that cgroup and wait until "cgroup.events" is empty. At this point try to
+ * destroy the empty cgroup. The test helps detect race conditions between
+ * dying processes leaving the cgroup and cgroup destruction path.
+ */
+static int test_cgcore_destroy(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cg_test = NULL;
+	int child_pid;
+	char buf[PAGE_SIZE];
+
+	cg_test = cg_name(root, "cg_test");
+
+	if (!cg_test)
+		goto cleanup;
+
+	for (int i = 0; i < 10; i++) {
+		if (cg_create(cg_test))
+			goto cleanup;
+
+		child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
+					  (void *) MB(100));
+
+		if (child_pid < 0)
+			goto cleanup;
+
+		/* wait for the child to enter cgroup */
+		if (cg_wait_for_proc_count(cg_test, 1))
+			goto cleanup;
+
+		if (cg_killall(cg_test))
+			goto cleanup;
+
+		/* wait for cgroup to be empty */
+		while (1) {
+			if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
+				goto cleanup;
+			if (buf[0] == '\0')
+				break;
+			usleep(1000);
+		}
+
+		if (rmdir(cg_test))
+			goto cleanup;
+
+		if (waitpid(child_pid, NULL, 0) < 0)
+			goto cleanup;
+	}
+	ret = KSFT_PASS;
+cleanup:
+	if (cg_test)
+		cg_destroy(cg_test);
+	free(cg_test);
+	return ret;
+}
+
 /*
  * A(0) - B(0) - C(1)
  *        \ D(0)
@@ -512,6 +624,7 @@ struct corecg_test {
 	T(test_cgcore_populated),
 	T(test_cgcore_proc_migration),
 	T(test_cgcore_thread_migration),
+	T(test_cgcore_destroy),
 };
 #undef T
 
-- 
2.25.0.rc1.283.g88dfdc4193-goog


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-16  4:36 [PATCH 1/2] cgroup: allow deletion of cgroups containing only dying processes Suren Baghdasaryan
  2020-01-16  4:36 ` [PATCH 2/2] kselftest/cgroup: add cgroup destruction test Suren Baghdasaryan
@ 2020-01-17 15:15 ` Michal Koutný
  2020-01-17 15:15   ` [PATCH 1/3] cgroup: Unify css_set task lists Michal Koutný
                     ` (3 more replies)
  1 sibling, 4 replies; 19+ messages in thread
From: Michal Koutný @ 2020-01-17 15:15 UTC (permalink / raw)
  To: cgroups
  Cc: Johannes Weiner, Li Zefan, Tejun Heo, alex.shi, guro,
	kernel-team, linger.lee, linux-arm-kernel, linux-kernel,
	linux-kselftest, linux-mediatek, matthias.bgg, shuah, tomcherry

Hi,
I was looking into the issue and came up with an alternative solution that
changes task iteration to be consistent with cgroup_is_populated() check and
moving the responsibility to check PF_EXITING on the consumers of iterator API.

I haven't check your approach thoroughly, however, it appears to me it
complicates (already non-trivial) cgroup destruction path. I ran your selftest
on the iterators approach and it proved working.

Michal Koutný (2):
  cgroup: Unify css_set task lists
  cgroup: Iterate tasks that did not finish do_exit()

Suren Baghdasaryan (1):
  kselftest/cgroup: add cgroup destruction test

 include/linux/cgroup-defs.h                |  15 ++-
 include/linux/cgroup.h                     |   4 +-
 kernel/cgroup/cgroup.c                     |  86 ++++++++--------
 kernel/cgroup/debug.c                      |  16 ++-
 tools/testing/selftests/cgroup/test_core.c | 113 +++++++++++++++++++++
 5 files changed, 176 insertions(+), 58 deletions(-)

-- 
2.24.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/3] cgroup: Unify css_set task lists
  2020-01-17 15:15 ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
@ 2020-01-17 15:15   ` Michal Koutný
  2020-01-17 16:59     ` Tejun Heo
  2020-01-17 15:15   ` [PATCH 2/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 19+ messages in thread
From: Michal Koutný @ 2020-01-17 15:15 UTC (permalink / raw)
  To: cgroups
  Cc: Johannes Weiner, Li Zefan, Tejun Heo, alex.shi, guro,
	kernel-team, linger.lee, linux-arm-kernel, linux-kernel,
	linux-kselftest, linux-mediatek, matthias.bgg, shuah, tomcherry

We track tasks of css_set in three different lists. The iterators
unnecessarily process each list specially.
Use an array of lists and simplify the iterator code. This is
refactoring with no intended functional change.

Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 include/linux/cgroup-defs.h | 15 ++++---
 include/linux/cgroup.h      |  4 +-
 kernel/cgroup/cgroup.c      | 81 +++++++++++++++++++------------------
 kernel/cgroup/debug.c       | 16 ++++----
 4 files changed, 60 insertions(+), 56 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 63097cb243cb..e474b5a67438 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -180,6 +180,13 @@ struct cgroup_subsys_state {
 	struct cgroup_subsys_state *parent;
 };
 
+enum css_set_task_list {
+	CSS_SET_TASKS = 0,
+	CSS_SET_TASKS_MG,
+	CSS_SET_TASKS_DYING,
+	CSS_SET_TASKS_COUNT,
+};
+
 /*
  * A css_set is a structure holding pointers to a set of
  * cgroup_subsys_state objects. This saves space in the task struct
@@ -214,14 +221,12 @@ struct css_set {
 
 	/*
 	 * Lists running through all tasks using this cgroup group.
-	 * mg_tasks lists tasks which belong to this cset but are in the
+	 * CSS_SET_TASKS_MG lists tasks which belong to this cset but are in the
 	 * process of being migrated out or in.  Protected by
 	 * css_set_rwsem, but, during migration, once tasks are moved to
-	 * mg_tasks, it can be read safely while holding cgroup_mutex.
+	 * CSS_SET_TASKS_MG, it can be read safely while holding cgroup_mutex.
 	 */
-	struct list_head tasks;
-	struct list_head mg_tasks;
-	struct list_head dying_tasks;
+	struct list_head tasks[CSS_SET_TASKS_COUNT];
 
 	/* all css_task_iters currently walking this cset */
 	struct list_head task_iters;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d7ddebd0cdec..284e6b4b43cc 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -58,10 +58,8 @@ struct css_task_iter {
 	struct list_head		*tcset_head;
 
 	struct list_head		*task_pos;
-	struct list_head		*tasks_head;
-	struct list_head		*mg_tasks_head;
-	struct list_head		*dying_tasks_head;
 
+	enum css_set_task_list		cur_list;
 	struct css_set			*cur_cset;
 	struct css_set			*cur_dcset;
 	struct task_struct		*cur_task;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 735af8f15f95..b56283e13491 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -737,9 +737,9 @@ EXPORT_SYMBOL_GPL(of_css);
 struct css_set init_css_set = {
 	.refcount		= REFCOUNT_INIT(1),
 	.dom_cset		= &init_css_set,
-	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
-	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
-	.dying_tasks		= LIST_HEAD_INIT(init_css_set.dying_tasks),
+	.tasks[CSS_SET_TASKS]	= LIST_HEAD_INIT(init_css_set.tasks[CSS_SET_TASKS]),
+	.tasks[CSS_SET_TASKS_MG]	= LIST_HEAD_INIT(init_css_set.tasks[CSS_SET_TASKS_MG]),
+	.tasks[CSS_SET_TASKS_DYING]	= LIST_HEAD_INIT(init_css_set.tasks[CSS_SET_TASKS_DYING]),
 	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 	.threaded_csets		= LIST_HEAD_INIT(init_css_set.threaded_csets),
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -775,7 +775,8 @@ static bool css_set_populated(struct css_set *cset)
 {
 	lockdep_assert_held(&css_set_lock);
 
-	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+	return !list_empty(&cset->tasks[CSS_SET_TASKS]) ||
+	       !list_empty(&cset->tasks[CSS_SET_TASKS_MG]);
 }
 
 /**
@@ -865,7 +866,7 @@ static void css_set_skip_task_iters(struct css_set *cset,
  * @task: task being moved
  * @from_cset: css_set @task currently belongs to (may be NULL)
  * @to_cset: new css_set @task is being moved to (may be NULL)
- * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ * @use_mg_tasks: move to @to_cset->tasks[CSS_SET_TASKS_MG] instead of ->tasks[CSS_SET_TASKS]
  *
  * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
  * css_set, @from_cset can be NULL.  If @task is being disassociated
@@ -896,6 +897,8 @@ static void css_set_move_task(struct task_struct *task,
 	}
 
 	if (to_cset) {
+		enum css_set_task_list ls = use_mg_tasks ? CSS_SET_TASKS_MG : CSS_SET_TASKS;
+
 		/*
 		 * We are synchronized through cgroup_threadgroup_rwsem
 		 * against PF_EXITING setting such that we can't race
@@ -904,8 +907,7 @@ static void css_set_move_task(struct task_struct *task,
 		WARN_ON_ONCE(task->flags & PF_EXITING);
 
 		cgroup_move_task(task, to_cset);
-		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
-							     &to_cset->tasks);
+		list_add_tail(&task->cg_list, &to_cset->tasks[ls]);
 	}
 }
 
@@ -1211,9 +1213,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	refcount_set(&cset->refcount, 1);
 	cset->dom_cset = cset;
-	INIT_LIST_HEAD(&cset->tasks);
-	INIT_LIST_HEAD(&cset->mg_tasks);
-	INIT_LIST_HEAD(&cset->dying_tasks);
+	INIT_LIST_HEAD(&cset->tasks[CSS_SET_TASKS]);
+	INIT_LIST_HEAD(&cset->tasks[CSS_SET_TASKS_MG]);
+	INIT_LIST_HEAD(&cset->tasks[CSS_SET_TASKS_DYING]);
 	INIT_LIST_HEAD(&cset->task_iters);
 	INIT_LIST_HEAD(&cset->threaded_csets);
 	INIT_HLIST_NODE(&cset->hlist);
@@ -2285,7 +2287,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
  * Add @task, which is a migration target, to @mgctx->tset.  This function
  * becomes noop if @task doesn't need to be migrated.  @task's css_set
  * should have been added as a migration source and @task->cg_list will be
- * moved from the css_set's tasks list to mg_tasks one.
+ * moved from the css_set's tasks list to tasks[CSS_SET_TASKS_MG] one.
  */
 static void cgroup_migrate_add_task(struct task_struct *task,
 				    struct cgroup_mgctx *mgctx)
@@ -2307,7 +2309,7 @@ static void cgroup_migrate_add_task(struct task_struct *task,
 
 	mgctx->tset.nr_tasks++;
 
-	list_move_tail(&task->cg_list, &cset->mg_tasks);
+	list_move_tail(&task->cg_list, &cset->tasks[CSS_SET_TASKS_MG]);
 	if (list_empty(&cset->mg_node))
 		list_add_tail(&cset->mg_node,
 			      &mgctx->tset.src_csets);
@@ -2348,12 +2350,12 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 
 	while (&cset->mg_node != tset->csets) {
 		if (!task)
-			task = list_first_entry(&cset->mg_tasks,
+			task = list_first_entry(&cset->tasks[CSS_SET_TASKS_MG],
 						struct task_struct, cg_list);
 		else
 			task = list_next_entry(task, cg_list);
 
-		if (&task->cg_list != &cset->mg_tasks) {
+		if (&task->cg_list != &cset->tasks[CSS_SET_TASKS_MG]) {
 			tset->cur_cset = cset;
 			tset->cur_task = task;
 
@@ -2416,7 +2418,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 	 */
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(cset, &tset->src_csets, mg_node) {
-		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
+		list_for_each_entry_safe(task, tmp_task, &cset->tasks[CSS_SET_TASKS_MG], cg_list) {
 			struct css_set *from_cset = task_css_set(task);
 			struct css_set *to_cset = cset->mg_dst_cset;
 
@@ -2470,7 +2472,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 	spin_lock_irq(&css_set_lock);
 	list_splice_init(&tset->dst_csets, &tset->src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
-		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+		list_splice_tail_init(&cset->tasks[CSS_SET_TASKS_MG], &cset->tasks[CSS_SET_TASKS]);
 		list_del_init(&cset->mg_node);
 	}
 	spin_unlock_irq(&css_set_lock);
@@ -2592,7 +2594,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
 
 	WARN_ON(src_cset->mg_src_cgrp);
 	WARN_ON(src_cset->mg_dst_cgrp);
-	WARN_ON(!list_empty(&src_cset->mg_tasks));
+	WARN_ON(!list_empty(&src_cset->tasks[CSS_SET_TASKS_MG]));
 	WARN_ON(!list_empty(&src_cset->mg_node));
 
 	src_cset->mg_src_cgrp = src_cgrp;
@@ -2905,7 +2907,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		struct task_struct *task, *ntask;
 
 		/* all tasks in src_csets need to be migrated */
-		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
+		list_for_each_entry_safe(task, ntask, &src_cset->tasks[CSS_SET_TASKS], cg_list)
 			cgroup_migrate_add_task(task, &mgctx);
 	}
 	spin_unlock_irq(&css_set_lock);
@@ -4402,18 +4404,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 			it->task_pos = NULL;
 			return;
 		}
-	} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
-
-	if (!list_empty(&cset->tasks))
-		it->task_pos = cset->tasks.next;
-	else if (!list_empty(&cset->mg_tasks))
-		it->task_pos = cset->mg_tasks.next;
-	else
-		it->task_pos = cset->dying_tasks.next;
+	} while (!css_set_populated(cset) && list_empty(&cset->tasks[CSS_SET_TASKS_DYING]));
 
-	it->tasks_head = &cset->tasks;
-	it->mg_tasks_head = &cset->mg_tasks;
-	it->dying_tasks_head = &cset->dying_tasks;
+	it->cur_list = CSS_SET_TASKS;
+	while (list_empty(&cset->tasks[it->cur_list])) {
+		/* At least one list should be non-empty */
+		if (WARN_ON(++it->cur_list == CSS_SET_TASKS_COUNT)) {
+			it->cur_list = CSS_SET_TASKS;
+			it->task_pos = NULL;
+			return;
+		}
+	}
+	it->task_pos = cset->tasks[it->cur_list].next;
 
 	/*
 	 * We don't keep css_sets locked across iteration steps and thus
@@ -4458,21 +4460,22 @@ static void css_task_iter_advance(struct css_task_iter *it)
 repeat:
 	if (it->task_pos) {
 		/*
-		 * Advance iterator to find next entry.  cset->tasks is
-		 * consumed first and then ->mg_tasks.  After ->mg_tasks,
-		 * we move onto the next cset.
+		 * Advance iterator to find next entry. All lists in
+		 * cset->tasks[] are consumed, then we move onto the next cset.
 		 */
 		if (it->flags & CSS_TASK_ITER_SKIPPED)
 			it->flags &= ~CSS_TASK_ITER_SKIPPED;
 		else
 			it->task_pos = it->task_pos->next;
 
-		if (it->task_pos == it->tasks_head)
-			it->task_pos = it->mg_tasks_head->next;
-		if (it->task_pos == it->mg_tasks_head)
-			it->task_pos = it->dying_tasks_head->next;
-		if (it->task_pos == it->dying_tasks_head)
-			css_task_iter_advance_css_set(it);
+		while (it->task_pos == &it->cur_cset->tasks[it->cur_list]) {
+			if (++it->cur_list == CSS_SET_TASKS_COUNT) {
+				css_task_iter_advance_css_set(it);
+				break;
+			}
+
+			it->task_pos = it->cur_cset->tasks[it->cur_list].next;
+		}
 	} else {
 		/* called from start, proceed to the first cset */
 		css_task_iter_advance_css_set(it);
@@ -5986,7 +5989,7 @@ void cgroup_exit(struct task_struct *tsk)
 	WARN_ON_ONCE(list_empty(&tsk->cg_list));
 	cset = task_css_set(tsk);
 	css_set_move_task(tsk, cset, NULL, false);
-	list_add_tail(&tsk->cg_list, &cset->dying_tasks);
+	list_add_tail(&tsk->cg_list, &cset->tasks[CSS_SET_TASKS_DYING]);
 	cset->nr_tasks--;
 
 	WARN_ON_ONCE(cgroup_task_frozen(tsk));
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 80aa3f027ac3..26b4908600f7 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -124,6 +124,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 		struct task_struct *task;
 		int count = 0;
 		int refcnt = refcount_read(&cset->refcount);
+		enum css_set_task_list ls;
 
 		/*
 		 * Print out the proc_cset and threaded_cset relationship
@@ -161,17 +162,14 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 		}
 		seq_puts(seq, "\n");
 
-		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
-				seq_printf(seq, "  task %d\n",
-					   task_pid_vnr(task));
+		for (ls = CSS_SET_TASKS; ls < CSS_SET_TASKS_DYING; ++ls) {
+			list_for_each_entry(task, &cset->tasks[ls], cg_list) {
+				if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+					seq_printf(seq, "  task %d\n",
+						   task_pid_vnr(task));
+			}
 		}
 
-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
-				seq_printf(seq, "  task %d\n",
-					   task_pid_vnr(task));
-		}
 		/* show # of overflowed tasks */
 		if (count > MAX_TASKS_SHOWN_PER_CSS)
 			seq_printf(seq, "  ... (%d)\n",
-- 
2.24.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-17 15:15 ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
  2020-01-17 15:15   ` [PATCH 1/3] cgroup: Unify css_set task lists Michal Koutný
@ 2020-01-17 15:15   ` Michal Koutný
  2020-01-17 17:28     ` Tejun Heo
  2020-01-17 15:15   ` [PATCH " Michal Koutný
  2020-01-17 17:30   ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Suren Baghdasaryan
  3 siblings, 1 reply; 19+ messages in thread
From: Michal Koutný @ 2020-01-17 15:15 UTC (permalink / raw)
  To: cgroups
  Cc: Johannes Weiner, Li Zefan, Tejun Heo, alex.shi, guro,
	kernel-team, linger.lee, linux-arm-kernel, linux-kernel,
	linux-kselftest, linux-mediatek, matthias.bgg, shuah, tomcherry

PF_EXITING is set earlier than actual removal from css_set when a task
is exitting. This can confuse cgroup.procs readers who see no PF_EXITING
tasks, however, rmdir is checking against css_set membership so it can
transitionally fail with EBUSY.

Fix this by listing tasks that weren't unlinked from css_set active
lists.
It may happen that other users of the task iterator (without
CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This
is equal to the state before commit c03cd7738a83 ("cgroup: Include dying
leaders with live threads in PROCS iterations") but it may be reviewed
later.

Reported-by: Suren Baghdasaryan <surenb@google.com>
Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations")
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 kernel/cgroup/cgroup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b56283e13491..132d258e7172 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4492,11 +4492,12 @@ static void css_task_iter_advance(struct css_task_iter *it)
 			goto repeat;
 
 		/* and dying leaders w/o live member threads */
-		if (!atomic_read(&task->signal->live))
+		if (it->cur_list == CSS_SET_TASKS_DYING &&
+		    !atomic_read(&task->signal->live))
 			goto repeat;
 	} else {
 		/* skip all dying ones */
-		if (task->flags & PF_EXITING)
+		if (it->cur_list == CSS_SET_TASKS_DYING)
 			goto repeat;
 	}
 }
-- 
2.24.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 3/3] kselftest/cgroup: add cgroup destruction test
  2020-01-17 15:15 ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
  2020-01-17 15:15   ` [PATCH 1/3] cgroup: Unify css_set task lists Michal Koutný
  2020-01-17 15:15   ` [PATCH 2/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
@ 2020-01-17 15:15   ` " Michal Koutný
  2020-01-17 17:30   ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Suren Baghdasaryan
  3 siblings, 0 replies; 19+ messages in thread
From: Michal Koutný @ 2020-01-17 15:15 UTC (permalink / raw)
  To: cgroups
  Cc: Johannes Weiner, Li Zefan, Tejun Heo, alex.shi, guro,
	kernel-team, linger.lee, linux-arm-kernel, linux-kernel,
	linux-kselftest, linux-mediatek, matthias.bgg, shuah, tomcherry

From: Suren Baghdasaryan <surenb@google.com>

Add new test to verify that a cgroup with dead processes can be destroyed.
The test spawns a child process which allocates and touches 100MB of RAM
to ensure prolonged exit. Subsequently it kills the child, waits until
the cgroup containing the child is empty and destroys the cgroup.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 tools/testing/selftests/cgroup/test_core.c | 113 +++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index c5ca669feb2b..2a5242ec1a49 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -2,7 +2,10 @@
 
 #include <linux/limits.h>
 #include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
 #include <unistd.h>
+#include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
 #include <signal.h>
@@ -12,6 +15,115 @@
 #include "../kselftest.h"
 #include "cgroup_util.h"
 
+static int touch_anon(char *buf, size_t size)
+{
+	int fd;
+	char *pos = buf;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	while (size > 0) {
+		ssize_t ret = read(fd, pos, size);
+
+		if (ret < 0) {
+			if (errno != EINTR) {
+				close(fd);
+				return -1;
+			}
+		} else {
+			pos += ret;
+			size -= ret;
+		}
+	}
+	close(fd);
+
+	return 0;
+}
+
+static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
+{
+	int ppid = getppid();
+	size_t size = (size_t)arg;
+	void *buf;
+
+	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+		   0, 0);
+	if (buf == MAP_FAILED)
+		return -1;
+
+	if (touch_anon((char *)buf, size)) {
+		munmap(buf, size);
+		return -1;
+	}
+
+	while (getppid() == ppid)
+		sleep(1);
+
+	munmap(buf, size);
+	return 0;
+}
+
+/*
+ * Create a child process that allocates and touches 100MB, then waits to be
+ * killed. Wait until the child is attached to the cgroup, kill all processes
+ * in that cgroup and wait until "cgroup.events" is empty. At this point try to
+ * destroy the empty cgroup. The test helps detect race conditions between
+ * dying processes leaving the cgroup and cgroup destruction path.
+ */
+static int test_cgcore_destroy(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cg_test = NULL;
+	int child_pid;
+	char buf[PAGE_SIZE];
+
+	cg_test = cg_name(root, "cg_test");
+
+	if (!cg_test)
+		goto cleanup;
+
+	for (int i = 0; i < 10; i++) {
+		if (cg_create(cg_test))
+			goto cleanup;
+
+		child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
+					  (void *) MB(100));
+
+		if (child_pid < 0)
+			goto cleanup;
+
+		/* wait for the child to enter cgroup */
+		if (cg_wait_for_proc_count(cg_test, 1))
+			goto cleanup;
+
+		if (cg_killall(cg_test))
+			goto cleanup;
+
+		/* wait for cgroup to be empty */
+		while (1) {
+			if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
+				goto cleanup;
+			if (buf[0] == '\0')
+				break;
+			usleep(1000);
+		}
+
+		if (rmdir(cg_test))
+			goto cleanup;
+
+		if (waitpid(child_pid, NULL, 0) < 0)
+			goto cleanup;
+	}
+	ret = KSFT_PASS;
+cleanup:
+	if (cg_test)
+		cg_destroy(cg_test);
+	free(cg_test);
+	return ret;
+}
+
 /*
  * A(0) - B(0) - C(1)
  *        \ D(0)
@@ -512,6 +624,7 @@ struct corecg_test {
 	T(test_cgcore_populated),
 	T(test_cgcore_proc_migration),
 	T(test_cgcore_thread_migration),
+	T(test_cgcore_destroy),
 };
 #undef T
 
-- 
2.24.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/3] cgroup: Unify css_set task lists
  2020-01-17 15:15   ` [PATCH 1/3] cgroup: Unify css_set task lists Michal Koutný
@ 2020-01-17 16:59     ` Tejun Heo
  0 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2020-01-17 16:59 UTC (permalink / raw)
  To: Michal Koutný
  Cc: cgroups, Johannes Weiner, Li Zefan, alex.shi, guro, kernel-team,
	linger.lee, linux-arm-kernel, linux-kernel, linux-kselftest,
	linux-mediatek, matthias.bgg, shuah, tomcherry

On Fri, Jan 17, 2020 at 04:15:31PM +0100, Michal Koutný wrote:
> We track tasks of css_set in three different lists. The iterators
> unnecessarily process each list specially.
> Use an array of lists and simplify the iterator code. This is
> refactoring with no intended functional change.
> 
> Signed-off-by: Michal Koutný <mkoutny@suse.com>
> ---
>  include/linux/cgroup-defs.h | 15 ++++---
>  include/linux/cgroup.h      |  4 +-
>  kernel/cgroup/cgroup.c      | 81 +++++++++++++++++++------------------
>  kernel/cgroup/debug.c       | 16 ++++----
>  4 files changed, 60 insertions(+), 56 deletions(-)

So, I get the urge to move the lists into an array and thought about
doing that while adding the third list for sure.  However, it does
make code paths which don't walk all lists wordier and the code path
is already tricky like hell.  Given that there aren't that many places
which loop over the lists, if you really wanna clean it up, maybe add
an interator over the lists so that both parts of code can look lean?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-17 15:15   ` [PATCH 2/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
@ 2020-01-17 17:28     ` Tejun Heo
  2020-01-17 18:41       ` Suren Baghdasaryan
  0 siblings, 1 reply; 19+ messages in thread
From: Tejun Heo @ 2020-01-17 17:28 UTC (permalink / raw)
  To: Michal Koutný
  Cc: cgroups, Johannes Weiner, Li Zefan, alex.shi, guro, kernel-team,
	linger.lee, linux-arm-kernel, linux-kernel, linux-kselftest,
	linux-mediatek, matthias.bgg, shuah, tomcherry

On Fri, Jan 17, 2020 at 04:15:32PM +0100, Michal Koutný wrote:
> PF_EXITING is set earlier than actual removal from css_set when a task
> is exitting. This can confuse cgroup.procs readers who see no PF_EXITING
> tasks, however, rmdir is checking against css_set membership so it can
> transitionally fail with EBUSY.
> 
> Fix this by listing tasks that weren't unlinked from css_set active
> lists.
> It may happen that other users of the task iterator (without
> CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This
> is equal to the state before commit c03cd7738a83 ("cgroup: Include dying
> leaders with live threads in PROCS iterations") but it may be reviewed
> later.

Yeah, this looks fine to me.  Any chance you can order this before the
clean up so that we can mark it for -stable.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-17 15:15 ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
                     ` (2 preceding siblings ...)
  2020-01-17 15:15   ` [PATCH " Michal Koutný
@ 2020-01-17 17:30   ` Suren Baghdasaryan
  3 siblings, 0 replies; 19+ messages in thread
From: Suren Baghdasaryan @ 2020-01-17 17:30 UTC (permalink / raw)
  To: Michal Koutný
  Cc: cgroups mailinglist, Johannes Weiner, Li Zefan, Tejun Heo,
	alex.shi, Roman Gushchin, kernel-team, JeiFeng Lee,
	linux-arm-kernel, LKML, linux-kselftest, linux-mediatek,
	matthias.bgg, shuah, Tom Cherry

Hi Michal,

On Fri, Jan 17, 2020 at 7:15 AM Michal Koutný <mkoutny@suse.com> wrote:
>
> Hi,
> I was looking into the issue and came up with an alternative solution that
> changes task iteration to be consistent with cgroup_is_populated() check and
> moving the responsibility to check PF_EXITING on the consumers of iterator API.

Yeah, that was my first thought which basically reverts a part of
c03cd7738a83. When I first brought up this issue in the other email
thread, Tejun's comment was "the right thing to do is allowing
destruction of cgroups w/ only
dead processes in it". I assumed, maybe incorrectly, that the desire
here is not to include dying processes into cgroup.procs but to allow
cgroups with dying processes to be deleted.

To be clear, either way is fine with me since both ways solve the
issue and this way the code is definitely simpler. I'll rerun the
tests with your patches just to confirm the issue is gone.
Thanks!

> I haven't check your approach thoroughly, however, it appears to me it
> complicates (already non-trivial) cgroup destruction path. I ran your selftest
> on the iterators approach and it proved working.
>
>
> Michal Koutný (2):
>   cgroup: Unify css_set task lists
>   cgroup: Iterate tasks that did not finish do_exit()
>
> Suren Baghdasaryan (1):
>   kselftest/cgroup: add cgroup destruction test
>
>  include/linux/cgroup-defs.h                |  15 ++-
>  include/linux/cgroup.h                     |   4 +-
>  kernel/cgroup/cgroup.c                     |  86 ++++++++--------
>  kernel/cgroup/debug.c                      |  16 ++-
>  tools/testing/selftests/cgroup/test_core.c | 113 +++++++++++++++++++++
>  5 files changed, 176 insertions(+), 58 deletions(-)
>
> --
> 2.24.1
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-17 17:28     ` Tejun Heo
@ 2020-01-17 18:41       ` Suren Baghdasaryan
  2020-01-20 14:56         ` Michal Koutný
  0 siblings, 1 reply; 19+ messages in thread
From: Suren Baghdasaryan @ 2020-01-17 18:41 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Koutný,
	cgroups mailinglist, Johannes Weiner, Li Zefan, alex.shi,
	Roman Gushchin, kernel-team, JeiFeng Lee, linux-arm-kernel, LKML,
	linux-kselftest, linux-mediatek, matthias.bgg, shuah, Tom Cherry

On Fri, Jan 17, 2020 at 9:28 AM Tejun Heo <tj@kernel.org> wrote:
>
> On Fri, Jan 17, 2020 at 04:15:32PM +0100, Michal Koutný wrote:
> > PF_EXITING is set earlier than actual removal from css_set when a task
> > is exitting. This can confuse cgroup.procs readers who see no PF_EXITING
> > tasks, however, rmdir is checking against css_set membership so it can
> > transitionally fail with EBUSY.
> >
> > Fix this by listing tasks that weren't unlinked from css_set active
> > lists.
> > It may happen that other users of the task iterator (without
> > CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This
> > is equal to the state before commit c03cd7738a83 ("cgroup: Include dying
> > leaders with live threads in PROCS iterations") but it may be reviewed
> > later.

Tested-by: Suren Baghdasaryan <surenb@google.com>

>
> Yeah, this looks fine to me.  Any chance you can order this before the
> clean up so that we can mark it for -stable.
>

+1 for reordering. Makes it easier to backport.
Thanks,
Suren.

> Thanks.
>
> --
> tejun
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-17 18:41       ` Suren Baghdasaryan
@ 2020-01-20 14:56         ` Michal Koutný
  2020-01-24 11:40           ` [PATCH v2 0/3] " Michal Koutný
  0 siblings, 1 reply; 19+ messages in thread
From: Michal Koutný @ 2020-01-20 14:56 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Tejun Heo, cgroups mailinglist, Johannes Weiner, Li Zefan,
	alex.shi, Roman Gushchin, kernel-team, JeiFeng Lee,
	linux-arm-kernel, LKML, linux-kselftest, linux-mediatek,
	matthias.bgg, shuah, Tom Cherry

[-- Attachment #1: Type: text/plain, Size: 475 bytes --]

On Fri, Jan 17, 2020 at 10:41:29AM -0800, Suren Baghdasaryan <surenb@google.com> wrote:
> Tested-by: Suren Baghdasaryan <surenb@google.com>
Thanks.

> > Yeah, this looks fine to me.  Any chance you can order this before the
> > clean up so that we can mark it for -stable.
> +1 for reordering. Makes it easier to backport.
The grounds still need to be prepared for css_task_iter to store
additional information. Let me see how the preceding changes can be
minimized.

Michal

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2 0/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-20 14:56         ` Michal Koutný
@ 2020-01-24 11:40           ` " Michal Koutný
  2020-01-24 11:40             ` [PATCH v2 1/3] " Michal Koutný
                               ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Michal Koutný @ 2020-01-24 11:40 UTC (permalink / raw)
  To: cgroups
  Cc: alex.shi, guro, hannes, kernel-team, linger.lee,
	linux-arm-kernel, linux-kernel, linux-kselftest, linux-mediatek,
	lizefan, matthias.bgg, shuah, surenb, tj, tomcherry

Hi,
thanks for your feedback. I'm sending updated solution that changes task
iteration to be consistent with cgroup_is_populated() check and moving the
responsibility to check PF_EXITING on the consumers of iterator API.

Changes from v1
- v1: https://lore.kernel.org/lkml/20200117151533.12381-1-mkoutny@suse.com/
- Swap the fixing and refactoring patch (in order to make the fix easier for
  backport)
- Don't introduce tasks lists array because of unnecessarily long access code
- Fix comment in selftest
- I leave the CC:stable on discretion of the maintainer

Michal Koutný (2):
  cgroup: Iterate tasks that did not finish do_exit()
  cgroup: Clean up css_set task traversal

Suren Baghdasaryan (1):
  kselftest/cgroup: add cgroup destruction test

 include/linux/cgroup.h                     |   4 +-
 kernel/cgroup/cgroup.c                     |  60 ++++++-----
 tools/testing/selftests/cgroup/test_core.c | 113 +++++++++++++++++++++
 3 files changed, 146 insertions(+), 31 deletions(-)

-- 
2.24.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2 1/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-24 11:40           ` [PATCH v2 0/3] " Michal Koutný
@ 2020-01-24 11:40             ` " Michal Koutný
  2020-01-24 22:56               ` Suren Baghdasaryan
  2020-02-12 22:03               ` Tejun Heo
  2020-01-24 11:40             ` [PATCH v2 2/3] cgroup: Clean up css_set task traversal Michal Koutný
  2020-01-24 11:40             ` [PATCH v2 3/3] kselftest/cgroup: add cgroup destruction test Michal Koutný
  2 siblings, 2 replies; 19+ messages in thread
From: Michal Koutný @ 2020-01-24 11:40 UTC (permalink / raw)
  To: cgroups
  Cc: alex.shi, guro, hannes, kernel-team, linger.lee,
	linux-arm-kernel, linux-kernel, linux-kselftest, linux-mediatek,
	lizefan, matthias.bgg, shuah, surenb, tj, tomcherry

PF_EXITING is set earlier than actual removal from css_set when a task
is exitting. This can confuse cgroup.procs readers who see no PF_EXITING
tasks, however, rmdir is checking against css_set membership so it can
transitionally fail with EBUSY.

Fix this by listing tasks that weren't unlinked from css_set active
lists.
It may happen that other users of the task iterator (without
CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This
is equal to the state before commit c03cd7738a83 ("cgroup: Include dying
leaders with live threads in PROCS iterations") but it may be reviewed
later.

Reported-by: Suren Baghdasaryan <surenb@google.com>
Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations")
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup/cgroup.c | 23 ++++++++++++++++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d7ddebd0cdec..e75d2191226b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -62,6 +62,7 @@ struct css_task_iter {
 	struct list_head		*mg_tasks_head;
 	struct list_head		*dying_tasks_head;
 
+	struct list_head		*cur_tasks_head;
 	struct css_set			*cur_cset;
 	struct css_set			*cur_dcset;
 	struct task_struct		*cur_task;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 735af8f15f95..a6e3619e013b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4404,12 +4404,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 		}
 	} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
 
-	if (!list_empty(&cset->tasks))
+	if (!list_empty(&cset->tasks)) {
 		it->task_pos = cset->tasks.next;
-	else if (!list_empty(&cset->mg_tasks))
+		it->cur_tasks_head = &cset->tasks;
+	} else if (!list_empty(&cset->mg_tasks)) {
 		it->task_pos = cset->mg_tasks.next;
-	else
+		it->cur_tasks_head = &cset->mg_tasks;
+	} else {
 		it->task_pos = cset->dying_tasks.next;
+		it->cur_tasks_head = &cset->dying_tasks;
+	}
 
 	it->tasks_head = &cset->tasks;
 	it->mg_tasks_head = &cset->mg_tasks;
@@ -4467,10 +4471,14 @@ static void css_task_iter_advance(struct css_task_iter *it)
 		else
 			it->task_pos = it->task_pos->next;
 
-		if (it->task_pos == it->tasks_head)
+		if (it->task_pos == it->tasks_head) {
 			it->task_pos = it->mg_tasks_head->next;
-		if (it->task_pos == it->mg_tasks_head)
+			it->cur_tasks_head = it->mg_tasks_head;
+		}
+		if (it->task_pos == it->mg_tasks_head) {
 			it->task_pos = it->dying_tasks_head->next;
+			it->cur_tasks_head = it->dying_tasks_head;
+		}
 		if (it->task_pos == it->dying_tasks_head)
 			css_task_iter_advance_css_set(it);
 	} else {
@@ -4489,11 +4497,12 @@ static void css_task_iter_advance(struct css_task_iter *it)
 			goto repeat;
 
 		/* and dying leaders w/o live member threads */
-		if (!atomic_read(&task->signal->live))
+		if (it->cur_tasks_head == it->dying_tasks_head &&
+		    !atomic_read(&task->signal->live))
 			goto repeat;
 	} else {
 		/* skip all dying ones */
-		if (task->flags & PF_EXITING)
+		if (it->cur_tasks_head == it->dying_tasks_head)
 			goto repeat;
 	}
 }
-- 
2.24.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2 2/3] cgroup: Clean up css_set task traversal
  2020-01-24 11:40           ` [PATCH v2 0/3] " Michal Koutný
  2020-01-24 11:40             ` [PATCH v2 1/3] " Michal Koutný
@ 2020-01-24 11:40             ` Michal Koutný
  2020-01-24 11:40             ` [PATCH v2 3/3] kselftest/cgroup: add cgroup destruction test Michal Koutný
  2 siblings, 0 replies; 19+ messages in thread
From: Michal Koutný @ 2020-01-24 11:40 UTC (permalink / raw)
  To: cgroups
  Cc: alex.shi, guro, hannes, kernel-team, linger.lee,
	linux-arm-kernel, linux-kernel, linux-kselftest, linux-mediatek,
	lizefan, matthias.bgg, shuah, surenb, tj, tomcherry

css_task_iter stores pointer to head of each iterable list, this dates
back to commit 0f0a2b4fa621 ("cgroup: reorganize css_task_iter") when we
did not store cur_cset. Let us utilize list heads directly in cur_cset
and streamline css_task_iter_advance_css_set a bit. This is no
intentional function change.

Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 include/linux/cgroup.h |  3 ---
 kernel/cgroup/cgroup.c | 61 +++++++++++++++++++-----------------------
 2 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e75d2191226b..f1219b927817 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -58,9 +58,6 @@ struct css_task_iter {
 	struct list_head		*tcset_head;
 
 	struct list_head		*task_pos;
-	struct list_head		*tasks_head;
-	struct list_head		*mg_tasks_head;
-	struct list_head		*dying_tasks_head;
 
 	struct list_head		*cur_tasks_head;
 	struct css_set			*cur_cset;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a6e3619e013b..14e0e360a2b4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4395,29 +4395,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 
 	lockdep_assert_held(&css_set_lock);
 
-	/* Advance to the next non-empty css_set */
-	do {
-		cset = css_task_iter_next_css_set(it);
-		if (!cset) {
-			it->task_pos = NULL;
-			return;
+	/* Advance to the next non-empty css_set and find first non-empty tasks list*/
+	while ((cset = css_task_iter_next_css_set(it))) {
+		if (!list_empty(&cset->tasks)) {
+			it->cur_tasks_head = &cset->tasks;
+			break;
+		} else if (!list_empty(&cset->mg_tasks)) {
+			it->cur_tasks_head = &cset->mg_tasks;
+			break;
+		} else if (!list_empty(&cset->dying_tasks)) {
+			it->cur_tasks_head = &cset->dying_tasks;
+			break;
 		}
-	} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
-
-	if (!list_empty(&cset->tasks)) {
-		it->task_pos = cset->tasks.next;
-		it->cur_tasks_head = &cset->tasks;
-	} else if (!list_empty(&cset->mg_tasks)) {
-		it->task_pos = cset->mg_tasks.next;
-		it->cur_tasks_head = &cset->mg_tasks;
-	} else {
-		it->task_pos = cset->dying_tasks.next;
-		it->cur_tasks_head = &cset->dying_tasks;
 	}
-
-	it->tasks_head = &cset->tasks;
-	it->mg_tasks_head = &cset->mg_tasks;
-	it->dying_tasks_head = &cset->dying_tasks;
+	if (!cset) {
+		it->task_pos = NULL;
+		return;
+	}
+	it->task_pos = it->cur_tasks_head->next;
 
 	/*
 	 * We don't keep css_sets locked across iteration steps and thus
@@ -4462,24 +4457,24 @@ static void css_task_iter_advance(struct css_task_iter *it)
 repeat:
 	if (it->task_pos) {
 		/*
-		 * Advance iterator to find next entry.  cset->tasks is
-		 * consumed first and then ->mg_tasks.  After ->mg_tasks,
-		 * we move onto the next cset.
+		 * Advance iterator to find next entry. We go through cset
+		 * tasks, mg_tasks and dying_tasks, when consumed we move onto
+		 * the next cset.
 		 */
 		if (it->flags & CSS_TASK_ITER_SKIPPED)
 			it->flags &= ~CSS_TASK_ITER_SKIPPED;
 		else
 			it->task_pos = it->task_pos->next;
 
-		if (it->task_pos == it->tasks_head) {
-			it->task_pos = it->mg_tasks_head->next;
-			it->cur_tasks_head = it->mg_tasks_head;
+		if (it->task_pos == &it->cur_cset->tasks) {
+			it->cur_tasks_head = &it->cur_cset->mg_tasks;
+			it->task_pos = it->cur_tasks_head->next;
 		}
-		if (it->task_pos == it->mg_tasks_head) {
-			it->task_pos = it->dying_tasks_head->next;
-			it->cur_tasks_head = it->dying_tasks_head;
+		if (it->task_pos == &it->cur_cset->mg_tasks) {
+			it->cur_tasks_head = &it->cur_cset->dying_tasks;
+			it->task_pos = it->cur_tasks_head->next;
 		}
-		if (it->task_pos == it->dying_tasks_head)
+		if (it->task_pos == &it->cur_cset->dying_tasks)
 			css_task_iter_advance_css_set(it);
 	} else {
 		/* called from start, proceed to the first cset */
@@ -4497,12 +4492,12 @@ static void css_task_iter_advance(struct css_task_iter *it)
 			goto repeat;
 
 		/* and dying leaders w/o live member threads */
-		if (it->cur_tasks_head == it->dying_tasks_head &&
+		if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
 		    !atomic_read(&task->signal->live))
 			goto repeat;
 	} else {
 		/* skip all dying ones */
-		if (it->cur_tasks_head == it->dying_tasks_head)
+		if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
 			goto repeat;
 	}
 }
-- 
2.24.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2 3/3] kselftest/cgroup: add cgroup destruction test
  2020-01-24 11:40           ` [PATCH v2 0/3] " Michal Koutný
  2020-01-24 11:40             ` [PATCH v2 1/3] " Michal Koutný
  2020-01-24 11:40             ` [PATCH v2 2/3] cgroup: Clean up css_set task traversal Michal Koutný
@ 2020-01-24 11:40             ` Michal Koutný
  2020-02-12 22:10               ` Tejun Heo
  2 siblings, 1 reply; 19+ messages in thread
From: Michal Koutný @ 2020-01-24 11:40 UTC (permalink / raw)
  To: cgroups
  Cc: alex.shi, guro, hannes, kernel-team, linger.lee,
	linux-arm-kernel, linux-kernel, linux-kselftest, linux-mediatek,
	lizefan, matthias.bgg, shuah, surenb, tj, tomcherry

From: Suren Baghdasaryan <surenb@google.com>

Add new test to verify that a cgroup with dead processes can be destroyed.
The test spawns a child process which allocates and touches 100MB of RAM
to ensure prolonged exit. Subsequently it kills the child, waits until
the cgroup containing the child is empty and destroys the cgroup.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
[mkoutny@suse.com: Fix typo in test_cgcore_destroy comment]
Acked-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 tools/testing/selftests/cgroup/test_core.c | 113 +++++++++++++++++++++
 1 file changed, 113 insertions(+)

diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
index c5ca669feb2b..76c9dd578ba5 100644
--- a/tools/testing/selftests/cgroup/test_core.c
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -2,7 +2,10 @@
 
 #include <linux/limits.h>
 #include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
 #include <unistd.h>
+#include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
 #include <signal.h>
@@ -12,6 +15,115 @@
 #include "../kselftest.h"
 #include "cgroup_util.h"
 
+static int touch_anon(char *buf, size_t size)
+{
+	int fd;
+	char *pos = buf;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	while (size > 0) {
+		ssize_t ret = read(fd, pos, size);
+
+		if (ret < 0) {
+			if (errno != EINTR) {
+				close(fd);
+				return -1;
+			}
+		} else {
+			pos += ret;
+			size -= ret;
+		}
+	}
+	close(fd);
+
+	return 0;
+}
+
+static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
+{
+	int ppid = getppid();
+	size_t size = (size_t)arg;
+	void *buf;
+
+	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+		   0, 0);
+	if (buf == MAP_FAILED)
+		return -1;
+
+	if (touch_anon((char *)buf, size)) {
+		munmap(buf, size);
+		return -1;
+	}
+
+	while (getppid() == ppid)
+		sleep(1);
+
+	munmap(buf, size);
+	return 0;
+}
+
+/*
+ * Create a child process that allocates and touches 100MB, then waits to be
+ * killed. Wait until the child is attached to the cgroup, kill all processes
+ * in that cgroup and wait until "cgroup.procs" is empty. At this point try to
+ * destroy the empty cgroup. The test helps detect race conditions between
+ * dying processes leaving the cgroup and cgroup destruction path.
+ */
+static int test_cgcore_destroy(const char *root)
+{
+	int ret = KSFT_FAIL;
+	char *cg_test = NULL;
+	int child_pid;
+	char buf[PAGE_SIZE];
+
+	cg_test = cg_name(root, "cg_test");
+
+	if (!cg_test)
+		goto cleanup;
+
+	for (int i = 0; i < 10; i++) {
+		if (cg_create(cg_test))
+			goto cleanup;
+
+		child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
+					  (void *) MB(100));
+
+		if (child_pid < 0)
+			goto cleanup;
+
+		/* wait for the child to enter cgroup */
+		if (cg_wait_for_proc_count(cg_test, 1))
+			goto cleanup;
+
+		if (cg_killall(cg_test))
+			goto cleanup;
+
+		/* wait for cgroup to be empty */
+		while (1) {
+			if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
+				goto cleanup;
+			if (buf[0] == '\0')
+				break;
+			usleep(1000);
+		}
+
+		if (rmdir(cg_test))
+			goto cleanup;
+
+		if (waitpid(child_pid, NULL, 0) < 0)
+			goto cleanup;
+	}
+	ret = KSFT_PASS;
+cleanup:
+	if (cg_test)
+		cg_destroy(cg_test);
+	free(cg_test);
+	return ret;
+}
+
 /*
  * A(0) - B(0) - C(1)
  *        \ D(0)
@@ -512,6 +624,7 @@ struct corecg_test {
 	T(test_cgcore_populated),
 	T(test_cgcore_proc_migration),
 	T(test_cgcore_thread_migration),
+	T(test_cgcore_destroy),
 };
 #undef T
 
-- 
2.24.1


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 1/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-24 11:40             ` [PATCH v2 1/3] " Michal Koutný
@ 2020-01-24 22:56               ` Suren Baghdasaryan
  2020-02-05 17:27                 ` Suren Baghdasaryan
  2020-02-12 22:03               ` Tejun Heo
  1 sibling, 1 reply; 19+ messages in thread
From: Suren Baghdasaryan @ 2020-01-24 22:56 UTC (permalink / raw)
  To: Michal Koutný
  Cc: cgroups mailinglist, alex.shi, Roman Gushchin, Johannes Weiner,
	kernel-team, JeiFeng Lee, linux-arm-kernel, LKML,
	linux-kselftest, linux-mediatek, Li Zefan, matthias.bgg, shuah,
	Tejun Heo, Tom Cherry

On Fri, Jan 24, 2020 at 3:40 AM Michal Koutný <mkoutny@suse.com> wrote:
>
> PF_EXITING is set earlier than actual removal from css_set when a task
> is exitting. This can confuse cgroup.procs readers who see no PF_EXITING
> tasks, however, rmdir is checking against css_set membership so it can
> transitionally fail with EBUSY.
>
> Fix this by listing tasks that weren't unlinked from css_set active
> lists.
> It may happen that other users of the task iterator (without
> CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This
> is equal to the state before commit c03cd7738a83 ("cgroup: Include dying
> leaders with live threads in PROCS iterations") but it may be reviewed
> later.
>
> Reported-by: Suren Baghdasaryan <surenb@google.com>
> Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations")
> Signed-off-by: Michal Koutný <mkoutny@suse.com>
> ---
>  include/linux/cgroup.h |  1 +
>  kernel/cgroup/cgroup.c | 23 ++++++++++++++++-------
>  2 files changed, 17 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index d7ddebd0cdec..e75d2191226b 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -62,6 +62,7 @@ struct css_task_iter {
>         struct list_head                *mg_tasks_head;
>         struct list_head                *dying_tasks_head;
>
> +       struct list_head                *cur_tasks_head;
>         struct css_set                  *cur_cset;
>         struct css_set                  *cur_dcset;
>         struct task_struct              *cur_task;
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 735af8f15f95..a6e3619e013b 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -4404,12 +4404,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
>                 }
>         } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
>
> -       if (!list_empty(&cset->tasks))
> +       if (!list_empty(&cset->tasks)) {
>                 it->task_pos = cset->tasks.next;
> -       else if (!list_empty(&cset->mg_tasks))
> +               it->cur_tasks_head = &cset->tasks;
> +       } else if (!list_empty(&cset->mg_tasks)) {
>                 it->task_pos = cset->mg_tasks.next;
> -       else
> +               it->cur_tasks_head = &cset->mg_tasks;
> +       } else {
>                 it->task_pos = cset->dying_tasks.next;
> +               it->cur_tasks_head = &cset->dying_tasks;
> +       }
>
>         it->tasks_head = &cset->tasks;
>         it->mg_tasks_head = &cset->mg_tasks;
> @@ -4467,10 +4471,14 @@ static void css_task_iter_advance(struct css_task_iter *it)
>                 else
>                         it->task_pos = it->task_pos->next;
>
> -               if (it->task_pos == it->tasks_head)
> +               if (it->task_pos == it->tasks_head) {
>                         it->task_pos = it->mg_tasks_head->next;
> -               if (it->task_pos == it->mg_tasks_head)
> +                       it->cur_tasks_head = it->mg_tasks_head;
> +               }
> +               if (it->task_pos == it->mg_tasks_head) {
>                         it->task_pos = it->dying_tasks_head->next;
> +                       it->cur_tasks_head = it->dying_tasks_head;
> +               }
>                 if (it->task_pos == it->dying_tasks_head)
>                         css_task_iter_advance_css_set(it);
>         } else {
> @@ -4489,11 +4497,12 @@ static void css_task_iter_advance(struct css_task_iter *it)
>                         goto repeat;
>
>                 /* and dying leaders w/o live member threads */
> -               if (!atomic_read(&task->signal->live))
> +               if (it->cur_tasks_head == it->dying_tasks_head &&
> +                   !atomic_read(&task->signal->live))
>                         goto repeat;
>         } else {
>                 /* skip all dying ones */
> -               if (task->flags & PF_EXITING)
> +               if (it->cur_tasks_head == it->dying_tasks_head)
>                         goto repeat;
>         }
>  }
> --
> 2.24.1
>

Tested-by: Suren Baghdasaryan <surenb@google.com>

Thanks!

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 1/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-24 22:56               ` Suren Baghdasaryan
@ 2020-02-05 17:27                 ` Suren Baghdasaryan
  0 siblings, 0 replies; 19+ messages in thread
From: Suren Baghdasaryan @ 2020-02-05 17:27 UTC (permalink / raw)
  To: Michal Koutný
  Cc: cgroups mailinglist, alex.shi, Roman Gushchin, Johannes Weiner,
	kernel-team, JeiFeng Lee, linux-arm-kernel, LKML,
	linux-kselftest, linux-mediatek, Li Zefan, matthias.bgg, shuah,
	Tejun Heo, Tom Cherry

On Fri, Jan 24, 2020 at 2:56 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Jan 24, 2020 at 3:40 AM Michal Koutný <mkoutny@suse.com> wrote:
> >
> > PF_EXITING is set earlier than actual removal from css_set when a task
> > is exitting. This can confuse cgroup.procs readers who see no PF_EXITING
> > tasks, however, rmdir is checking against css_set membership so it can
> > transitionally fail with EBUSY.
> >
> > Fix this by listing tasks that weren't unlinked from css_set active
> > lists.
> > It may happen that other users of the task iterator (without
> > CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This
> > is equal to the state before commit c03cd7738a83 ("cgroup: Include dying
> > leaders with live threads in PROCS iterations") but it may be reviewed
> > later.
> >
> > Reported-by: Suren Baghdasaryan <surenb@google.com>
> > Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations")
> > Signed-off-by: Michal Koutný <mkoutny@suse.com>
> > ---
> >  include/linux/cgroup.h |  1 +
> >  kernel/cgroup/cgroup.c | 23 ++++++++++++++++-------
> >  2 files changed, 17 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> > index d7ddebd0cdec..e75d2191226b 100644
> > --- a/include/linux/cgroup.h
> > +++ b/include/linux/cgroup.h
> > @@ -62,6 +62,7 @@ struct css_task_iter {
> >         struct list_head                *mg_tasks_head;
> >         struct list_head                *dying_tasks_head;
> >
> > +       struct list_head                *cur_tasks_head;
> >         struct css_set                  *cur_cset;
> >         struct css_set                  *cur_dcset;
> >         struct task_struct              *cur_task;
> > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > index 735af8f15f95..a6e3619e013b 100644
> > --- a/kernel/cgroup/cgroup.c
> > +++ b/kernel/cgroup/cgroup.c
> > @@ -4404,12 +4404,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
> >                 }
> >         } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
> >
> > -       if (!list_empty(&cset->tasks))
> > +       if (!list_empty(&cset->tasks)) {
> >                 it->task_pos = cset->tasks.next;
> > -       else if (!list_empty(&cset->mg_tasks))
> > +               it->cur_tasks_head = &cset->tasks;
> > +       } else if (!list_empty(&cset->mg_tasks)) {
> >                 it->task_pos = cset->mg_tasks.next;
> > -       else
> > +               it->cur_tasks_head = &cset->mg_tasks;
> > +       } else {
> >                 it->task_pos = cset->dying_tasks.next;
> > +               it->cur_tasks_head = &cset->dying_tasks;
> > +       }
> >
> >         it->tasks_head = &cset->tasks;
> >         it->mg_tasks_head = &cset->mg_tasks;
> > @@ -4467,10 +4471,14 @@ static void css_task_iter_advance(struct css_task_iter *it)
> >                 else
> >                         it->task_pos = it->task_pos->next;
> >
> > -               if (it->task_pos == it->tasks_head)
> > +               if (it->task_pos == it->tasks_head) {
> >                         it->task_pos = it->mg_tasks_head->next;
> > -               if (it->task_pos == it->mg_tasks_head)
> > +                       it->cur_tasks_head = it->mg_tasks_head;
> > +               }
> > +               if (it->task_pos == it->mg_tasks_head) {
> >                         it->task_pos = it->dying_tasks_head->next;
> > +                       it->cur_tasks_head = it->dying_tasks_head;
> > +               }
> >                 if (it->task_pos == it->dying_tasks_head)
> >                         css_task_iter_advance_css_set(it);
> >         } else {
> > @@ -4489,11 +4497,12 @@ static void css_task_iter_advance(struct css_task_iter *it)
> >                         goto repeat;
> >
> >                 /* and dying leaders w/o live member threads */
> > -               if (!atomic_read(&task->signal->live))
> > +               if (it->cur_tasks_head == it->dying_tasks_head &&
> > +                   !atomic_read(&task->signal->live))
> >                         goto repeat;
> >         } else {
> >                 /* skip all dying ones */
> > -               if (task->flags & PF_EXITING)
> > +               if (it->cur_tasks_head == it->dying_tasks_head)
> >                         goto repeat;
> >         }
> >  }
> > --
> > 2.24.1
> >
>
> Tested-by: Suren Baghdasaryan <surenb@google.com>
>
> Thanks!

Hi Folks,
If this new version looks good could we get an Ack please? I need to
start backporting this fix to Android and would like a confirmation
before doing so.
Thanks!

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 1/3] cgroup: Iterate tasks that did not finish do_exit()
  2020-01-24 11:40             ` [PATCH v2 1/3] " Michal Koutný
  2020-01-24 22:56               ` Suren Baghdasaryan
@ 2020-02-12 22:03               ` Tejun Heo
  1 sibling, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2020-02-12 22:03 UTC (permalink / raw)
  To: Michal Koutný
  Cc: cgroups, alex.shi, guro, hannes, kernel-team, linger.lee,
	linux-arm-kernel, linux-kernel, linux-kselftest, linux-mediatek,
	lizefan, matthias.bgg, shuah, surenb, tomcherry

On Fri, Jan 24, 2020 at 12:40:15PM +0100, Michal Koutný wrote:
> PF_EXITING is set earlier than actual removal from css_set when a task
> is exitting. This can confuse cgroup.procs readers who see no PF_EXITING
> tasks, however, rmdir is checking against css_set membership so it can
> transitionally fail with EBUSY.
> 
> Fix this by listing tasks that weren't unlinked from css_set active
> lists.
> It may happen that other users of the task iterator (without
> CSS_TASK_ITER_PROCS) spot a PF_EXITING task before cgroup_exit(). This
> is equal to the state before commit c03cd7738a83 ("cgroup: Include dying
> leaders with live threads in PROCS iterations") but it may be reviewed
> later.
> 
> Reported-by: Suren Baghdasaryan <surenb@google.com>
> Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations")
> Signed-off-by: Michal Koutný <mkoutny@suse.com>

Applied to cgroup/for-5.6-fixes.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2 3/3] kselftest/cgroup: add cgroup destruction test
  2020-01-24 11:40             ` [PATCH v2 3/3] kselftest/cgroup: add cgroup destruction test Michal Koutný
@ 2020-02-12 22:10               ` Tejun Heo
  0 siblings, 0 replies; 19+ messages in thread
From: Tejun Heo @ 2020-02-12 22:10 UTC (permalink / raw)
  To: Michal Koutný
  Cc: cgroups, alex.shi, guro, hannes, kernel-team, linger.lee,
	linux-arm-kernel, linux-kernel, linux-kselftest, linux-mediatek,
	lizefan, matthias.bgg, shuah, surenb, tomcherry

On Fri, Jan 24, 2020 at 12:40:17PM +0100, Michal Koutný wrote:
> From: Suren Baghdasaryan <surenb@google.com>
> 
> Add new test to verify that a cgroup with dead processes can be destroyed.
> The test spawns a child process which allocates and touches 100MB of RAM
> to ensure prolonged exit. Subsequently it kills the child, waits until
> the cgroup containing the child is empty and destroys the cgroup.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> [mkoutny@suse.com: Fix typo in test_cgcore_destroy comment]
> Acked-by: Michal Koutný <mkoutny@suse.com>
> Signed-off-by: Michal Koutný <mkoutny@suse.com>

Applied 2-3 to cgroup/for-5.7.

Thanks a lot for working on this!

-- 
tejun

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, back to index

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-16  4:36 [PATCH 1/2] cgroup: allow deletion of cgroups containing only dying processes Suren Baghdasaryan
2020-01-16  4:36 ` [PATCH 2/2] kselftest/cgroup: add cgroup destruction test Suren Baghdasaryan
2020-01-17 15:15 ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
2020-01-17 15:15   ` [PATCH 1/3] cgroup: Unify css_set task lists Michal Koutný
2020-01-17 16:59     ` Tejun Heo
2020-01-17 15:15   ` [PATCH 2/3] cgroup: Iterate tasks that did not finish do_exit() Michal Koutný
2020-01-17 17:28     ` Tejun Heo
2020-01-17 18:41       ` Suren Baghdasaryan
2020-01-20 14:56         ` Michal Koutný
2020-01-24 11:40           ` [PATCH v2 0/3] " Michal Koutný
2020-01-24 11:40             ` [PATCH v2 1/3] " Michal Koutný
2020-01-24 22:56               ` Suren Baghdasaryan
2020-02-05 17:27                 ` Suren Baghdasaryan
2020-02-12 22:03               ` Tejun Heo
2020-01-24 11:40             ` [PATCH v2 2/3] cgroup: Clean up css_set task traversal Michal Koutný
2020-01-24 11:40             ` [PATCH v2 3/3] kselftest/cgroup: add cgroup destruction test Michal Koutný
2020-02-12 22:10               ` Tejun Heo
2020-01-17 15:15   ` [PATCH " Michal Koutný
2020-01-17 17:30   ` [PATCH 0/3] cgroup: Iterate tasks that did not finish do_exit() Suren Baghdasaryan

Linux-kselftest Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-kselftest/0 linux-kselftest/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-kselftest linux-kselftest/ https://lore.kernel.org/linux-kselftest \
		linux-kselftest@vger.kernel.org
	public-inbox-index linux-kselftest

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kselftest


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git