linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] cgroup/pids: turn cgroup_subsys->free() into cgroup_subsys->release() to fix the accounting
@ 2019-01-28 16:00 Oleg Nesterov
  2019-01-31 14:56 ` Tejun Heo
  0 siblings, 1 reply; 2+ messages in thread
From: Oleg Nesterov @ 2019-01-28 16:00 UTC (permalink / raw)
  To: Johannes Weiner, Li Zefan, Tejun Heo
  Cc: Herton Krzesinski, Jan Stancek, cgroups, linux-kernel

The only user of cgroup_subsys->free() callback is pids_cgrp_subsys which
needs pids_free() to uncharge the pid.

However, ->free() is called from __put_task_struct()->cgroup_free() and this
is too late. Even the trivial program which does

	for (;;) {
		int pid = fork();
		assert(pid >= 0);
		if (pid)
			wait(NULL);
		else
			exit(0);
	}

can run out of limits because release_task()->call_rcu(delayed_put_task_struct)
implies an RCU gp after the task/pid goes away and before the final put().

Test-case:

	mkdir -p /tmp/CG
	mount -t cgroup2 none /tmp/CG
	echo '+pids' > /tmp/CG/cgroup.subtree_control

	mkdir /tmp/CG/PID
	echo 2 > /tmp/CG/PID/pids.max

	perl -e 'while ($p = fork) { wait; } $p // die "fork failed: $!\n"' &
	echo $! > /tmp/CG/PID/cgroup.procs

Without this patch the forking process fails soon after migration.

Rename cgroup_subsys->free() to cgroup_subsys->release() and move the callsite
into the new helper, cgroup_release(), called by release_task() which actually
frees the pid(s).

Reported-by: Herton R. Krzesinski <hkrzesin@redhat.com>
Reported-by: Jan Stancek <jstancek@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/cgroup-defs.h |  2 +-
 include/linux/cgroup.h      |  2 ++
 kernel/cgroup/cgroup.c      | 15 +++++++++------
 kernel/cgroup/pids.c        |  4 ++--
 kernel/exit.c               |  1 +
 5 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8fcbae1..120d1d4 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -602,7 +602,7 @@ struct cgroup_subsys {
 	void (*cancel_fork)(struct task_struct *task);
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct task_struct *task);
-	void (*free)(struct task_struct *task);
+	void (*release)(struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 
 	bool early_init:1;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9968332..81f58b4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -121,6 +121,7 @@ extern int cgroup_can_fork(struct task_struct *p);
 extern void cgroup_cancel_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 void cgroup_exit(struct task_struct *p);
+void cgroup_release(struct task_struct *p);
 void cgroup_free(struct task_struct *p);
 
 int cgroup_init_early(void);
@@ -697,6 +698,7 @@ static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
 static inline void cgroup_cancel_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p) {}
+static inline void cgroup_release(struct task_struct *p) {}
 static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f31bd61..f441837 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -197,7 +197,7 @@ static u64 css_serial_nr_next = 1;
  */
 static u16 have_fork_callback __read_mostly;
 static u16 have_exit_callback __read_mostly;
-static u16 have_free_callback __read_mostly;
+static u16 have_release_callback __read_mostly;
 static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
@@ -5313,7 +5313,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
 	have_fork_callback |= (bool)ss->fork << ss->id;
 	have_exit_callback |= (bool)ss->exit << ss->id;
-	have_free_callback |= (bool)ss->free << ss->id;
+	have_release_callback |= (bool)ss->release << ss->id;
 	have_canfork_callback |= (bool)ss->can_fork << ss->id;
 
 	/* At system boot, before all subsystems have been
@@ -5749,16 +5749,19 @@ void cgroup_exit(struct task_struct *tsk)
 	} while_each_subsys_mask();
 }
 
-void cgroup_free(struct task_struct *task)
+void cgroup_release(struct task_struct *task)
 {
-	struct css_set *cset = task_css_set(task);
 	struct cgroup_subsys *ss;
 	int ssid;
 
-	do_each_subsys_mask(ss, ssid, have_free_callback) {
-		ss->free(task);
+	do_each_subsys_mask(ss, ssid, have_release_callback) {
+		ss->release(task);
 	} while_each_subsys_mask();
+}
 
+void cgroup_free(struct task_struct *task)
+{
+	struct css_set *cset = task_css_set(task);
 	put_css_set(cset);
 }
 
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 9829c67..c9960baa 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task)
 	pids_uncharge(pids, 1);
 }
 
-static void pids_free(struct task_struct *task)
+static void pids_release(struct task_struct *task)
 {
 	struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
 
@@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
 	.cancel_attach 	= pids_cancel_attach,
 	.can_fork	= pids_can_fork,
 	.cancel_fork	= pids_cancel_fork,
-	.free		= pids_free,
+	.release	= pids_release,
 	.legacy_cftypes	= pids_files,
 	.dfl_cftypes	= pids_files,
 	.threaded	= true,
diff --git a/kernel/exit.c b/kernel/exit.c
index 3fb7be0..c2b8443 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -219,6 +219,7 @@ void release_task(struct task_struct *p)
 	}
 
 	write_unlock_irq(&tasklist_lock);
+	cgroup_release(p);
 	release_thread(p);
 	call_rcu(&p->rcu, delayed_put_task_struct);
 
-- 
2.5.0



^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] cgroup/pids: turn cgroup_subsys->free() into cgroup_subsys->release() to fix the accounting
  2019-01-28 16:00 [PATCH] cgroup/pids: turn cgroup_subsys->free() into cgroup_subsys->release() to fix the accounting Oleg Nesterov
@ 2019-01-31 14:56 ` Tejun Heo
  0 siblings, 0 replies; 2+ messages in thread
From: Tejun Heo @ 2019-01-31 14:56 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Johannes Weiner, Li Zefan, Herton Krzesinski, Jan Stancek,
	cgroups, linux-kernel

On Mon, Jan 28, 2019 at 05:00:13PM +0100, Oleg Nesterov wrote:
> The only user of cgroup_subsys->free() callback is pids_cgrp_subsys which
> needs pids_free() to uncharge the pid.
> 
> However, ->free() is called from __put_task_struct()->cgroup_free() and this
> is too late. Even the trivial program which does
> 
> 	for (;;) {
> 		int pid = fork();
> 		assert(pid >= 0);
> 		if (pid)
> 			wait(NULL);
> 		else
> 			exit(0);
> 	}
> 
> can run out of limits because release_task()->call_rcu(delayed_put_task_struct)
> implies an RCU gp after the task/pid goes away and before the final put().
> 
> Test-case:
> 
> 	mkdir -p /tmp/CG
> 	mount -t cgroup2 none /tmp/CG
> 	echo '+pids' > /tmp/CG/cgroup.subtree_control
> 
> 	mkdir /tmp/CG/PID
> 	echo 2 > /tmp/CG/PID/pids.max
> 
> 	perl -e 'while ($p = fork) { wait; } $p // die "fork failed: $!\n"' &
> 	echo $! > /tmp/CG/PID/cgroup.procs
> 
> Without this patch the forking process fails soon after migration.
> 
> Rename cgroup_subsys->free() to cgroup_subsys->release() and move the callsite
> into the new helper, cgroup_release(), called by release_task() which actually
> frees the pid(s).
> 
> Reported-by: Herton R. Krzesinski <hkrzesin@redhat.com>
> Reported-by: Jan Stancek <jstancek@redhat.com>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>

Applied to cgroup/for-5.0.

Thanks, Oleg.

-- 
tejun

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-01-31 14:56 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-28 16:00 [PATCH] cgroup/pids: turn cgroup_subsys->free() into cgroup_subsys->release() to fix the accounting Oleg Nesterov
2019-01-31 14:56 ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).