[PATCH 2/3] sched, cgroup: Generalize threadgroup_rwsem

From: Tejun Heo <tj@kernel.org>
To: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <kafai@fb.com>, Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>
Cc: bpf@vger.kernel.org, kernel-team@fb.com, linux-kernel@vger.kernel.org
Subject: [PATCH 2/3] sched, cgroup: Generalize threadgroup_rwsem
Date: Wed, 20 Oct 2021 10:17:48 -1000	[thread overview]
Message-ID: <YXB5bGiFmACYWw2y@slm.duckdns.org> (raw)
In-Reply-To: <YXB5Mec4ahxXRx8K@slm.duckdns.org>

From 1b07d36b074acb8a97c8bb5c0f1604960763578e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Oct 2021 10:12:27 -1000

Generalize threadgroup stabilization through threadgroup_rwsem so that it
can be used outside cgroup.

* A new config option CONFIG_THREADGROUP_RWSEM which is selected by
  CONFIG_CGROUPS enables threadgroup_rwsem.

* The declarations are moved to linux/sched/threadgroup_rwsem.h and the
  rwsem is now defined in kernel/sched/core.c.

* cgroup_mutex nests outside threadgroup_rwsem. During fork,
  cgroup_css_set_fork() which is called from cgroup_can_fork() was acquiring
  both. However, generalizing threadgroup_rwsem means that it needs to be
  acquired and released in the outer copy_process(). To maintain the locking
  order, break out cgroup_mutex acquisition into a separate function
  cgroup_prep_fork() which is called from copy_process() before acquiring
  threadgroup_rwsem.

No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/exec.c                               |  1 +
 include/linux/cgroup-defs.h             | 33 ------------------
 include/linux/cgroup.h                  | 11 +++---
 include/linux/sched/threadgroup_rwsem.h | 46 +++++++++++++++++++++++++
 init/Kconfig                            |  4 +++
 kernel/cgroup/cgroup-v1.c               |  1 +
 kernel/cgroup/cgroup.c                  | 38 +++++++++++++-------
 kernel/fork.c                           | 10 +++++-
 kernel/sched/core.c                     |  4 +++
 kernel/sched/sched.h                    |  1 +
 kernel/signal.c                         |  1 +
 11 files changed, 98 insertions(+), 52 deletions(-)
 create mode 100644 include/linux/sched/threadgroup_rwsem.h

diff --git a/fs/exec.c b/fs/exec.c
index caedd06a6d472..b18abc76e1ce0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -39,6 +39,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1a77731e33096..b7e89b0c17057 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,7 +16,6 @@
 #include <linux/rcupdate.h>
 #include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
-#include <linux/percpu-rwsem.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/workqueue.h>
 #include <linux/bpf-cgroup.h>
@@ -708,42 +707,10 @@ struct cgroup_subsys {
 	unsigned int depends_on;
 };
 
-extern struct percpu_rw_semaphore threadgroup_rwsem;
-
-/**
- * threadgroup_change_begin - threadgroup exclusion for cgroups
- * @tsk: target task
- *
- * Allows cgroup operations to synchronize against threadgroup changes
- * using a percpu_rw_semaphore.
- */
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
-	percpu_down_read(&threadgroup_rwsem);
-}
-
-/**
- * threadgroup_change_end - threadgroup exclusion for cgroups
- * @tsk: target task
- *
- * Counterpart of threadgroup_change_begin().
- */
-static inline void threadgroup_change_end(struct task_struct *tsk)
-{
-	percpu_up_read(&threadgroup_rwsem);
-}
-
 #else	/* CONFIG_CGROUPS */
 
 #define CGROUP_SUBSYS_COUNT 0
 
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
-	might_sleep();
-}
-
-static inline void threadgroup_change_end(struct task_struct *tsk) {}
-
 #endif	/* CONFIG_CGROUPS */
 
 #ifdef CONFIG_SOCK_CGROUP_DATA
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 75c151413fda8..aa3df6361105f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -121,12 +121,10 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		     struct pid *pid, struct task_struct *tsk);
 
 void cgroup_fork(struct task_struct *p);
-extern int cgroup_can_fork(struct task_struct *p,
-			   struct kernel_clone_args *kargs);
-extern void cgroup_cancel_fork(struct task_struct *p,
-			       struct kernel_clone_args *kargs);
-extern void cgroup_post_fork(struct task_struct *p,
-			     struct kernel_clone_args *kargs);
+void cgroup_prep_fork(struct kernel_clone_args *kargs);
+int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs);
 void cgroup_exit(struct task_struct *p);
 void cgroup_release(struct task_struct *p);
 void cgroup_free(struct task_struct *p);
@@ -713,6 +711,7 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 				    struct dentry *dentry) { return -EINVAL; }
 
 static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_prep_fork(struct kernel_clone_args *kargs) { }
 static inline int cgroup_can_fork(struct task_struct *p,
 				  struct kernel_clone_args *kargs) { return 0; }
 static inline void cgroup_cancel_fork(struct task_struct *p,
diff --git a/include/linux/sched/threadgroup_rwsem.h b/include/linux/sched/threadgroup_rwsem.h
new file mode 100644
index 0000000000000..31ab72703724b
--- /dev/null
+++ b/include/linux/sched/threadgroup_rwsem.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_THREADGROUP_RWSEM_H
+#define _LINUX_SCHED_THREADGROUP_RWSEM_H
+
+#ifdef CONFIG_THREADGROUP_RWSEM
+/* including before task_struct definition causes dependency loop */
+#include <linux/percpu-rwsem.h>
+
+extern struct percpu_rw_semaphore threadgroup_rwsem;
+
+/**
+ * threadgroup_change_begin - mark the beginning of changes to a threadgroup
+ * @tsk: task causing the changes
+ *
+ * All operations which modify a threadgroup - a new thread joining the group,
+ * death of a member thread (the assertion of PF_EXITING) and exec(2)
+ * dethreading the process and replacing the leader - read-locks
+ * threadgroup_rwsem so that write-locking stabilizes thread groups.
+ */
+static inline void threadgroup_change_begin(struct task_struct *tsk)
+{
+	percpu_down_read(&threadgroup_rwsem);
+}
+
+/**
+ * threadgroup_change_end - mark the end of changes to a threadgroup
+ * @tsk: task causing the changes
+ *
+ * See threadgroup_change_begin().
+ */
+static inline void threadgroup_change_end(struct task_struct *tsk)
+{
+	percpu_up_read(&threadgroup_rwsem);
+}
+#else
+static inline void threadgroup_change_begin(struct task_struct *tsk)
+{
+	might_sleep();
+}
+
+static inline void threadgroup_change_end(struct task_struct *tsk)
+{
+}
+#endif
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 11f8a845f259d..3a3699ccff3ce 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -917,8 +917,12 @@ config NUMA_BALANCING_DEFAULT_ENABLED
 	  If set, automatic NUMA balancing will be enabled if running on a NUMA
 	  machine.
 
+config THREADGROUP_RWSEM
+        bool
+
 menuconfig CGROUPS
 	bool "Control Group support"
+	select THREADGROUP_RWSEM
 	select KERNFS
 	help
 	  This option adds support for grouping sets of processes together, for
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 03808e7deb2ea..9c747e258ae7c 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2fd01c901b1ae..937888386210a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -42,6 +42,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/percpu-rwsem.h>
@@ -109,8 +110,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 
-DEFINE_PERCPU_RWSEM(threadgroup_rwsem);
-
 #define cgroup_assert_mutex_or_rcu_locked()				\
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			   !lockdep_is_held(&cgroup_mutex),		\
@@ -6050,7 +6049,6 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
  * to the target cgroup.
  */
 static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
-	__acquires(&cgroup_mutex) __acquires(&threadgroup_rwsem)
 {
 	int ret;
 	struct cgroup *dst_cgrp = NULL;
@@ -6058,11 +6056,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 	struct super_block *sb;
 	struct file *f;
 
-	if (kargs->flags & CLONE_INTO_CGROUP)
-		mutex_lock(&cgroup_mutex);
-
-	threadgroup_change_begin(current);
-
 	spin_lock_irq(&css_set_lock);
 	cset = task_css_set(current);
 	get_css_set(cset);
@@ -6118,7 +6111,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 	return ret;
 
 err:
-	threadgroup_change_end(current);
 	mutex_unlock(&cgroup_mutex);
 	if (f)
 		fput(f);
@@ -6138,10 +6130,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
  * CLONE_INTO_CGROUP was requested.
  */
 static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
-	__releases(&threadgroup_rwsem) __releases(&cgroup_mutex)
+	__releases(&cgroup_mutex)
 {
-	threadgroup_change_end(current);
-
 	if (kargs->flags & CLONE_INTO_CGROUP) {
 		struct cgroup *cgrp = kargs->cgrp;
 		struct css_set *cset = kargs->cset;
@@ -6160,9 +6150,26 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
 	}
 }
 
+/**
+ * cgroup_prep_fork - called during fork before threadgroup_rwsem is acquired
+ * @kargs: the arguments passed to create the child process
+ *
+ * CLONE_INTO_CGROUP requires cgroup_mutex as we're migrating while forking.
+ * However, cgroup_mutex must nest outside threadgroup_rwsem which is
+ * read-locked before cgroup_can_fork(). Break out cgroup_mutex locking to this
+ * function to follow the locking order.
+ */
+void cgroup_prep_fork(struct kernel_clone_args *kargs)
+	__acquires(&cgroup_mutex)
+{
+	if (kargs->flags & CLONE_INTO_CGROUP)
+		mutex_lock(&cgroup_mutex);
+}
+
 /**
  * cgroup_can_fork - called on a new task before the process is exposed
  * @child: the child process
+ * @kargs: the arguments passed to create the child process
  *
  * This prepares a new css_set for the child process which the child will
  * be attached to in cgroup_post_fork().
@@ -6175,6 +6182,13 @@ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
 	struct cgroup_subsys *ss;
 	int i, j, ret;
 
+	/*
+	 * cgroup_mutex should have been acquired by cgroup_prep_fork() if
+	 * CLONE_INTO_CGROUP
+	 */
+	if (kargs->flags & CLONE_INTO_CGROUP)
+		lockdep_assert_held(&cgroup_mutex);
+
 	ret = cgroup_css_set_fork(kargs);
 	if (ret)
 		return ret;
diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76b..34fb9db59148b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/seq_file.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
@@ -2285,6 +2286,10 @@ static __latent_entropy struct task_struct *copy_process(
 	p->kretprobe_instances.first = NULL;
 #endif
 
+	cgroup_prep_fork(args);
+
+	threadgroup_change_begin(current);
+
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
 	 * forked. It should be noted that the new process's css_set can be changed
@@ -2293,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	retval = cgroup_can_fork(p, args);
 	if (retval)
-		goto bad_fork_put_pidfd;
+		goto bad_fork_threadgroup_change_end;
 
 	/*
 	 * From this point on we must avoid any synchronous user-space
@@ -2407,6 +2412,7 @@ static __latent_entropy struct task_struct *copy_process(
 	proc_fork_connector(p);
 	sched_post_fork(p);
 	cgroup_post_fork(p, args);
+	threadgroup_change_end(current);
 	perf_event_fork(p);
 
 	trace_task_newtask(p, clone_flags);
@@ -2421,6 +2427,8 @@ static __latent_entropy struct task_struct *copy_process(
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p, args);
+bad_fork_threadgroup_change_end:
+	threadgroup_change_end(current);
 bad_fork_put_pidfd:
 	if (clone_flags & CLONE_PIDFD) {
 		fput(pidfile);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1bba4128a3e68..bee6bf6d9659d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,6 +84,10 @@ unsigned int sysctl_sched_rt_period = 1000000;
 
 __read_mostly int scheduler_running;
 
+#ifdef CONFIG_THREADGROUP_RWSEM
+DEFINE_PERCPU_RWSEM(threadgroup_rwsem);
+#endif
+
 #ifdef CONFIG_SCHED_CORE
 
 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e5793e1172..135e4265fd259 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -28,6 +28,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/user.h>
 #include <linux/sched/wake_q.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index f01b249369ce2..d46e63266faf4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -20,6 +20,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
-- 
2.33.1