All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <kafai@fb.com>, Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>
Cc: bpf@vger.kernel.org, kernel-team@fb.com, linux-kernel@vger.kernel.org
Subject: [PATCH 2/3] sched, cgroup: Generalize threadgroup_rwsem
Date: Wed, 20 Oct 2021 10:17:48 -1000	[thread overview]
Message-ID: <YXB5bGiFmACYWw2y@slm.duckdns.org> (raw)
In-Reply-To: <YXB5Mec4ahxXRx8K@slm.duckdns.org>

From 1b07d36b074acb8a97c8bb5c0f1604960763578e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Oct 2021 10:12:27 -1000

Generalize threadgroup stabilization through threadgroup_rwsem so that it
can be used outside cgroup.

* A new config option CONFIG_THREADGROUP_RWSEM which is selected by
  CONFIG_CGROUPS enables threadgroup_rwsem.

* The declarations are moved to linux/sched/threadgroup_rwsem.h and the
  rwsem is now defined in kernel/sched/core.c.

* cgroup_mutex nests outside threadgroup_rwsem. During fork,
  cgroup_css_set_fork() which is called from cgroup_can_fork() was acquiring
  both. However, generalizing threadgroup_rwsem means that it needs to be
  acquired and released in the outer copy_process(). To maintain the locking
  order, break out cgroup_mutex acquisition into a separate function
  cgroup_prep_fork() which is called from copy_process() before acquiring
  threadgroup_rwsem.

No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/exec.c                               |  1 +
 include/linux/cgroup-defs.h             | 33 ------------------
 include/linux/cgroup.h                  | 11 +++---
 include/linux/sched/threadgroup_rwsem.h | 46 +++++++++++++++++++++++++
 init/Kconfig                            |  4 +++
 kernel/cgroup/cgroup-v1.c               |  1 +
 kernel/cgroup/cgroup.c                  | 38 +++++++++++++-------
 kernel/fork.c                           | 10 +++++-
 kernel/sched/core.c                     |  4 +++
 kernel/sched/sched.h                    |  1 +
 kernel/signal.c                         |  1 +
 11 files changed, 98 insertions(+), 52 deletions(-)
 create mode 100644 include/linux/sched/threadgroup_rwsem.h

diff --git a/fs/exec.c b/fs/exec.c
index caedd06a6d472..b18abc76e1ce0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -39,6 +39,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
 #include <linux/highmem.h>
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1a77731e33096..b7e89b0c17057 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,7 +16,6 @@
 #include <linux/rcupdate.h>
 #include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
-#include <linux/percpu-rwsem.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/workqueue.h>
 #include <linux/bpf-cgroup.h>
@@ -708,42 +707,10 @@ struct cgroup_subsys {
 	unsigned int depends_on;
 };
 
-extern struct percpu_rw_semaphore threadgroup_rwsem;
-
-/**
- * threadgroup_change_begin - threadgroup exclusion for cgroups
- * @tsk: target task
- *
- * Allows cgroup operations to synchronize against threadgroup changes
- * using a percpu_rw_semaphore.
- */
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
-	percpu_down_read(&threadgroup_rwsem);
-}
-
-/**
- * threadgroup_change_end - threadgroup exclusion for cgroups
- * @tsk: target task
- *
- * Counterpart of threadgroup_change_begin().
- */
-static inline void threadgroup_change_end(struct task_struct *tsk)
-{
-	percpu_up_read(&threadgroup_rwsem);
-}
-
 #else	/* CONFIG_CGROUPS */
 
 #define CGROUP_SUBSYS_COUNT 0
 
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
-	might_sleep();
-}
-
-static inline void threadgroup_change_end(struct task_struct *tsk) {}
-
 #endif	/* CONFIG_CGROUPS */
 
 #ifdef CONFIG_SOCK_CGROUP_DATA
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 75c151413fda8..aa3df6361105f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -121,12 +121,10 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		     struct pid *pid, struct task_struct *tsk);
 
 void cgroup_fork(struct task_struct *p);
-extern int cgroup_can_fork(struct task_struct *p,
-			   struct kernel_clone_args *kargs);
-extern void cgroup_cancel_fork(struct task_struct *p,
-			       struct kernel_clone_args *kargs);
-extern void cgroup_post_fork(struct task_struct *p,
-			     struct kernel_clone_args *kargs);
+void cgroup_prep_fork(struct kernel_clone_args *kargs);
+int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs);
 void cgroup_exit(struct task_struct *p);
 void cgroup_release(struct task_struct *p);
 void cgroup_free(struct task_struct *p);
@@ -713,6 +711,7 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 				    struct dentry *dentry) { return -EINVAL; }
 
 static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_prep_fork(struct kernel_clone_args *kargs) { }
 static inline int cgroup_can_fork(struct task_struct *p,
 				  struct kernel_clone_args *kargs) { return 0; }
 static inline void cgroup_cancel_fork(struct task_struct *p,
diff --git a/include/linux/sched/threadgroup_rwsem.h b/include/linux/sched/threadgroup_rwsem.h
new file mode 100644
index 0000000000000..31ab72703724b
--- /dev/null
+++ b/include/linux/sched/threadgroup_rwsem.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_THREADGROUP_RWSEM_H
+#define _LINUX_SCHED_THREADGROUP_RWSEM_H
+
+#ifdef CONFIG_THREADGROUP_RWSEM
+/* including before task_struct definition causes dependency loop */
+#include <linux/percpu-rwsem.h>
+
+extern struct percpu_rw_semaphore threadgroup_rwsem;
+
+/**
+ * threadgroup_change_begin - mark the beginning of changes to a threadgroup
+ * @tsk: task causing the changes
+ *
+ * All operations which modify a threadgroup - a new thread joining the group,
+ * death of a member thread (the assertion of PF_EXITING) and exec(2)
+ * dethreading the process and replacing the leader - read-locks
+ * threadgroup_rwsem so that write-locking stabilizes thread groups.
+ */
+static inline void threadgroup_change_begin(struct task_struct *tsk)
+{
+	percpu_down_read(&threadgroup_rwsem);
+}
+
+/**
+ * threadgroup_change_end - mark the end of changes to a threadgroup
+ * @tsk: task causing the changes
+ *
+ * See threadgroup_change_begin().
+ */
+static inline void threadgroup_change_end(struct task_struct *tsk)
+{
+	percpu_up_read(&threadgroup_rwsem);
+}
+#else
+static inline void threadgroup_change_begin(struct task_struct *tsk)
+{
+	might_sleep();
+}
+
+static inline void threadgroup_change_end(struct task_struct *tsk)
+{
+}
+#endif
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 11f8a845f259d..3a3699ccff3ce 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -917,8 +917,12 @@ config NUMA_BALANCING_DEFAULT_ENABLED
 	  If set, automatic NUMA balancing will be enabled if running on a NUMA
 	  machine.
 
+config THREADGROUP_RWSEM
+        bool
+
 menuconfig CGROUPS
 	bool "Control Group support"
+	select THREADGROUP_RWSEM
 	select KERNFS
 	help
 	  This option adds support for grouping sets of processes together, for
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 03808e7deb2ea..9c747e258ae7c 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2fd01c901b1ae..937888386210a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -42,6 +42,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/percpu-rwsem.h>
@@ -109,8 +110,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 
-DEFINE_PERCPU_RWSEM(threadgroup_rwsem);
-
 #define cgroup_assert_mutex_or_rcu_locked()				\
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			   !lockdep_is_held(&cgroup_mutex),		\
@@ -6050,7 +6049,6 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
  * to the target cgroup.
  */
 static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
-	__acquires(&cgroup_mutex) __acquires(&threadgroup_rwsem)
 {
 	int ret;
 	struct cgroup *dst_cgrp = NULL;
@@ -6058,11 +6056,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 	struct super_block *sb;
 	struct file *f;
 
-	if (kargs->flags & CLONE_INTO_CGROUP)
-		mutex_lock(&cgroup_mutex);
-
-	threadgroup_change_begin(current);
-
 	spin_lock_irq(&css_set_lock);
 	cset = task_css_set(current);
 	get_css_set(cset);
@@ -6118,7 +6111,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 	return ret;
 
 err:
-	threadgroup_change_end(current);
 	mutex_unlock(&cgroup_mutex);
 	if (f)
 		fput(f);
@@ -6138,10 +6130,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
  * CLONE_INTO_CGROUP was requested.
  */
 static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
-	__releases(&threadgroup_rwsem) __releases(&cgroup_mutex)
+	__releases(&cgroup_mutex)
 {
-	threadgroup_change_end(current);
-
 	if (kargs->flags & CLONE_INTO_CGROUP) {
 		struct cgroup *cgrp = kargs->cgrp;
 		struct css_set *cset = kargs->cset;
@@ -6160,9 +6150,26 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
 	}
 }
 
+/**
+ * cgroup_prep_fork - called during fork before threadgroup_rwsem is acquired
+ * @kargs: the arguments passed to create the child process
+ *
+ * CLONE_INTO_CGROUP requires cgroup_mutex as we're migrating while forking.
+ * However, cgroup_mutex must nest outside threadgroup_rwsem which is
+ * read-locked before cgroup_can_fork(). Break out cgroup_mutex locking to this
+ * function to follow the locking order.
+ */
+void cgroup_prep_fork(struct kernel_clone_args *kargs)
+	__acquires(&cgroup_mutex)
+{
+	if (kargs->flags & CLONE_INTO_CGROUP)
+		mutex_lock(&cgroup_mutex);
+}
+
 /**
  * cgroup_can_fork - called on a new task before the process is exposed
  * @child: the child process
+ * @kargs: the arguments passed to create the child process
  *
  * This prepares a new css_set for the child process which the child will
  * be attached to in cgroup_post_fork().
@@ -6175,6 +6182,13 @@ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
 	struct cgroup_subsys *ss;
 	int i, j, ret;
 
+	/*
+	 * cgroup_mutex should have been acquired by cgroup_prep_fork() if
+	 * CLONE_INTO_CGROUP
+	 */
+	if (kargs->flags & CLONE_INTO_CGROUP)
+		lockdep_assert_held(&cgroup_mutex);
+
 	ret = cgroup_css_set_fork(kargs);
 	if (ret)
 		return ret;
diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76b..34fb9db59148b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/seq_file.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
@@ -2285,6 +2286,10 @@ static __latent_entropy struct task_struct *copy_process(
 	p->kretprobe_instances.first = NULL;
 #endif
 
+	cgroup_prep_fork(args);
+
+	threadgroup_change_begin(current);
+
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
 	 * forked. It should be noted that the new process's css_set can be changed
@@ -2293,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	retval = cgroup_can_fork(p, args);
 	if (retval)
-		goto bad_fork_put_pidfd;
+		goto bad_fork_threadgroup_change_end;
 
 	/*
 	 * From this point on we must avoid any synchronous user-space
@@ -2407,6 +2412,7 @@ static __latent_entropy struct task_struct *copy_process(
 	proc_fork_connector(p);
 	sched_post_fork(p);
 	cgroup_post_fork(p, args);
+	threadgroup_change_end(current);
 	perf_event_fork(p);
 
 	trace_task_newtask(p, clone_flags);
@@ -2421,6 +2427,8 @@ static __latent_entropy struct task_struct *copy_process(
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p, args);
+bad_fork_threadgroup_change_end:
+	threadgroup_change_end(current);
 bad_fork_put_pidfd:
 	if (clone_flags & CLONE_PIDFD) {
 		fput(pidfile);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1bba4128a3e68..bee6bf6d9659d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,6 +84,10 @@ unsigned int sysctl_sched_rt_period = 1000000;
 
 __read_mostly int scheduler_running;
 
+#ifdef CONFIG_THREADGROUP_RWSEM
+DEFINE_PERCPU_RWSEM(threadgroup_rwsem);
+#endif
+
 #ifdef CONFIG_SCHED_CORE
 
 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e5793e1172..135e4265fd259 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -28,6 +28,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/sched/topology.h>
 #include <linux/sched/user.h>
 #include <linux/sched/wake_q.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index f01b249369ce2..d46e63266faf4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -20,6 +20,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/threadgroup_rwsem.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
-- 
2.33.1


  parent reply	other threads:[~2021-10-20 20:17 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-10-20 20:16 [RFC] bpf: Implement prealloc for task_local_storage Tejun Heo
2021-10-20 20:17 ` [PATCH 1/3] cgroup: Drop cgroup_ prefix from cgroup_threadgroup_rwsem and friends Tejun Heo
2021-10-20 20:17 ` Tejun Heo [this message]
2021-10-20 20:18 ` [PATCH 3/3] bpf: Implement prealloc for task_local_storage Tejun Heo
2021-10-21  9:50   ` kernel test robot
2021-10-21  9:50     ` kernel test robot
2021-10-22 22:47   ` Martin KaFai Lau
2021-10-22 23:00     ` Tejun Heo
2021-10-22 17:01 ` [RFC] " Andrii Nakryiko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YXB5bGiFmACYWw2y@slm.duckdns.org \
    --to=tj@kernel.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kafai@fb.com \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.