* [PATCH 1/3] cgroup: Drop cgroup_ prefix from cgroup_threadgroup_rwsem and friends
2021-10-20 20:16 [RFC] bpf: Implement prealloc for task_local_storage Tejun Heo
@ 2021-10-20 20:17 ` Tejun Heo
2021-10-20 20:17 ` [PATCH 2/3] sched, cgroup: Generalize threadgroup_rwsem Tejun Heo
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2021-10-20 20:17 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Ingo Molnar, Peter Zijlstra
Cc: bpf, kernel-team, linux-kernel
From e9bad0a8967987edae58ad498b7ba5ba91923e1e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Oct 2021 09:50:17 -1000
threadgroup stabilization is being generalized so that it can be used
outside cgroup. Let's drop the cgroup_ prefix in preparation. No functional
changes.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
fs/exec.c | 6 ++---
include/linux/cgroup-defs.h | 20 +++++++-------
kernel/cgroup/cgroup-internal.h | 4 +--
kernel/cgroup/cgroup-v1.c | 8 +++---
kernel/cgroup/cgroup.c | 48 ++++++++++++++++-----------------
kernel/cgroup/pids.c | 2 +-
kernel/signal.c | 6 ++---
7 files changed, 47 insertions(+), 47 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index a098c133d8d7..caedd06a6d47 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1080,7 +1080,7 @@ static int de_thread(struct task_struct *tsk)
struct task_struct *leader = tsk->group_leader;
for (;;) {
- cgroup_threadgroup_change_begin(tsk);
+ threadgroup_change_begin(tsk);
write_lock_irq(&tasklist_lock);
/*
* Do this under tasklist_lock to ensure that
@@ -1091,7 +1091,7 @@ static int de_thread(struct task_struct *tsk)
break;
__set_current_state(TASK_KILLABLE);
write_unlock_irq(&tasklist_lock);
- cgroup_threadgroup_change_end(tsk);
+ threadgroup_change_end(tsk);
schedule();
if (__fatal_signal_pending(tsk))
goto killed;
@@ -1146,7 +1146,7 @@ static int de_thread(struct task_struct *tsk)
if (unlikely(leader->ptrace))
__wake_up_parent(leader, leader->parent);
write_unlock_irq(&tasklist_lock);
- cgroup_threadgroup_change_end(tsk);
+ threadgroup_change_end(tsk);
release_task(leader);
}
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index db2e147e069f..1a77731e3309 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -708,41 +708,41 @@ struct cgroup_subsys {
unsigned int depends_on;
};
-extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+extern struct percpu_rw_semaphore threadgroup_rwsem;
/**
- * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
+ * threadgroup_change_begin - threadgroup exclusion for cgroups
* @tsk: target task
*
* Allows cgroup operations to synchronize against threadgroup changes
* using a percpu_rw_semaphore.
*/
-static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+static inline void threadgroup_change_begin(struct task_struct *tsk)
{
- percpu_down_read(&cgroup_threadgroup_rwsem);
+ percpu_down_read(&threadgroup_rwsem);
}
/**
- * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
+ * threadgroup_change_end - threadgroup exclusion for cgroups
* @tsk: target task
*
- * Counterpart of cgroup_threadcgroup_change_begin().
+ * Counterpart of threadgroup_change_begin().
*/
-static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
+static inline void threadgroup_change_end(struct task_struct *tsk)
{
- percpu_up_read(&cgroup_threadgroup_rwsem);
+ percpu_up_read(&threadgroup_rwsem);
}
#else /* CONFIG_CGROUPS */
#define CGROUP_SUBSYS_COUNT 0
-static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+static inline void threadgroup_change_begin(struct task_struct *tsk)
{
might_sleep();
}
-static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
+static inline void threadgroup_change_end(struct task_struct *tsk) {}
#endif /* CONFIG_CGROUPS */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..9f76fc5aec8d 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -233,9 +233,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
- __acquires(&cgroup_threadgroup_rwsem);
+ __acquires(&threadgroup_rwsem);
void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem);
+ __releases(&threadgroup_rwsem);
void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 35b920328344..03808e7deb2e 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -59,7 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ percpu_down_write(&threadgroup_rwsem);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -74,7 +74,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ percpu_up_write(&threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return retval;
@@ -111,7 +111,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ percpu_down_write(&threadgroup_rwsem);
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
@@ -147,7 +147,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ percpu_up_write(&threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return ret;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 570b0c97392a..2fd01c901b1a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -109,7 +109,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
+DEFINE_PERCPU_RWSEM(threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
@@ -918,9 +918,9 @@ static void css_set_move_task(struct task_struct *task,
if (to_cset) {
/*
- * We are synchronized through cgroup_threadgroup_rwsem
- * against PF_EXITING setting such that we can't race
- * against cgroup_exit()/cgroup_free() dropping the css_set.
+ * We are synchronized through threadgroup_rwsem against
+ * PF_EXITING setting such that we can't race against
+ * cgroup_exit()/cgroup_free() dropping the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -2338,7 +2338,7 @@ static void cgroup_migrate_add_task(struct task_struct *task,
if (task->flags & PF_EXITING)
return;
- /* cgroup_threadgroup_rwsem protects racing against forks */
+ /* threadgroup_rwsem protects racing against forks */
WARN_ON_ONCE(list_empty(&task->cg_list));
cset = task_css_set(task);
@@ -2602,7 +2602,7 @@ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
* @src_cset and add it to @mgctx->src_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
- * This function may be called without holding cgroup_threadgroup_rwsem
+ * This function may be called without holding threadgroup_rwsem
* even if the target is a process. Threads may be created and destroyed
* but as long as cgroup_mutex is not dropped, no new css_set can be put
* into play and the preloaded css_sets are guaranteed to cover all
@@ -2711,7 +2711,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
* @mgctx: migration context
*
* Migrate a process or task denoted by @leader. If migrating a process,
- * the caller must be holding cgroup_threadgroup_rwsem. The caller is also
+ * the caller must be holding threadgroup_rwsem. The caller is also
* responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish().
@@ -2752,7 +2752,7 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
* @leader: the task or the leader of the threadgroup to be attached
* @threadgroup: attach the whole threadgroup?
*
- * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
+ * Call holding cgroup_mutex and threadgroup_rwsem.
*/
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup)
@@ -2788,7 +2788,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
+ __acquires(&threadgroup_rwsem)
{
struct task_struct *tsk;
pid_t pid;
@@ -2806,7 +2806,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
*/
lockdep_assert_held(&cgroup_mutex);
if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ percpu_down_write(&threadgroup_rwsem);
*locked = true;
} else {
*locked = false;
@@ -2842,7 +2842,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
out_unlock_threadgroup:
if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ percpu_up_write(&threadgroup_rwsem);
*locked = false;
}
out_unlock_rcu:
@@ -2851,7 +2851,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
}
void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
+ __releases(&threadgroup_rwsem)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2860,7 +2860,7 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
put_task_struct(task);
if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ percpu_up_write(&threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2919,7 +2919,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ percpu_down_write(&threadgroup_rwsem);
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
@@ -2949,7 +2949,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ percpu_up_write(&threadgroup_rwsem);
return ret;
}
@@ -5784,7 +5784,7 @@ int __init cgroup_init(void)
* The latency of the synchronize_rcu() is too high for cgroups,
* avoid it at the cost of forcing all readers into the slow path.
*/
- rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+ rcu_sync_enter_start(&threadgroup_rwsem.rss);
get_user_ns(init_cgroup_ns.user_ns);
@@ -6044,13 +6044,13 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
* If CLONE_INTO_CGROUP is specified this function will try to find an
* existing css_set which includes the requested cgroup and if not create
* a new css_set that the child will be attached to later. If this function
- * succeeds it will hold cgroup_threadgroup_rwsem on return. If
+ * succeeds it will hold threadgroup_rwsem on return. If
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
- * before grabbing cgroup_threadgroup_rwsem and will hold a reference
+ * before grabbing threadgroup_rwsem and will hold a reference
* to the target cgroup.
*/
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
- __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
+ __acquires(&cgroup_mutex) __acquires(&threadgroup_rwsem)
{
int ret;
struct cgroup *dst_cgrp = NULL;
@@ -6061,7 +6061,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
if (kargs->flags & CLONE_INTO_CGROUP)
mutex_lock(&cgroup_mutex);
- cgroup_threadgroup_change_begin(current);
+ threadgroup_change_begin(current);
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
@@ -6118,7 +6118,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
return ret;
err:
- cgroup_threadgroup_change_end(current);
+ threadgroup_change_end(current);
mutex_unlock(&cgroup_mutex);
if (f)
fput(f);
@@ -6138,9 +6138,9 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
* CLONE_INTO_CGROUP was requested.
*/
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
- __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+ __releases(&threadgroup_rwsem) __releases(&cgroup_mutex)
{
- cgroup_threadgroup_change_end(current);
+ threadgroup_change_end(current);
if (kargs->flags & CLONE_INTO_CGROUP) {
struct cgroup *cgrp = kargs->cgrp;
@@ -6231,7 +6231,7 @@ void cgroup_cancel_fork(struct task_struct *child,
*/
void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
- __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+ __releases(&threadgroup_rwsem) __releases(&cgroup_mutex)
{
unsigned long cgrp_flags = 0;
bool kill = false;
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 511af87f685e..368bc3ea4dbb 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -213,7 +213,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
/*
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
- * on cgroup_threadgroup_change_begin() held by the copy_process().
+ * on threadgroup_change_begin() held by the copy_process().
*/
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
diff --git a/kernel/signal.c b/kernel/signal.c
index 952741f6d0f9..f01b249369ce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2956,11 +2956,11 @@ void exit_signals(struct task_struct *tsk)
* @tsk is about to have PF_EXITING set - lock out users which
* expect stable threadgroup.
*/
- cgroup_threadgroup_change_begin(tsk);
+ threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
- cgroup_threadgroup_change_end(tsk);
+ threadgroup_change_end(tsk);
return;
}
@@ -2971,7 +2971,7 @@ void exit_signals(struct task_struct *tsk)
*/
tsk->flags |= PF_EXITING;
- cgroup_threadgroup_change_end(tsk);
+ threadgroup_change_end(tsk);
if (!task_sigpending(tsk))
goto out;
--
2.33.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/3] sched, cgroup: Generalize threadgroup_rwsem
2021-10-20 20:16 [RFC] bpf: Implement prealloc for task_local_storage Tejun Heo
2021-10-20 20:17 ` [PATCH 1/3] cgroup: Drop cgroup_ prefix from cgroup_threadgroup_rwsem and friends Tejun Heo
@ 2021-10-20 20:17 ` Tejun Heo
2021-10-20 20:18 ` [PATCH 3/3] bpf: Implement prealloc for task_local_storage Tejun Heo
2021-10-22 17:01 ` [RFC] " Andrii Nakryiko
3 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2021-10-20 20:17 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Ingo Molnar, Peter Zijlstra
Cc: bpf, kernel-team, linux-kernel
From 1b07d36b074acb8a97c8bb5c0f1604960763578e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Oct 2021 10:12:27 -1000
Generalize threadgroup stabilization through threadgroup_rwsem so that it
can be used outside cgroup.
* A new config option CONFIG_THREADGROUP_RWSEM which is selected by
CONFIG_CGROUPS enables threadgroup_rwsem.
* The declarations are moved to linux/sched/threadgroup_rwsem.h and the
rwsem is now defined in kernel/sched/core.c.
* cgroup_mutex nests outside threadgroup_rwsem. During fork,
cgroup_css_set_fork() which is called from cgroup_can_fork() was acquiring
both. However, generalizing threadgroup_rwsem means that it needs to be
acquired and released in the outer copy_process(). To maintain the locking
order, break out cgroup_mutex acquisition into a separate function
cgroup_prep_fork() which is called from copy_process() before acquiring
threadgroup_rwsem.
No functional changes.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
---
fs/exec.c | 1 +
include/linux/cgroup-defs.h | 33 ------------------
include/linux/cgroup.h | 11 +++---
include/linux/sched/threadgroup_rwsem.h | 46 +++++++++++++++++++++++++
init/Kconfig | 4 +++
kernel/cgroup/cgroup-v1.c | 1 +
kernel/cgroup/cgroup.c | 38 +++++++++++++-------
kernel/fork.c | 10 +++++-
kernel/sched/core.c | 4 +++
kernel/sched/sched.h | 1 +
kernel/signal.c | 1 +
11 files changed, 98 insertions(+), 52 deletions(-)
create mode 100644 include/linux/sched/threadgroup_rwsem.h
diff --git a/fs/exec.c b/fs/exec.c
index caedd06a6d472..b18abc76e1ce0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -39,6 +39,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
+#include <linux/sched/threadgroup_rwsem.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
#include <linux/highmem.h>
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1a77731e33096..b7e89b0c17057 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,7 +16,6 @@
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/percpu-refcount.h>
-#include <linux/percpu-rwsem.h>
#include <linux/u64_stats_sync.h>
#include <linux/workqueue.h>
#include <linux/bpf-cgroup.h>
@@ -708,42 +707,10 @@ struct cgroup_subsys {
unsigned int depends_on;
};
-extern struct percpu_rw_semaphore threadgroup_rwsem;
-
-/**
- * threadgroup_change_begin - threadgroup exclusion for cgroups
- * @tsk: target task
- *
- * Allows cgroup operations to synchronize against threadgroup changes
- * using a percpu_rw_semaphore.
- */
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
- percpu_down_read(&threadgroup_rwsem);
-}
-
-/**
- * threadgroup_change_end - threadgroup exclusion for cgroups
- * @tsk: target task
- *
- * Counterpart of threadgroup_change_begin().
- */
-static inline void threadgroup_change_end(struct task_struct *tsk)
-{
- percpu_up_read(&threadgroup_rwsem);
-}
-
#else /* CONFIG_CGROUPS */
#define CGROUP_SUBSYS_COUNT 0
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
- might_sleep();
-}
-
-static inline void threadgroup_change_end(struct task_struct *tsk) {}
-
#endif /* CONFIG_CGROUPS */
#ifdef CONFIG_SOCK_CGROUP_DATA
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 75c151413fda8..aa3df6361105f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -121,12 +121,10 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk);
void cgroup_fork(struct task_struct *p);
-extern int cgroup_can_fork(struct task_struct *p,
- struct kernel_clone_args *kargs);
-extern void cgroup_cancel_fork(struct task_struct *p,
- struct kernel_clone_args *kargs);
-extern void cgroup_post_fork(struct task_struct *p,
- struct kernel_clone_args *kargs);
+void cgroup_prep_fork(struct kernel_clone_args *kargs);
+int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs);
+void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);
@@ -713,6 +711,7 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry) { return -EINVAL; }
static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_prep_fork(struct kernel_clone_args *kargs) { }
static inline int cgroup_can_fork(struct task_struct *p,
struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
diff --git a/include/linux/sched/threadgroup_rwsem.h b/include/linux/sched/threadgroup_rwsem.h
new file mode 100644
index 0000000000000..31ab72703724b
--- /dev/null
+++ b/include/linux/sched/threadgroup_rwsem.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_THREADGROUP_RWSEM_H
+#define _LINUX_SCHED_THREADGROUP_RWSEM_H
+
+#ifdef CONFIG_THREADGROUP_RWSEM
+/* including before task_struct definition causes dependency loop */
+#include <linux/percpu-rwsem.h>
+
+extern struct percpu_rw_semaphore threadgroup_rwsem;
+
+/**
+ * threadgroup_change_begin - mark the beginning of changes to a threadgroup
+ * @tsk: task causing the changes
+ *
+ * All operations which modify a threadgroup - a new thread joining the group,
+ * death of a member thread (the assertion of PF_EXITING) and exec(2)
+ * dethreading the process and replacing the leader - read-locks
+ * threadgroup_rwsem so that write-locking stabilizes thread groups.
+ */
+static inline void threadgroup_change_begin(struct task_struct *tsk)
+{
+ percpu_down_read(&threadgroup_rwsem);
+}
+
+/**
+ * threadgroup_change_end - mark the end of changes to a threadgroup
+ * @tsk: task causing the changes
+ *
+ * See threadgroup_change_begin().
+ */
+static inline void threadgroup_change_end(struct task_struct *tsk)
+{
+ percpu_up_read(&threadgroup_rwsem);
+}
+#else
+static inline void threadgroup_change_begin(struct task_struct *tsk)
+{
+ might_sleep();
+}
+
+static inline void threadgroup_change_end(struct task_struct *tsk)
+{
+}
+#endif
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 11f8a845f259d..3a3699ccff3ce 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -917,8 +917,12 @@ config NUMA_BALANCING_DEFAULT_ENABLED
If set, automatic NUMA balancing will be enabled if running on a NUMA
machine.
+config THREADGROUP_RWSEM
+ bool
+
menuconfig CGROUPS
bool "Control Group support"
+ select THREADGROUP_RWSEM
select KERNFS
help
This option adds support for grouping sets of processes together, for
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 03808e7deb2ea..9c747e258ae7c 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
+#include <linux/sched/threadgroup_rwsem.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2fd01c901b1ae..937888386210a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -42,6 +42,7 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
+#include <linux/sched/threadgroup_rwsem.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
@@ -109,8 +110,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-DEFINE_PERCPU_RWSEM(threadgroup_rwsem);
-
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&cgroup_mutex), \
@@ -6050,7 +6049,6 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
* to the target cgroup.
*/
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
- __acquires(&cgroup_mutex) __acquires(&threadgroup_rwsem)
{
int ret;
struct cgroup *dst_cgrp = NULL;
@@ -6058,11 +6056,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
struct super_block *sb;
struct file *f;
- if (kargs->flags & CLONE_INTO_CGROUP)
- mutex_lock(&cgroup_mutex);
-
- threadgroup_change_begin(current);
-
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
@@ -6118,7 +6111,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
return ret;
err:
- threadgroup_change_end(current);
mutex_unlock(&cgroup_mutex);
if (f)
fput(f);
@@ -6138,10 +6130,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
* CLONE_INTO_CGROUP was requested.
*/
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
- __releases(&threadgroup_rwsem) __releases(&cgroup_mutex)
+ __releases(&cgroup_mutex)
{
- threadgroup_change_end(current);
-
if (kargs->flags & CLONE_INTO_CGROUP) {
struct cgroup *cgrp = kargs->cgrp;
struct css_set *cset = kargs->cset;
@@ -6160,9 +6150,26 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
}
}
+/**
+ * cgroup_prep_fork - called during fork before threadgroup_rwsem is acquired
+ * @kargs: the arguments passed to create the child process
+ *
+ * CLONE_INTO_CGROUP requires cgroup_mutex as we're migrating while forking.
+ * However, cgroup_mutex must nest outside threadgroup_rwsem which is
+ * read-locked before cgroup_can_fork(). Break out cgroup_mutex locking to this
+ * function to follow the locking order.
+ */
+void cgroup_prep_fork(struct kernel_clone_args *kargs)
+ __acquires(&cgroup_mutex)
+{
+ if (kargs->flags & CLONE_INTO_CGROUP)
+ mutex_lock(&cgroup_mutex);
+}
+
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
@@ -6175,6 +6182,13 @@ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
struct cgroup_subsys *ss;
int i, j, ret;
+ /*
+ * cgroup_mutex should have been acquired by cgroup_prep_fork() if
+ * CLONE_INTO_CGROUP
+ */
+ if (kargs->flags & CLONE_INTO_CGROUP)
+ lockdep_assert_held(&cgroup_mutex);
+
ret = cgroup_css_set_fork(kargs);
if (ret)
return ret;
diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76b..34fb9db59148b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/threadgroup_rwsem.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
@@ -2285,6 +2286,10 @@ static __latent_entropy struct task_struct *copy_process(
p->kretprobe_instances.first = NULL;
#endif
+ cgroup_prep_fork(args);
+
+ threadgroup_change_begin(current);
+
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted that the new process's css_set can be changed
@@ -2293,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
retval = cgroup_can_fork(p, args);
if (retval)
- goto bad_fork_put_pidfd;
+ goto bad_fork_threadgroup_change_end;
/*
* From this point on we must avoid any synchronous user-space
@@ -2407,6 +2412,7 @@ static __latent_entropy struct task_struct *copy_process(
proc_fork_connector(p);
sched_post_fork(p);
cgroup_post_fork(p, args);
+ threadgroup_change_end(current);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
@@ -2421,6 +2427,8 @@ static __latent_entropy struct task_struct *copy_process(
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p, args);
+bad_fork_threadgroup_change_end:
+ threadgroup_change_end(current);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
fput(pidfile);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1bba4128a3e68..bee6bf6d9659d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,6 +84,10 @@ unsigned int sysctl_sched_rt_period = 1000000;
__read_mostly int scheduler_running;
+#ifdef CONFIG_THREADGROUP_RWSEM
+DEFINE_PERCPU_RWSEM(threadgroup_rwsem);
+#endif
+
#ifdef CONFIG_SCHED_CORE
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3d3e5793e1172..135e4265fd259 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -28,6 +28,7 @@
#include <linux/sched/sysctl.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
+#include <linux/sched/threadgroup_rwsem.h>
#include <linux/sched/topology.h>
#include <linux/sched/user.h>
#include <linux/sched/wake_q.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index f01b249369ce2..d46e63266faf4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -20,6 +20,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/threadgroup_rwsem.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
--
2.33.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 3/3] bpf: Implement prealloc for task_local_storage
2021-10-20 20:16 [RFC] bpf: Implement prealloc for task_local_storage Tejun Heo
2021-10-20 20:17 ` [PATCH 1/3] cgroup: Drop cgroup_ prefix from cgroup_threadgroup_rwsem and friends Tejun Heo
2021-10-20 20:17 ` [PATCH 2/3] sched, cgroup: Generalize threadgroup_rwsem Tejun Heo
@ 2021-10-20 20:18 ` Tejun Heo
2021-10-21 9:50 ` kernel test robot
2021-10-22 22:47 ` Martin KaFai Lau
2021-10-22 17:01 ` [RFC] " Andrii Nakryiko
3 siblings, 2 replies; 9+ messages in thread
From: Tejun Heo @ 2021-10-20 20:18 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Ingo Molnar, Peter Zijlstra
Cc: bpf, kernel-team, linux-kernel
From 5e3ad0d4a0b0732e7ebe035582d282ab752397ed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 20 Oct 2021 08:56:53 -1000
task_local_storage currently does not support pre-allocation and the memory
is allocated on demand using the GFP_ATOMIC mask. While atomic allocations
succeed most of the time and the occasional failures aren't a problem for
many use-cases, there are some which can benefit from reliable allocations -
e.g. tracking acquisitions and releases of specific resources to root cause
long-term reference leaks.
Prealloc semantics for task_local_storage:
* When a prealloc map is created, the map's elements for all existing tasks
are allocated.
* Afterwards, whenever a new task is forked, it automatically allocates the
elements for the existing preallocated maps.
To synchronize against concurrent forks, CONFIG_BPF_SYSCALL now enables
CONFIG_THREADGROUP_RWSEM and prealloc task_local_storage creation path
write-locks threadgroup_rwsem, and the rest of the implementation is
straight-forward.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/bpf.h | 6 +
include/linux/bpf_local_storage.h | 12 ++
kernel/bpf/Kconfig | 1 +
kernel/bpf/bpf_local_storage.c | 112 ++++++++++----
kernel/bpf/bpf_task_storage.c | 138 +++++++++++++++++-
kernel/fork.c | 8 +-
.../bpf/prog_tests/task_local_storage.c | 101 +++++++++++++
.../selftests/bpf/progs/task_ls_prealloc.c | 15 ++
8 files changed, 361 insertions(+), 32 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/task_ls_prealloc.c
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d604c8251d88..7f9e5dea0660 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1673,6 +1673,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id);
struct bpf_link *bpf_link_by_id(u32 id);
const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+int bpf_task_storage_fork(struct task_struct *task);
void bpf_task_storage_free(struct task_struct *task);
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
const struct btf_func_model *
@@ -1882,6 +1883,11 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return NULL;
}
+static inline int bpf_task_storage_fork(struct task_struct *p)
+{
+ return 0;
+}
+
static inline void bpf_task_storage_free(struct task_struct *task)
{
}
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 24496bc28e7b..bbb4cedbd2b2 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -51,6 +51,12 @@ struct bpf_local_storage_map {
u32 bucket_log;
u16 elem_size;
u16 cache_idx;
+ /* Maps with prealloc need to be tracked and allocated when a new
+ * containing object is created. The following node can be used to keep
+ * track of the prealloc maps. Outside of initializing the field, the
+ * shared local_storage code doesn't use it directly.
+ */
+ struct list_head prealloc_node;
};
struct bpf_local_storage_data {
@@ -118,6 +124,7 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
/* Helper functions for bpf_local_storage */
int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
+int bpf_local_storage_prealloc_map_alloc_check(union bpf_attr *attr);
struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr);
@@ -158,6 +165,11 @@ bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_elem *first_selem);
struct bpf_local_storage_data *
+__bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+ void *value, u64 map_flags,
+ struct bpf_local_storage **local_storage_prealloc,
+ struct bpf_local_storage_elem **selem_prealloc);
+struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
void *value, u64 map_flags);
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index a82d6de86522..4d816664026f 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -29,6 +29,7 @@ config BPF_SYSCALL
select IRQ_WORK
select TASKS_TRACE_RCU
select BINARY_PRINTF
+ select THREADGROUP_RWSEM
select NET_SOCK_MSG if NET
default n
help
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b305270b7a4b..0a6bf3e4bbcd 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -258,24 +258,13 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
return 0;
}
-int bpf_local_storage_alloc(void *owner,
- struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem)
+static int bpf_local_storage_link(void *owner,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *first_selem,
+ struct bpf_local_storage *storage)
{
- struct bpf_local_storage *prev_storage, *storage;
+ struct bpf_local_storage *prev_storage;
struct bpf_local_storage **owner_storage_ptr;
- int err;
-
- err = mem_charge(smap, owner, sizeof(*storage));
- if (err)
- return err;
-
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- GFP_ATOMIC | __GFP_NOWARN);
- if (!storage) {
- err = -ENOMEM;
- goto uncharge;
- }
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
@@ -299,8 +288,7 @@ int bpf_local_storage_alloc(void *owner,
prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
if (unlikely(prev_storage)) {
bpf_selem_unlink_map(first_selem);
- err = -EAGAIN;
- goto uncharge;
+ return -EAGAIN;
/* Note that even first_selem was linked to smap's
* bucket->list, first_selem can be freed immediately
@@ -313,6 +301,31 @@ int bpf_local_storage_alloc(void *owner,
}
return 0;
+}
+
+int bpf_local_storage_alloc(void *owner,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *first_selem)
+{
+ struct bpf_local_storage *storage;
+ int err;
+
+ err = mem_charge(smap, owner, sizeof(*storage));
+ if (err)
+ return err;
+
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!storage) {
+ err = -ENOMEM;
+ goto uncharge;
+ }
+
+ err = bpf_local_storage_link(owner, smap, first_selem, storage);
+ if (err)
+ goto uncharge;
+
+ return 0;
uncharge:
kfree(storage);
@@ -326,8 +339,10 @@ int bpf_local_storage_alloc(void *owner,
* during map destruction).
*/
struct bpf_local_storage_data *
-bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags)
+__bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+ void *value, u64 map_flags,
+ struct bpf_local_storage **local_storage_prealloc,
+ struct bpf_local_storage_elem **selem_prealloc)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *selem;
@@ -349,17 +364,30 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true);
+ if (*selem_prealloc)
+ selem = *selem_prealloc;
+ else
+ selem = bpf_selem_alloc(smap, owner, value, true);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem);
+ if (*local_storage_prealloc) {
+ err = bpf_local_storage_link(owner, smap, selem,
+ *local_storage_prealloc);
+ } else {
+ err = bpf_local_storage_alloc(owner, smap, selem);
+ }
if (err) {
- kfree(selem);
- mem_uncharge(smap, owner, smap->elem_size);
+ if (!*selem_prealloc) {
+ kfree(selem);
+ mem_uncharge(smap, owner, smap->elem_size);
+ }
return ERR_PTR(err);
}
+ *selem_prealloc = NULL;
+ *local_storage_prealloc = NULL;
+
return SDATA(selem);
}
@@ -414,10 +442,15 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
* old_sdata will not be uncharged later during
* bpf_selem_unlink_storage_nolock().
*/
- selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
+ if (*selem_prealloc) {
+ selem = *selem_prealloc;
+ *selem_prealloc = NULL;
+ } else {
+ selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
+ if (!selem) {
+ err = -ENOMEM;
+ goto unlock_err;
+ }
}
/* First, link the new selem to the map */
@@ -442,6 +475,17 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
return ERR_PTR(err);
}
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage *local_storage_prealloc = NULL;
+ struct bpf_local_storage_elem *selem_prealloc = NULL;
+
+ return __bpf_local_storage_update(owner, smap, value, map_flags,
+ &local_storage_prealloc, &selem_prealloc);
+}
+
u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
{
u64 min_usage = U64_MAX;
@@ -536,10 +580,9 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
kfree(smap);
}
-int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+int bpf_local_storage_prealloc_map_alloc_check(union bpf_attr *attr)
{
if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
- !(attr->map_flags & BPF_F_NO_PREALLOC) ||
attr->max_entries ||
attr->key_size != sizeof(int) || !attr->value_size ||
/* Enforce BTF for userspace sk dumping */
@@ -555,6 +598,13 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+{
+ if (!(attr->map_flags & BPF_F_NO_PREALLOC))
+ return -EINVAL;
+ return bpf_local_storage_prealloc_map_alloc_check(attr);
+}
+
struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
{
struct bpf_local_storage_map *smap;
@@ -586,6 +636,8 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
smap->elem_size =
sizeof(struct bpf_local_storage_elem) + attr->value_size;
+ INIT_LIST_HEAD(&smap->prealloc_node);
+
return smap;
}
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index ebfa8bc90892..6f8b781647f7 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -17,11 +17,15 @@
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
+#include <linux/sched/threadgroup_rwsem.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
static DEFINE_PER_CPU(int, bpf_task_storage_busy);
+/* Protected by threadgroup_rwsem. */
+static LIST_HEAD(prealloc_smaps);
+
static void bpf_task_storage_lock(void)
{
migrate_disable();
@@ -280,14 +284,103 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
return -ENOTSUPP;
}
+static int task_storage_map_populate(struct bpf_local_storage_map *smap)
+{
+ struct bpf_local_storage *storage = NULL;
+ struct bpf_local_storage_elem *selem = NULL;
+ struct task_struct *p, *g;
+ int err = 0;
+
+ lockdep_assert_held(&threadgroup_rwsem);
+retry:
+ if (!storage)
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ GFP_USER);
+ if (!selem)
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size, GFP_USER);
+ if (!storage || !selem) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ rcu_read_lock();
+ bpf_task_storage_lock();
+
+ for_each_process_thread(g, p) {
+ struct bpf_local_storage_data *sdata;
+
+ /* Try inserting with atomic allocations. On failure, retry with
+ * the preallocated ones.
+ */
+ sdata = bpf_local_storage_update(p, smap, NULL, BPF_NOEXIST);
+
+ if (PTR_ERR(sdata) == -ENOMEM && storage && selem) {
+ sdata = __bpf_local_storage_update(p, smap, NULL,
+ BPF_NOEXIST,
+ &storage, &selem);
+ }
+
+ /* Check -EEXIST before need_resched() to guarantee forward
+ * progress.
+ */
+ if (PTR_ERR(sdata) == -EEXIST)
+ continue;
+
+ /* If requested or alloc failed, take a breather and loop back
+ * to preallocate.
+ */
+ if (need_resched() ||
+ PTR_ERR(sdata) == -EAGAIN || PTR_ERR(sdata) == -ENOMEM) {
+ bpf_task_storage_unlock();
+ rcu_read_unlock();
+ cond_resched();
+ goto retry;
+ }
+
+ if (IS_ERR(sdata)) {
+ err = PTR_ERR(sdata);
+ goto out_unlock;
+ }
+ }
+out_unlock:
+ bpf_task_storage_unlock();
+ rcu_read_unlock();
+out_free:
+ if (storage)
+ kfree(storage);
+ if (selem)
+ kfree(selem);
+ return err;
+}
+
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
struct bpf_local_storage_map *smap;
+ int err;
smap = bpf_local_storage_map_alloc(attr);
if (IS_ERR(smap))
return ERR_CAST(smap);
+ if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
+ /* We're going to exercise the regular update path to populate
+ * the map for the existing tasks, which will call into map ops
+ * which is normally initialized after this function returns.
+ * Initialize it early here.
+ */
+ smap->map.ops = &task_storage_map_ops;
+
+ percpu_down_write(&threadgroup_rwsem);
+ list_add_tail(&smap->prealloc_node, &prealloc_smaps);
+ err = task_storage_map_populate(smap);
+ percpu_up_write(&threadgroup_rwsem);
+ if (err) {
+ bpf_local_storage_map_free(smap,
+ &bpf_task_storage_busy);
+ return ERR_PTR(err);
+ }
+ }
+
smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
return &smap->map;
}
@@ -298,13 +391,20 @@ static void task_storage_map_free(struct bpf_map *map)
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
+
+ if (!list_empty(&smap->prealloc_node)) {
+ percpu_down_write(&threadgroup_rwsem);
+ list_del_init(&smap->prealloc_node);
+ percpu_up_write(&threadgroup_rwsem);
+ }
+
bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
}
static int task_storage_map_btf_id;
const struct bpf_map_ops task_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
- .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc_check = bpf_local_storage_prealloc_map_alloc_check,
.map_alloc = task_storage_map_alloc,
.map_free = task_storage_map_free,
.map_get_next_key = notsupp_get_next_key,
@@ -317,6 +417,42 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_owner_storage_ptr = task_storage_ptr,
};
+int bpf_task_storage_fork(struct task_struct *task)
+{
+ struct bpf_local_storage_map *smap;
+
+ percpu_rwsem_assert_held(&threadgroup_rwsem);
+
+ list_for_each_entry(smap, &prealloc_smaps, prealloc_node) {
+ struct bpf_local_storage *storage;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_data *sdata;
+
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ GFP_USER);
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size, GFP_USER);
+
+ rcu_read_lock();
+ bpf_task_storage_lock();
+ sdata = __bpf_local_storage_update(task, smap, NULL, BPF_NOEXIST,
+ &storage, &selem);
+ bpf_task_storage_unlock();
+ rcu_read_unlock();
+
+ if (storage)
+ kfree(storage);
+ if (selem)
+ kfree(selem);
+
+ if (IS_ERR(sdata)) {
+ bpf_task_storage_free(task);
+ return PTR_ERR(sdata);
+ }
+ }
+
+ return 0;
+}
+
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
diff --git a/kernel/fork.c b/kernel/fork.c
index 34fb9db59148..845c49c6e89b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2290,6 +2290,10 @@ static __latent_entropy struct task_struct *copy_process(
threadgroup_change_begin(current);
+ retval = bpf_task_storage_fork(p);
+ if (retval)
+ goto bad_fork_threadgroup_change_end;
+
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted that the new process's css_set can be changed
@@ -2298,7 +2302,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
retval = cgroup_can_fork(p, args);
if (retval)
- goto bad_fork_threadgroup_change_end;
+ goto bad_fork_bpf_task_storage_free;
/*
* From this point on we must avoid any synchronous user-space
@@ -2427,6 +2431,8 @@ static __latent_entropy struct task_struct *copy_process(
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p, args);
+bad_fork_bpf_task_storage_free:
+ bpf_task_storage_free(p);
bad_fork_threadgroup_change_end:
threadgroup_change_end(current);
bad_fork_put_pidfd:
diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index 035c263aab1b..ad35470db991 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -5,10 +5,21 @@
#include <unistd.h>
#include <sys/syscall.h> /* For SYS_xxx definitions */
#include <sys/types.h>
+#include <sys/wait.h>
#include <test_progs.h>
#include "task_local_storage.skel.h"
#include "task_local_storage_exit_creds.skel.h"
#include "task_ls_recursion.skel.h"
+#include "task_ls_prealloc.skel.h"
+
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open 434
+#endif
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+ return syscall(__NR_pidfd_open, pid, flags);
+}
static void test_sys_enter_exit(void)
{
@@ -81,6 +92,94 @@ static void test_recursion(void)
task_ls_recursion__destroy(skel);
}
+static int fork_prealloc_child(int *pipe_fd)
+{
+ int pipe_fds[2], pid_fd, err;
+ pid_t pid;
+
+ err = pipe(pipe_fds);
+ if (!ASSERT_OK(err, "pipe"))
+ return -1;
+
+ *pipe_fd = pipe_fds[1];
+
+ pid = fork();
+ if (pid == 0) {
+ char ch;
+ close(pipe_fds[1]);
+ read(pipe_fds[0], &ch, 1);
+ exit(0);
+ }
+
+ if (!ASSERT_GE(pid, 0, "fork"))
+ return -1;
+
+ pid_fd = sys_pidfd_open(pid, 0);
+ if (!ASSERT_GE(pid_fd, 0, "pidfd_open"))
+ return -1;
+
+ return pid_fd;
+}
+
+static void test_prealloc_elem(int map_fd, int pid_fd)
+{
+ int val, err;
+
+ err = bpf_map_lookup_elem(map_fd, &pid_fd, &val);
+ if (ASSERT_OK(err, "bpf_map_lookup_elem"))
+ ASSERT_EQ(val, 0, "elem value == 0");
+
+ val = 0xdeadbeef;
+ err = bpf_map_update_elem(map_fd, &pid_fd, &val, BPF_EXIST);
+ ASSERT_OK(err, "bpf_map_update_elem to 0xdeadbeef");
+
+ err = bpf_map_lookup_elem(map_fd, &pid_fd, &val);
+ if (ASSERT_OK(err, "bpf_map_lookup_elem"))
+ ASSERT_EQ(val, 0xdeadbeef, "elem value == 0xdeadbeef");
+}
+
+static void test_prealloc(void)
+{
+ struct task_ls_prealloc *skel = NULL;
+ int pre_pipe_fd = -1, post_pipe_fd = -1;
+ int pre_pid_fd, post_pid_fd;
+ int map_fd, err;
+
+ pre_pid_fd = fork_prealloc_child(&pre_pipe_fd);
+ if (pre_pid_fd < 0)
+ goto out;
+
+ skel = task_ls_prealloc__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+ goto out;
+
+ err = task_ls_prealloc__attach(skel);
+ if (!ASSERT_OK(err, "skel_attach"))
+ goto out;
+
+ post_pid_fd = fork_prealloc_child(&post_pipe_fd);
+ if (post_pid_fd < 0)
+ goto out;
+
+ map_fd = bpf_map__fd(skel->maps.prealloc_map);
+ if (!ASSERT_GE(map_fd, 0, "bpf_map__fd"))
+ goto out;
+
+ test_prealloc_elem(map_fd, pre_pid_fd);
+ test_prealloc_elem(map_fd, post_pid_fd);
+out:
+ if (pre_pipe_fd >= 0)
+ close(pre_pipe_fd);
+ if (post_pipe_fd >= 0)
+ close(post_pipe_fd);
+ do {
+ err = wait4(-1, NULL, 0, NULL);
+ } while (!err);
+
+ if (skel)
+ task_ls_prealloc__destroy(skel);
+}
+
void test_task_local_storage(void)
{
if (test__start_subtest("sys_enter_exit"))
@@ -89,4 +188,6 @@ void test_task_local_storage(void)
test_exit_creds();
if (test__start_subtest("recursion"))
test_recursion();
+ if (test__start_subtest("prealloc"))
+ test_prealloc();
}
diff --git a/tools/testing/selftests/bpf/progs/task_ls_prealloc.c b/tools/testing/selftests/bpf/progs/task_ls_prealloc.c
new file mode 100644
index 000000000000..8b252ee3511e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/task_ls_prealloc.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, 0);
+ __type(key, int);
+ __type(value, int);
+} prealloc_map SEC(".maps");
--
2.33.1
^ permalink raw reply related [flat|nested] 9+ messages in thread