* [PATCH v4 1/2] cgroup: fix cgroup_path() vs rename() race
@ 2013-02-28 7:34 Li Zefan
2013-02-28 7:35 ` [PATCH 2/2] cpuset: use cgroup_name() in cpuset_print_task_mems_allowed() Li Zefan
0 siblings, 1 reply; 2+ messages in thread
From: Li Zefan @ 2013-02-28 7:34 UTC (permalink / raw)
To: Tejun Heo; +Cc: Al Viro, cgroups, LKML
rename() will change dentry->d_name. The result of this race can
be worse than seeing partially rewritten name, but we might access
a stale pointer because rename() will re-allocate memory to hold
a longer name.
As accessing dentry->name must be protected by dentry->d_lock or
parent inode's i_mutex, while on the other hand cgroup-path() can
be called with some irq-safe spinlocks held, we can't generate
cgroup path using dentry->d_name.
Alternatively we make a copy of dentry->d_name and save it in
cgrp->name when a cgroup is created, and update cgrp->name at
rename().
v4: - allocate root_cgroup_name and all root_cgroup->name points to it.
- add cgroup_name() wrapper.
v3: use kfree_rcu() instead of synchronize_rcu() in user-visible path.
v2: make cgrp->name RCU safe.
Signed-off-by: Li Zefan <lizefan@huawei.com>
---
block/blk-cgroup.h | 2 -
include/linux/cgroup.h | 24 +++++++++++
kernel/cgroup.c | 111 ++++++++++++++++++++++++++++++++++++-------------
3 files changed, 105 insertions(+), 32 deletions(-)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2459730..e2e3404 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -216,9 +216,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
{
int ret;
- rcu_read_lock();
ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
- rcu_read_unlock();
if (ret)
strncpy(buf, "<unavailable>", buflen);
return ret;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af59..25fc6c9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -150,6 +150,11 @@ enum {
CGRP_CPUSET_CLONE_CHILDREN,
};
+struct cgroup_name {
+ struct rcu_head rcu_head;
+ char name[0];
+};
+
struct cgroup {
unsigned long flags; /* "unsigned long" so bitops work */
@@ -172,6 +177,19 @@ struct cgroup {
struct cgroup *parent; /* my parent */
struct dentry *dentry; /* cgroup fs entry, RCU protected */
+ /*
+ * This is a copy of dentry->d_name, and it's needed because
+ * we can't use dentry->d_name in cgroup_path().
+ *
+ * You must acquire rcu_read_lock() to access cgrp->name, and
+ * the only place that can change it is rename(), which is
+ * protected by parent dir's i_mutex.
+ *
+ * Normally you should use cgroup_name() wrapper rather than
+ * access it directly.
+ */
+ struct cgroup_name __rcu *name;
+
/* Private pointers for each registered subsystem */
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
@@ -404,6 +422,12 @@ struct cgroup_scanner {
void *data;
};
+/* Caller should hold rcu_read_lock() */
+static inline const char *cgroup_name(const struct cgroup *cgrp)
+{
+ return rcu_dereference(cgrp->name)->name;
+}
+
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b5c6432..2be28af 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -238,6 +238,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)
+static struct cgroup_name *root_cgroup_name;
+
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
* extra work in the fork/exit path if none of the subsystems need to
@@ -860,6 +862,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
return inode;
}
+static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+{
+ struct cgroup_name *name;
+
+ name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+ if (!name)
+ return NULL;
+ strcpy(name->name, dentry->d_name.name);
+ return name;
+}
+
static void cgroup_free_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -890,6 +903,7 @@ static void cgroup_free_fn(struct work_struct *work)
simple_xattrs_free(&cgrp->xattrs);
ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+ kfree(rcu_dereference_raw(cgrp->name));
kfree(cgrp);
}
@@ -1422,6 +1436,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
INIT_LIST_HEAD(&root->allcg_list);
root->number_of_cgroups = 1;
cgrp->root = root;
+ cgrp->name = root_cgroup_name;
cgrp->top_cgroup = cgrp;
init_cgroup_housekeeping(cgrp);
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
@@ -1771,49 +1786,45 @@ static struct kobject *cgroup_kobj;
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference. Writes path of cgroup into buf. Returns 0 on success,
- * -errno on error.
+ * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
+ *
+ * We can't generate cgroup path using dentry->d_name, as accessing
+ * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
+ * inode's i_mutex, while on the other hand cgroup_path() can be called
+ * with some irq-safe spinlocks held.
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
- struct dentry *dentry = cgrp->dentry;
+ int ret = -ENAMETOOLONG;
char *start;
- rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
- "cgroup_path() called without proper locking");
-
- if (cgrp == dummytop) {
- /*
- * Inactive subsystems have no dentry for their root
- * cgroup
- */
- strcpy(buf, "/");
- return 0;
- }
-
start = buf + buflen - 1;
-
*start = '\0';
- for (;;) {
- int len = dentry->d_name.len;
+ rcu_read_lock();
+ while (cgrp) {
+ const char *name = cgroup_name(cgrp);
+ int len;
+
+ len = strlen(name);
if ((start -= len) < buf)
- return -ENAMETOOLONG;
- memcpy(start, dentry->d_name.name, len);
- cgrp = cgrp->parent;
- if (!cgrp)
- break;
+ goto out;
+ memcpy(start, name, len);
- dentry = cgrp->dentry;
if (!cgrp->parent)
- continue;
+ break;
+
if (--start < buf)
- return -ENAMETOOLONG;
+ goto out;
*start = '/';
+
+ cgrp = cgrp->parent;
}
+ ret = 0;
memmove(buf, start, buf + buflen - start);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path);
@@ -2539,13 +2550,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
+ int ret;
+ struct cgroup_name *name, *old_name;
+ struct cgroup *cgrp;
+
+ /*
+ * It's convinient to use parent dir's i_mutex to protected
+ * cgrp->name.
+ */
+ lockdep_assert_held(&old_dir->i_mutex);
+
if (!S_ISDIR(old_dentry->d_inode->i_mode))
return -ENOTDIR;
if (new_dentry->d_inode)
return -EEXIST;
if (old_dir != new_dir)
return -EIO;
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+ cgrp = __d_cgrp(old_dentry);
+
+ name = cgroup_alloc_name(new_dentry);
+ if (!name)
+ return -ENOMEM;
+
+ ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ old_name = cgrp->name;
+ rcu_assign_pointer(cgrp->name, name);
+
+ kfree_rcu(old_name, rcu_head);
+ return 0;
}
static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -4160,6 +4198,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
umode_t mode)
{
struct cgroup *cgrp;
+ struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
int err = 0;
struct cgroup_subsys *ss;
@@ -4170,9 +4209,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (!cgrp)
return -ENOMEM;
+ name = cgroup_alloc_name(dentry);
+ if (!name)
+ goto err_free_cgrp;
+ rcu_assign_pointer(cgrp->name, name);
+
cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
if (cgrp->id < 0)
- goto err_free_cgrp;
+ goto err_free_name;
/*
* Only live parents can have children. Note that the liveliness
@@ -4278,6 +4322,8 @@ err_free_all:
deactivate_super(sb);
err_free_id:
ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_name:
+ kfree(rcu_dereference_raw(cgrp->name));
err_free_cgrp:
kfree(cgrp);
return err;
@@ -4739,6 +4785,11 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist, key);
BUG_ON(!init_root_id(&rootnode));
+ root_cgroup_name = kmalloc(sizeof(struct cgroup_name) + 2, GFP_KERNEL);
+ BUG_ON(!root_cgroup_name);
+ strcpy(root_cgroup_name->name, "/");
+ rootnode.top_cgroup.name = root_cgroup_name;
+
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
if (!cgroup_kobj) {
err = -ENOMEM;
--
1.8.0.2
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH 2/2] cpuset: use cgroup_name() in cpuset_print_task_mems_allowed()
2013-02-28 7:34 [PATCH v4 1/2] cgroup: fix cgroup_path() vs rename() race Li Zefan
@ 2013-02-28 7:35 ` Li Zefan
0 siblings, 0 replies; 2+ messages in thread
From: Li Zefan @ 2013-02-28 7:35 UTC (permalink / raw)
To: Tejun Heo; +Cc: Al Viro, cgroups, LKML
Use cgroup_name() instead of cgrp->dentry->name. This makes the code
a bit simpler.
While at it, remove cpuset_name and make cpuset_nodelist a local variable
to cpuset_print_task_mems_allowed().
Signed-off-by: Li Zefan <lizefan@huawei.com>
---
kernel/cpuset.c | 32 +++++++++-----------------------
1 file changed, 9 insertions(+), 23 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe4..ace5bfc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -265,17 +265,6 @@ static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
- * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
- * buffers. They are statically allocated to prevent using excess stack
- * when calling cpuset_print_task_mems_allowed().
- */
-#define CPUSET_NAME_LEN (128)
-#define CPUSET_NODELIST_LEN (256)
-static char cpuset_name[CPUSET_NAME_LEN];
-static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-static DEFINE_SPINLOCK(cpuset_buffer_lock);
-
-/*
* CPU / memory hotplug is handled asynchronously.
*/
static struct workqueue_struct *cpuset_propagate_hotplug_wq;
@@ -2592,6 +2581,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
+#define CPUSET_NODELIST_LEN (256)
+
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
* @task: pointer to task_struct of some task.
@@ -2602,24 +2593,19 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
- struct dentry *dentry;
+ /* Statically allocated to prevent using excess stack. */
+ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
+ static DEFINE_SPINLOCK(cpuset_buffer_lock);
- dentry = task_cs(tsk)->css.cgroup->dentry;
- spin_lock(&cpuset_buffer_lock);
+ struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
- if (!dentry) {
- strcpy(cpuset_name, "/");
- } else {
- spin_lock(&dentry->d_lock);
- strlcpy(cpuset_name, (const char *)dentry->d_name.name,
- CPUSET_NAME_LEN);
- spin_unlock(&dentry->d_lock);
- }
+ spin_lock(&cpuset_buffer_lock);
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
- tsk->comm, cpuset_name, cpuset_nodelist);
+ tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
+
spin_unlock(&cpuset_buffer_lock);
}
--
1.8.0.2
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2013-02-28 7:40 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-28 7:34 [PATCH v4 1/2] cgroup: fix cgroup_path() vs rename() race Li Zefan
2013-02-28 7:35 ` [PATCH 2/2] cpuset: use cgroup_name() in cpuset_print_task_mems_allowed() Li Zefan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).