All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] cgroup: revise how we re-populate root directory
@ 2012-01-16  8:06 ` Li Zefan
  0 siblings, 0 replies; 29+ messages in thread
From: Li Zefan @ 2012-01-16  8:06 UTC (permalink / raw)
  To: LKML; +Cc: Cgroups, Tejun Heo, Lennart Poettering, Kay Sievers

When remounting cgroupfs with some subsystems added to it and some
removed, cgroup will remove all the files in root directory and then
re-popluate it.

What I'm doing here is, only remove files which belong to subsystems that
are to be unbound, and only create files for newly-added subsystems.
The purpose is to have all other files untouched.

This is a preparation for cgroup xattr support.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |   11 +++---
 kernel/cgroup.c        |   92 +++++++++++++++++++++++++++++++----------------
 2 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e9b6021..13db9e8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -327,6 +327,9 @@ struct cftype {
 	 */
 	size_t max_write_len;
 
+	/* The subsystem this cgroup file belongs to */
+	struct cgroup_subsys *subsys;
+
 	int (*open)(struct inode *inode, struct file *file);
 	ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
@@ -419,16 +422,14 @@ struct cgroup_scanner {
  * called by subsystems from within a populate() method
  */
 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-		       const struct cftype *cft);
+		    struct cftype *cft);
 
 /*
  * Add a set of new files to the given cgroup directory. Should
  * only be called by subsystems from within a populate() method
  */
-int cgroup_add_files(struct cgroup *cgrp,
-			struct cgroup_subsys *subsys,
-			const struct cftype cft[],
-			int count);
+int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+		     struct cftype cft[], int count);
 
 int cgroup_is_removed(const struct cgroup *cgrp);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b53..c4ed6fe 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -781,6 +781,8 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
+static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
+				 unsigned long removed_bits);
 static const struct inode_operations cgroup_dir_inode_operations;
 static const struct file_operations proc_cgroupstats_operations;
 
@@ -882,34 +884,44 @@ static void remove_dir(struct dentry *d)
 	dput(parent);
 }
 
-static void cgroup_clear_directory(struct dentry *dentry)
+static void cgroup_clear_directory(struct dentry *dentry, bool remove_all,
+				   unsigned long removed_bits)
 {
-	struct list_head *node;
+	LIST_HEAD(head);
+	struct dentry *d, *node;
 
 	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+
 	spin_lock(&dentry->d_lock);
-	node = dentry->d_subdirs.next;
-	while (node != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+	list_for_each_entry_safe(d, node, &dentry->d_subdirs, d_u.d_child) {
+		struct cftype *cft = d->d_fsdata;
+
+		if (!remove_all && cft->subsys &&
+		    !test_bit(cft->subsys->subsys_id, &removed_bits))
+			continue;
 
 		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
-		list_del_init(node);
+		list_move(&d->d_u.d_child, &head);
 		if (d->d_inode) {
 			/* This should never be called on a cgroup
 			 * directory with child cgroups */
 			BUG_ON(d->d_inode->i_mode & S_IFDIR);
 			dget_dlock(d);
-			spin_unlock(&d->d_lock);
-			spin_unlock(&dentry->d_lock);
+		}
+		spin_unlock(&d->d_lock);
+	}
+	spin_unlock(&dentry->d_lock);
+
+	list_for_each_entry_safe(d, node, &head, d_u.d_child) {
+		spin_lock(&d->d_lock);
+		list_del_init(&d->d_u.d_child);
+		spin_unlock(&d->d_lock);
+		if (d->d_inode) {
 			d_delete(d);
 			simple_unlink(dentry->d_inode, d);
 			dput(d);
-			spin_lock(&dentry->d_lock);
-		} else
-			spin_unlock(&d->d_lock);
-		node = dentry->d_subdirs.next;
+		}
 	}
-	spin_unlock(&dentry->d_lock);
 }
 
 /*
@@ -919,7 +931,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 {
 	struct dentry *parent;
 
-	cgroup_clear_directory(dentry);
+	cgroup_clear_directory(dentry, true, 0);
 
 	parent = dentry->d_parent;
 	spin_lock(&parent->d_lock);
@@ -1284,6 +1296,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
+	unsigned long added_bits, removed_bits;
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
@@ -1294,6 +1307,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
+	added_bits = opts.subsys_bits & ~root->subsys_bits;
+	removed_bits = root->subsys_bits & ~opts.subsys_bits;
+
 	/* Don't allow flags or name to change at remount */
 	if (opts.flags != root->flags ||
 	    (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,8 +1324,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 		goto out_unlock;
 	}
 
-	/* (re)populate subsystem files */
-	cgroup_populate_dir(cgrp);
+	/* re-populate subsystem files */
+	cgroup_repopulate_dir(cgrp, added_bits, removed_bits);
 
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
@@ -2710,16 +2726,17 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	return mode;
 }
 
-int cgroup_add_file(struct cgroup *cgrp,
-		       struct cgroup_subsys *subsys,
-		       const struct cftype *cft)
+int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+		    struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
 	umode_t mode;
-
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+
+	cft->subsys = subsys;
+
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
 		strcpy(name, subsys->name);
 		strcat(name, ".");
@@ -2740,10 +2757,8 @@ int cgroup_add_file(struct cgroup *cgrp,
 }
 EXPORT_SYMBOL_GPL(cgroup_add_file);
 
-int cgroup_add_files(struct cgroup *cgrp,
-			struct cgroup_subsys *subsys,
-			const struct cftype cft[],
-			int count)
+int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+		     struct cftype cft[], int count)
 {
 	int i, err;
 	for (i = 0; i < count; i++) {
@@ -3703,26 +3718,27 @@ static struct cftype cft_release_agent = {
 	.max_write_len = PATH_MAX,
 };
 
-static int cgroup_populate_dir(struct cgroup *cgrp)
+static int __cgroup_populate_dir(struct cgroup *cgrp, unsigned long added_bits)
 {
 	int err;
 	struct cgroup_subsys *ss;
 
-	/* First clear out any existing files */
-	cgroup_clear_directory(cgrp->dentry);
-
 	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
 	if (err < 0)
 		return err;
 
 	if (cgrp == cgrp->top_cgroup) {
-		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
+		err = cgroup_add_file(cgrp, NULL, &cft_release_agent);
+		if (err < 0)
 			return err;
 	}
 
 	for_each_subsys(cgrp->root, ss) {
-		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
-			return err;
+		if (test_bit(ss->subsys_id, &added_bits) && ss->populate) {
+			err = ss->populate(ss, cgrp);
+			if (err < 0)
+				return err;
+		}
 	}
 	/* This cgroup is ready now */
 	for_each_subsys(cgrp->root, ss) {
@@ -3739,6 +3755,20 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	return 0;
 }
 
+static int cgroup_populate_dir(struct cgroup *cgrp)
+{
+	return __cgroup_populate_dir(cgrp, cgrp->root->subsys_bits);
+}
+
+static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
+				 unsigned long removed_bits)
+{
+	/* First clear out files */
+	cgroup_clear_directory(cgrp->dentry, false, removed_bits);
+
+	return __cgroup_populate_dir(cgrp, added_bits);
+}
+
 static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup_subsys *ss,
 			       struct cgroup *cgrp)
-- 
1.7.3.1

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 1/2] cgroup: revise how we re-populate root directory
@ 2012-01-16  8:06 ` Li Zefan
  0 siblings, 0 replies; 29+ messages in thread
From: Li Zefan @ 2012-01-16  8:06 UTC (permalink / raw)
  To: LKML; +Cc: Cgroups, Tejun Heo, Lennart Poettering, Kay Sievers

When remounting cgroupfs with some subsystems added to it and some
removed, cgroup will remove all the files in root directory and then
re-popluate it.

What I'm doing here is, only remove files which belong to subsystems that
are to be unbound, and only create files for newly-added subsystems.
The purpose is to have all other files untouched.

This is a preparation for cgroup xattr support.

Signed-off-by: Li Zefan <lizf-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
---
 include/linux/cgroup.h |   11 +++---
 kernel/cgroup.c        |   92 +++++++++++++++++++++++++++++++----------------
 2 files changed, 67 insertions(+), 36 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e9b6021..13db9e8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -327,6 +327,9 @@ struct cftype {
 	 */
 	size_t max_write_len;
 
+	/* The subsystem this cgroup file belongs to */
+	struct cgroup_subsys *subsys;
+
 	int (*open)(struct inode *inode, struct file *file);
 	ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
@@ -419,16 +422,14 @@ struct cgroup_scanner {
  * called by subsystems from within a populate() method
  */
 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-		       const struct cftype *cft);
+		    struct cftype *cft);
 
 /*
  * Add a set of new files to the given cgroup directory. Should
  * only be called by subsystems from within a populate() method
  */
-int cgroup_add_files(struct cgroup *cgrp,
-			struct cgroup_subsys *subsys,
-			const struct cftype cft[],
-			int count);
+int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+		     struct cftype cft[], int count);
 
 int cgroup_is_removed(const struct cgroup *cgrp);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b53..c4ed6fe 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -781,6 +781,8 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
+static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
+				 unsigned long removed_bits);
 static const struct inode_operations cgroup_dir_inode_operations;
 static const struct file_operations proc_cgroupstats_operations;
 
@@ -882,34 +884,44 @@ static void remove_dir(struct dentry *d)
 	dput(parent);
 }
 
-static void cgroup_clear_directory(struct dentry *dentry)
+static void cgroup_clear_directory(struct dentry *dentry, bool remove_all,
+				   unsigned long removed_bits)
 {
-	struct list_head *node;
+	LIST_HEAD(head);
+	struct dentry *d, *node;
 
 	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+
 	spin_lock(&dentry->d_lock);
-	node = dentry->d_subdirs.next;
-	while (node != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
+	list_for_each_entry_safe(d, node, &dentry->d_subdirs, d_u.d_child) {
+		struct cftype *cft = d->d_fsdata;
+
+		if (!remove_all && cft->subsys &&
+		    !test_bit(cft->subsys->subsys_id, &removed_bits))
+			continue;
 
 		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
-		list_del_init(node);
+		list_move(&d->d_u.d_child, &head);
 		if (d->d_inode) {
 			/* This should never be called on a cgroup
 			 * directory with child cgroups */
 			BUG_ON(d->d_inode->i_mode & S_IFDIR);
 			dget_dlock(d);
-			spin_unlock(&d->d_lock);
-			spin_unlock(&dentry->d_lock);
+		}
+		spin_unlock(&d->d_lock);
+	}
+	spin_unlock(&dentry->d_lock);
+
+	list_for_each_entry_safe(d, node, &head, d_u.d_child) {
+		spin_lock(&d->d_lock);
+		list_del_init(&d->d_u.d_child);
+		spin_unlock(&d->d_lock);
+		if (d->d_inode) {
 			d_delete(d);
 			simple_unlink(dentry->d_inode, d);
 			dput(d);
-			spin_lock(&dentry->d_lock);
-		} else
-			spin_unlock(&d->d_lock);
-		node = dentry->d_subdirs.next;
+		}
 	}
-	spin_unlock(&dentry->d_lock);
 }
 
 /*
@@ -919,7 +931,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 {
 	struct dentry *parent;
 
-	cgroup_clear_directory(dentry);
+	cgroup_clear_directory(dentry, true, 0);
 
 	parent = dentry->d_parent;
 	spin_lock(&parent->d_lock);
@@ -1284,6 +1296,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
+	unsigned long added_bits, removed_bits;
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
@@ -1294,6 +1307,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
+	added_bits = opts.subsys_bits & ~root->subsys_bits;
+	removed_bits = root->subsys_bits & ~opts.subsys_bits;
+
 	/* Don't allow flags or name to change at remount */
 	if (opts.flags != root->flags ||
 	    (opts.name && strcmp(opts.name, root->name))) {
@@ -1308,8 +1324,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 		goto out_unlock;
 	}
 
-	/* (re)populate subsystem files */
-	cgroup_populate_dir(cgrp);
+	/* re-populate subsystem files */
+	cgroup_repopulate_dir(cgrp, added_bits, removed_bits);
 
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
@@ -2710,16 +2726,17 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	return mode;
 }
 
-int cgroup_add_file(struct cgroup *cgrp,
-		       struct cgroup_subsys *subsys,
-		       const struct cftype *cft)
+int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+		    struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
 	umode_t mode;
-
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+
+	cft->subsys = subsys;
+
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
 		strcpy(name, subsys->name);
 		strcat(name, ".");
@@ -2740,10 +2757,8 @@ int cgroup_add_file(struct cgroup *cgrp,
 }
 EXPORT_SYMBOL_GPL(cgroup_add_file);
 
-int cgroup_add_files(struct cgroup *cgrp,
-			struct cgroup_subsys *subsys,
-			const struct cftype cft[],
-			int count)
+int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+		     struct cftype cft[], int count)
 {
 	int i, err;
 	for (i = 0; i < count; i++) {
@@ -3703,26 +3718,27 @@ static struct cftype cft_release_agent = {
 	.max_write_len = PATH_MAX,
 };
 
-static int cgroup_populate_dir(struct cgroup *cgrp)
+static int __cgroup_populate_dir(struct cgroup *cgrp, unsigned long added_bits)
 {
 	int err;
 	struct cgroup_subsys *ss;
 
-	/* First clear out any existing files */
-	cgroup_clear_directory(cgrp->dentry);
-
 	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
 	if (err < 0)
 		return err;
 
 	if (cgrp == cgrp->top_cgroup) {
-		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
+		err = cgroup_add_file(cgrp, NULL, &cft_release_agent);
+		if (err < 0)
 			return err;
 	}
 
 	for_each_subsys(cgrp->root, ss) {
-		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
-			return err;
+		if (test_bit(ss->subsys_id, &added_bits) && ss->populate) {
+			err = ss->populate(ss, cgrp);
+			if (err < 0)
+				return err;
+		}
 	}
 	/* This cgroup is ready now */
 	for_each_subsys(cgrp->root, ss) {
@@ -3739,6 +3755,20 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	return 0;
 }
 
+static int cgroup_populate_dir(struct cgroup *cgrp)
+{
+	return __cgroup_populate_dir(cgrp, cgrp->root->subsys_bits);
+}
+
+static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
+				 unsigned long removed_bits)
+{
+	/* First clear out files */
+	cgroup_clear_directory(cgrp->dentry, false, removed_bits);
+
+	return __cgroup_populate_dir(cgrp, added_bits);
+}
+
 static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup_subsys *ss,
 			       struct cgroup *cgrp)
-- 
1.7.3.1

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/2] cgroup: add xattr support
@ 2012-01-16  8:07   ` Li Zefan
  0 siblings, 0 replies; 29+ messages in thread
From: Li Zefan @ 2012-01-16  8:07 UTC (permalink / raw)
  To: LKML; +Cc: Cgroups, Tejun Heo, Lennart Poettering, Kay Sievers

This is one of the items in the plumber's wish list.

For use cases:

>> What would the use case be for this?
>
> Attaching meta information to services, in an easily discoverable
> way. For example, in systemd we create one cgroup for each service, and
> could then store data like the main pid of the specific service as an
> xattr on the cgroup itself. That way we'd have almost all service state
> in the cgroupfs, which would make it possible to terminate systemd and
> later restart it without losing any state information. But there's more:
> for example, some very peculiar services cannot be terminated on
> shutdown (i.e. fakeraid DM stuff) and it would be really nice if the
> services in question could just mark that on their cgroup, by setting an
> xattr. On the more desktopy side of things there are other
> possibilities: for example there are plans defining what an application
> is along the lines of a cgroup (i.e. an app being a collection of
> processes). With xattrs one could then attach an icon or human readable
> program name on the cgroup.
>
> The key idea is that this would allow attaching runtime meta information
> to cgroups and everything they model (services, apps, vms), that doesn't
> need any complex userspace infrastructure, has good access control
> (i.e. because the file system enforces that anyway, and there's the
> "trusted." xattr namespace), notifications (inotify), and can easily be
> shared among applications.
>
> Lennart

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |   15 +++
 init/Kconfig           |   12 ++
 kernel/cgroup.c        |  272 ++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 289 insertions(+), 10 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 13db9e8..a5ac3be 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -16,6 +16,8 @@
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
 #include <linux/idr.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -42,6 +44,13 @@ extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
 
 extern const struct file_operations proc_cgroup_operations;
 
+struct cgroup_xattr_root {
+#ifdef CONFIG_CGROUP_XATTR
+	struct rb_root root;
+	spinlock_t lock;
+#endif
+};
+
 /* Define the enumeration of all builtin cgroup subsystems */
 #define SUBSYS(_x) _x ## _subsys_id,
 enum cgroup_subsys_id {
@@ -243,6 +252,9 @@ struct cgroup {
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
+
+	/* directory xattrs */
+	struct cgroup_xattr_root xattr_root;
 };
 
 /*
@@ -330,6 +342,9 @@ struct cftype {
 	/* The subsystem this cgroup file belongs to */
 	struct cgroup_subsys *subsys;
 
+	/* file xattrs */
+	struct cgroup_xattr_root xattr_root;
+
 	int (*open)(struct inode *inode, struct file *file);
 	ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
diff --git a/init/Kconfig b/init/Kconfig
index 6ac2236..28990ec 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -587,6 +587,18 @@ menuconfig CGROUPS
 
 if CGROUPS
 
+config CGROUP_XATTR
+	bool "Cgroup extended attributes"
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  Currently the system.* namespace is not supported.
+
+	  If unsure, say N.
+
 config CGROUP_DEBUG
 	bool "Example debug cgroup subsystem"
 	default n
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c4ed6fe..ab4cca5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,7 +60,8 @@
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
-
+#include <linux/xattr.h>
+#include <linux/rbtree.h>
 #include <linux/atomic.h>
 
 /*
@@ -786,6 +787,9 @@ static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
 static const struct inode_operations cgroup_dir_inode_operations;
 static const struct file_operations proc_cgroupstats_operations;
 
+static void cgroup_xattrs_init(struct cgroup_xattr_root *root);
+static void cgroup_xattrs_destroy(struct cgroup_xattr_root *root);
+
 static struct backing_dev_info cgroup_backing_dev_info = {
 	.name		= "cgroup",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -865,7 +869,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		 */
 		BUG_ON(!list_empty(&cgrp->pidlists));
 
+		cgroup_xattrs_destroy(&cgrp->xattr_root);
+
 		kfree_rcu(cgrp, rcu_head);
+	} else {
+		struct cftype *cft = dentry->d_fsdata;
+		cgroup_xattrs_destroy(&cft->xattr_root);
 	}
 	iput(inode);
 }
@@ -1355,6 +1364,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
+	cgroup_xattrs_init(&cgrp->xattr_root);
 }
 
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1700,6 +1710,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 
+	cgroup_xattrs_destroy(&cgrp->xattr_root);
+
 	kill_litter_super(sb);
 	cgroup_drop_root(root);
 }
@@ -2608,18 +2620,256 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 }
 
 static const struct file_operations cgroup_file_operations = {
-	.read = cgroup_file_read,
-	.write = cgroup_file_write,
-	.llseek = generic_file_llseek,
-	.open = cgroup_file_open,
-	.release = cgroup_file_release,
+	.read		= cgroup_file_read,
+	.write		= cgroup_file_write,
+	.llseek		= generic_file_llseek,
+	.open		= cgroup_file_open,
+	.release	= cgroup_file_release,
+};
+
+#ifdef CONFIG_CGROUP_XATTR
+
+struct cgroup_xattr_entry {
+	struct rb_node node;
+	char *name;
+	char *val;
+	int len;
+};
+
+static void free_xattr_entry(struct cgroup_xattr_entry *entry)
+{
+	kfree(entry->name);
+	kfree(entry->val);
+	kfree(entry);
+}
+
+static struct cgroup_xattr_root *xattr_root(struct dentry *dentry)
+{
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		return &__d_cgrp(dentry)->xattr_root;
+	else
+		return &__d_cft(dentry)->xattr_root;
+}
+
+static void cgroup_xattrs_init(struct cgroup_xattr_root *root)
+{
+	spin_lock_init(&root->lock);
+	root->root = RB_ROOT;
+}
+
+static void cgroup_xattrs_destroy(struct cgroup_xattr_root *xattr_root)
+{
+	struct rb_root *root = &xattr_root->root;
+	struct rb_node *node;
+	struct cgroup_xattr_entry *entry;
+
+	while (true) {
+		node = rb_first(root);
+		if (!node)
+			break;
+		entry = rb_entry(node, struct cgroup_xattr_entry, node);
+
+		rb_erase(node, root);
+		free_xattr_entry(entry);
+	}
+}
+
+static bool is_valid_xattr(const char *name)
+{
+	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
+	    !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+		return true;
+	return false;
+}
+
+static int __cgroup_setxattr(struct dentry *dentry, const char *name,
+			     const void *value, size_t size, int flags)
+{
+	struct cgroup_xattr_root *root = xattr_root(dentry);
+	struct cgroup_xattr_entry *entry = NULL;
+	struct cgroup_xattr_entry *new = NULL;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	int cmp;
+	int ret = 0;
+	char tmp[200];
+
+	if (!is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (value) {
+		new = kzalloc(sizeof(*new), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		new->name = kstrdup(name, GFP_KERNEL);
+		new->val = kmemdup(value, size, GFP_KERNEL);
+		new->len = size;
+		if (!new->name || !new->val) {
+			free_xattr_entry(new);
+			return -ENOMEM;
+		}
+	}
+
+	memcpy(tmp, value, size);
+	tmp[size] = '\0';
+
+	spin_lock(&root->lock);
+
+	p = &root->root.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct cgroup_xattr_entry, node);
+
+		cmp = strcmp(name, entry->name);
+		if (cmp > 0)
+			p = &(*p)->rb_right;
+		else if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			break;
+	}
+
+	if (*p) {
+		if (flags & XATTR_CREATE) {
+			ret = -EEXIST;
+		} else if (new) {
+			swap(entry->val, new->val);
+			swap(entry->len, new->len);
+		} else {
+			rb_erase(&entry->node, &root->root);
+			new = entry;
+		}
+
+		free_xattr_entry(new);
+	} else {
+		if (!new || (flags & XATTR_REPLACE)) {
+			ret = -ENOENT;
+		} else {
+			rb_link_node(&new->node, parent, p);
+			rb_insert_color(&new->node, &root->root);
+		}
+	}
+
+	spin_unlock(&root->lock);
+
+	return ret;
+}
+
+static int cgroup_setxattr(struct dentry *dentry, const char *name,
+			   const void *value, size_t size, int flags)
+{
+	if (size == 0)
+		value = "";
+
+	return __cgroup_setxattr(dentry, name, value, size, flags);
+}
+
+static int cgroup_removexattr(struct dentry *dentry, const char *name)
+{
+	return __cgroup_setxattr(dentry, name, NULL, 0, XATTR_REPLACE);
+}
+
+static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
+			       void *buf, size_t size)
+{
+	struct cgroup_xattr_root *root = xattr_root(dentry);
+	struct cgroup_xattr_entry *entry;
+	struct rb_node *node;
+	int cmp;
+	int ret = -ENOENT;
+
+	if (!is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	spin_lock(&root->lock);
+	node = root->root.rb_node;
+	while (node) {
+		entry = rb_entry(node, struct cgroup_xattr_entry, node);
+
+		cmp = strcmp(name, entry->name);
+		if (cmp > 0) {
+			node = node->rb_right;
+		} else if (cmp < 0) {
+			node = node->rb_left;
+		} else {
+			ret = entry->len;
+			if (buf) {
+				if (size < entry->len)
+					ret = -ERANGE;
+				else
+					memcpy(buf, entry->val, entry->len);
+			}
+			break;
+		}
+	}
+	spin_unlock(&root->lock);
+	return ret;
+}
+
+static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+	struct cgroup_xattr_root *root = xattr_root(dentry);
+	struct cgroup_xattr_entry *entry;
+	struct rb_node *node;
+	int total_len = 0;
+	int len;
+
+	spin_lock(&root->lock);
+	node = rb_first(&root->root);
+	while (node) {
+		entry = rb_entry(node, struct cgroup_xattr_entry, node);
+
+		if (!capable(CAP_SYS_ADMIN) &&
+		    strncmp(entry->name, XATTR_TRUSTED_PREFIX,
+			    XATTR_TRUSTED_PREFIX_LEN) == 0)
+			continue;
+
+		len = strlen(entry->name) + 1;
+		total_len += len;
+		if (buf) {
+			if (size < total_len) {
+				total_len = -ERANGE;
+				break;
+			}
+			memcpy(buf, entry->name, len);
+			buf += len;
+		}
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->lock);
+
+	return total_len;
+}
+
+#else /* CONFIG_CGROUP_XATTR */
+
+static void cgroup_xattrs_init(struct cgroup_xattr_root *root) {}
+static void cgroup_xattrs_destroy(struct cgroup_xattr_root *root) {}
+
+#endif
+
+static const struct inode_operations cgroup_file_inode_operations = {
+#ifdef CONFIG_CGROUP_XATTR
+	.setxattr	= cgroup_setxattr,
+	.getxattr	= cgroup_getxattr,
+	.listxattr	= cgroup_listxattr,
+	.removexattr	= cgroup_removexattr,
+#endif
 };
 
 static const struct inode_operations cgroup_dir_inode_operations = {
-	.lookup = cgroup_lookup,
-	.mkdir = cgroup_mkdir,
-	.rmdir = cgroup_rmdir,
-	.rename = cgroup_rename,
+	.lookup		= cgroup_lookup,
+	.mkdir		= cgroup_mkdir,
+	.rmdir		= cgroup_rmdir,
+	.rename		= cgroup_rename,
+#ifdef CONFIG_CGROUP_XATTR
+	.setxattr	= cgroup_setxattr,
+	.getxattr	= cgroup_getxattr,
+	.listxattr	= cgroup_listxattr,
+	.removexattr	= cgroup_removexattr,
+#endif
 };
 
 static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
@@ -2667,6 +2917,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 	} else if (S_ISREG(mode)) {
 		inode->i_size = 0;
 		inode->i_fop = &cgroup_file_operations;
+		inode->i_op = &cgroup_file_inode_operations;
 	}
 	d_instantiate(dentry, inode);
 	dget(dentry);	/* Extra count - pin the dentry in core */
@@ -2736,6 +2987,7 @@ int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 
 	cft->subsys = subsys;
+	cgroup_xattrs_init(&cft->xattr_root);
 
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
 		strcpy(name, subsys->name);
-- 
1.7.3.1

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/2] cgroup: add xattr support
@ 2012-01-16  8:07   ` Li Zefan
  0 siblings, 0 replies; 29+ messages in thread
From: Li Zefan @ 2012-01-16  8:07 UTC (permalink / raw)
  To: LKML; +Cc: Cgroups, Tejun Heo, Lennart Poettering, Kay Sievers

This is one of the items in the plumber's wish list.

For use cases:

>> What would the use case be for this?
>
> Attaching meta information to services, in an easily discoverable
> way. For example, in systemd we create one cgroup for each service, and
> could then store data like the main pid of the specific service as an
> xattr on the cgroup itself. That way we'd have almost all service state
> in the cgroupfs, which would make it possible to terminate systemd and
> later restart it without losing any state information. But there's more:
> for example, some very peculiar services cannot be terminated on
> shutdown (i.e. fakeraid DM stuff) and it would be really nice if the
> services in question could just mark that on their cgroup, by setting an
> xattr. On the more desktopy side of things there are other
> possibilities: for example there are plans defining what an application
> is along the lines of a cgroup (i.e. an app being a collection of
> processes). With xattrs one could then attach an icon or human readable
> program name on the cgroup.
>
> The key idea is that this would allow attaching runtime meta information
> to cgroups and everything they model (services, apps, vms), that doesn't
> need any complex userspace infrastructure, has good access control
> (i.e. because the file system enforces that anyway, and there's the
> "trusted." xattr namespace), notifications (inotify), and can easily be
> shared among applications.
>
> Lennart

Signed-off-by: Li Zefan <lizf-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
---
 include/linux/cgroup.h |   15 +++
 init/Kconfig           |   12 ++
 kernel/cgroup.c        |  272 ++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 289 insertions(+), 10 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 13db9e8..a5ac3be 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -16,6 +16,8 @@
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
 #include <linux/idr.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -42,6 +44,13 @@ extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
 
 extern const struct file_operations proc_cgroup_operations;
 
+struct cgroup_xattr_root {
+#ifdef CONFIG_CGROUP_XATTR
+	struct rb_root root;
+	spinlock_t lock;
+#endif
+};
+
 /* Define the enumeration of all builtin cgroup subsystems */
 #define SUBSYS(_x) _x ## _subsys_id,
 enum cgroup_subsys_id {
@@ -243,6 +252,9 @@ struct cgroup {
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
+
+	/* directory xattrs */
+	struct cgroup_xattr_root xattr_root;
 };
 
 /*
@@ -330,6 +342,9 @@ struct cftype {
 	/* The subsystem this cgroup file belongs to */
 	struct cgroup_subsys *subsys;
 
+	/* file xattrs */
+	struct cgroup_xattr_root xattr_root;
+
 	int (*open)(struct inode *inode, struct file *file);
 	ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
diff --git a/init/Kconfig b/init/Kconfig
index 6ac2236..28990ec 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -587,6 +587,18 @@ menuconfig CGROUPS
 
 if CGROUPS
 
+config CGROUP_XATTR
+	bool "Cgroup extended attributes"
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  Currently the system.* namespace is not supported.
+
+	  If unsure, say N.
+
 config CGROUP_DEBUG
 	bool "Example debug cgroup subsystem"
 	default n
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c4ed6fe..ab4cca5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,7 +60,8 @@
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
-
+#include <linux/xattr.h>
+#include <linux/rbtree.h>
 #include <linux/atomic.h>
 
 /*
@@ -786,6 +787,9 @@ static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
 static const struct inode_operations cgroup_dir_inode_operations;
 static const struct file_operations proc_cgroupstats_operations;
 
+static void cgroup_xattrs_init(struct cgroup_xattr_root *root);
+static void cgroup_xattrs_destroy(struct cgroup_xattr_root *root);
+
 static struct backing_dev_info cgroup_backing_dev_info = {
 	.name		= "cgroup",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -865,7 +869,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		 */
 		BUG_ON(!list_empty(&cgrp->pidlists));
 
+		cgroup_xattrs_destroy(&cgrp->xattr_root);
+
 		kfree_rcu(cgrp, rcu_head);
+	} else {
+		struct cftype *cft = dentry->d_fsdata;
+		cgroup_xattrs_destroy(&cft->xattr_root);
 	}
 	iput(inode);
 }
@@ -1355,6 +1364,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
+	cgroup_xattrs_init(&cgrp->xattr_root);
 }
 
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1700,6 +1710,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 
+	cgroup_xattrs_destroy(&cgrp->xattr_root);
+
 	kill_litter_super(sb);
 	cgroup_drop_root(root);
 }
@@ -2608,18 +2620,256 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 }
 
 static const struct file_operations cgroup_file_operations = {
-	.read = cgroup_file_read,
-	.write = cgroup_file_write,
-	.llseek = generic_file_llseek,
-	.open = cgroup_file_open,
-	.release = cgroup_file_release,
+	.read		= cgroup_file_read,
+	.write		= cgroup_file_write,
+	.llseek		= generic_file_llseek,
+	.open		= cgroup_file_open,
+	.release	= cgroup_file_release,
+};
+
+#ifdef CONFIG_CGROUP_XATTR
+
+struct cgroup_xattr_entry {
+	struct rb_node node;
+	char *name;
+	char *val;
+	int len;
+};
+
+static void free_xattr_entry(struct cgroup_xattr_entry *entry)
+{
+	kfree(entry->name);
+	kfree(entry->val);
+	kfree(entry);
+}
+
+static struct cgroup_xattr_root *xattr_root(struct dentry *dentry)
+{
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		return &__d_cgrp(dentry)->xattr_root;
+	else
+		return &__d_cft(dentry)->xattr_root;
+}
+
+static void cgroup_xattrs_init(struct cgroup_xattr_root *root)
+{
+	spin_lock_init(&root->lock);
+	root->root = RB_ROOT;
+}
+
+static void cgroup_xattrs_destroy(struct cgroup_xattr_root *xattr_root)
+{
+	struct rb_root *root = &xattr_root->root;
+	struct rb_node *node;
+	struct cgroup_xattr_entry *entry;
+
+	while (true) {
+		node = rb_first(root);
+		if (!node)
+			break;
+		entry = rb_entry(node, struct cgroup_xattr_entry, node);
+
+		rb_erase(node, root);
+		free_xattr_entry(entry);
+	}
+}
+
+static bool is_valid_xattr(const char *name)
+{
+	if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
+	    !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+		return true;
+	return false;
+}
+
+static int __cgroup_setxattr(struct dentry *dentry, const char *name,
+			     const void *value, size_t size, int flags)
+{
+	struct cgroup_xattr_root *root = xattr_root(dentry);
+	struct cgroup_xattr_entry *entry = NULL;
+	struct cgroup_xattr_entry *new = NULL;
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	int cmp;
+	int ret = 0;
+	char tmp[200];
+
+	if (!is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (value) {
+		new = kzalloc(sizeof(*new), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		new->name = kstrdup(name, GFP_KERNEL);
+		new->val = kmemdup(value, size, GFP_KERNEL);
+		new->len = size;
+		if (!new->name || !new->val) {
+			free_xattr_entry(new);
+			return -ENOMEM;
+		}
+	}
+
+	memcpy(tmp, value, size);
+	tmp[size] = '\0';
+
+	spin_lock(&root->lock);
+
+	p = &root->root.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct cgroup_xattr_entry, node);
+
+		cmp = strcmp(name, entry->name);
+		if (cmp > 0)
+			p = &(*p)->rb_right;
+		else if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			break;
+	}
+
+	if (*p) {
+		if (flags & XATTR_CREATE) {
+			ret = -EEXIST;
+		} else if (new) {
+			swap(entry->val, new->val);
+			swap(entry->len, new->len);
+		} else {
+			rb_erase(&entry->node, &root->root);
+			new = entry;
+		}
+
+		free_xattr_entry(new);
+	} else {
+		if (!new || (flags & XATTR_REPLACE)) {
+			ret = -ENOENT;
+		} else {
+			rb_link_node(&new->node, parent, p);
+			rb_insert_color(&new->node, &root->root);
+		}
+	}
+
+	spin_unlock(&root->lock);
+
+	return ret;
+}
+
+static int cgroup_setxattr(struct dentry *dentry, const char *name,
+			   const void *value, size_t size, int flags)
+{
+	if (size == 0)
+		value = "";
+
+	return __cgroup_setxattr(dentry, name, value, size, flags);
+}
+
+static int cgroup_removexattr(struct dentry *dentry, const char *name)
+{
+	return __cgroup_setxattr(dentry, name, NULL, 0, XATTR_REPLACE);
+}
+
+static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
+			       void *buf, size_t size)
+{
+	struct cgroup_xattr_root *root = xattr_root(dentry);
+	struct cgroup_xattr_entry *entry;
+	struct rb_node *node;
+	int cmp;
+	int ret = -ENOENT;
+
+	if (!is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	spin_lock(&root->lock);
+	node = root->root.rb_node;
+	while (node) {
+		entry = rb_entry(node, struct cgroup_xattr_entry, node);
+
+		cmp = strcmp(name, entry->name);
+		if (cmp > 0) {
+			node = node->rb_right;
+		} else if (cmp < 0) {
+			node = node->rb_left;
+		} else {
+			ret = entry->len;
+			if (buf) {
+				if (size < entry->len)
+					ret = -ERANGE;
+				else
+					memcpy(buf, entry->val, entry->len);
+			}
+			break;
+		}
+	}
+	spin_unlock(&root->lock);
+	return ret;
+}
+
+static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+	struct cgroup_xattr_root *root = xattr_root(dentry);
+	struct cgroup_xattr_entry *entry;
+	struct rb_node *node;
+	int total_len = 0;
+	int len;
+
+	spin_lock(&root->lock);
+	node = rb_first(&root->root);
+	while (node) {
+		entry = rb_entry(node, struct cgroup_xattr_entry, node);
+
+		if (!capable(CAP_SYS_ADMIN) &&
+		    strncmp(entry->name, XATTR_TRUSTED_PREFIX,
+			    XATTR_TRUSTED_PREFIX_LEN) == 0)
+			continue;
+
+		len = strlen(entry->name) + 1;
+		total_len += len;
+		if (buf) {
+			if (size < total_len) {
+				total_len = -ERANGE;
+				break;
+			}
+			memcpy(buf, entry->name, len);
+			buf += len;
+		}
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->lock);
+
+	return total_len;
+}
+
+#else /* CONFIG_CGROUP_XATTR */
+
+static void cgroup_xattrs_init(struct cgroup_xattr_root *root) {}
+static void cgroup_xattrs_destroy(struct cgroup_xattr_root *root) {}
+
+#endif
+
+static const struct inode_operations cgroup_file_inode_operations = {
+#ifdef CONFIG_CGROUP_XATTR
+	.setxattr	= cgroup_setxattr,
+	.getxattr	= cgroup_getxattr,
+	.listxattr	= cgroup_listxattr,
+	.removexattr	= cgroup_removexattr,
+#endif
 };
 
 static const struct inode_operations cgroup_dir_inode_operations = {
-	.lookup = cgroup_lookup,
-	.mkdir = cgroup_mkdir,
-	.rmdir = cgroup_rmdir,
-	.rename = cgroup_rename,
+	.lookup		= cgroup_lookup,
+	.mkdir		= cgroup_mkdir,
+	.rmdir		= cgroup_rmdir,
+	.rename		= cgroup_rename,
+#ifdef CONFIG_CGROUP_XATTR
+	.setxattr	= cgroup_setxattr,
+	.getxattr	= cgroup_getxattr,
+	.listxattr	= cgroup_listxattr,
+	.removexattr	= cgroup_removexattr,
+#endif
 };
 
 static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
@@ -2667,6 +2917,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 	} else if (S_ISREG(mode)) {
 		inode->i_size = 0;
 		inode->i_fop = &cgroup_file_operations;
+		inode->i_op = &cgroup_file_inode_operations;
 	}
 	d_instantiate(dentry, inode);
 	dget(dentry);	/* Extra count - pin the dentry in core */
@@ -2736,6 +2987,7 @@ int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 
 	cft->subsys = subsys;
+	cgroup_xattrs_init(&cft->xattr_root);
 
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
 		strcpy(name, subsys->name);
-- 
1.7.3.1

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-17 17:53     ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-17 17:53 UTC (permalink / raw)
  To: Li Zefan; +Cc: LKML, Cgroups, Lennart Poettering, Kay Sievers

Hello,

On Mon, Jan 16, 2012 at 04:07:05PM +0800, Li Zefan wrote:
> This is one of the items in the plumber's wish list.
> 
> For use cases:
> 
> >> What would the use case be for this?
> >
> > Attaching meta information to services, in an easily discoverable
> > way. For example, in systemd we create one cgroup for each service, and
> > could then store data like the main pid of the specific service as an
> > xattr on the cgroup itself. That way we'd have almost all service state
> > in the cgroupfs, which would make it possible to terminate systemd and
> > later restart it without losing any state information. But there's more:
> > for example, some very peculiar services cannot be terminated on
> > shutdown (i.e. fakeraid DM stuff) and it would be really nice if the
> > services in question could just mark that on their cgroup, by setting an
> > xattr. On the more desktopy side of things there are other
> > possibilities: for example there are plans defining what an application
> > is along the lines of a cgroup (i.e. an app being a collection of
> > processes). With xattrs one could then attach an icon or human readable
> > program name on the cgroup.
> >
> > The key idea is that this would allow attaching runtime meta information
> > to cgroups and everything they model (services, apps, vms), that doesn't
> > need any complex userspace infrastructure, has good access control
> > (i.e. because the file system enforces that anyway, and there's the
> > "trusted." xattr namespace), notifications (inotify), and can easily be
> > shared among applications.
> >
> > Lennart
> 
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>

Ummm... I don't feel too good about this.  Why can't those extra
information be kept somewhere else?  Overloading cgroupfs as storage
for essentially opaque userland information doesn't seem like a good
idea to me.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-17 17:53     ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-17 17:53 UTC (permalink / raw)
  To: Li Zefan; +Cc: LKML, Cgroups, Lennart Poettering, Kay Sievers

Hello,

On Mon, Jan 16, 2012 at 04:07:05PM +0800, Li Zefan wrote:
> This is one of the items in the plumber's wish list.
> 
> For use cases:
> 
> >> What would the use case be for this?
> >
> > Attaching meta information to services, in an easily discoverable
> > way. For example, in systemd we create one cgroup for each service, and
> > could then store data like the main pid of the specific service as an
> > xattr on the cgroup itself. That way we'd have almost all service state
> > in the cgroupfs, which would make it possible to terminate systemd and
> > later restart it without losing any state information. But there's more:
> > for example, some very peculiar services cannot be terminated on
> > shutdown (i.e. fakeraid DM stuff) and it would be really nice if the
> > services in question could just mark that on their cgroup, by setting an
> > xattr. On the more desktopy side of things there are other
> > possibilities: for example there are plans defining what an application
> > is along the lines of a cgroup (i.e. an app being a collection of
> > processes). With xattrs one could then attach an icon or human readable
> > program name on the cgroup.
> >
> > The key idea is that this would allow attaching runtime meta information
> > to cgroups and everything they model (services, apps, vms), that doesn't
> > need any complex userspace infrastructure, has good access control
> > (i.e. because the file system enforces that anyway, and there's the
> > "trusted." xattr namespace), notifications (inotify), and can easily be
> > shared among applications.
> >
> > Lennart
> 
> Signed-off-by: Li Zefan <lizf-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>

Ummm... I don't feel too good about this.  Why can't those extra
information be kept somewhere else?  Overloading cgroupfs as storage
for essentially opaque userland information doesn't seem like a good
idea to me.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/2] cgroup: revise how we re-populate root directory
@ 2012-01-18  7:23   ` Sha
  0 siblings, 0 replies; 29+ messages in thread
From: Sha @ 2012-01-18  7:23 UTC (permalink / raw)
  To: Li Zefan; +Cc: LKML, Cgroups, Tejun Heo, Lennart Poettering, Kay Sievers

Hi Zefan,

On Mon, Jan 16, 2012 at 4:06 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
> When remounting cgroupfs with some subsystems added to it and some
> removed, cgroup will remove all the files in root directory and then
> re-popluate it.
>
> What I'm doing here is, only remove files which belong to subsystems that
> are to be unbound, and only create files for newly-added subsystems.
> The purpose is to have all other files untouched.
>
> This is a preparation for cgroup xattr support.
>
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
> ---
>  include/linux/cgroup.h |   11 +++---
>  kernel/cgroup.c        |   92 +++++++++++++++++++++++++++++++----------------
>  2 files changed, 67 insertions(+), 36 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index e9b6021..13db9e8 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -327,6 +327,9 @@ struct cftype {
>         */
>        size_t max_write_len;
>
> +       /* The subsystem this cgroup file belongs to */
> +       struct cgroup_subsys *subsys;
> +
>        int (*open)(struct inode *inode, struct file *file);
>        ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
>                        struct file *file,
> @@ -419,16 +422,14 @@ struct cgroup_scanner {
>  * called by subsystems from within a populate() method
>  */
>  int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
> -                      const struct cftype *cft);
> +                   struct cftype *cft);
>
>  /*
>  * Add a set of new files to the given cgroup directory. Should
>  * only be called by subsystems from within a populate() method
>  */
> -int cgroup_add_files(struct cgroup *cgrp,
> -                       struct cgroup_subsys *subsys,
> -                       const struct cftype cft[],
> -                       int count);
> +int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
> +                    struct cftype cft[], int count);
>
>  int cgroup_is_removed(const struct cgroup *cgrp);
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index a5d3b53..c4ed6fe 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -781,6 +781,8 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
>  static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
>  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
>  static int cgroup_populate_dir(struct cgroup *cgrp);
> +static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
> +                                unsigned long removed_bits);
>  static const struct inode_operations cgroup_dir_inode_operations;
>  static const struct file_operations proc_cgroupstats_operations;
>
> @@ -882,34 +884,44 @@ static void remove_dir(struct dentry *d)
>        dput(parent);
>  }
>
> -static void cgroup_clear_directory(struct dentry *dentry)
> +static void cgroup_clear_directory(struct dentry *dentry, bool remove_all,
> +                                  unsigned long removed_bits)
>  {
> -       struct list_head *node;
> +       LIST_HEAD(head);
> +       struct dentry *d, *node;
>
>        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
> +
>        spin_lock(&dentry->d_lock);
> -       node = dentry->d_subdirs.next;
> -       while (node != &dentry->d_subdirs) {
> -               struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
> +       list_for_each_entry_safe(d, node, &dentry->d_subdirs, d_u.d_child) {
> +               struct cftype *cft = d->d_fsdata;
> +
> +               if (!remove_all && cft->subsys &&
> +                   !test_bit(cft->subsys->subsys_id, &removed_bits))
> +                       continue;
>
>                spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
> -               list_del_init(node);
> +               list_move(&d->d_u.d_child, &head);
>                if (d->d_inode) {
>                        /* This should never be called on a cgroup
>                         * directory with child cgroups */
>                        BUG_ON(d->d_inode->i_mode & S_IFDIR);
>                        dget_dlock(d);
> -                       spin_unlock(&d->d_lock);
> -                       spin_unlock(&dentry->d_lock);
> +               }
> +               spin_unlock(&d->d_lock);
> +       }
> +       spin_unlock(&dentry->d_lock);
> +
> +       list_for_each_entry_safe(d, node, &head, d_u.d_child) {
> +               spin_lock(&d->d_lock);
> +               list_del_init(&d->d_u.d_child);
> +               spin_unlock(&d->d_lock);
> +               if (d->d_inode) {
>                        d_delete(d);
>                        simple_unlink(dentry->d_inode, d);
>                        dput(d);
> -                       spin_lock(&dentry->d_lock);
> -               } else
> -                       spin_unlock(&d->d_lock);
> -               node = dentry->d_subdirs.next;
> +               }
>        }
> -       spin_unlock(&dentry->d_lock);
>  }
>
>  /*
> @@ -919,7 +931,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
>  {
>        struct dentry *parent;
>
> -       cgroup_clear_directory(dentry);
> +       cgroup_clear_directory(dentry, true, 0);
>
>        parent = dentry->d_parent;
>        spin_lock(&parent->d_lock);
> @@ -1284,6 +1296,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>        struct cgroupfs_root *root = sb->s_fs_info;
>        struct cgroup *cgrp = &root->top_cgroup;
>        struct cgroup_sb_opts opts;
> +       unsigned long added_bits, removed_bits;
>
>        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
>        mutex_lock(&cgroup_mutex);
> @@ -1294,6 +1307,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>        if (ret)
>                goto out_unlock;
>
> +       added_bits = opts.subsys_bits & ~root->subsys_bits;
> +       removed_bits = root->subsys_bits & ~opts.subsys_bits;
> +

Should it be the following?:
          added_bits = opts.subsys_bits & ~root->actual_subsys_bits;
          removed_bits = root->actual_subsys_bits & ~opts.subsys_bits;

Thanks,
Sha

>        /* Don't allow flags or name to change at remount */
>        if (opts.flags != root->flags ||
>            (opts.name && strcmp(opts.name, root->name))) {
> @@ -1308,8 +1324,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>                goto out_unlock;
>        }
>
> -       /* (re)populate subsystem files */
> -       cgroup_populate_dir(cgrp);
> +       /* re-populate subsystem files */
> +       cgroup_repopulate_dir(cgrp, added_bits, removed_bits);
>
>        if (opts.release_agent)
>                strcpy(root->release_agent_path, opts.release_agent);
> @@ -2710,16 +2726,17 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
>        return mode;
>  }
>
> -int cgroup_add_file(struct cgroup *cgrp,
> -                      struct cgroup_subsys *subsys,
> -                      const struct cftype *cft)
> +int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
> +                   struct cftype *cft)
>  {
>        struct dentry *dir = cgrp->dentry;
>        struct dentry *dentry;
>        int error;
>        umode_t mode;
> -
>        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
> +
> +       cft->subsys = subsys;
> +
>        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
>                strcpy(name, subsys->name);
>                strcat(name, ".");
> @@ -2740,10 +2757,8 @@ int cgroup_add_file(struct cgroup *cgrp,
>  }
>  EXPORT_SYMBOL_GPL(cgroup_add_file);
>
> -int cgroup_add_files(struct cgroup *cgrp,
> -                       struct cgroup_subsys *subsys,
> -                       const struct cftype cft[],
> -                       int count)
> +int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
> +                    struct cftype cft[], int count)
>  {
>        int i, err;
>        for (i = 0; i < count; i++) {
> @@ -3703,26 +3718,27 @@ static struct cftype cft_release_agent = {
>        .max_write_len = PATH_MAX,
>  };
>
> -static int cgroup_populate_dir(struct cgroup *cgrp)
> +static int __cgroup_populate_dir(struct cgroup *cgrp, unsigned long added_bits)
>  {
>        int err;
>        struct cgroup_subsys *ss;
>
> -       /* First clear out any existing files */
> -       cgroup_clear_directory(cgrp->dentry);
> -
>        err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
>        if (err < 0)
>                return err;
>
>        if (cgrp == cgrp->top_cgroup) {
> -               if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
> +               err = cgroup_add_file(cgrp, NULL, &cft_release_agent);
> +               if (err < 0)
>                        return err;
>        }
>
>        for_each_subsys(cgrp->root, ss) {
> -               if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
> -                       return err;
> +               if (test_bit(ss->subsys_id, &added_bits) && ss->populate) {
> +                       err = ss->populate(ss, cgrp);
> +                       if (err < 0)
> +                               return err;
> +               }
>        }
>        /* This cgroup is ready now */
>        for_each_subsys(cgrp->root, ss) {
> @@ -3739,6 +3755,20 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
>        return 0;
>  }
>
> +static int cgroup_populate_dir(struct cgroup *cgrp)
> +{
> +       return __cgroup_populate_dir(cgrp, cgrp->root->subsys_bits);
> +}
> +
> +static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
> +                                unsigned long removed_bits)
> +{
> +       /* First clear out files */
> +       cgroup_clear_directory(cgrp->dentry, false, removed_bits);
> +
> +       return __cgroup_populate_dir(cgrp, added_bits);
> +}
> +
>  static void init_cgroup_css(struct cgroup_subsys_state *css,
>                               struct cgroup_subsys *ss,
>                               struct cgroup *cgrp)
> --
> 1.7.3.1
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/2] cgroup: revise how we re-populate root directory
@ 2012-01-18  7:23   ` Sha
  0 siblings, 0 replies; 29+ messages in thread
From: Sha @ 2012-01-18  7:23 UTC (permalink / raw)
  To: Li Zefan; +Cc: LKML, Cgroups, Tejun Heo, Lennart Poettering, Kay Sievers

Hi Zefan,

On Mon, Jan 16, 2012 at 4:06 PM, Li Zefan <lizf-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org> wrote:
> When remounting cgroupfs with some subsystems added to it and some
> removed, cgroup will remove all the files in root directory and then
> re-popluate it.
>
> What I'm doing here is, only remove files which belong to subsystems that
> are to be unbound, and only create files for newly-added subsystems.
> The purpose is to have all other files untouched.
>
> This is a preparation for cgroup xattr support.
>
> Signed-off-by: Li Zefan <lizf-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
> ---
>  include/linux/cgroup.h |   11 +++---
>  kernel/cgroup.c        |   92 +++++++++++++++++++++++++++++++----------------
>  2 files changed, 67 insertions(+), 36 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index e9b6021..13db9e8 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -327,6 +327,9 @@ struct cftype {
>         */
>        size_t max_write_len;
>
> +       /* The subsystem this cgroup file belongs to */
> +       struct cgroup_subsys *subsys;
> +
>        int (*open)(struct inode *inode, struct file *file);
>        ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
>                        struct file *file,
> @@ -419,16 +422,14 @@ struct cgroup_scanner {
>  * called by subsystems from within a populate() method
>  */
>  int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
> -                      const struct cftype *cft);
> +                   struct cftype *cft);
>
>  /*
>  * Add a set of new files to the given cgroup directory. Should
>  * only be called by subsystems from within a populate() method
>  */
> -int cgroup_add_files(struct cgroup *cgrp,
> -                       struct cgroup_subsys *subsys,
> -                       const struct cftype cft[],
> -                       int count);
> +int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
> +                    struct cftype cft[], int count);
>
>  int cgroup_is_removed(const struct cgroup *cgrp);
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index a5d3b53..c4ed6fe 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -781,6 +781,8 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
>  static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
>  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
>  static int cgroup_populate_dir(struct cgroup *cgrp);
> +static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
> +                                unsigned long removed_bits);
>  static const struct inode_operations cgroup_dir_inode_operations;
>  static const struct file_operations proc_cgroupstats_operations;
>
> @@ -882,34 +884,44 @@ static void remove_dir(struct dentry *d)
>        dput(parent);
>  }
>
> -static void cgroup_clear_directory(struct dentry *dentry)
> +static void cgroup_clear_directory(struct dentry *dentry, bool remove_all,
> +                                  unsigned long removed_bits)
>  {
> -       struct list_head *node;
> +       LIST_HEAD(head);
> +       struct dentry *d, *node;
>
>        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
> +
>        spin_lock(&dentry->d_lock);
> -       node = dentry->d_subdirs.next;
> -       while (node != &dentry->d_subdirs) {
> -               struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
> +       list_for_each_entry_safe(d, node, &dentry->d_subdirs, d_u.d_child) {
> +               struct cftype *cft = d->d_fsdata;
> +
> +               if (!remove_all && cft->subsys &&
> +                   !test_bit(cft->subsys->subsys_id, &removed_bits))
> +                       continue;
>
>                spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
> -               list_del_init(node);
> +               list_move(&d->d_u.d_child, &head);
>                if (d->d_inode) {
>                        /* This should never be called on a cgroup
>                         * directory with child cgroups */
>                        BUG_ON(d->d_inode->i_mode & S_IFDIR);
>                        dget_dlock(d);
> -                       spin_unlock(&d->d_lock);
> -                       spin_unlock(&dentry->d_lock);
> +               }
> +               spin_unlock(&d->d_lock);
> +       }
> +       spin_unlock(&dentry->d_lock);
> +
> +       list_for_each_entry_safe(d, node, &head, d_u.d_child) {
> +               spin_lock(&d->d_lock);
> +               list_del_init(&d->d_u.d_child);
> +               spin_unlock(&d->d_lock);
> +               if (d->d_inode) {
>                        d_delete(d);
>                        simple_unlink(dentry->d_inode, d);
>                        dput(d);
> -                       spin_lock(&dentry->d_lock);
> -               } else
> -                       spin_unlock(&d->d_lock);
> -               node = dentry->d_subdirs.next;
> +               }
>        }
> -       spin_unlock(&dentry->d_lock);
>  }
>
>  /*
> @@ -919,7 +931,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
>  {
>        struct dentry *parent;
>
> -       cgroup_clear_directory(dentry);
> +       cgroup_clear_directory(dentry, true, 0);
>
>        parent = dentry->d_parent;
>        spin_lock(&parent->d_lock);
> @@ -1284,6 +1296,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>        struct cgroupfs_root *root = sb->s_fs_info;
>        struct cgroup *cgrp = &root->top_cgroup;
>        struct cgroup_sb_opts opts;
> +       unsigned long added_bits, removed_bits;
>
>        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
>        mutex_lock(&cgroup_mutex);
> @@ -1294,6 +1307,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>        if (ret)
>                goto out_unlock;
>
> +       added_bits = opts.subsys_bits & ~root->subsys_bits;
> +       removed_bits = root->subsys_bits & ~opts.subsys_bits;
> +

Should it be the following?:
          added_bits = opts.subsys_bits & ~root->actual_subsys_bits;
          removed_bits = root->actual_subsys_bits & ~opts.subsys_bits;

Thanks,
Sha

>        /* Don't allow flags or name to change at remount */
>        if (opts.flags != root->flags ||
>            (opts.name && strcmp(opts.name, root->name))) {
> @@ -1308,8 +1324,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>                goto out_unlock;
>        }
>
> -       /* (re)populate subsystem files */
> -       cgroup_populate_dir(cgrp);
> +       /* re-populate subsystem files */
> +       cgroup_repopulate_dir(cgrp, added_bits, removed_bits);
>
>        if (opts.release_agent)
>                strcpy(root->release_agent_path, opts.release_agent);
> @@ -2710,16 +2726,17 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
>        return mode;
>  }
>
> -int cgroup_add_file(struct cgroup *cgrp,
> -                      struct cgroup_subsys *subsys,
> -                      const struct cftype *cft)
> +int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
> +                   struct cftype *cft)
>  {
>        struct dentry *dir = cgrp->dentry;
>        struct dentry *dentry;
>        int error;
>        umode_t mode;
> -
>        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
> +
> +       cft->subsys = subsys;
> +
>        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
>                strcpy(name, subsys->name);
>                strcat(name, ".");
> @@ -2740,10 +2757,8 @@ int cgroup_add_file(struct cgroup *cgrp,
>  }
>  EXPORT_SYMBOL_GPL(cgroup_add_file);
>
> -int cgroup_add_files(struct cgroup *cgrp,
> -                       struct cgroup_subsys *subsys,
> -                       const struct cftype cft[],
> -                       int count)
> +int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
> +                    struct cftype cft[], int count)
>  {
>        int i, err;
>        for (i = 0; i < count; i++) {
> @@ -3703,26 +3718,27 @@ static struct cftype cft_release_agent = {
>        .max_write_len = PATH_MAX,
>  };
>
> -static int cgroup_populate_dir(struct cgroup *cgrp)
> +static int __cgroup_populate_dir(struct cgroup *cgrp, unsigned long added_bits)
>  {
>        int err;
>        struct cgroup_subsys *ss;
>
> -       /* First clear out any existing files */
> -       cgroup_clear_directory(cgrp->dentry);
> -
>        err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
>        if (err < 0)
>                return err;
>
>        if (cgrp == cgrp->top_cgroup) {
> -               if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
> +               err = cgroup_add_file(cgrp, NULL, &cft_release_agent);
> +               if (err < 0)
>                        return err;
>        }
>
>        for_each_subsys(cgrp->root, ss) {
> -               if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
> -                       return err;
> +               if (test_bit(ss->subsys_id, &added_bits) && ss->populate) {
> +                       err = ss->populate(ss, cgrp);
> +                       if (err < 0)
> +                               return err;
> +               }
>        }
>        /* This cgroup is ready now */
>        for_each_subsys(cgrp->root, ss) {
> @@ -3739,6 +3755,20 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
>        return 0;
>  }
>
> +static int cgroup_populate_dir(struct cgroup *cgrp)
> +{
> +       return __cgroup_populate_dir(cgrp, cgrp->root->subsys_bits);
> +}
> +
> +static int cgroup_repopulate_dir(struct cgroup *cgrp, unsigned long added_bits,
> +                                unsigned long removed_bits)
> +{
> +       /* First clear out files */
> +       cgroup_clear_directory(cgrp->dentry, false, removed_bits);
> +
> +       return __cgroup_populate_dir(cgrp, added_bits);
> +}
> +
>  static void init_cgroup_css(struct cgroup_subsys_state *css,
>                               struct cgroup_subsys *ss,
>                               struct cgroup *cgrp)
> --
> 1.7.3.1
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/2] cgroup: revise how we re-populate root directory
  2012-01-18  7:23   ` Sha
  (?)
@ 2012-01-18  7:59   ` Li Zefan
  -1 siblings, 0 replies; 29+ messages in thread
From: Li Zefan @ 2012-01-18  7:59 UTC (permalink / raw)
  To: Sha; +Cc: LKML, Cgroups, Tejun Heo, Lennart Poettering, Kay Sievers

>> @@ -1284,6 +1296,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>>        struct cgroupfs_root *root = sb->s_fs_info;
>>        struct cgroup *cgrp = &root->top_cgroup;
>>        struct cgroup_sb_opts opts;
>> +       unsigned long added_bits, removed_bits;
>>
>>        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
>>        mutex_lock(&cgroup_mutex);
>> @@ -1294,6 +1307,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
>>        if (ret)
>>                goto out_unlock;
>>
>> +       added_bits = opts.subsys_bits & ~root->subsys_bits;
>> +       removed_bits = root->subsys_bits & ~opts.subsys_bits;
>> +
> 
> Should it be the following?:
>           added_bits = opts.subsys_bits & ~root->actual_subsys_bits;
>           removed_bits = root->actual_subsys_bits & ~opts.subsys_bits;
> 

subsys_bits and actual_subsys_bits differ during cgroup_mount() only,
and in other places they are replaceable with each other.

Thanks for looking into this.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-18  8:27       ` Li Zefan
  0 siblings, 0 replies; 29+ messages in thread
From: Li Zefan @ 2012-01-18  8:27 UTC (permalink / raw)
  To: Tejun Heo; +Cc: LKML, Cgroups, Lennart Poettering, Kay Sievers

Tejun Heo wrote:
> Hello,
> 
> On Mon, Jan 16, 2012 at 04:07:05PM +0800, Li Zefan wrote:
>> This is one of the items in the plumber's wish list.
>>
>> For use cases:
>>
>>>> What would the use case be for this?
>>>
>>> Attaching meta information to services, in an easily discoverable
>>> way. For example, in systemd we create one cgroup for each service, and
>>> could then store data like the main pid of the specific service as an
>>> xattr on the cgroup itself. That way we'd have almost all service state
>>> in the cgroupfs, which would make it possible to terminate systemd and
>>> later restart it without losing any state information. But there's more:
>>> for example, some very peculiar services cannot be terminated on
>>> shutdown (i.e. fakeraid DM stuff) and it would be really nice if the
>>> services in question could just mark that on their cgroup, by setting an
>>> xattr. On the more desktopy side of things there are other
>>> possibilities: for example there are plans defining what an application
>>> is along the lines of a cgroup (i.e. an app being a collection of
>>> processes). With xattrs one could then attach an icon or human readable
>>> program name on the cgroup.
>>>
>>> The key idea is that this would allow attaching runtime meta information
>>> to cgroups and everything they model (services, apps, vms), that doesn't
>>> need any complex userspace infrastructure, has good access control
>>> (i.e. because the file system enforces that anyway, and there's the
>>> "trusted." xattr namespace), notifications (inotify), and can easily be
>>> shared among applications.
>>>
>>> Lennart
>>
>> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
> 
> Ummm... I don't feel too good about this.  Why can't those extra
> information be kept somewhere else?  Overloading cgroupfs as storage
> for essentially opaque userland information doesn't seem like a good
> idea to me.
> 

Long ago Paul M toyed with a patch that adds a control file for userspace
to read/write per-cgroup user information, but there were no use cases.
This patchset has a similar purpose, but this interface is more flexable
and easier to use, and we do have systemd as a use case this time.

I'll let Lennart answer if we can easily live without this.

Furthermore, I noticed tmpfs also implemented xattr support, and we
should be able to share most of the code, which makes the cost for
having this feature smaller.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-18  8:27       ` Li Zefan
  0 siblings, 0 replies; 29+ messages in thread
From: Li Zefan @ 2012-01-18  8:27 UTC (permalink / raw)
  To: Tejun Heo; +Cc: LKML, Cgroups, Lennart Poettering, Kay Sievers

Tejun Heo wrote:
> Hello,
> 
> On Mon, Jan 16, 2012 at 04:07:05PM +0800, Li Zefan wrote:
>> This is one of the items in the plumber's wish list.
>>
>> For use cases:
>>
>>>> What would the use case be for this?
>>>
>>> Attaching meta information to services, in an easily discoverable
>>> way. For example, in systemd we create one cgroup for each service, and
>>> could then store data like the main pid of the specific service as an
>>> xattr on the cgroup itself. That way we'd have almost all service state
>>> in the cgroupfs, which would make it possible to terminate systemd and
>>> later restart it without losing any state information. But there's more:
>>> for example, some very peculiar services cannot be terminated on
>>> shutdown (i.e. fakeraid DM stuff) and it would be really nice if the
>>> services in question could just mark that on their cgroup, by setting an
>>> xattr. On the more desktopy side of things there are other
>>> possibilities: for example there are plans defining what an application
>>> is along the lines of a cgroup (i.e. an app being a collection of
>>> processes). With xattrs one could then attach an icon or human readable
>>> program name on the cgroup.
>>>
>>> The key idea is that this would allow attaching runtime meta information
>>> to cgroups and everything they model (services, apps, vms), that doesn't
>>> need any complex userspace infrastructure, has good access control
>>> (i.e. because the file system enforces that anyway, and there's the
>>> "trusted." xattr namespace), notifications (inotify), and can easily be
>>> shared among applications.
>>>
>>> Lennart
>>
>> Signed-off-by: Li Zefan <lizf-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
> 
> Ummm... I don't feel too good about this.  Why can't those extra
> information be kept somewhere else?  Overloading cgroupfs as storage
> for essentially opaque userland information doesn't seem like a good
> idea to me.
> 

Long ago Paul M toyed with a patch that adds a control file for userspace
to read/write per-cgroup user information, but there were no use cases.
This patchset has a similar purpose, but this interface is more flexable
and easier to use, and we do have systemd as a use case this time.

I'll let Lennart answer if we can easily live without this.

Furthermore, I noticed tmpfs also implemented xattr support, and we
should be able to share most of the code, which makes the cost for
having this feature smaller.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-18 17:47         ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-18 17:47 UTC (permalink / raw)
  To: Li Zefan; +Cc: LKML, Cgroups, Lennart Poettering, Kay Sievers

Hello,

On Wed, Jan 18, 2012 at 04:27:18PM +0800, Li Zefan wrote:
> Furthermore, I noticed tmpfs also implemented xattr support, and we
> should be able to share most of the code, which makes the cost for
> having this feature smaller.

Yes, maybe, but tmpfs doesn't have any special semantics attached to
files.  It's supposed to store opaque data from userland.  Regardless
of code complexity, I just don't think this is a good idea.  What if
we later want to attach certain config meaning to or export statistics
via xattrs?  That would be quite unusual and require strong rationale
but at least fall in the similar usage as with the rest of the fs.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-18 17:47         ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-18 17:47 UTC (permalink / raw)
  To: Li Zefan; +Cc: LKML, Cgroups, Lennart Poettering, Kay Sievers

Hello,

On Wed, Jan 18, 2012 at 04:27:18PM +0800, Li Zefan wrote:
> Furthermore, I noticed tmpfs also implemented xattr support, and we
> should be able to share most of the code, which makes the cost for
> having this feature smaller.

Yes, maybe, but tmpfs doesn't have any special semantics attached to
files.  It's supposed to store opaque data from userland.  Regardless
of code complexity, I just don't think this is a good idea.  What if
we later want to attach certain config meaning to or export statistics
via xattrs?  That would be quite unusual and require strong rationale
but at least fall in the similar usage as with the rest of the fs.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-18 21:28       ` Kay Sievers
  0 siblings, 0 replies; 29+ messages in thread
From: Kay Sievers @ 2012-01-18 21:28 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Li Zefan, LKML, Cgroups, Lennart Poettering

On Tue, Jan 17, 2012 at 18:53, Tejun Heo <tj@kernel.org> wrote:

>> > The key idea is that this would allow attaching runtime meta information
>> > to cgroups and everything they model (services, apps, vms), that doesn't
>> > need any complex userspace infrastructure, has good access control
>> > (i.e. because the file system enforces that anyway, and there's the
>> > "trusted." xattr namespace), notifications (inotify), and can easily be
>> > shared among applications.

> Ummm... I don't feel too good about this.  Why can't those extra
> information be kept somewhere else?  Overloading cgroupfs as storage
> for essentially opaque userland information doesn't seem like a good
> idea to me.

A bit of the background:

The /sys/fs/cgroup/systemd/ hierarchy represents something like a
first class kernel-related system object, which we call an
'application' or 'service' in userspace. Every 'service' or
'application' has zero, one or many processes, and has some high-level
metadata attached.

The general idea, not specifcally related to cgroups, is to try as
hard as possible to stick metadata about objects to the objects
themselves. It's a bit like the difference of lock files vs. locks
attached to a process. In the later case, if the process dies, the
lock is automatically cleaned up, in the former, we have a stale file.

The idea with the cgroup fs xattrs was to be able to attach some
general useful attributes to the 'service container' itself, instead
of keeping them in the memory of the managing process or store them on
disk which can get out-of-sync much easier.

Kay

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-18 21:28       ` Kay Sievers
  0 siblings, 0 replies; 29+ messages in thread
From: Kay Sievers @ 2012-01-18 21:28 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Li Zefan, LKML, Cgroups, Lennart Poettering

On Tue, Jan 17, 2012 at 18:53, Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:

>> > The key idea is that this would allow attaching runtime meta information
>> > to cgroups and everything they model (services, apps, vms), that doesn't
>> > need any complex userspace infrastructure, has good access control
>> > (i.e. because the file system enforces that anyway, and there's the
>> > "trusted." xattr namespace), notifications (inotify), and can easily be
>> > shared among applications.

> Ummm... I don't feel too good about this.  Why can't those extra
> information be kept somewhere else?  Overloading cgroupfs as storage
> for essentially opaque userland information doesn't seem like a good
> idea to me.

A bit of the background:

The /sys/fs/cgroup/systemd/ hierarchy represents something like a
first class kernel-related system object, which we call an
'application' or 'service' in userspace. Every 'service' or
'application' has zero, one or many processes, and has some high-level
metadata attached.

The general idea, not specifcally related to cgroups, is to try as
hard as possible to stick metadata about objects to the objects
themselves. It's a bit like the difference of lock files vs. locks
attached to a process. In the later case, if the process dies, the
lock is automatically cleaned up, in the former, we have a stale file.

The idea with the cgroup fs xattrs was to be able to attach some
general useful attributes to the 'service container' itself, instead
of keeping them in the memory of the managing process or store them on
disk which can get out-of-sync much easier.

Kay

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-18 21:36         ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-18 21:36 UTC (permalink / raw)
  To: Kay Sievers; +Cc: Li Zefan, LKML, Cgroups, Lennart Poettering

Hello,

On Wed, Jan 18, 2012 at 10:28:42PM +0100, Kay Sievers wrote:
> The idea with the cgroup fs xattrs was to be able to attach some
> general useful attributes to the 'service container' itself, instead
> of keeping them in the memory of the managing process or store them on
> disk which can get out-of-sync much easier.

Hmmm.... I can see the attraction but there really is nothing which
binds that information to cgroup.  The same information might as well
live in /proc/PID/userland_data or whatever.  It may be convenient now
but I'm pretty skeptical it's a good idea in the long run.

Given that cgroups themselves need to be explicitly created and
destroyed, maintaining a parallel tmpfs hierarchy for metadata, if
necessary, shouldn't be too bothersome, right?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-18 21:36         ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-18 21:36 UTC (permalink / raw)
  To: Kay Sievers; +Cc: Li Zefan, LKML, Cgroups, Lennart Poettering

Hello,

On Wed, Jan 18, 2012 at 10:28:42PM +0100, Kay Sievers wrote:
> The idea with the cgroup fs xattrs was to be able to attach some
> general useful attributes to the 'service container' itself, instead
> of keeping them in the memory of the managing process or store them on
> disk which can get out-of-sync much easier.

Hmmm.... I can see the attraction but there really is nothing which
binds that information to cgroup.  The same information might as well
live in /proc/PID/userland_data or whatever.  It may be convenient now
but I'm pretty skeptical it's a good idea in the long run.

Given that cgroups themselves need to be explicitly created and
destroyed, maintaining a parallel tmpfs hierarchy for metadata, if
necessary, shouldn't be too bothersome, right?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
  2012-01-18 21:36         ` Tejun Heo
  (?)
@ 2012-01-19  1:47         ` Lennart Poettering
  2012-01-19  2:20             ` Tejun Heo
  -1 siblings, 1 reply; 29+ messages in thread
From: Lennart Poettering @ 2012-01-19  1:47 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Kay Sievers, Li Zefan, LKML, Cgroups

On Wed, 18.01.12 13:36, Tejun Heo (tj@kernel.org) wrote:

> 
> Hello,
> 
> On Wed, Jan 18, 2012 at 10:28:42PM +0100, Kay Sievers wrote:
> > The idea with the cgroup fs xattrs was to be able to attach some
> > general useful attributes to the 'service container' itself, instead
> > of keeping them in the memory of the managing process or store them on
> > disk which can get out-of-sync much easier.
> 
> Hmmm.... I can see the attraction but there really is nothing which
> binds that information to cgroup.  The same information might as well
> live in /proc/PID/userland_data or whatever.  It may be convenient now
> but I'm pretty skeptical it's a good idea in the long run.
> 
> Given that cgroups themselves need to be explicitly created and
> destroyed, maintaining a parallel tmpfs hierarchy for metadata, if
> necessary, shouldn't be too bothersome, right?

Well, the interesting bit here is that to make things robust we'd like
to attach the meta information userspace needs to the object itself, so
that it follows the same lifecycle, and we can detect changes to it with
the usual APIs such as inotify, without any complex logic in userspace
that tries to ensure that meta data stored elsewhere is always kept in
sync with what cgroups are currently around.

I mean, certainly, it's possible to store this data elsewhere, and
that's what all current cgroup client code does, including systemd for
example. But ultimately that is very fragile and cumbersome, since it
requires us to proxy all cgroup events to this meta information,
emulating in userspace that we align the lifecycles of cgroups with the
lifecycle of its metadata. It also makes it necessary to define new
userspace APIs if we want different userspace components to share meta
data on cgroups (and we do want to share metadata!), which is always
difficult, and would look an awful lot like xattrs without actually
being xattrs, but just suck as they'd be a userspace emulation of the
real thing with all the complexities and problems it creates, for
example the fact that they'd userspace emulated xattrs can only
asynchronously follow the lifecycle of their cgroup, thus creating races
one has to deal with and so on. Also, proper xattrs can distuingish
"trusted.xxx" and "user.xxx" namespaces which influences access control
on the xattr. Something like that is very useful but really hard to
emulate without kernel support.

In summary: we want to be able to hook into the lifecycle of the cgroup
right to make things robust. We want the xattrs to synchronously follow
the lifecycle. We want a simple API that's already used for the same
purpose on files. We don't want to to have to intrdouce fragile
userspace code, that deals with races, and robustness, if we can
trivially benefit from code that mostly already exists in the kernel.

Note that there already are a couple of attributes of cgroups userspace
manipulates to encode meta data about services, for example via the
mode/uid/gid of the cgroup dir and the tasks file. And people are
actually already encoding information into those which better should be
stored elsewhere, i.e. in my opinion the best place would be xattrs.

But yeah, to clarify this: this feature can be emulated in userspace, so
this is not about making things possible that previously weren't. It's
mostly about making things clean, safe, and robust which they previously
weren't, without introducing non-trivial additional userspace components
and interfaces.

Lennart

-- 
Lennart Poettering - Red Hat, Inc.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
  2012-01-18 17:47         ` Tejun Heo
  (?)
@ 2012-01-19  1:49         ` Lennart Poettering
  -1 siblings, 0 replies; 29+ messages in thread
From: Lennart Poettering @ 2012-01-19  1:49 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Li Zefan, LKML, Cgroups, Kay Sievers

On Wed, 18.01.12 09:47, Tejun Heo (tj@kernel.org) wrote:

> 
> Hello,
> 
> On Wed, Jan 18, 2012 at 04:27:18PM +0800, Li Zefan wrote:
> > Furthermore, I noticed tmpfs also implemented xattr support, and we
> > should be able to share most of the code, which makes the cost for
> > having this feature smaller.
> 
> Yes, maybe, but tmpfs doesn't have any special semantics attached to
> files.  It's supposed to store opaque data from userland.  Regardless
> of code complexity, I just don't think this is a good idea.  What if
> we later want to attach certain config meaning to or export statistics
> via xattrs?  That would be quite unusual and require strong rationale
> but at least fall in the similar usage as with the rest of the fs.

If you plan to export kernel data to userspace via xattrs, then you
should probably place that in the system.xxx xattr namespace, not in
the user.xxx or trusted.xxx namespaces which are the only ones which can
be manipulated by userspace.

See attr(5) for more information about the four available xattr
namespaces.

In other words: kernel exported xattrs should never collide with xattrs
set by userspace.

Lennart

-- 
Lennart Poettering - Red Hat, Inc.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-19  2:20             ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-19  2:20 UTC (permalink / raw)
  To: Lennart Poettering; +Cc: Kay Sievers, Li Zefan, LKML, Cgroups

Hello, Lennart, Li.

Two things.

* Probably I'm missing something but isn't the systemd cgroup
  hierarchy already managed by systemd?  If so, I don't see how
  managing tmpfs on the side would noticeably make things more
  fragile.  It would take a bit more care after, for example, restart
  but it shouldn't be too complex, no?

* FS attributes already being used for userland information seems like
  a good argument, but we shouldn't add separate specialized xattr
  implementation to different pseudo filesystems.  For it to be
  acceptable, it should be a libfs thing easily applicable to any
  pseudo FS and definitely shouldn't be using kmem for storage.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-19  2:20             ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-19  2:20 UTC (permalink / raw)
  To: Lennart Poettering; +Cc: Kay Sievers, Li Zefan, LKML, Cgroups

Hello, Lennart, Li.

Two things.

* Probably I'm missing something but isn't the systemd cgroup
  hierarchy already managed by systemd?  If so, I don't see how
  managing tmpfs on the side would noticeably make things more
  fragile.  It would take a bit more care after, for example, restart
  but it shouldn't be too complex, no?

* FS attributes already being used for userland information seems like
  a good argument, but we shouldn't add separate specialized xattr
  implementation to different pseudo filesystems.  For it to be
  acceptable, it should be a libfs thing easily applicable to any
  pseudo FS and definitely shouldn't be using kmem for storage.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-19  2:40               ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-19  2:40 UTC (permalink / raw)
  To: Lennart Poettering; +Cc: Kay Sievers, Li Zefan, LKML, Cgroups

On Wed, Jan 18, 2012 at 06:20:05PM -0800, Tejun Heo wrote:
> Hello, Lennart, Li.
> 
> Two things.
> 
> * Probably I'm missing something but isn't the systemd cgroup
>   hierarchy already managed by systemd?  If so, I don't see how
>   managing tmpfs on the side would noticeably make things more
>   fragile.  It would take a bit more care after, for example, restart
>   but it shouldn't be too complex, no?
> 
> * FS attributes already being used for userland information seems like
>   a good argument, but we shouldn't add separate specialized xattr
>   implementation to different pseudo filesystems.  For it to be
>   acceptable, it should be a libfs thing easily applicable to any
>   pseudo FS and definitely shouldn't be using kmem for storage.

Also note that tmpfs also implies size limit.  We definitely need some
form of control over the amount of memory xattr may consume.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-19  2:40               ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2012-01-19  2:40 UTC (permalink / raw)
  To: Lennart Poettering; +Cc: Kay Sievers, Li Zefan, LKML, Cgroups

On Wed, Jan 18, 2012 at 06:20:05PM -0800, Tejun Heo wrote:
> Hello, Lennart, Li.
> 
> Two things.
> 
> * Probably I'm missing something but isn't the systemd cgroup
>   hierarchy already managed by systemd?  If so, I don't see how
>   managing tmpfs on the side would noticeably make things more
>   fragile.  It would take a bit more care after, for example, restart
>   but it shouldn't be too complex, no?
> 
> * FS attributes already being used for userland information seems like
>   a good argument, but we shouldn't add separate specialized xattr
>   implementation to different pseudo filesystems.  For it to be
>   acceptable, it should be a libfs thing easily applicable to any
>   pseudo FS and definitely shouldn't be using kmem for storage.

Also note that tmpfs also implies size limit.  We definitely need some
form of control over the amount of memory xattr may consume.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-21  2:59               ` Lennart Poettering
  0 siblings, 0 replies; 29+ messages in thread
From: Lennart Poettering @ 2012-01-21  2:59 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Kay Sievers, Li Zefan, LKML, Cgroups

On Wed, 18.01.12 18:20, Tejun Heo (tj@kernel.org) wrote:

Heya,

> * Probably I'm missing something but isn't the systemd cgroup
>   hierarchy already managed by systemd?  If so, I don't see how
>   managing tmpfs on the side would noticeably make things more
>   fragile.  It would take a bit more care after, for example, restart
>   but it shouldn't be too complex, no?

Well, it becomes more complex, for example, if we miss a cgroups event,
or we go away and come back (i.e. systemd can reexecute itself for
upgrades), and as soon as it isn't just systemd anymore which needs meta
data attached to a cgroup, but we need something shared. For example,
the logic described in
http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups
regarding "persistant" cgroups would be much nicer using xattrs (it
currently misuses the sticky bit as flag instead).

It's just much nicer if meta data doesn't get stale just because the
process owning it goes awol. And if process want to share meta data, too.
 
> * FS attributes already being used for userland information seems like
>   a good argument, but we shouldn't add separate specialized xattr
>   implementation to different pseudo filesystems.  For it to be
>   acceptable, it should be a libfs thing easily applicable to any
>   pseudo FS and definitely shouldn't be using kmem for storage.

I think the code that was recently added to tmpfs for the same purposes
was supposed to be reusable on other virtual file systems. Might be
worth looking into that.

Lennart

-- 
Lennart Poettering - Red Hat, Inc.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-21  2:59               ` Lennart Poettering
  0 siblings, 0 replies; 29+ messages in thread
From: Lennart Poettering @ 2012-01-21  2:59 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Kay Sievers, Li Zefan, LKML, Cgroups

On Wed, 18.01.12 18:20, Tejun Heo (tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org) wrote:

Heya,

> * Probably I'm missing something but isn't the systemd cgroup
>   hierarchy already managed by systemd?  If so, I don't see how
>   managing tmpfs on the side would noticeably make things more
>   fragile.  It would take a bit more care after, for example, restart
>   but it shouldn't be too complex, no?

Well, it becomes more complex, for example, if we miss a cgroups event,
or we go away and come back (i.e. systemd can reexecute itself for
upgrades), and as soon as it isn't just systemd anymore which needs meta
data attached to a cgroup, but we need something shared. For example,
the logic described in
http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups
regarding "persistant" cgroups would be much nicer using xattrs (it
currently misuses the sticky bit as flag instead).

It's just much nicer if meta data doesn't get stale just because the
process owning it goes awol. And if process want to share meta data, too.
 
> * FS attributes already being used for userland information seems like
>   a good argument, but we shouldn't add separate specialized xattr
>   implementation to different pseudo filesystems.  For it to be
>   acceptable, it should be a libfs thing easily applicable to any
>   pseudo FS and definitely shouldn't be using kmem for storage.

I think the code that was recently added to tmpfs for the same purposes
was supposed to be reusable on other virtual file systems. Might be
worth looking into that.

Lennart

-- 
Lennart Poettering - Red Hat, Inc.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-21  3:02                 ` Lennart Poettering
  0 siblings, 0 replies; 29+ messages in thread
From: Lennart Poettering @ 2012-01-21  3:02 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Kay Sievers, Li Zefan, LKML, Cgroups

On Wed, 18.01.12 18:40, Tejun Heo (tj@kernel.org) wrote:

> 
> On Wed, Jan 18, 2012 at 06:20:05PM -0800, Tejun Heo wrote:
> > Hello, Lennart, Li.
> > 
> > Two things.
> > 
> > * Probably I'm missing something but isn't the systemd cgroup
> >   hierarchy already managed by systemd?  If so, I don't see how
> >   managing tmpfs on the side would noticeably make things more
> >   fragile.  It would take a bit more care after, for example, restart
> >   but it shouldn't be too complex, no?
> > 
> > * FS attributes already being used for userland information seems like
> >   a good argument, but we shouldn't add separate specialized xattr
> >   implementation to different pseudo filesystems.  For it to be
> >   acceptable, it should be a libfs thing easily applicable to any
> >   pseudo FS and definitely shouldn't be using kmem for storage.
> 
> Also note that tmpfs also implies size limit.  We definitely need some
> form of control over the amount of memory xattr may consume.

Good point. But then again we don't even have anything resembling for
tmpfs either, where it would be much more important... :-(

Given that cgroupfs is probably mostly read-only for normal users, the
requirement for quotas on it is probably much less important than for tmpfs
though.

Lennart

-- 
Lennart Poettering - Red Hat, Inc.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-21  3:02                 ` Lennart Poettering
  0 siblings, 0 replies; 29+ messages in thread
From: Lennart Poettering @ 2012-01-21  3:02 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Kay Sievers, Li Zefan, LKML, Cgroups

On Wed, 18.01.12 18:40, Tejun Heo (tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org) wrote:

> 
> On Wed, Jan 18, 2012 at 06:20:05PM -0800, Tejun Heo wrote:
> > Hello, Lennart, Li.
> > 
> > Two things.
> > 
> > * Probably I'm missing something but isn't the systemd cgroup
> >   hierarchy already managed by systemd?  If so, I don't see how
> >   managing tmpfs on the side would noticeably make things more
> >   fragile.  It would take a bit more care after, for example, restart
> >   but it shouldn't be too complex, no?
> > 
> > * FS attributes already being used for userland information seems like
> >   a good argument, but we shouldn't add separate specialized xattr
> >   implementation to different pseudo filesystems.  For it to be
> >   acceptable, it should be a libfs thing easily applicable to any
> >   pseudo FS and definitely shouldn't be using kmem for storage.
> 
> Also note that tmpfs also implies size limit.  We definitely need some
> form of control over the amount of memory xattr may consume.

Good point. But then again we don't even have anything resembling for
tmpfs either, where it would be much more important... :-(

Given that cgroupfs is probably mostly read-only for normal users, the
requirement for quotas on it is probably much less important than for tmpfs
though.

Lennart

-- 
Lennart Poettering - Red Hat, Inc.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-21  4:00                   ` Hugh Dickins
  0 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2012-01-21  4:00 UTC (permalink / raw)
  To: Lennart Poettering; +Cc: Tejun Heo, Kay Sievers, Li Zefan, LKML, Cgroups

On Sat, 21 Jan 2012, Lennart Poettering wrote:
> On Wed, 18.01.12 18:40, Tejun Heo (tj@kernel.org) wrote:
> > On Wed, Jan 18, 2012 at 06:20:05PM -0800, Tejun Heo wrote:
> > 
> > Also note that tmpfs also implies size limit.  We definitely need some
> > form of control over the amount of memory xattr may consume.
> 
> Good point. But then again we don't even have anything resembling for
> tmpfs either, where it would be much more important... :-(

But although the tmpfs framework for xattrs is general, the validation
function only permits XATTR_SECURITY_PREFIX and XATTR_TRUSTED_PREFIX.

Yes, it would be very necessary to impose size limits if tmpfs went
beyond those to permit XATTR_USER_PREFIX - which the cgroupfs patch
under discussion does permit.

> 
> Given that cgroupfs is probably mostly read-only for normal users, the
> requirement for quotas on it is probably much less important than for tmpfs
> though.

Hugh

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] cgroup: add xattr support
@ 2012-01-21  4:00                   ` Hugh Dickins
  0 siblings, 0 replies; 29+ messages in thread
From: Hugh Dickins @ 2012-01-21  4:00 UTC (permalink / raw)
  To: Lennart Poettering; +Cc: Tejun Heo, Kay Sievers, Li Zefan, LKML, Cgroups

On Sat, 21 Jan 2012, Lennart Poettering wrote:
> On Wed, 18.01.12 18:40, Tejun Heo (tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org) wrote:
> > On Wed, Jan 18, 2012 at 06:20:05PM -0800, Tejun Heo wrote:
> > 
> > Also note that tmpfs also implies size limit.  We definitely need some
> > form of control over the amount of memory xattr may consume.
> 
> Good point. But then again we don't even have anything resembling for
> tmpfs either, where it would be much more important... :-(

But although the tmpfs framework for xattrs is general, the validation
function only permits XATTR_SECURITY_PREFIX and XATTR_TRUSTED_PREFIX.

Yes, it would be very necessary to impose size limits if tmpfs went
beyond those to permit XATTR_USER_PREFIX - which the cgroupfs patch
under discussion does permit.

> 
> Given that cgroupfs is probably mostly read-only for normal users, the
> requirement for quotas on it is probably much less important than for tmpfs
> though.

Hugh

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2012-01-21  4:01 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-16  8:06 [PATCH 1/2] cgroup: revise how we re-populate root directory Li Zefan
2012-01-16  8:06 ` Li Zefan
2012-01-16  8:07 ` [PATCH 2/2] cgroup: add xattr support Li Zefan
2012-01-16  8:07   ` Li Zefan
2012-01-17 17:53   ` Tejun Heo
2012-01-17 17:53     ` Tejun Heo
2012-01-18  8:27     ` Li Zefan
2012-01-18  8:27       ` Li Zefan
2012-01-18 17:47       ` Tejun Heo
2012-01-18 17:47         ` Tejun Heo
2012-01-19  1:49         ` Lennart Poettering
2012-01-18 21:28     ` Kay Sievers
2012-01-18 21:28       ` Kay Sievers
2012-01-18 21:36       ` Tejun Heo
2012-01-18 21:36         ` Tejun Heo
2012-01-19  1:47         ` Lennart Poettering
2012-01-19  2:20           ` Tejun Heo
2012-01-19  2:20             ` Tejun Heo
2012-01-19  2:40             ` Tejun Heo
2012-01-19  2:40               ` Tejun Heo
2012-01-21  3:02               ` Lennart Poettering
2012-01-21  3:02                 ` Lennart Poettering
2012-01-21  4:00                 ` Hugh Dickins
2012-01-21  4:00                   ` Hugh Dickins
2012-01-21  2:59             ` Lennart Poettering
2012-01-21  2:59               ` Lennart Poettering
2012-01-18  7:23 ` [PATCH 1/2] cgroup: revise how we re-populate root directory Sha
2012-01-18  7:23   ` Sha
2012-01-18  7:59   ` Li Zefan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.