All of lore.kernel.org
 help / color / mirror / Atom feed
From: Li Zefan <lizf-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
To: Paul Menage <menage-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
Subject: Re: [PATCH 4/7] [RFC] Allow cgroup hierarchies to be created with no	bound subsystems
Date: Tue, 17 Mar 2009 14:45:33 +0800	[thread overview]
Message-ID: <49BF470D.8080600@cn.fujitsu.com> (raw)
In-Reply-To: <20090312105137.24154.34890.stgit-B63HFAS8fGlSzHKm+aFRNNkmqwFzkYv6@public.gmane.org>

Paul Menage wrote:
> [RFC] Allow cgroup hierarchies to be created with no bound subsystems
> 
> This patch removes the restriction that a cgroup hierarchy must have
> at least one bound subsystem. The mount option "none" is treated as an
> explicit request for no bound subsystems.
> 
> A hierarchy with no subsystems can be useful for plan task tracking,

s/plan/plain/ ;)

> and is also a step towards the support for multiply-bindable
> subsystems in a later patch in this series.
> 
> As part of this change, the hierarchy id is no longer calculated from
> the bitmask of subsystems in the hierarchy (since this is not
> guaranteed to be unique) but is allocated via an ida. Reference counts
> on cgroups from css_set objects are now taken explicitly one per
> hierarchy, rather than one per subsystem.
> 
> Example usage:
> 
> mount -t cgroup -o none,name=foo cgroup /mnt/cgroup
> 
> Based on the "no-op"/"none" subsystem concept proposed by
> kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org
> 
> Signed-off-by: Paul Menage <menage-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> 
> ---
> 
>  kernel/cgroup.c |  158 ++++++++++++++++++++++++++++++++++++-------------------
>  1 files changed, 104 insertions(+), 54 deletions(-)
> 
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 9cdbbac..4a7ef2c 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -72,6 +72,9 @@ struct cgroupfs_root {
>  	 */
>  	unsigned long subsys_bits;
>  
> +	/* Unique id for this hierarchy. */
> +	int hierarchy_id;
> +
>  	/* The bitmask of subsystems currently attached to this hierarchy */
>  	unsigned long actual_subsys_bits;
>  
> @@ -142,6 +145,10 @@ struct css_id {
>  static LIST_HEAD(roots);
>  static int root_count;
>  
> +static DEFINE_IDA(hierarchy_ida);
> +static int next_hierarchy_id;
> +static DEFINE_SPINLOCK(hierarchy_id_lock);
> +
>  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
>  #define dummytop (&rootnode.top_cgroup)
>  
> @@ -259,42 +266,10 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
>   * compiled into their kernel but not actually in use */
>  static int use_task_css_set_links __read_mostly;
>  
> -/* When we create or destroy a css_set, the operation simply
> - * takes/releases a reference count on all the cgroups referenced
> - * by subsystems in this css_set. This can end up multiple-counting
> - * some cgroups, but that's OK - the ref-count is just a
> - * busy/not-busy indicator; ensuring that we only count each cgroup
> - * once would require taking a global lock to ensure that no
> - * subsystems moved between hierarchies while we were doing so.
> - *
> - * Possible TODO: decide at boot time based on the number of
> - * registered subsystems and the number of CPUs or NUMA nodes whether
> - * it's better for performance to ref-count every subsystem, or to
> - * take a global lock and only add one ref count to each hierarchy.
> - */
> -
> -/*
> - * unlink a css_set from the list and free it
> - */
> -static void unlink_css_set(struct css_set *cg)
> +static void __put_css_set(struct css_set *cg, int taskexit)
>  {
>  	struct cg_cgroup_link *link;
>  	struct cg_cgroup_link *saved_link;
> -
> -	hlist_del(&cg->hlist);
> -	css_set_count--;
> -
> -	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
> -				 cg_link_list) {
> -		list_del(&link->cg_link_list);
> -		list_del(&link->cgrp_link_list);
> -		kfree(link);
> -	}
> -}
> -
> -static void __put_css_set(struct css_set *cg, int taskexit)
> -{
> -	int i;
>  	/*
>  	 * Ensure that the refcount doesn't hit zero while any readers
>  	 * can see it. Similar to atomic_dec_and_lock(), but for an
> @@ -307,20 +282,27 @@ static void __put_css_set(struct css_set *cg, int taskexit)
>  		write_unlock(&css_set_lock);
>  		return;
>  	}
> -	unlink_css_set(cg);
> -	write_unlock(&css_set_lock);
>  
> -	rcu_read_lock();
> -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> -		struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
> +	/* This css_set is dead. unlink it and release cgroup refcounts */
> +	hlist_del(&cg->hlist);
> +	css_set_count--;
> +
> +	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
> +				 cg_link_list) {
> +		struct cgroup *cgrp = link->cgrp;
> +		list_del(&link->cg_link_list);
> +		list_del(&link->cgrp_link_list);
>  		if (atomic_dec_and_test(&cgrp->count) &&
>  		    notify_on_release(cgrp)) {
>  			if (taskexit)
>  				set_bit(CGRP_RELEASABLE, &cgrp->flags);
>  			check_for_release(cgrp);
>  		}
> +
> +		kfree(link);
>  	}
> -	rcu_read_unlock();
> +
> +	write_unlock(&css_set_lock);
>  	kfree(cg);
>  }
>  
> @@ -513,6 +495,7 @@ static void link_css_set(struct list_head *tmp_cg_links,
>  				cgrp_link_list);
>  	link->cg = cg;
>  	link->cgrp = cgrp;
> +	atomic_inc(&cgrp->count);
>  	list_move(&link->cgrp_link_list, &cgrp->css_sets);
>  	/*
>  	 * Always add links to the tail of the list so that the list
> @@ -533,7 +516,6 @@ static struct css_set *find_css_set(
>  {
>  	struct css_set *res;
>  	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> -	int i;
>  
>  	struct list_head tmp_cg_links;
>  
> @@ -572,10 +554,6 @@ static struct css_set *find_css_set(
>  
>  	write_lock(&css_set_lock);
>  	/* Add reference counts and links from the new css_set. */
> -	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> -		struct cgroup *cgrp = res->subsys[i]->cgroup;
> -		atomic_inc(&cgrp->count);
> -	}
>  	list_for_each_entry_safe(link, saved_link, &oldcg->cg_links,
>  				 cg_link_list) {
>  		struct cgroup *c = link->cgrp;
> @@ -955,6 +933,10 @@ struct cgroup_sb_opts {
>  	unsigned long flags;
>  	char *release_agent;
>  	char *name;
> +
> +	/* User explicitly requested empty subsystem */
> +	bool none;
> +
>  	/* A flag indicating that a root was created from this options block */
>  	bool created_root;
>  };
> @@ -983,6 +965,9 @@ static int parse_cgroupfs_options(char *data,
>  				if (!ss->disabled)
>  					opts->subsys_bits |= 1ul << i;
>  			}
> +		} else if (!strcmp(token, "none")) {
> +			/* Explicitly have no subsystems */
> +			opts->none = true;
>  		} else if (!strcmp(token, "noprefix")) {
>  			set_bit(ROOT_NOPREFIX, &opts->flags);
>  		} else if (!strncmp(token, "release_agent=", 14)) {
> @@ -1019,7 +1004,16 @@ static int parse_cgroupfs_options(char *data,
>  		}
>  	}
>  
> -	/* We can't have an empty hierarchy */
> +	/* Consistency checks */
> +
> +	/* Can't specify "none" and some subsystems */
> +	if (opts->subsys_bits && opts->none)
> +		return -EINVAL;
> +
> +	/*
> +	 * We either have to specify by name or by subsystems. (So all
> +	 * empty hierarchies must have a name).
> +	 */
>  	if (!opts->subsys_bits && !opts->name)
>  		return -EINVAL;
>  
> @@ -1085,6 +1079,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
>  	INIT_LIST_HEAD(&cgrp->release_list);
>  	init_rwsem(&cgrp->pids_mutex);
>  }
> +
>  static void init_cgroup_root(struct cgroupfs_root *root)
>  {
>  	struct cgroup *cgrp = &root->top_cgroup;
> @@ -1096,6 +1091,18 @@ static void init_cgroup_root(struct cgroupfs_root *root)
>  	init_cgroup_housekeeping(cgrp);
>  }
>  
> +static void init_root_id(struct cgroupfs_root *root)
> +{
> +	BUG_ON(!ida_pre_get(&hierarchy_ida, GFP_KERNEL));

we should return -errno, BUG_ON() on this is wrong

> +	spin_lock(&hierarchy_id_lock);
> +	if (ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
> +			      &root->hierarchy_id)) {
> +		BUG_ON(ida_get_new(&hierarchy_ida, &root->hierarchy_id));
> +	}

next_hierarchy_id is redundant, just use ida_get_new() instead of
ida_get_new_above()

and I think we should check EAGAIN:

/*
 * ida_get_new - allocate new ID
 ...
 * If memory is required, it will return -EAGAIN, you should unlock
 * and go back to the idr_pre_get() call. ...
 */

> +	next_hierarchy_id = root->hierarchy_id + 1;
> +	spin_unlock(&hierarchy_id_lock);
> +}
> +
>  static int cgroup_test_super(struct super_block *sb, void *data)
>  {
>  	struct cgroup_sb_opts *new = data;
> @@ -1116,7 +1123,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
>  {
>  	struct cgroupfs_root *root;
>  
> -	if (!opts->subsys_bits)
> +	if (!opts->subsys_bits && !opts->none)
>  		return ERR_PTR(-EINVAL);
>  

Shouldn't we check this in parse_cgroupfs_options() instead?

>  	root = kzalloc(sizeof(*root), GFP_KERNEL);
> @@ -1124,6 +1131,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
>  		return ERR_PTR(-ENOMEM);
>  
>  	init_cgroup_root(root);
> +	init_root_id(root);
> +
>  	root->subsys_bits = opts->subsys_bits;
>  	root->flags = opts->flags;
>  	if (opts->release_agent)
> @@ -1134,6 +1143,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
>  	return root;
>  }
>  
> +static void cgroup_drop_root(struct cgroupfs_root *root)
> +{
> +	BUG_ON(!root->hierarchy_id);

I think this BUG_ON() is redundant, if root->hierarchy_id == 0,
we are freeing the dummy root, and kfree(root) should trigger
kernel oops.

> +	spin_lock(&hierarchy_id_lock);
> +	ida_remove(&hierarchy_ida, root->hierarchy_id);
> +	spin_unlock(&hierarchy_id_lock);
> +	kfree(root);
> +}
> +
>  static int cgroup_set_super(struct super_block *sb, void *data)
>  {
>  	int ret;
> @@ -1145,7 +1163,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
>  		return PTR_ERR(root);
>  	ret = set_anon_super(sb, NULL);
>  	if (ret) {
> -		kfree(root);
> +		cgroup_drop_root(root);
>  		return ret;
>  	}
>  
> @@ -1331,7 +1349,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
>  	mutex_unlock(&cgroup_mutex);
>  
>  	kill_litter_super(sb);
> -	kfree(root);
> +	cgroup_drop_root(root);
>  }
>  
>  static struct file_system_type cgroup_fs_type = {
> @@ -2975,7 +2993,7 @@ int __init cgroup_init(void)
>  	/* Add init_css_set to the hash table */
>  	hhead = css_set_hash(init_css_set.subsys);
>  	hlist_add_head(&init_css_set.hlist, hhead);
> -
> +	init_root_id(&rootnode);
>  	err = register_filesystem(&cgroup_fs_type);
>  	if (err < 0)
>  		goto out;
> @@ -3030,7 +3048,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
>  		struct cgroup *cgrp;
>  		int count = 0;
>  
> -		seq_printf(m, "%lu:", root->subsys_bits);
> +		seq_printf(m, "%d:", root->hierarchy_id);
>  		for_each_subsys(root, ss)
>  			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
>  		if (strlen(root->name))
> @@ -3076,8 +3094,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
>  	mutex_lock(&cgroup_mutex);
>  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
>  		struct cgroup_subsys *ss = subsys[i];
> -		seq_printf(m, "%s\t%lu\t%d\t%d\n",
> -			   ss->name, ss->root->subsys_bits,
> +		seq_printf(m, "%s\t%d\t%d\t%d\n",
> +			   ss->name, ss->root->hierarchy_id,
>  			   ss->root->number_of_cgroups, !ss->disabled);
>  	}
>  	mutex_unlock(&cgroup_mutex);
> @@ -3800,8 +3818,8 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
>  			name = c->dentry->d_name.name;
>  		else
>  			name = "?";
> -		seq_printf(seq, "Root %lu group %s\n",
> -			   c->root->subsys_bits, name);
> +		seq_printf(seq, "Root %d group %s\n",
> +			   c->root->hierarchy_id, name);
>  		rcu_read_unlock();
>  	}
>  	task_unlock(current);
> @@ -3809,6 +3827,33 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
>  	return 0;
>  }
>  
> +#define MAX_TASKS_SHOWN_PER_CSS 25
> +static int cgroup_css_links_read(struct cgroup *cont,
> +				 struct cftype *cft,
> +				 struct seq_file *seq)
> +{
> +	struct cg_cgroup_link *link, *saved_link;
> +	write_lock_irq(&css_set_lock);
> +	list_for_each_entry_safe(link, saved_link, &cont->css_sets,
> +				 cgrp_link_list) {
> +		struct css_set *cg = link->cg;
> +		struct task_struct *task, *saved_task;
> +		int count = 0;

> +		seq_printf(seq, "css_set %p\n", cg);
> +		list_for_each_entry_safe(task, saved_task, &cg->tasks,
> +					 cg_list) {
> +			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {

actually this prints at most 26 tasks but not 25

and what's the concern to not print out all the tasks? the buffer
limit of seqfile?

> +				seq_puts(seq, "  ...\n");
> +				break;
> +			} else {
> +				seq_printf(seq, "  task %d\n", task->pid);

I guess we should call task_pid_vnr(tsk) instead of accessing task->pid
directly.

> +			}
> +		}
> +	}
> +	write_unlock_irq(&css_set_lock);
> +	return 0;
> +}
> +
>  static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
>  {
>  	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
> @@ -3840,6 +3885,11 @@ static struct cftype debug_files[] =  {
>  	},
>  
>  	{
> +		.name = "cgroup_css_links",
> +		.read_seq_string = cgroup_css_links_read,
> +	},
> +
> +	{
>  		.name = "releasable",
>  		.read_u64 = releasable_read,
>  	},
> 
> 
> 

  parent reply	other threads:[~2009-03-17  6:45 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-03-12 10:51 [PATCH 0/7][RFC] CGroup hierarchy extensions Paul Menage
     [not found] ` <20090312104507.24154.71691.stgit-B63HFAS8fGlSzHKm+aFRNNkmqwFzkYv6@public.gmane.org>
2009-03-12 10:51   ` [PATCH 1/7] [RFC] Support named cgroups hierarchies Paul Menage
     [not found]     ` <20090312105122.24154.73633.stgit-B63HFAS8fGlSzHKm+aFRNNkmqwFzkYv6@public.gmane.org>
2009-03-17  6:44       ` Li Zefan
     [not found]         ` <49BF46BC.4080302-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
2009-03-31  8:12           ` Paul Menage
     [not found]             ` <6599ad830903310112x626252c4je760a80eb1eaa1d-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-04-01  6:24               ` Li Zefan
2009-03-12 10:51   ` [PATCH 2/7] [RFC]Move the cgroup debug subsys into cgroup.c to access internal state Paul Menage
     [not found]     ` <20090312105127.24154.39200.stgit-B63HFAS8fGlSzHKm+aFRNNkmqwFzkYv6@public.gmane.org>
2009-03-17  6:44       ` Li Zefan
2009-03-12 10:51   ` [PATCH 3/7] [RFC] Add a back-pointer from struct cg_cgroup_link to struct cgroup Paul Menage
     [not found]     ` <20090312105132.24154.99250.stgit-B63HFAS8fGlSzHKm+aFRNNkmqwFzkYv6@public.gmane.org>
2009-03-17  6:44       ` Li Zefan
2009-03-12 10:51   ` [PATCH 4/7] [RFC] Allow cgroup hierarchies to be created with no bound subsystems Paul Menage
     [not found]     ` <20090312105137.24154.34890.stgit-B63HFAS8fGlSzHKm+aFRNNkmqwFzkYv6@public.gmane.org>
2009-03-17  6:45       ` Li Zefan [this message]
     [not found]         ` <49BF470D.8080600-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
2009-03-31  8:45           ` Paul Menage
     [not found]             ` <6599ad830903310145l7b24ad0vb4462f94390ac264-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-04-01  6:40               ` Li Zefan
     [not found]                 ` <49D30C44.8050802-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
2009-04-02  8:17                   ` Paul Menage
2009-03-12 10:51   ` [PATCH 5/7] [RFC] Remove cgroup_subsys.root pointer Paul Menage
2009-03-12 10:51   ` [PATCH 6/7] [RFC] Support multiply-bindable cgroup subsystems Paul Menage
     [not found]     ` <20090312105147.24154.62638.stgit-B63HFAS8fGlSzHKm+aFRNNkmqwFzkYv6@public.gmane.org>
2009-03-17  6:46       ` Li Zefan
     [not found]         ` <49BF4744.5060309-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
2009-03-18  2:09           ` Li Zefan
     [not found]             ` <49C057EB.1000307-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
2009-03-18  2:16               ` Paul Menage
     [not found]                 ` <6599ad830903171916x7364ec7cw76975d71d5125d82-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-03-18  2:39                   ` Li Zefan
     [not found]                     ` <49C05EDF.5010607-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
2009-03-18  2:43                       ` Paul Menage
     [not found]                         ` <6599ad830903171943x7884cb03w8f22fa1629d667b3-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-03-18  3:09                           ` Li Zefan
     [not found]                             ` <49C065E7.8060901-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
2009-03-18  5:43                               ` Balbir Singh
2009-03-31  9:02                               ` Paul Menage
     [not found]                                 ` <6599ad830903310202p1c268237lff283b2676f78864-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-04-01  6:44                                   ` Li Zefan
2009-03-12 10:51   ` [PATCH 7/7] [RFC] Example multi-bindable subsystem: a per-cgroup notes field Paul Menage
     [not found]     ` <20090312105153.24154.29389.stgit-B63HFAS8fGlSzHKm+aFRNNkmqwFzkYv6@public.gmane.org>
2009-03-17  6:46       ` Li Zefan
2009-03-16  1:10   ` [PATCH 0/7][RFC] CGroup hierarchy extensions KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=49BF470D.8080600@cn.fujitsu.com \
    --to=lizf-bthxqxjhjhxqfuhtdcdx3a@public.gmane.org \
    --cc=containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org \
    --cc=menage-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.