* [PATCH 1/3] cpuset: implement cgroup_rightmost_descendant()
2013-01-03 21:44 [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2 Tejun Heo
@ 2013-01-03 21:44 ` Tejun Heo
2013-01-03 21:44 ` [PATCH 2/3] cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre() Tejun Heo
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2013-01-03 21:44 UTC (permalink / raw)
To: lizefan, paul, glommer
Cc: containers, cgroups, peterz, mhocko, linux-kernel, Tejun Heo
Implement cgroup_rightmost_descendant() which returns the right most
descendant of the specified cgroup. This can be used to skip the
cgroup's subtree while iterating with
cgroup_for_each_descendant_pre().
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
---
include/linux/cgroup.h | 1 +
kernel/cgroup.c | 26 ++++++++++++++++++++++++++
2 files changed, 27 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7d73905..860ca0f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -558,6 +558,7 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
struct cgroup *cgroup);
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
/**
* cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892..6643f70 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3017,6 +3017,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
}
EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos. If there's no descendant,
+ * @pos is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+ struct cgroup *last, *tmp;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ list_for_each_entry_rcu(tmp, &last->children, sibling)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+
static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
{
struct cgroup *last;
--
1.8.0.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/3] cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre()
2013-01-03 21:44 [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2 Tejun Heo
2013-01-03 21:44 ` [PATCH 1/3] cpuset: implement cgroup_rightmost_descendant() Tejun Heo
@ 2013-01-03 21:44 ` Tejun Heo
2013-01-03 21:44 ` [PATCH 3/3] cpuset: remove cpuset->parent Tejun Heo
2013-01-06 9:27 ` [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2 Li Zefan
3 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2013-01-03 21:44 UTC (permalink / raw)
To: lizefan, paul, glommer
Cc: containers, cgroups, peterz, mhocko, linux-kernel, Tejun Heo
Implement cpuset_for_each_descendant_pre() and replace the
cpuset-specific tree walking using cpuset->stack_list with it.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
---
kernel/cpuset.c | 123 ++++++++++++++++++++++----------------------------------
1 file changed, 48 insertions(+), 75 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 58aa99b..b2f8dad 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -103,9 +103,6 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* used for walking a cpuset hierarchy */
- struct list_head stack_list;
-
struct work_struct hotplug_work;
};
@@ -207,6 +204,20 @@ static struct cpuset top_cpuset = {
cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
+ cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+ if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
+
/*
* There are two global mutexes guarding cpuset structures - cpuset_mutex
* and callback_mutex. The latter may nest inside the former. We also
@@ -507,31 +518,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
-
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ struct cpuset *cp;
+ struct cgroup *pos_cgrp;
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- rcu_read_lock();
- cpuset_for_each_child(child, cont, cp)
- list_add_tail(&child->stack_list, &q);
- rcu_read_unlock();
}
+ rcu_read_unlock();
}
/*
@@ -591,7 +595,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -600,6 +603,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup *pos_cgrp;
doms = NULL;
dattr = NULL;
@@ -627,33 +631,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
- continue;
-
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- rcu_read_lock();
- cpuset_for_each_child(child, cont, cp)
- list_add_tail(&child->stack_list, &q);
- rcu_read_unlock();
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -2068,31 +2066,6 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
move_member_tasks_to_cpuset(cs, parent);
}
-/*
- * Helper function to traverse cpusets.
- * It can be used to walk the cpuset tree from top to bottom, completing
- * one layer before dropping down to the next (thus always processing a
- * node before any of its children).
- */
-static struct cpuset *cpuset_next(struct list_head *queue)
-{
- struct cpuset *cp;
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
-
- if (list_empty(queue))
- return NULL;
-
- cp = list_first_entry(queue, struct cpuset, stack_list);
- list_del(queue->next);
- rcu_read_lock();
- cpuset_for_each_child(child, cont, cp)
- list_add_tail(&child->stack_list, queue);
- rcu_read_unlock();
-
- return cp;
-}
-
/**
* cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
* @cs: cpuset in interest
@@ -2229,12 +2202,12 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* if cpus or mems went down, we need to propagate to descendants */
if (cpus_offlined || mems_offlined) {
struct cpuset *cs;
- LIST_HEAD(queue);
+ struct cgroup *pos_cgrp;
- list_add_tail(&top_cpuset.stack_list, &queue);
- while ((cs = cpuset_next(&queue)))
- if (cs != &top_cpuset)
- schedule_cpuset_propagate_hotplug(cs);
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
+ schedule_cpuset_propagate_hotplug(cs);
+ rcu_read_unlock();
}
mutex_unlock(&cpuset_mutex);
--
1.8.0.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 3/3] cpuset: remove cpuset->parent
2013-01-03 21:44 [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2 Tejun Heo
2013-01-03 21:44 ` [PATCH 1/3] cpuset: implement cgroup_rightmost_descendant() Tejun Heo
2013-01-03 21:44 ` [PATCH 2/3] cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre() Tejun Heo
@ 2013-01-03 21:44 ` Tejun Heo
2013-01-06 9:27 ` [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2 Li Zefan
3 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2013-01-03 21:44 UTC (permalink / raw)
To: lizefan, paul, glommer
Cc: containers, cgroups, peterz, mhocko, linux-kernel, Tejun Heo
cgroup already tracks the hierarchy. Follow cgroup->parent to find
the parent and drop cpuset->parent.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
---
kernel/cpuset.c | 28 +++++++++++++++++-----------
1 file changed, 17 insertions(+), 11 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b2f8dad..afadbb99 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -87,8 +87,6 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
- struct cpuset *parent; /* my parent */
-
struct fmeter fmeter; /* memory_pressure filter */
/*
@@ -120,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
struct cpuset, css);
}
+static inline struct cpuset *parent_cs(const struct cpuset *cs)
+{
+ struct cgroup *pcgrp = cs->css.cgroup->parent;
+
+ if (pcgrp)
+ return cgroup_cs(pcgrp);
+ return NULL;
+}
+
#ifdef CONFIG_NUMA
static inline bool task_has_mempolicy(struct task_struct *task)
{
@@ -323,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
struct cpumask *pmask)
{
while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
else
@@ -348,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
{
while (cs && !nodes_intersects(cs->mems_allowed,
node_states[N_MEMORY]))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
nodes_and(*pmask, cs->mems_allowed,
node_states[N_MEMORY]);
@@ -461,7 +468,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
if (cur == &top_cpuset)
goto out;
- par = cur->parent;
+ par = parent_cs(cur);
/* We must be a subset of our parent cpuset */
ret = -EACCES;
@@ -1866,7 +1873,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
fmeter_init(&cs->fmeter);
INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
cs->relax_domain_level = -1;
- cs->parent = cgroup_cs(cont->parent);
return &cs->css;
}
@@ -1874,7 +1880,7 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
static int cpuset_css_online(struct cgroup *cgrp)
{
struct cpuset *cs = cgroup_cs(cgrp);
- struct cpuset *parent = cs->parent;
+ struct cpuset *parent = parent_cs(cs);
struct cpuset *tmp_cs;
struct cgroup *pos_cg;
@@ -2058,10 +2064,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
* Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty).
*/
- parent = cs->parent;
+ parent = parent_cs(cs);
while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed))
- parent = parent->parent;
+ parent = parent_cs(parent);
move_member_tasks_to_cpuset(cs, parent);
}
@@ -2373,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
*/
static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
{
- while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
- cs = cs->parent;
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
return cs;
}
--
1.8.0.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2
2013-01-03 21:44 [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2 Tejun Heo
` (2 preceding siblings ...)
2013-01-03 21:44 ` [PATCH 3/3] cpuset: remove cpuset->parent Tejun Heo
@ 2013-01-06 9:27 ` Li Zefan
2013-01-07 16:47 ` Tejun Heo
3 siblings, 1 reply; 9+ messages in thread
From: Li Zefan @ 2013-01-06 9:27 UTC (permalink / raw)
To: Tejun Heo
Cc: paul, glommer, containers, cgroups, peterz, mhocko, linux-kernel
On 2013/1/4 5:44, Tejun Heo wrote:
> Hello, guys.
>
> This is the second take of "drop cpuset->stack_list and ->parent"
> patchset. Other than being rebased on top of v3.8-rc2 + "cpuset:
> decouple cpuset locking from cgroup core, take#2", nothing really has
> changed.
>
> The original patchset description follows.
>
> cpuset implements its own descendant iteration using
> cpuset->stack_list and has its own ->parent pointer. There's nothing
> cpuset specific about descendant walking or finding the parent. This
> patchset makes cpuset use cgroup generic API instead.
>
> 0001-cpuset-implement-cgroup_rightmost_descendant.patch
> 0002-cpuset-replace-cpuset-stack_list-with-cpuset_for_eac.patch
> 0003-cpuset-remove-cpuset-parent.patch
>
> 0001 implements cgroup_rightmost_descendant() which can be used to
> skip subtree during pre-order tree walk. Michal, maybe memcg can use
> it too?
>
> 0002 replaces cpuset->stack_list with generic
> for_each_descendasnt_pre().
>
> 0003 replaces cpuset->parent with cgroup->parent.
>
> This patchset is on top of
>
> v3.8-rc2 d1c3ed669a2d452cacfb48c2d171a1f364dae2ed
> + [1] "[PATCHSET cgroup/for-3.8] cpuset: decouple cpuset locking from cgroup core, take#2"
>
> and available in the following git branch.
>
> git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git review-cpuset-iter
>
> include/linux/cgroup.h | 1
> kernel/cgroup.c | 26 ++++++++
> kernel/cpuset.c | 151 +++++++++++++++++++++----------------------------
> 3 files changed, 92 insertions(+), 86 deletions(-)
>
Acked-by: Li Zefan <lizefan@huawei.com>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2
2013-01-06 9:27 ` [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2 Li Zefan
@ 2013-01-07 16:47 ` Tejun Heo
2013-01-07 16:52 ` Tejun Heo
0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2013-01-07 16:47 UTC (permalink / raw)
To: Li Zefan; +Cc: paul, glommer, containers, cgroups, peterz, mhocko, linux-kernel
On Sun, Jan 06, 2013 at 05:27:46PM +0800, Li Zefan wrote:
> Acked-by: Li Zefan <lizefan@huawei.com>
I'll route this together with "decouple cpuset locking" patchset in a
separate branch in cgroup tree. If anyone objects, please let me
know.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCHSET] cpuset: drop cpuset->stack_list and ->parent, take#2
2013-01-07 16:47 ` Tejun Heo
@ 2013-01-07 16:52 ` Tejun Heo
0 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2013-01-07 16:52 UTC (permalink / raw)
To: Li Zefan; +Cc: paul, glommer, containers, cgroups, peterz, mhocko, linux-kernel
On Mon, Jan 07, 2013 at 08:47:07AM -0800, Tejun Heo wrote:
> On Sun, Jan 06, 2013 at 05:27:46PM +0800, Li Zefan wrote:
> > Acked-by: Li Zefan <lizefan@huawei.com>
>
> I'll route this together with "decouple cpuset locking" patchset in a
> separate branch in cgroup tree. If anyone objects, please let me
> know.
I put the cgroup_rightmost_descendant() patch in cgroup/for-3.9 and
the rest in cgroup/for-3.9-cpuset.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 2/3] cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre()
2012-11-28 22:26 [PATCHSET cgroup/for-3.8] cpuset: drop cpuset->stack_list and ->parent Tejun Heo
@ 2012-11-28 22:27 ` Tejun Heo
2012-12-03 16:18 ` Michal Hocko
0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2012-11-28 22:27 UTC (permalink / raw)
To: lizefan, paul, glommer
Cc: containers, cgroups, peterz, mhocko, linux-kernel, Tejun Heo
Implement cpuset_for_each_descendant_pre() and replace the
cpuset-specific tree walking using cpuset->stack_list with it.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/cpuset.c | 123 ++++++++++++++++++++++----------------------------------
1 file changed, 48 insertions(+), 75 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2ee0e03..3a01730 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -103,9 +103,6 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* used for walking a cpuset hierarchy */
- struct list_head stack_list;
-
struct work_struct hotplug_work;
};
@@ -207,6 +204,20 @@ static struct cpuset top_cpuset = {
cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
+ cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+ if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
+
/*
* There are two global mutexes guarding cpuset structures - cpuset_mutex
* and callback_mutex. The latter may nest inside the former. We also
@@ -507,31 +518,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
-
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ struct cpuset *cp;
+ struct cgroup *pos_cgrp;
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- rcu_read_lock();
- cpuset_for_each_child(child, cont, cp)
- list_add_tail(&child->stack_list, &q);
- rcu_read_unlock();
}
+ rcu_read_unlock();
}
/*
@@ -591,7 +595,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -600,6 +603,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup *pos_cgrp;
doms = NULL;
dattr = NULL;
@@ -627,33 +631,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
- continue;
-
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- rcu_read_lock();
- cpuset_for_each_child(child, cont, cp)
- list_add_tail(&child->stack_list, &q);
- rcu_read_unlock();
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -2059,31 +2057,6 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
move_member_tasks_to_cpuset(cs, parent);
}
-/*
- * Helper function to traverse cpusets.
- * It can be used to walk the cpuset tree from top to bottom, completing
- * one layer before dropping down to the next (thus always processing a
- * node before any of its children).
- */
-static struct cpuset *cpuset_next(struct list_head *queue)
-{
- struct cpuset *cp;
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
-
- if (list_empty(queue))
- return NULL;
-
- cp = list_first_entry(queue, struct cpuset, stack_list);
- list_del(queue->next);
- rcu_read_lock();
- cpuset_for_each_child(child, cont, cp)
- list_add_tail(&child->stack_list, queue);
- rcu_read_unlock();
-
- return cp;
-}
-
/**
* cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
* @cs: cpuset in interest
@@ -2220,12 +2193,12 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* if cpus or mems went down, we need to propagate to descendants */
if (cpus_offlined || mems_offlined) {
struct cpuset *cs;
- LIST_HEAD(queue);
+ struct cgroup *pos_cgrp;
- list_add_tail(&top_cpuset.stack_list, &queue);
- while ((cs = cpuset_next(&queue)))
- if (cs != &top_cpuset)
- schedule_cpuset_propagate_hotplug(cs);
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
+ schedule_cpuset_propagate_hotplug(cs);
+ rcu_read_unlock();
}
mutex_unlock(&cpuset_mutex);
--
1.7.11.7
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH 2/3] cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre()
2012-11-28 22:27 ` [PATCH 2/3] cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre() Tejun Heo
@ 2012-12-03 16:18 ` Michal Hocko
0 siblings, 0 replies; 9+ messages in thread
From: Michal Hocko @ 2012-12-03 16:18 UTC (permalink / raw)
To: Tejun Heo
Cc: lizefan, paul, glommer, containers, cgroups, peterz, linux-kernel
On Wed 28-11-12 14:27:00, Tejun Heo wrote:
> Implement cpuset_for_each_descendant_pre() and replace the
> cpuset-specific tree walking using cpuset->stack_list with it.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
> ---
> kernel/cpuset.c | 123 ++++++++++++++++++++++----------------------------------
> 1 file changed, 48 insertions(+), 75 deletions(-)
>
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 2ee0e03..3a01730 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -103,9 +103,6 @@ struct cpuset {
> /* for custom sched domain */
> int relax_domain_level;
>
> - /* used for walking a cpuset hierarchy */
> - struct list_head stack_list;
> -
> struct work_struct hotplug_work;
> };
>
> @@ -207,6 +204,20 @@ static struct cpuset top_cpuset = {
> cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
> if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
>
> +/**
> + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
> + * @des_cs: loop cursor pointing to the current descendant
> + * @pos_cgrp: used for iteration
> + * @root_cs: target cpuset to walk ancestor of
> + *
> + * Walk @des_cs through the online descendants of @root_cs. Must be used
> + * with RCU read locked. The caller may modify @pos_cgrp by calling
> + * cgroup_rightmost_descendant() to skip subtree.
> + */
> +#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
> + cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
> + if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
> +
> /*
> * There are two global mutexes guarding cpuset structures - cpuset_mutex
> * and callback_mutex. The latter may nest inside the former. We also
> @@ -507,31 +518,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
> return;
> }
>
> -static void
> -update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
> +static void update_domain_attr_tree(struct sched_domain_attr *dattr,
> + struct cpuset *root_cs)
> {
> - LIST_HEAD(q);
> -
> - list_add(&c->stack_list, &q);
> - while (!list_empty(&q)) {
> - struct cpuset *cp;
> - struct cgroup *cont;
> - struct cpuset *child;
> -
> - cp = list_first_entry(&q, struct cpuset, stack_list);
> - list_del(q.next);
> + struct cpuset *cp;
> + struct cgroup *pos_cgrp;
>
> - if (cpumask_empty(cp->cpus_allowed))
> + rcu_read_lock();
> + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
> + /* skip the whole subtree if @cp doesn't have any CPU */
> + if (cpumask_empty(cp->cpus_allowed)) {
> + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
> continue;
> + }
>
> if (is_sched_load_balance(cp))
> update_domain_attr(dattr, cp);
> -
> - rcu_read_lock();
> - cpuset_for_each_child(child, cont, cp)
> - list_add_tail(&child->stack_list, &q);
> - rcu_read_unlock();
> }
> + rcu_read_unlock();
> }
>
> /*
> @@ -591,7 +595,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
> static int generate_sched_domains(cpumask_var_t **domains,
> struct sched_domain_attr **attributes)
> {
> - LIST_HEAD(q); /* queue of cpusets to be scanned */
> struct cpuset *cp; /* scans q */
> struct cpuset **csa; /* array of all cpuset ptrs */
> int csn; /* how many cpuset ptrs in csa so far */
> @@ -600,6 +603,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
> struct sched_domain_attr *dattr; /* attributes for custom domains */
> int ndoms = 0; /* number of sched domains in result */
> int nslot; /* next empty doms[] struct cpumask slot */
> + struct cgroup *pos_cgrp;
>
> doms = NULL;
> dattr = NULL;
> @@ -627,33 +631,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
> goto done;
> csn = 0;
>
> - list_add(&top_cpuset.stack_list, &q);
> - while (!list_empty(&q)) {
> - struct cgroup *cont;
> - struct cpuset *child; /* scans child cpusets of cp */
> -
> - cp = list_first_entry(&q, struct cpuset, stack_list);
> - list_del(q.next);
> -
> - if (cpumask_empty(cp->cpus_allowed))
> - continue;
> -
> + rcu_read_lock();
> + cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
> /*
> - * All child cpusets contain a subset of the parent's cpus, so
> - * just skip them, and then we call update_domain_attr_tree()
> - * to calc relax_domain_level of the corresponding sched
> - * domain.
> + * Continue traversing beyond @cp iff @cp has some CPUs and
> + * isn't load balancing. The former is obvious. The
> + * latter: All child cpusets contain a subset of the
> + * parent's cpus, so just skip them, and then we call
> + * update_domain_attr_tree() to calc relax_domain_level of
> + * the corresponding sched domain.
> */
> - if (is_sched_load_balance(cp)) {
> - csa[csn++] = cp;
> + if (!cpumask_empty(cp->cpus_allowed) &&
> + !is_sched_load_balance(cp))
> continue;
> - }
>
> - rcu_read_lock();
> - cpuset_for_each_child(child, cont, cp)
> - list_add_tail(&child->stack_list, &q);
> - rcu_read_unlock();
> - }
> + if (is_sched_load_balance(cp))
> + csa[csn++] = cp;
> +
> + /* skip @cp's subtree */
> + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
> + }
> + rcu_read_unlock();
>
> for (i = 0; i < csn; i++)
> csa[i]->pn = i;
> @@ -2059,31 +2057,6 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
> move_member_tasks_to_cpuset(cs, parent);
> }
>
> -/*
> - * Helper function to traverse cpusets.
> - * It can be used to walk the cpuset tree from top to bottom, completing
> - * one layer before dropping down to the next (thus always processing a
> - * node before any of its children).
> - */
> -static struct cpuset *cpuset_next(struct list_head *queue)
> -{
> - struct cpuset *cp;
> - struct cpuset *child; /* scans child cpusets of cp */
> - struct cgroup *cont;
> -
> - if (list_empty(queue))
> - return NULL;
> -
> - cp = list_first_entry(queue, struct cpuset, stack_list);
> - list_del(queue->next);
> - rcu_read_lock();
> - cpuset_for_each_child(child, cont, cp)
> - list_add_tail(&child->stack_list, queue);
> - rcu_read_unlock();
> -
> - return cp;
> -}
> -
> /**
> * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
> * @cs: cpuset in interest
> @@ -2220,12 +2193,12 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
> /* if cpus or mems went down, we need to propagate to descendants */
> if (cpus_offlined || mems_offlined) {
> struct cpuset *cs;
> - LIST_HEAD(queue);
> + struct cgroup *pos_cgrp;
>
> - list_add_tail(&top_cpuset.stack_list, &queue);
> - while ((cs = cpuset_next(&queue)))
> - if (cs != &top_cpuset)
> - schedule_cpuset_propagate_hotplug(cs);
> + rcu_read_lock();
> + cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
> + schedule_cpuset_propagate_hotplug(cs);
> + rcu_read_unlock();
> }
>
> mutex_unlock(&cpuset_mutex);
> --
> 1.7.11.7
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 9+ messages in thread