All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch] mm: memcontrol: convert reclaim iterator to simple css refcounting
@ 2014-09-19 21:10 ` Johannes Weiner
  0 siblings, 0 replies; 15+ messages in thread
From: Johannes Weiner @ 2014-09-19 21:10 UTC (permalink / raw)
  To: linux-mm; +Cc: Michal Hocko, Tejun Heo, cgroups, linux-kernel

The memcg reclaim iterators use a complicated weak reference scheme to
prevent pinning cgroups indefinitely in the absence of memory pressure.

However, during the ongoing cgroup core rework, css lifetime has been
decoupled such that a pinned css no longer interferes with removal of
the user-visible cgroup, and all this complexity is now unnecessary.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/memcontrol.c | 200 ++++++++++----------------------------------------------
 1 file changed, 33 insertions(+), 167 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dfd3b15a57e8..5daa1d3dd9d5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -253,18 +253,6 @@ struct mem_cgroup_stat_cpu {
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
-struct mem_cgroup_reclaim_iter {
-	/*
-	 * last scanned hierarchy member. Valid only if last_dead_count
-	 * matches memcg->dead_count of the hierarchy root group.
-	 */
-	struct mem_cgroup *last_visited;
-	int last_dead_count;
-
-	/* scan generation, increased every round-trip */
-	unsigned int generation;
-};
-
 /*
  * per-zone information in memory controller.
  */
@@ -272,7 +260,7 @@ struct mem_cgroup_per_zone {
 	struct lruvec		lruvec;
 	unsigned long		lru_size[NR_LRU_LISTS];
 
-	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+	struct mem_cgroup	*reclaim_iter[DEF_PRIORITY + 1];
 
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long		usage_in_excess;/* Set to the value by which */
@@ -1174,111 +1162,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return memcg;
 }
 
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-		struct mem_cgroup *last_visited)
-{
-	struct cgroup_subsys_state *prev_css, *next_css;
-
-	prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
-	next_css = css_next_descendant_pre(prev_css, &root->css);
-
-	/*
-	 * Even if we found a group we have to make sure it is
-	 * alive. css && !memcg means that the groups should be
-	 * skipped and we should continue the tree walk.
-	 * last_visited css is safe to use because it is
-	 * protected by css_get and the tree walk is rcu safe.
-	 *
-	 * We do not take a reference on the root of the tree walk
-	 * because we might race with the root removal when it would
-	 * be the only node in the iterated hierarchy and mem_cgroup_iter
-	 * would end up in an endless loop because it expects that at
-	 * least one valid node will be returned. Root cannot disappear
-	 * because caller of the iterator should hold it already so
-	 * skipping css reference should be safe.
-	 */
-	if (next_css) {
-		if ((next_css == &root->css) ||
-		    ((next_css->flags & CSS_ONLINE) &&
-		     css_tryget_online(next_css)))
-			return mem_cgroup_from_css(next_css);
-
-		prev_css = next_css;
-		goto skip_node;
-	}
-
-	return NULL;
-}
-
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
-	/*
-	 * When a group in the hierarchy below root is destroyed, the
-	 * hierarchy iterator can no longer be trusted since it might
-	 * have pointed to the destroyed group.  Invalidate it.
-	 */
-	atomic_inc(&root->dead_count);
-}
-
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-		     struct mem_cgroup *root,
-		     int *sequence)
-{
-	struct mem_cgroup *position = NULL;
-	/*
-	 * A cgroup destruction happens in two stages: offlining and
-	 * release.  They are separated by a RCU grace period.
-	 *
-	 * If the iterator is valid, we may still race with an
-	 * offlining.  The RCU lock ensures the object won't be
-	 * released, tryget will fail if we lost the race.
-	 */
-	*sequence = atomic_read(&root->dead_count);
-	if (iter->last_dead_count == *sequence) {
-		smp_rmb();
-		position = iter->last_visited;
-
-		/*
-		 * We cannot take a reference to root because we might race
-		 * with root removal and returning NULL would end up in
-		 * an endless loop on the iterator user level when root
-		 * would be returned all the time.
-		 */
-		if (position && position != root &&
-		    !css_tryget_online(&position->css))
-			position = NULL;
-	}
-	return position;
-}
-
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
-				   struct mem_cgroup *last_visited,
-				   struct mem_cgroup *new_position,
-				   struct mem_cgroup *root,
-				   int sequence)
-{
-	/* root reference counting symmetric to mem_cgroup_iter_load */
-	if (last_visited && last_visited != root)
-		css_put(&last_visited->css);
-	/*
-	 * We store the sequence count from the time @last_visited was
-	 * loaded successfully instead of rereading it here so that we
-	 * don't lose destruction events in between.  We could have
-	 * raced with the destruction of @new_position after all.
-	 */
-	iter->last_visited = new_position;
-	smp_wmb();
-	iter->last_dead_count = sequence;
-}
-
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
@@ -1300,8 +1183,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
+	struct mem_cgroup_per_zone *uninitialized_var(mz);
+	struct cgroup_subsys_state *css = NULL;
+	int uninitialized_var(priority);
 	struct mem_cgroup *memcg = NULL;
-	struct mem_cgroup *last_visited = NULL;
+	struct mem_cgroup *pos = NULL;
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -1310,50 +1196,50 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 		root = root_mem_cgroup;
 
 	if (prev && !reclaim)
-		last_visited = prev;
+		pos = prev;
 
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
-			goto out_css_put;
+			goto out;
 		return root;
 	}
 
 	rcu_read_lock();
-	while (!memcg) {
-		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-		int uninitialized_var(seq);
 
-		if (reclaim) {
-			struct mem_cgroup_per_zone *mz;
+	if (reclaim) {
+		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+		priority = reclaim->priority;
 
-			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
-			iter = &mz->reclaim_iter[reclaim->priority];
-			if (prev && reclaim->generation != iter->generation) {
-				iter->last_visited = NULL;
-				goto out_unlock;
-			}
-
-			last_visited = mem_cgroup_iter_load(iter, root, &seq);
-		}
-
-		memcg = __mem_cgroup_iter_next(root, last_visited);
+		do {
+			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
+		} while (!css_tryget(&pos->css));
+	}
 
-		if (reclaim) {
-			mem_cgroup_iter_update(iter, last_visited, memcg, root,
-					seq);
+	if (pos)
+		css = &pos->css;
 
-			if (!memcg)
-				iter->generation++;
-			else if (!prev && memcg)
-				reclaim->generation = iter->generation;
+	for (;;) {
+		css = css_next_descendant_pre(css, &root->css);
+		if (!css) {
+			if (prev)
+				goto out_unlock;
+			continue;
+		}
+		if (css == &root->css || css_tryget_online(css)) {
+			memcg = mem_cgroup_from_css(css);
+			break;
 		}
+	}
 
-		if (prev && !memcg)
-			goto out_unlock;
+	if (reclaim) {
+		if (cmpxchg(&mz->reclaim_iter[priority], pos, memcg) == pos)
+			css_get(&memcg->css);
+		css_put(&pos->css);
 	}
+
 out_unlock:
 	rcu_read_unlock();
-out_css_put:
+out:
 	if (prev && prev != root)
 		css_put(&prev->css);
 
@@ -5526,24 +5412,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	return memcg_init_kmem(memcg, &memory_cgrp_subsys);
 }
 
-/*
- * Announce all parents that a group from their hierarchy is gone.
- */
-static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *parent = memcg;
-
-	while ((parent = parent_mem_cgroup(parent)))
-		mem_cgroup_iter_invalidate(parent);
-
-	/*
-	 * if the root memcg is not hierarchical we have to check it
-	 * explicitely.
-	 */
-	if (!root_mem_cgroup->use_hierarchy)
-		mem_cgroup_iter_invalidate(root_mem_cgroup);
-}
-
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5564,8 +5432,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
 	kmem_cgroup_css_offline(memcg);
 
-	mem_cgroup_invalidate_reclaim_iterators(memcg);
-
 	/*
 	 * This requires that offlining is serialized.  Right now that is
 	 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [patch] mm: memcontrol: convert reclaim iterator to simple css refcounting
@ 2014-09-19 21:10 ` Johannes Weiner
  0 siblings, 0 replies; 15+ messages in thread
From: Johannes Weiner @ 2014-09-19 21:10 UTC (permalink / raw)
  To: linux-mm; +Cc: Michal Hocko, Tejun Heo, cgroups, linux-kernel

The memcg reclaim iterators use a complicated weak reference scheme to
prevent pinning cgroups indefinitely in the absence of memory pressure.

However, during the ongoing cgroup core rework, css lifetime has been
decoupled such that a pinned css no longer interferes with removal of
the user-visible cgroup, and all this complexity is now unnecessary.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/memcontrol.c | 200 ++++++++++----------------------------------------------
 1 file changed, 33 insertions(+), 167 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dfd3b15a57e8..5daa1d3dd9d5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -253,18 +253,6 @@ struct mem_cgroup_stat_cpu {
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
-struct mem_cgroup_reclaim_iter {
-	/*
-	 * last scanned hierarchy member. Valid only if last_dead_count
-	 * matches memcg->dead_count of the hierarchy root group.
-	 */
-	struct mem_cgroup *last_visited;
-	int last_dead_count;
-
-	/* scan generation, increased every round-trip */
-	unsigned int generation;
-};
-
 /*
  * per-zone information in memory controller.
  */
@@ -272,7 +260,7 @@ struct mem_cgroup_per_zone {
 	struct lruvec		lruvec;
 	unsigned long		lru_size[NR_LRU_LISTS];
 
-	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+	struct mem_cgroup	*reclaim_iter[DEF_PRIORITY + 1];
 
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long		usage_in_excess;/* Set to the value by which */
@@ -1174,111 +1162,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return memcg;
 }
 
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-		struct mem_cgroup *last_visited)
-{
-	struct cgroup_subsys_state *prev_css, *next_css;
-
-	prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
-	next_css = css_next_descendant_pre(prev_css, &root->css);
-
-	/*
-	 * Even if we found a group we have to make sure it is
-	 * alive. css && !memcg means that the groups should be
-	 * skipped and we should continue the tree walk.
-	 * last_visited css is safe to use because it is
-	 * protected by css_get and the tree walk is rcu safe.
-	 *
-	 * We do not take a reference on the root of the tree walk
-	 * because we might race with the root removal when it would
-	 * be the only node in the iterated hierarchy and mem_cgroup_iter
-	 * would end up in an endless loop because it expects that at
-	 * least one valid node will be returned. Root cannot disappear
-	 * because caller of the iterator should hold it already so
-	 * skipping css reference should be safe.
-	 */
-	if (next_css) {
-		if ((next_css == &root->css) ||
-		    ((next_css->flags & CSS_ONLINE) &&
-		     css_tryget_online(next_css)))
-			return mem_cgroup_from_css(next_css);
-
-		prev_css = next_css;
-		goto skip_node;
-	}
-
-	return NULL;
-}
-
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
-	/*
-	 * When a group in the hierarchy below root is destroyed, the
-	 * hierarchy iterator can no longer be trusted since it might
-	 * have pointed to the destroyed group.  Invalidate it.
-	 */
-	atomic_inc(&root->dead_count);
-}
-
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-		     struct mem_cgroup *root,
-		     int *sequence)
-{
-	struct mem_cgroup *position = NULL;
-	/*
-	 * A cgroup destruction happens in two stages: offlining and
-	 * release.  They are separated by a RCU grace period.
-	 *
-	 * If the iterator is valid, we may still race with an
-	 * offlining.  The RCU lock ensures the object won't be
-	 * released, tryget will fail if we lost the race.
-	 */
-	*sequence = atomic_read(&root->dead_count);
-	if (iter->last_dead_count == *sequence) {
-		smp_rmb();
-		position = iter->last_visited;
-
-		/*
-		 * We cannot take a reference to root because we might race
-		 * with root removal and returning NULL would end up in
-		 * an endless loop on the iterator user level when root
-		 * would be returned all the time.
-		 */
-		if (position && position != root &&
-		    !css_tryget_online(&position->css))
-			position = NULL;
-	}
-	return position;
-}
-
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
-				   struct mem_cgroup *last_visited,
-				   struct mem_cgroup *new_position,
-				   struct mem_cgroup *root,
-				   int sequence)
-{
-	/* root reference counting symmetric to mem_cgroup_iter_load */
-	if (last_visited && last_visited != root)
-		css_put(&last_visited->css);
-	/*
-	 * We store the sequence count from the time @last_visited was
-	 * loaded successfully instead of rereading it here so that we
-	 * don't lose destruction events in between.  We could have
-	 * raced with the destruction of @new_position after all.
-	 */
-	iter->last_visited = new_position;
-	smp_wmb();
-	iter->last_dead_count = sequence;
-}
-
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
@@ -1300,8 +1183,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
+	struct mem_cgroup_per_zone *uninitialized_var(mz);
+	struct cgroup_subsys_state *css = NULL;
+	int uninitialized_var(priority);
 	struct mem_cgroup *memcg = NULL;
-	struct mem_cgroup *last_visited = NULL;
+	struct mem_cgroup *pos = NULL;
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -1310,50 +1196,50 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 		root = root_mem_cgroup;
 
 	if (prev && !reclaim)
-		last_visited = prev;
+		pos = prev;
 
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
-			goto out_css_put;
+			goto out;
 		return root;
 	}
 
 	rcu_read_lock();
-	while (!memcg) {
-		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-		int uninitialized_var(seq);
 
-		if (reclaim) {
-			struct mem_cgroup_per_zone *mz;
+	if (reclaim) {
+		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+		priority = reclaim->priority;
 
-			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
-			iter = &mz->reclaim_iter[reclaim->priority];
-			if (prev && reclaim->generation != iter->generation) {
-				iter->last_visited = NULL;
-				goto out_unlock;
-			}
-
-			last_visited = mem_cgroup_iter_load(iter, root, &seq);
-		}
-
-		memcg = __mem_cgroup_iter_next(root, last_visited);
+		do {
+			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
+		} while (!css_tryget(&pos->css));
+	}
 
-		if (reclaim) {
-			mem_cgroup_iter_update(iter, last_visited, memcg, root,
-					seq);
+	if (pos)
+		css = &pos->css;
 
-			if (!memcg)
-				iter->generation++;
-			else if (!prev && memcg)
-				reclaim->generation = iter->generation;
+	for (;;) {
+		css = css_next_descendant_pre(css, &root->css);
+		if (!css) {
+			if (prev)
+				goto out_unlock;
+			continue;
+		}
+		if (css == &root->css || css_tryget_online(css)) {
+			memcg = mem_cgroup_from_css(css);
+			break;
 		}
+	}
 
-		if (prev && !memcg)
-			goto out_unlock;
+	if (reclaim) {
+		if (cmpxchg(&mz->reclaim_iter[priority], pos, memcg) == pos)
+			css_get(&memcg->css);
+		css_put(&pos->css);
 	}
+
 out_unlock:
 	rcu_read_unlock();
-out_css_put:
+out:
 	if (prev && prev != root)
 		css_put(&prev->css);
 
@@ -5526,24 +5412,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	return memcg_init_kmem(memcg, &memory_cgrp_subsys);
 }
 
-/*
- * Announce all parents that a group from their hierarchy is gone.
- */
-static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *parent = memcg;
-
-	while ((parent = parent_mem_cgroup(parent)))
-		mem_cgroup_iter_invalidate(parent);
-
-	/*
-	 * if the root memcg is not hierarchical we have to check it
-	 * explicitely.
-	 */
-	if (!root_mem_cgroup->use_hierarchy)
-		mem_cgroup_iter_invalidate(root_mem_cgroup);
-}
-
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5564,8 +5432,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
 	kmem_cgroup_css_offline(memcg);
 
-	mem_cgroup_invalidate_reclaim_iterators(memcg);
-
 	/*
 	 * This requires that offlining is serialized.  Right now that is
 	 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
-- 
2.1.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
  2014-09-19 21:10 ` Johannes Weiner
@ 2014-09-19 21:28   ` Johannes Weiner
  -1 siblings, 0 replies; 15+ messages in thread
From: Johannes Weiner @ 2014-09-19 21:28 UTC (permalink / raw)
  To: linux-mm; +Cc: Michal Hocko, Tejun Heo, cgroups, linux-kernel

Eek, sent a stale version of this.  Here is the right one:

---
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 19 Sep 2014 12:39:18 -0400
Subject: [patch v2] mm: memcontrol: convert reclaim iterator to simple css
 refcounting

The memcg reclaim iterators use a complicated weak reference scheme to
prevent pinning cgroups indefinitely in the absence of memory pressure.

However, during the ongoing cgroup core rework, css lifetime has been
decoupled such that a pinned css no longer interferes with removal of
the user-visible cgroup, and all this complexity is now unnecessary.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/memcontrol.c | 201 ++++++++++----------------------------------------------
 1 file changed, 34 insertions(+), 167 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dfd3b15a57e8..154161bb7d4c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -253,18 +253,6 @@ struct mem_cgroup_stat_cpu {
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
-struct mem_cgroup_reclaim_iter {
-	/*
-	 * last scanned hierarchy member. Valid only if last_dead_count
-	 * matches memcg->dead_count of the hierarchy root group.
-	 */
-	struct mem_cgroup *last_visited;
-	int last_dead_count;
-
-	/* scan generation, increased every round-trip */
-	unsigned int generation;
-};
-
 /*
  * per-zone information in memory controller.
  */
@@ -272,7 +260,7 @@ struct mem_cgroup_per_zone {
 	struct lruvec		lruvec;
 	unsigned long		lru_size[NR_LRU_LISTS];
 
-	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+	struct mem_cgroup	*reclaim_iter[DEF_PRIORITY + 1];
 
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long		usage_in_excess;/* Set to the value by which */
@@ -1174,111 +1162,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return memcg;
 }
 
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-		struct mem_cgroup *last_visited)
-{
-	struct cgroup_subsys_state *prev_css, *next_css;
-
-	prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
-	next_css = css_next_descendant_pre(prev_css, &root->css);
-
-	/*
-	 * Even if we found a group we have to make sure it is
-	 * alive. css && !memcg means that the groups should be
-	 * skipped and we should continue the tree walk.
-	 * last_visited css is safe to use because it is
-	 * protected by css_get and the tree walk is rcu safe.
-	 *
-	 * We do not take a reference on the root of the tree walk
-	 * because we might race with the root removal when it would
-	 * be the only node in the iterated hierarchy and mem_cgroup_iter
-	 * would end up in an endless loop because it expects that at
-	 * least one valid node will be returned. Root cannot disappear
-	 * because caller of the iterator should hold it already so
-	 * skipping css reference should be safe.
-	 */
-	if (next_css) {
-		if ((next_css == &root->css) ||
-		    ((next_css->flags & CSS_ONLINE) &&
-		     css_tryget_online(next_css)))
-			return mem_cgroup_from_css(next_css);
-
-		prev_css = next_css;
-		goto skip_node;
-	}
-
-	return NULL;
-}
-
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
-	/*
-	 * When a group in the hierarchy below root is destroyed, the
-	 * hierarchy iterator can no longer be trusted since it might
-	 * have pointed to the destroyed group.  Invalidate it.
-	 */
-	atomic_inc(&root->dead_count);
-}
-
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-		     struct mem_cgroup *root,
-		     int *sequence)
-{
-	struct mem_cgroup *position = NULL;
-	/*
-	 * A cgroup destruction happens in two stages: offlining and
-	 * release.  They are separated by a RCU grace period.
-	 *
-	 * If the iterator is valid, we may still race with an
-	 * offlining.  The RCU lock ensures the object won't be
-	 * released, tryget will fail if we lost the race.
-	 */
-	*sequence = atomic_read(&root->dead_count);
-	if (iter->last_dead_count == *sequence) {
-		smp_rmb();
-		position = iter->last_visited;
-
-		/*
-		 * We cannot take a reference to root because we might race
-		 * with root removal and returning NULL would end up in
-		 * an endless loop on the iterator user level when root
-		 * would be returned all the time.
-		 */
-		if (position && position != root &&
-		    !css_tryget_online(&position->css))
-			position = NULL;
-	}
-	return position;
-}
-
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
-				   struct mem_cgroup *last_visited,
-				   struct mem_cgroup *new_position,
-				   struct mem_cgroup *root,
-				   int sequence)
-{
-	/* root reference counting symmetric to mem_cgroup_iter_load */
-	if (last_visited && last_visited != root)
-		css_put(&last_visited->css);
-	/*
-	 * We store the sequence count from the time @last_visited was
-	 * loaded successfully instead of rereading it here so that we
-	 * don't lose destruction events in between.  We could have
-	 * raced with the destruction of @new_position after all.
-	 */
-	iter->last_visited = new_position;
-	smp_wmb();
-	iter->last_dead_count = sequence;
-}
-
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
@@ -1300,8 +1183,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
+	struct mem_cgroup_per_zone *uninitialized_var(mz);
+	struct cgroup_subsys_state *css = NULL;
+	int uninitialized_var(priority);
 	struct mem_cgroup *memcg = NULL;
-	struct mem_cgroup *last_visited = NULL;
+	struct mem_cgroup *pos = NULL;
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -1310,50 +1196,51 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 		root = root_mem_cgroup;
 
 	if (prev && !reclaim)
-		last_visited = prev;
+		pos = prev;
 
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
-			goto out_css_put;
+			goto out;
 		return root;
 	}
 
 	rcu_read_lock();
-	while (!memcg) {
-		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-		int uninitialized_var(seq);
 
-		if (reclaim) {
-			struct mem_cgroup_per_zone *mz;
+	if (reclaim) {
+		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+		priority = reclaim->priority;
 
-			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
-			iter = &mz->reclaim_iter[reclaim->priority];
-			if (prev && reclaim->generation != iter->generation) {
-				iter->last_visited = NULL;
-				goto out_unlock;
-			}
-
-			last_visited = mem_cgroup_iter_load(iter, root, &seq);
-		}
-
-		memcg = __mem_cgroup_iter_next(root, last_visited);
+		do {
+			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
+		} while (pos && !css_tryget(&pos->css));
+	}
 
-		if (reclaim) {
-			mem_cgroup_iter_update(iter, last_visited, memcg, root,
-					seq);
+	if (pos)
+		css = &pos->css;
 
-			if (!memcg)
-				iter->generation++;
-			else if (!prev && memcg)
-				reclaim->generation = iter->generation;
+	for (;;) {
+		css = css_next_descendant_pre(css, &root->css);
+		if (!css) {
+			if (prev)
+				goto out_unlock;
+			continue;
+		}
+		if (css == &root->css || css_tryget_online(css)) {
+			memcg = mem_cgroup_from_css(css);
+			break;
 		}
+	}
 
-		if (prev && !memcg)
-			goto out_unlock;
+	if (reclaim) {
+		if (cmpxchg(&mz->reclaim_iter[priority], pos, memcg) == pos)
+			css_get(&memcg->css);
+		if (pos)
+			css_put(&pos->css);
 	}
+
 out_unlock:
 	rcu_read_unlock();
-out_css_put:
+out:
 	if (prev && prev != root)
 		css_put(&prev->css);
 
@@ -5526,24 +5413,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	return memcg_init_kmem(memcg, &memory_cgrp_subsys);
 }
 
-/*
- * Announce all parents that a group from their hierarchy is gone.
- */
-static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *parent = memcg;
-
-	while ((parent = parent_mem_cgroup(parent)))
-		mem_cgroup_iter_invalidate(parent);
-
-	/*
-	 * if the root memcg is not hierarchical we have to check it
-	 * explicitely.
-	 */
-	if (!root_mem_cgroup->use_hierarchy)
-		mem_cgroup_iter_invalidate(root_mem_cgroup);
-}
-
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5564,8 +5433,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
 	kmem_cgroup_css_offline(memcg);
 
-	mem_cgroup_invalidate_reclaim_iterators(memcg);
-
 	/*
 	 * This requires that offlining is serialized.  Right now that is
 	 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
-- 
2.1.0



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
@ 2014-09-19 21:28   ` Johannes Weiner
  0 siblings, 0 replies; 15+ messages in thread
From: Johannes Weiner @ 2014-09-19 21:28 UTC (permalink / raw)
  To: linux-mm; +Cc: Michal Hocko, Tejun Heo, cgroups, linux-kernel

Eek, sent a stale version of this.  Here is the right one:

---
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 19 Sep 2014 12:39:18 -0400
Subject: [patch v2] mm: memcontrol: convert reclaim iterator to simple css
 refcounting

The memcg reclaim iterators use a complicated weak reference scheme to
prevent pinning cgroups indefinitely in the absence of memory pressure.

However, during the ongoing cgroup core rework, css lifetime has been
decoupled such that a pinned css no longer interferes with removal of
the user-visible cgroup, and all this complexity is now unnecessary.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/memcontrol.c | 201 ++++++++++----------------------------------------------
 1 file changed, 34 insertions(+), 167 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dfd3b15a57e8..154161bb7d4c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -253,18 +253,6 @@ struct mem_cgroup_stat_cpu {
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
-struct mem_cgroup_reclaim_iter {
-	/*
-	 * last scanned hierarchy member. Valid only if last_dead_count
-	 * matches memcg->dead_count of the hierarchy root group.
-	 */
-	struct mem_cgroup *last_visited;
-	int last_dead_count;
-
-	/* scan generation, increased every round-trip */
-	unsigned int generation;
-};
-
 /*
  * per-zone information in memory controller.
  */
@@ -272,7 +260,7 @@ struct mem_cgroup_per_zone {
 	struct lruvec		lruvec;
 	unsigned long		lru_size[NR_LRU_LISTS];
 
-	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+	struct mem_cgroup	*reclaim_iter[DEF_PRIORITY + 1];
 
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long		usage_in_excess;/* Set to the value by which */
@@ -1174,111 +1162,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return memcg;
 }
 
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-		struct mem_cgroup *last_visited)
-{
-	struct cgroup_subsys_state *prev_css, *next_css;
-
-	prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
-	next_css = css_next_descendant_pre(prev_css, &root->css);
-
-	/*
-	 * Even if we found a group we have to make sure it is
-	 * alive. css && !memcg means that the groups should be
-	 * skipped and we should continue the tree walk.
-	 * last_visited css is safe to use because it is
-	 * protected by css_get and the tree walk is rcu safe.
-	 *
-	 * We do not take a reference on the root of the tree walk
-	 * because we might race with the root removal when it would
-	 * be the only node in the iterated hierarchy and mem_cgroup_iter
-	 * would end up in an endless loop because it expects that at
-	 * least one valid node will be returned. Root cannot disappear
-	 * because caller of the iterator should hold it already so
-	 * skipping css reference should be safe.
-	 */
-	if (next_css) {
-		if ((next_css == &root->css) ||
-		    ((next_css->flags & CSS_ONLINE) &&
-		     css_tryget_online(next_css)))
-			return mem_cgroup_from_css(next_css);
-
-		prev_css = next_css;
-		goto skip_node;
-	}
-
-	return NULL;
-}
-
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
-	/*
-	 * When a group in the hierarchy below root is destroyed, the
-	 * hierarchy iterator can no longer be trusted since it might
-	 * have pointed to the destroyed group.  Invalidate it.
-	 */
-	atomic_inc(&root->dead_count);
-}
-
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-		     struct mem_cgroup *root,
-		     int *sequence)
-{
-	struct mem_cgroup *position = NULL;
-	/*
-	 * A cgroup destruction happens in two stages: offlining and
-	 * release.  They are separated by a RCU grace period.
-	 *
-	 * If the iterator is valid, we may still race with an
-	 * offlining.  The RCU lock ensures the object won't be
-	 * released, tryget will fail if we lost the race.
-	 */
-	*sequence = atomic_read(&root->dead_count);
-	if (iter->last_dead_count == *sequence) {
-		smp_rmb();
-		position = iter->last_visited;
-
-		/*
-		 * We cannot take a reference to root because we might race
-		 * with root removal and returning NULL would end up in
-		 * an endless loop on the iterator user level when root
-		 * would be returned all the time.
-		 */
-		if (position && position != root &&
-		    !css_tryget_online(&position->css))
-			position = NULL;
-	}
-	return position;
-}
-
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
-				   struct mem_cgroup *last_visited,
-				   struct mem_cgroup *new_position,
-				   struct mem_cgroup *root,
-				   int sequence)
-{
-	/* root reference counting symmetric to mem_cgroup_iter_load */
-	if (last_visited && last_visited != root)
-		css_put(&last_visited->css);
-	/*
-	 * We store the sequence count from the time @last_visited was
-	 * loaded successfully instead of rereading it here so that we
-	 * don't lose destruction events in between.  We could have
-	 * raced with the destruction of @new_position after all.
-	 */
-	iter->last_visited = new_position;
-	smp_wmb();
-	iter->last_dead_count = sequence;
-}
-
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
@@ -1300,8 +1183,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
+	struct mem_cgroup_per_zone *uninitialized_var(mz);
+	struct cgroup_subsys_state *css = NULL;
+	int uninitialized_var(priority);
 	struct mem_cgroup *memcg = NULL;
-	struct mem_cgroup *last_visited = NULL;
+	struct mem_cgroup *pos = NULL;
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -1310,50 +1196,51 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 		root = root_mem_cgroup;
 
 	if (prev && !reclaim)
-		last_visited = prev;
+		pos = prev;
 
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
-			goto out_css_put;
+			goto out;
 		return root;
 	}
 
 	rcu_read_lock();
-	while (!memcg) {
-		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-		int uninitialized_var(seq);
 
-		if (reclaim) {
-			struct mem_cgroup_per_zone *mz;
+	if (reclaim) {
+		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+		priority = reclaim->priority;
 
-			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
-			iter = &mz->reclaim_iter[reclaim->priority];
-			if (prev && reclaim->generation != iter->generation) {
-				iter->last_visited = NULL;
-				goto out_unlock;
-			}
-
-			last_visited = mem_cgroup_iter_load(iter, root, &seq);
-		}
-
-		memcg = __mem_cgroup_iter_next(root, last_visited);
+		do {
+			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
+		} while (pos && !css_tryget(&pos->css));
+	}
 
-		if (reclaim) {
-			mem_cgroup_iter_update(iter, last_visited, memcg, root,
-					seq);
+	if (pos)
+		css = &pos->css;
 
-			if (!memcg)
-				iter->generation++;
-			else if (!prev && memcg)
-				reclaim->generation = iter->generation;
+	for (;;) {
+		css = css_next_descendant_pre(css, &root->css);
+		if (!css) {
+			if (prev)
+				goto out_unlock;
+			continue;
+		}
+		if (css == &root->css || css_tryget_online(css)) {
+			memcg = mem_cgroup_from_css(css);
+			break;
 		}
+	}
 
-		if (prev && !memcg)
-			goto out_unlock;
+	if (reclaim) {
+		if (cmpxchg(&mz->reclaim_iter[priority], pos, memcg) == pos)
+			css_get(&memcg->css);
+		if (pos)
+			css_put(&pos->css);
 	}
+
 out_unlock:
 	rcu_read_unlock();
-out_css_put:
+out:
 	if (prev && prev != root)
 		css_put(&prev->css);
 
@@ -5526,24 +5413,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	return memcg_init_kmem(memcg, &memory_cgrp_subsys);
 }
 
-/*
- * Announce all parents that a group from their hierarchy is gone.
- */
-static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *parent = memcg;
-
-	while ((parent = parent_mem_cgroup(parent)))
-		mem_cgroup_iter_invalidate(parent);
-
-	/*
-	 * if the root memcg is not hierarchical we have to check it
-	 * explicitely.
-	 */
-	if (!root_mem_cgroup->use_hierarchy)
-		mem_cgroup_iter_invalidate(root_mem_cgroup);
-}
-
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5564,8 +5433,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
 	kmem_cgroup_css_offline(memcg);
 
-	mem_cgroup_invalidate_reclaim_iterators(memcg);
-
 	/*
 	 * This requires that offlining is serialized.  Right now that is
 	 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
-- 
2.1.0


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
  2014-09-19 21:28   ` Johannes Weiner
  (?)
@ 2014-09-22  9:53     ` Vladimir Davydov
  -1 siblings, 0 replies; 15+ messages in thread
From: Vladimir Davydov @ 2014-09-22  9:53 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: linux-mm, Michal Hocko, Tejun Heo, cgroups, linux-kernel

On Fri, Sep 19, 2014 at 05:28:43PM -0400, Johannes Weiner wrote:
> The memcg reclaim iterators use a complicated weak reference scheme to
> prevent pinning cgroups indefinitely in the absence of memory pressure.
> 
> However, during the ongoing cgroup core rework, css lifetime has been
> decoupled such that a pinned css no longer interferes with removal of
> the user-visible cgroup, and all this complexity is now unnecessary.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  mm/memcontrol.c | 201 ++++++++++----------------------------------------------
>  1 file changed, 34 insertions(+), 167 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
[...]
> -static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
> -{
> -	/*
> -	 * When a group in the hierarchy below root is destroyed, the
> -	 * hierarchy iterator can no longer be trusted since it might
> -	 * have pointed to the destroyed group.  Invalidate it.
> -	 */
> -	atomic_inc(&root->dead_count);

After your patch is applied, mem_cgroup->dead_count is not used any
more. Please remove it.

[...]
> @@ -1300,8 +1183,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>  				   struct mem_cgroup *prev,
>  				   struct mem_cgroup_reclaim_cookie *reclaim)
>  {
> +	struct mem_cgroup_per_zone *uninitialized_var(mz);
> +	struct cgroup_subsys_state *css = NULL;
> +	int uninitialized_var(priority);
>  	struct mem_cgroup *memcg = NULL;
> -	struct mem_cgroup *last_visited = NULL;
> +	struct mem_cgroup *pos = NULL;
>  
>  	if (mem_cgroup_disabled())
>  		return NULL;
> @@ -1310,50 +1196,51 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>  		root = root_mem_cgroup;
>  
>  	if (prev && !reclaim)
> -		last_visited = prev;
> +		pos = prev;
>  
>  	if (!root->use_hierarchy && root != root_mem_cgroup) {
>  		if (prev)
> -			goto out_css_put;
> +			goto out;
>  		return root;
>  	}
>  
>  	rcu_read_lock();
> -	while (!memcg) {
> -		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> -		int uninitialized_var(seq);
>  
> -		if (reclaim) {
> -			struct mem_cgroup_per_zone *mz;
> +	if (reclaim) {
> +		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> +		priority = reclaim->priority;
>  
> -			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> -			iter = &mz->reclaim_iter[reclaim->priority];
> -			if (prev && reclaim->generation != iter->generation) {

Again, you are removing all generation checks, but leaving
mem_cgroup_reclaim_cookie->generation defined. Please remove it too.

BTW, don't we still need the generation check to eliminate the
possibility of a process iterating infinitely over a memory cgroup tree
in case of concurrent reclaim?

Thanks,
Vladimir

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
@ 2014-09-22  9:53     ` Vladimir Davydov
  0 siblings, 0 replies; 15+ messages in thread
From: Vladimir Davydov @ 2014-09-22  9:53 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: linux-mm, Michal Hocko, Tejun Heo, cgroups, linux-kernel

On Fri, Sep 19, 2014 at 05:28:43PM -0400, Johannes Weiner wrote:
> The memcg reclaim iterators use a complicated weak reference scheme to
> prevent pinning cgroups indefinitely in the absence of memory pressure.
> 
> However, during the ongoing cgroup core rework, css lifetime has been
> decoupled such that a pinned css no longer interferes with removal of
> the user-visible cgroup, and all this complexity is now unnecessary.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  mm/memcontrol.c | 201 ++++++++++----------------------------------------------
>  1 file changed, 34 insertions(+), 167 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
[...]
> -static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
> -{
> -	/*
> -	 * When a group in the hierarchy below root is destroyed, the
> -	 * hierarchy iterator can no longer be trusted since it might
> -	 * have pointed to the destroyed group.  Invalidate it.
> -	 */
> -	atomic_inc(&root->dead_count);

After your patch is applied, mem_cgroup->dead_count is not used any
more. Please remove it.

[...]
> @@ -1300,8 +1183,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>  				   struct mem_cgroup *prev,
>  				   struct mem_cgroup_reclaim_cookie *reclaim)
>  {
> +	struct mem_cgroup_per_zone *uninitialized_var(mz);
> +	struct cgroup_subsys_state *css = NULL;
> +	int uninitialized_var(priority);
>  	struct mem_cgroup *memcg = NULL;
> -	struct mem_cgroup *last_visited = NULL;
> +	struct mem_cgroup *pos = NULL;
>  
>  	if (mem_cgroup_disabled())
>  		return NULL;
> @@ -1310,50 +1196,51 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>  		root = root_mem_cgroup;
>  
>  	if (prev && !reclaim)
> -		last_visited = prev;
> +		pos = prev;
>  
>  	if (!root->use_hierarchy && root != root_mem_cgroup) {
>  		if (prev)
> -			goto out_css_put;
> +			goto out;
>  		return root;
>  	}
>  
>  	rcu_read_lock();
> -	while (!memcg) {
> -		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> -		int uninitialized_var(seq);
>  
> -		if (reclaim) {
> -			struct mem_cgroup_per_zone *mz;
> +	if (reclaim) {
> +		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> +		priority = reclaim->priority;
>  
> -			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> -			iter = &mz->reclaim_iter[reclaim->priority];
> -			if (prev && reclaim->generation != iter->generation) {

Again, you are removing all generation checks, but leaving
mem_cgroup_reclaim_cookie->generation defined. Please remove it too.

BTW, don't we still need the generation check to eliminate the
possibility of a process iterating infinitely over a memory cgroup tree
in case of concurrent reclaim?

Thanks,
Vladimir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
@ 2014-09-22  9:53     ` Vladimir Davydov
  0 siblings, 0 replies; 15+ messages in thread
From: Vladimir Davydov @ 2014-09-22  9:53 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Michal Hocko, Tejun Heo,
	cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

On Fri, Sep 19, 2014 at 05:28:43PM -0400, Johannes Weiner wrote:
> The memcg reclaim iterators use a complicated weak reference scheme to
> prevent pinning cgroups indefinitely in the absence of memory pressure.
> 
> However, during the ongoing cgroup core rework, css lifetime has been
> decoupled such that a pinned css no longer interferes with removal of
> the user-visible cgroup, and all this complexity is now unnecessary.
> 
> Signed-off-by: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
> ---
>  mm/memcontrol.c | 201 ++++++++++----------------------------------------------
>  1 file changed, 34 insertions(+), 167 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
[...]
> -static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
> -{
> -	/*
> -	 * When a group in the hierarchy below root is destroyed, the
> -	 * hierarchy iterator can no longer be trusted since it might
> -	 * have pointed to the destroyed group.  Invalidate it.
> -	 */
> -	atomic_inc(&root->dead_count);

After your patch is applied, mem_cgroup->dead_count is not used any
more. Please remove it.

[...]
> @@ -1300,8 +1183,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>  				   struct mem_cgroup *prev,
>  				   struct mem_cgroup_reclaim_cookie *reclaim)
>  {
> +	struct mem_cgroup_per_zone *uninitialized_var(mz);
> +	struct cgroup_subsys_state *css = NULL;
> +	int uninitialized_var(priority);
>  	struct mem_cgroup *memcg = NULL;
> -	struct mem_cgroup *last_visited = NULL;
> +	struct mem_cgroup *pos = NULL;
>  
>  	if (mem_cgroup_disabled())
>  		return NULL;
> @@ -1310,50 +1196,51 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
>  		root = root_mem_cgroup;
>  
>  	if (prev && !reclaim)
> -		last_visited = prev;
> +		pos = prev;
>  
>  	if (!root->use_hierarchy && root != root_mem_cgroup) {
>  		if (prev)
> -			goto out_css_put;
> +			goto out;
>  		return root;
>  	}
>  
>  	rcu_read_lock();
> -	while (!memcg) {
> -		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> -		int uninitialized_var(seq);
>  
> -		if (reclaim) {
> -			struct mem_cgroup_per_zone *mz;
> +	if (reclaim) {
> +		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> +		priority = reclaim->priority;
>  
> -			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> -			iter = &mz->reclaim_iter[reclaim->priority];
> -			if (prev && reclaim->generation != iter->generation) {

Again, you are removing all generation checks, but leaving
mem_cgroup_reclaim_cookie->generation defined. Please remove it too.

BTW, don't we still need the generation check to eliminate the
possibility of a process iterating infinitely over a memory cgroup tree
in case of concurrent reclaim?

Thanks,
Vladimir

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
  2014-09-19 21:28   ` Johannes Weiner
@ 2014-09-24 16:47     ` Michal Hocko
  -1 siblings, 0 replies; 15+ messages in thread
From: Michal Hocko @ 2014-09-24 16:47 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: linux-mm, Tejun Heo, cgroups, linux-kernel

On Fri 19-09-14 17:28:43, Johannes Weiner wrote:
> From: Johannes Weiner <hannes@cmpxchg.org>
> Date: Fri, 19 Sep 2014 12:39:18 -0400
> Subject: [patch v2] mm: memcontrol: convert reclaim iterator to simple css
>  refcounting
> 
> The memcg reclaim iterators use a complicated weak reference scheme to
> prevent pinning cgroups indefinitely in the absence of memory pressure.
> 
> However, during the ongoing cgroup core rework, css lifetime has been
> decoupled such that a pinned css no longer interferes with removal of
> the user-visible cgroup, and all this complexity is now unnecessary.

I very much welcome simplification in this area but I would also very much
appreciate more details why some checks are no longer needed. Why don't
we need ->generation or (next_css->flags & CSS_ONLINE) check anymore?

> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  mm/memcontrol.c | 201 ++++++++++----------------------------------------------
>  1 file changed, 34 insertions(+), 167 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index dfd3b15a57e8..154161bb7d4c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
[...]
>  	rcu_read_lock();
> -	while (!memcg) {
> -		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> -		int uninitialized_var(seq);
>  
> -		if (reclaim) {
> -			struct mem_cgroup_per_zone *mz;
> +	if (reclaim) {
> +		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> +		priority = reclaim->priority;
>  
> -			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> -			iter = &mz->reclaim_iter[reclaim->priority];
> -			if (prev && reclaim->generation != iter->generation) {
> -				iter->last_visited = NULL;
> -				goto out_unlock;
> -			}
> -
> -			last_visited = mem_cgroup_iter_load(iter, root, &seq);
> -		}
> -
> -		memcg = __mem_cgroup_iter_next(root, last_visited);
> +		do {
> +			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
> +		} while (pos && !css_tryget(&pos->css));

This is a bit confusing. AFAIU css_tryget fails only when the current
ref count is zero already. When do we keep cached memcg with zero count
behind? We always do css_get after cmpxchg.

Hmm, there is a small window between cmpxchg and css_get when we store
the current memcg into the reclaim_iter[priority]. If the current memcg
is root then we do not take any css reference before cmpxchg and so it
might drop down to zero in the mean time so other CPU might see zero I
guess. But I do not see how css_get after cmpxchg on such css works.
I guess I should go and check the css reference counting again.

Anyway this would deserve a comment.

> +	}
>  
> -		if (reclaim) {
> -			mem_cgroup_iter_update(iter, last_visited, memcg, root,
> -					seq);
> +	if (pos)
> +		css = &pos->css;
>  
> -			if (!memcg)
> -				iter->generation++;
> -			else if (!prev && memcg)
> -				reclaim->generation = iter->generation;
> +	for (;;) {
> +		css = css_next_descendant_pre(css, &root->css);
> +		if (!css) {
> +			if (prev)
> +				goto out_unlock;
> +			continue;
> +		}
> +		if (css == &root->css || css_tryget_online(css)) {
> +			memcg = mem_cgroup_from_css(css);
> +			break;
>  		}
> +	}
>  
> -		if (prev && !memcg)
> -			goto out_unlock;
> +	if (reclaim) {
> +		if (cmpxchg(&mz->reclaim_iter[priority], pos, memcg) == pos)
> +			css_get(&memcg->css);
> +		if (pos)
> +			css_put(&pos->css);
>  	}
> +
>  out_unlock:
>  	rcu_read_unlock();
> -out_css_put:
> +out:
>  	if (prev && prev != root)
>  		css_put(&prev->css);
>  
[...]
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
@ 2014-09-24 16:47     ` Michal Hocko
  0 siblings, 0 replies; 15+ messages in thread
From: Michal Hocko @ 2014-09-24 16:47 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: linux-mm, Tejun Heo, cgroups, linux-kernel

On Fri 19-09-14 17:28:43, Johannes Weiner wrote:
> From: Johannes Weiner <hannes@cmpxchg.org>
> Date: Fri, 19 Sep 2014 12:39:18 -0400
> Subject: [patch v2] mm: memcontrol: convert reclaim iterator to simple css
>  refcounting
> 
> The memcg reclaim iterators use a complicated weak reference scheme to
> prevent pinning cgroups indefinitely in the absence of memory pressure.
> 
> However, during the ongoing cgroup core rework, css lifetime has been
> decoupled such that a pinned css no longer interferes with removal of
> the user-visible cgroup, and all this complexity is now unnecessary.

I very much welcome simplification in this area but I would also very much
appreciate more details why some checks are no longer needed. Why don't
we need ->generation or (next_css->flags & CSS_ONLINE) check anymore?

> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  mm/memcontrol.c | 201 ++++++++++----------------------------------------------
>  1 file changed, 34 insertions(+), 167 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index dfd3b15a57e8..154161bb7d4c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
[...]
>  	rcu_read_lock();
> -	while (!memcg) {
> -		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> -		int uninitialized_var(seq);
>  
> -		if (reclaim) {
> -			struct mem_cgroup_per_zone *mz;
> +	if (reclaim) {
> +		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> +		priority = reclaim->priority;
>  
> -			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> -			iter = &mz->reclaim_iter[reclaim->priority];
> -			if (prev && reclaim->generation != iter->generation) {
> -				iter->last_visited = NULL;
> -				goto out_unlock;
> -			}
> -
> -			last_visited = mem_cgroup_iter_load(iter, root, &seq);
> -		}
> -
> -		memcg = __mem_cgroup_iter_next(root, last_visited);
> +		do {
> +			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
> +		} while (pos && !css_tryget(&pos->css));

This is a bit confusing. AFAIU css_tryget fails only when the current
ref count is zero already. When do we keep cached memcg with zero count
behind? We always do css_get after cmpxchg.

Hmm, there is a small window between cmpxchg and css_get when we store
the current memcg into the reclaim_iter[priority]. If the current memcg
is root then we do not take any css reference before cmpxchg and so it
might drop down to zero in the mean time so other CPU might see zero I
guess. But I do not see how css_get after cmpxchg on such css works.
I guess I should go and check the css reference counting again.

Anyway this would deserve a comment.

> +	}
>  
> -		if (reclaim) {
> -			mem_cgroup_iter_update(iter, last_visited, memcg, root,
> -					seq);
> +	if (pos)
> +		css = &pos->css;
>  
> -			if (!memcg)
> -				iter->generation++;
> -			else if (!prev && memcg)
> -				reclaim->generation = iter->generation;
> +	for (;;) {
> +		css = css_next_descendant_pre(css, &root->css);
> +		if (!css) {
> +			if (prev)
> +				goto out_unlock;
> +			continue;
> +		}
> +		if (css == &root->css || css_tryget_online(css)) {
> +			memcg = mem_cgroup_from_css(css);
> +			break;
>  		}
> +	}
>  
> -		if (prev && !memcg)
> -			goto out_unlock;
> +	if (reclaim) {
> +		if (cmpxchg(&mz->reclaim_iter[priority], pos, memcg) == pos)
> +			css_get(&memcg->css);
> +		if (pos)
> +			css_put(&pos->css);
>  	}
> +
>  out_unlock:
>  	rcu_read_unlock();
> -out_css_put:
> +out:
>  	if (prev && prev != root)
>  		css_put(&prev->css);
>  
[...]
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
  2014-09-24 16:47     ` Michal Hocko
@ 2014-09-24 17:16       ` Johannes Weiner
  -1 siblings, 0 replies; 15+ messages in thread
From: Johannes Weiner @ 2014-09-24 17:16 UTC (permalink / raw)
  To: Michal Hocko; +Cc: linux-mm, Tejun Heo, cgroups, linux-kernel

On Wed, Sep 24, 2014 at 06:47:39PM +0200, Michal Hocko wrote:
> On Fri 19-09-14 17:28:43, Johannes Weiner wrote:
> > From: Johannes Weiner <hannes@cmpxchg.org>
> > Date: Fri, 19 Sep 2014 12:39:18 -0400
> > Subject: [patch v2] mm: memcontrol: convert reclaim iterator to simple css
> >  refcounting
> > 
> > The memcg reclaim iterators use a complicated weak reference scheme to
> > prevent pinning cgroups indefinitely in the absence of memory pressure.
> > 
> > However, during the ongoing cgroup core rework, css lifetime has been
> > decoupled such that a pinned css no longer interferes with removal of
> > the user-visible cgroup, and all this complexity is now unnecessary.
> 
> I very much welcome simplification in this area but I would also very much
> appreciate more details why some checks are no longer needed. Why don't
> we need ->generation or (next_css->flags & CSS_ONLINE) check anymore?

Vladimir pointed out that the generation was still needed, I added it
back and will submit version 2 after the lockless counters have been
sorted out.

Argh, I thought CSS_ONLINE was an artifact obsoleted by the
css_tryget_online() conversion.  That's quite the handgrenade.

Tejun, should maybe the iterators not return css before they have
CSS_ONLINE set?  It seems odd to have memcg reach into cgroup like
that to check if published objects are actually fully initialized.
Background is this patch:

commit d8ad30559715ce97afb7d1a93a12fd90e8fff312
Author: Hugh Dickins <hughd@google.com>
Date:   Thu Jan 23 15:53:32 2014 -0800

    mm/memcg: iteration skip memcgs not yet fully initialized
    
    It is surprising that the mem_cgroup iterator can return memcgs which
    have not yet been fully initialized.  By accident (or trial and error?)
    this appears not to present an actual problem; but it may be better to
    prevent such surprises, by skipping memcgs not yet online.
    
    Signed-off-by: Hugh Dickins <hughd@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Acked-by: Michal Hocko <mhocko@suse.cz>
    Cc: Johannes Weiner <hannes@cmpxchg.org>
    Cc: <stable@vger.kernel.org>        [3.12+]
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

> >  	rcu_read_lock();
> > -	while (!memcg) {
> > -		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> > -		int uninitialized_var(seq);
> >  
> > -		if (reclaim) {
> > -			struct mem_cgroup_per_zone *mz;
> > +	if (reclaim) {
> > +		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> > +		priority = reclaim->priority;
> >  
> > -			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> > -			iter = &mz->reclaim_iter[reclaim->priority];
> > -			if (prev && reclaim->generation != iter->generation) {
> > -				iter->last_visited = NULL;
> > -				goto out_unlock;
> > -			}
> > -
> > -			last_visited = mem_cgroup_iter_load(iter, root, &seq);
> > -		}
> > -
> > -		memcg = __mem_cgroup_iter_next(root, last_visited);
> > +		do {
> > +			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
> > +		} while (pos && !css_tryget(&pos->css));
> 
> This is a bit confusing. AFAIU css_tryget fails only when the current
> ref count is zero already. When do we keep cached memcg with zero count
> behind? We always do css_get after cmpxchg.
> 
> Hmm, there is a small window between cmpxchg and css_get when we store
> the current memcg into the reclaim_iter[priority]. If the current memcg
> is root then we do not take any css reference before cmpxchg and so it
> might drop down to zero in the mean time so other CPU might see zero I
> guess. But I do not see how css_get after cmpxchg on such css works.
> I guess I should go and check the css reference counting again.

It's not about root or the newly stored memcg, it's that you might
read the position right before it's replaced and css_put(), at which
point the css_tryget() may fail and you have to reload the position.

I'll add a comment.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
@ 2014-09-24 17:16       ` Johannes Weiner
  0 siblings, 0 replies; 15+ messages in thread
From: Johannes Weiner @ 2014-09-24 17:16 UTC (permalink / raw)
  To: Michal Hocko; +Cc: linux-mm, Tejun Heo, cgroups, linux-kernel

On Wed, Sep 24, 2014 at 06:47:39PM +0200, Michal Hocko wrote:
> On Fri 19-09-14 17:28:43, Johannes Weiner wrote:
> > From: Johannes Weiner <hannes@cmpxchg.org>
> > Date: Fri, 19 Sep 2014 12:39:18 -0400
> > Subject: [patch v2] mm: memcontrol: convert reclaim iterator to simple css
> >  refcounting
> > 
> > The memcg reclaim iterators use a complicated weak reference scheme to
> > prevent pinning cgroups indefinitely in the absence of memory pressure.
> > 
> > However, during the ongoing cgroup core rework, css lifetime has been
> > decoupled such that a pinned css no longer interferes with removal of
> > the user-visible cgroup, and all this complexity is now unnecessary.
> 
> I very much welcome simplification in this area but I would also very much
> appreciate more details why some checks are no longer needed. Why don't
> we need ->generation or (next_css->flags & CSS_ONLINE) check anymore?

Vladimir pointed out that the generation was still needed, I added it
back and will submit version 2 after the lockless counters have been
sorted out.

Argh, I thought CSS_ONLINE was an artifact obsoleted by the
css_tryget_online() conversion.  That's quite the handgrenade.

Tejun, should maybe the iterators not return css before they have
CSS_ONLINE set?  It seems odd to have memcg reach into cgroup like
that to check if published objects are actually fully initialized.
Background is this patch:

commit d8ad30559715ce97afb7d1a93a12fd90e8fff312
Author: Hugh Dickins <hughd@google.com>
Date:   Thu Jan 23 15:53:32 2014 -0800

    mm/memcg: iteration skip memcgs not yet fully initialized
    
    It is surprising that the mem_cgroup iterator can return memcgs which
    have not yet been fully initialized.  By accident (or trial and error?)
    this appears not to present an actual problem; but it may be better to
    prevent such surprises, by skipping memcgs not yet online.
    
    Signed-off-by: Hugh Dickins <hughd@google.com>
    Cc: Tejun Heo <tj@kernel.org>
    Acked-by: Michal Hocko <mhocko@suse.cz>
    Cc: Johannes Weiner <hannes@cmpxchg.org>
    Cc: <stable@vger.kernel.org>        [3.12+]
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

> >  	rcu_read_lock();
> > -	while (!memcg) {
> > -		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> > -		int uninitialized_var(seq);
> >  
> > -		if (reclaim) {
> > -			struct mem_cgroup_per_zone *mz;
> > +	if (reclaim) {
> > +		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> > +		priority = reclaim->priority;
> >  
> > -			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
> > -			iter = &mz->reclaim_iter[reclaim->priority];
> > -			if (prev && reclaim->generation != iter->generation) {
> > -				iter->last_visited = NULL;
> > -				goto out_unlock;
> > -			}
> > -
> > -			last_visited = mem_cgroup_iter_load(iter, root, &seq);
> > -		}
> > -
> > -		memcg = __mem_cgroup_iter_next(root, last_visited);
> > +		do {
> > +			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
> > +		} while (pos && !css_tryget(&pos->css));
> 
> This is a bit confusing. AFAIU css_tryget fails only when the current
> ref count is zero already. When do we keep cached memcg with zero count
> behind? We always do css_get after cmpxchg.
> 
> Hmm, there is a small window between cmpxchg and css_get when we store
> the current memcg into the reclaim_iter[priority]. If the current memcg
> is root then we do not take any css reference before cmpxchg and so it
> might drop down to zero in the mean time so other CPU might see zero I
> guess. But I do not see how css_get after cmpxchg on such css works.
> I guess I should go and check the css reference counting again.

It's not about root or the newly stored memcg, it's that you might
read the position right before it's replaced and css_put(), at which
point the css_tryget() may fail and you have to reload the position.

I'll add a comment.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
  2014-09-24 17:16       ` Johannes Weiner
@ 2014-09-25  1:29         ` Tejun Heo
  -1 siblings, 0 replies; 15+ messages in thread
From: Tejun Heo @ 2014-09-25  1:29 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Michal Hocko, linux-mm, cgroups, linux-kernel

Hello,

On Wed, Sep 24, 2014 at 01:16:53PM -0400, Johannes Weiner wrote:
> Tejun, should maybe the iterators not return css before they have
> CSS_ONLINE set?  It seems odd to have memcg reach into cgroup like
> that to check if published objects are actually fully initialized.
> Background is this patch:

So, memcontrol shouldn't be doing that but at the same time cgroup
core can't do it for any controller.  One of the requirements of the
iterations is that it shouldn't miss any css which has been onlined by
its controller; however, because each controller defines its own
locking, cgroup core has no way knowing when a css finishes
initialization.  If it includes after ->css_online() is complete, the
iteration may happen between when the controller considers the css
fully initialized and cgroup core sets CSS_ONLINE.  The only thing
cgroup core can do is including all csses which *could* be considered
online by the controller and let it filter accordingly.

IOW, the precise moment a css becomes online is not known to the
cgroup core as cgroup core locking and controller locking are
completely independent.  The current memcg implementation is broken in
that memcg iterator is testing CSS_ONLINE which is set *after*
->css_online() is complete and iterations can happen inbetween where
the online css may be omitted because it's not marked online by cgroup
core yet.  This may be okay for memcg's use of iterators but for
things like freezer, for example, this can break things horribly.

Anyways, the right thing to do is removing the CSS_ONLINE testing and
implementing memcg's own way of marking an object as fully initialized
according to its synchronization and iteration requirements.  So,
yeah, it's a glaring layering violation which should be removed.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
@ 2014-09-25  1:29         ` Tejun Heo
  0 siblings, 0 replies; 15+ messages in thread
From: Tejun Heo @ 2014-09-25  1:29 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Michal Hocko, linux-mm, cgroups, linux-kernel

Hello,

On Wed, Sep 24, 2014 at 01:16:53PM -0400, Johannes Weiner wrote:
> Tejun, should maybe the iterators not return css before they have
> CSS_ONLINE set?  It seems odd to have memcg reach into cgroup like
> that to check if published objects are actually fully initialized.
> Background is this patch:

So, memcontrol shouldn't be doing that but at the same time cgroup
core can't do it for any controller.  One of the requirements of the
iterations is that it shouldn't miss any css which has been onlined by
its controller; however, because each controller defines its own
locking, cgroup core has no way knowing when a css finishes
initialization.  If it includes after ->css_online() is complete, the
iteration may happen between when the controller considers the css
fully initialized and cgroup core sets CSS_ONLINE.  The only thing
cgroup core can do is including all csses which *could* be considered
online by the controller and let it filter accordingly.

IOW, the precise moment a css becomes online is not known to the
cgroup core as cgroup core locking and controller locking are
completely independent.  The current memcg implementation is broken in
that memcg iterator is testing CSS_ONLINE which is set *after*
->css_online() is complete and iterations can happen inbetween where
the online css may be omitted because it's not marked online by cgroup
core yet.  This may be okay for memcg's use of iterators but for
things like freezer, for example, this can break things horribly.

Anyways, the right thing to do is removing the CSS_ONLINE testing and
implementing memcg's own way of marking an object as fully initialized
according to its synchronization and iteration requirements.  So,
yeah, it's a glaring layering violation which should be removed.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
  2014-09-24 17:16       ` Johannes Weiner
@ 2014-09-25 10:01         ` Michal Hocko
  -1 siblings, 0 replies; 15+ messages in thread
From: Michal Hocko @ 2014-09-25 10:01 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: linux-mm, Tejun Heo, cgroups, linux-kernel

On Wed 24-09-14 13:16:53, Johannes Weiner wrote:
> On Wed, Sep 24, 2014 at 06:47:39PM +0200, Michal Hocko wrote:
> > On Fri 19-09-14 17:28:43, Johannes Weiner wrote:
[...]
> > > -		memcg = __mem_cgroup_iter_next(root, last_visited);
> > > +		do {
> > > +			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
> > > +		} while (pos && !css_tryget(&pos->css));
> > 
> > This is a bit confusing. AFAIU css_tryget fails only when the current
> > ref count is zero already. When do we keep cached memcg with zero count
> > behind? We always do css_get after cmpxchg.
> > 
> > Hmm, there is a small window between cmpxchg and css_get when we store
> > the current memcg into the reclaim_iter[priority]. If the current memcg
> > is root then we do not take any css reference before cmpxchg and so it
> > might drop down to zero in the mean time so other CPU might see zero I
> > guess. But I do not see how css_get after cmpxchg on such css works.
> > I guess I should go and check the css reference counting again.
> 
> It's not about root or the newly stored memcg, it's that you might
> read the position right before it's replaced and css_put(), at which

OK, got it

	CPU0					CPU1
pos = reclaim_iter[priority]
					cmpxchg(reclaim_iter[priority], pos, memcg)
					css_put(pos)	# -> 0
css_tryget(pos)

Thanks!
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [patch v2] mm: memcontrol: convert reclaim iterator to simple css refcounting
@ 2014-09-25 10:01         ` Michal Hocko
  0 siblings, 0 replies; 15+ messages in thread
From: Michal Hocko @ 2014-09-25 10:01 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: linux-mm, Tejun Heo, cgroups, linux-kernel

On Wed 24-09-14 13:16:53, Johannes Weiner wrote:
> On Wed, Sep 24, 2014 at 06:47:39PM +0200, Michal Hocko wrote:
> > On Fri 19-09-14 17:28:43, Johannes Weiner wrote:
[...]
> > > -		memcg = __mem_cgroup_iter_next(root, last_visited);
> > > +		do {
> > > +			pos = ACCESS_ONCE(mz->reclaim_iter[priority]);
> > > +		} while (pos && !css_tryget(&pos->css));
> > 
> > This is a bit confusing. AFAIU css_tryget fails only when the current
> > ref count is zero already. When do we keep cached memcg with zero count
> > behind? We always do css_get after cmpxchg.
> > 
> > Hmm, there is a small window between cmpxchg and css_get when we store
> > the current memcg into the reclaim_iter[priority]. If the current memcg
> > is root then we do not take any css reference before cmpxchg and so it
> > might drop down to zero in the mean time so other CPU might see zero I
> > guess. But I do not see how css_get after cmpxchg on such css works.
> > I guess I should go and check the css reference counting again.
> 
> It's not about root or the newly stored memcg, it's that you might
> read the position right before it's replaced and css_put(), at which

OK, got it

	CPU0					CPU1
pos = reclaim_iter[priority]
					cmpxchg(reclaim_iter[priority], pos, memcg)
					css_put(pos)	# -> 0
css_tryget(pos)

Thanks!
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2014-09-25 10:01 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-19 21:10 [patch] mm: memcontrol: convert reclaim iterator to simple css refcounting Johannes Weiner
2014-09-19 21:10 ` Johannes Weiner
2014-09-19 21:28 ` [patch v2] " Johannes Weiner
2014-09-19 21:28   ` Johannes Weiner
2014-09-22  9:53   ` Vladimir Davydov
2014-09-22  9:53     ` Vladimir Davydov
2014-09-22  9:53     ` Vladimir Davydov
2014-09-24 16:47   ` Michal Hocko
2014-09-24 16:47     ` Michal Hocko
2014-09-24 17:16     ` Johannes Weiner
2014-09-24 17:16       ` Johannes Weiner
2014-09-25  1:29       ` Tejun Heo
2014-09-25  1:29         ` Tejun Heo
2014-09-25 10:01       ` Michal Hocko
2014-09-25 10:01         ` Michal Hocko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.