linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
@ 2013-12-18 13:16 Vladimir Davydov
  2013-12-18 13:16 ` [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error Vladimir Davydov
                   ` (7 more replies)
  0 siblings, 8 replies; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-18 13:16 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 mm/slab_common.c |   66 +++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb39..5d6f743 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
 
-	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
-		goto out_locked;
+	err = kmem_cache_sanity_check(memcg, name, size);
+	if (err)
+		goto out_unlock;
 
 	/*
 	 * Some allocators will constraint the set of valid flags to a subset
@@ -189,45 +190,41 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
 
 	s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
 	if (s)
-		goto out_locked;
+		goto out_unlock;
 
 	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
-	if (s) {
-		s->object_size = s->size = size;
-		s->align = calculate_alignment(flags, align, size);
-		s->ctor = ctor;
-
-		if (memcg_register_cache(memcg, s, parent_cache)) {
-			kmem_cache_free(kmem_cache, s);
-			err = -ENOMEM;
-			goto out_locked;
-		}
+	if (!s) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
 
-		s->name = kstrdup(name, GFP_KERNEL);
-		if (!s->name) {
-			kmem_cache_free(kmem_cache, s);
-			err = -ENOMEM;
-			goto out_locked;
-		}
+	s->object_size = s->size = size;
+	s->align = calculate_alignment(flags, align, size);
+	s->ctor = ctor;
 
-		err = __kmem_cache_create(s, flags);
-		if (!err) {
-			s->refcount = 1;
-			list_add(&s->list, &slab_caches);
-			memcg_cache_list_add(memcg, s);
-		} else {
-			kfree(s->name);
-			kmem_cache_free(kmem_cache, s);
-		}
-	} else
+	s->name = kstrdup(name, GFP_KERNEL);
+	if (!s->name) {
 		err = -ENOMEM;
+		goto out_free_cache;
+	}
+
+	err = memcg_register_cache(memcg, s, parent_cache);
+	if (err)
+		goto out_free_cache;
 
-out_locked:
+	err = __kmem_cache_create(s, flags);
+	if (err)
+		goto out_free_cache;
+
+	s->refcount = 1;
+	list_add(&s->list, &slab_caches);
+	memcg_cache_list_add(memcg, s);
+
+out_unlock:
 	mutex_unlock(&slab_mutex);
 	put_online_cpus();
 
 	if (err) {
-
 		if (flags & SLAB_PANIC)
 			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
 				name, err);
@@ -236,11 +233,14 @@ out_locked:
 				name, err);
 			dump_stack();
 		}
-
 		return NULL;
 	}
-
 	return s;
+
+out_free_cache:
+	kfree(s->name);
+	kmem_cache_free(kmem_cache, s);
+	goto out_unlock;
 }
 
 struct kmem_cache *
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error
  2013-12-18 13:16 [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Vladimir Davydov
@ 2013-12-18 13:16 ` Vladimir Davydov
  2013-12-18 17:06   ` Michal Hocko
  2013-12-18 13:16 ` [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches Vladimir Davydov
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-18 13:16 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

Plus, rename memcg_register_cache() to memcg_init_cache_params(),
because it actually does not register the cache anywhere, but simply
initialize kmem_cache::memcg_params.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |   13 +++++++++----
 mm/memcontrol.c            |    9 +++++++--
 mm/slab_common.c           |    3 ++-
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b3e7a66..b357ae3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -497,8 +497,9 @@ void __memcg_kmem_commit_charge(struct page *page,
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-			 struct kmem_cache *root_cache);
+int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+			    struct kmem_cache *root_cache);
+void memcg_free_cache_params(struct kmem_cache *s);
 void memcg_release_cache(struct kmem_cache *cachep);
 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
 
@@ -641,12 +642,16 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
 }
 
 static inline int
-memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-		     struct kmem_cache *root_cache)
+memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+			struct kmem_cache *root_cache)
 {
 	return 0;
 }
 
+static inline void memcg_free_cache_params(struct kmem_cache *s);
+{
+}
+
 static inline void memcg_release_cache(struct kmem_cache *cachep)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bf5e894..e6ad6ff 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3195,8 +3195,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 	return 0;
 }
 
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-			 struct kmem_cache *root_cache)
+int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+			    struct kmem_cache *root_cache)
 {
 	size_t size;
 
@@ -3224,6 +3224,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
 	return 0;
 }
 
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+	kfree(s->memcg_params);
+}
+
 void memcg_release_cache(struct kmem_cache *s)
 {
 	struct kmem_cache *root;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 5d6f743..62712fe 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -208,7 +208,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
 		goto out_free_cache;
 	}
 
-	err = memcg_register_cache(memcg, s, parent_cache);
+	err = memcg_init_cache_params(memcg, s, parent_cache);
 	if (err)
 		goto out_free_cache;
 
@@ -238,6 +238,7 @@ out_unlock:
 	return s;
 
 out_free_cache:
+	memcg_free_cache_params(s);
 	kfree(s->name);
 	kmem_cache_free(kmem_cache, s);
 	goto out_unlock;
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches
  2013-12-18 13:16 [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Vladimir Davydov
  2013-12-18 13:16 ` [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error Vladimir Davydov
@ 2013-12-18 13:16 ` Vladimir Davydov
  2013-12-18 17:14   ` Michal Hocko
  2013-12-18 13:16 ` [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex Vladimir Davydov
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-18 13:16 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

First, in memcg_create_kmem_cache() we should issue the write barrier
after the kmem_cache is initialized, but before storing the pointer to
it in its parent's memcg_params.

Second, we should always issue the read barrier after
cache_from_memcg_idx() to conform with the write barrier.

Third, its better to use smp_* versions of barriers, because we don't
need them on UP systems.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c |   24 ++++++++++--------------
 mm/slab.h       |    6 +++++-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6ad6ff..e37fdb5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3429,12 +3429,14 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
 
-	cachep->memcg_params->memcg_caches[idx] = new_cachep;
 	/*
-	 * the readers won't lock, make sure everybody sees the updated value,
-	 * so they won't put stuff in the queue again for no reason
+	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
+	 * barrier here to ensure nobody will see the kmem_cache partially
+	 * initialized.
 	 */
-	wmb();
+	smp_wmb();
+
+	cachep->memcg_params->memcg_caches[idx] = new_cachep;
 out:
 	mutex_unlock(&memcg_cache_mutex);
 	return new_cachep;
@@ -3573,7 +3575,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
 					  gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
-	int idx;
+	struct kmem_cache *memcg_cachep;
 
 	VM_BUG_ON(!cachep->memcg_params);
 	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3587,15 +3589,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
 	if (!memcg_can_account_kmem(memcg))
 		goto out;
 
-	idx = memcg_cache_id(memcg);
-
-	/*
-	 * barrier to mare sure we're always seeing the up to date value.  The
-	 * code updating memcg_caches will issue a write barrier to match this.
-	 */
-	read_barrier_depends();
-	if (likely(cache_from_memcg_idx(cachep, idx))) {
-		cachep = cache_from_memcg_idx(cachep, idx);
+	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+	if (likely(memcg_cachep)) {
+		cachep = memcg_cachep;
 		goto out;
 	}
 
diff --git a/mm/slab.h b/mm/slab.h
index 0859c42..1d8b53f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,9 +163,13 @@ static inline const char *cache_name(struct kmem_cache *s)
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
+	struct kmem_cache *cachep;
+
 	if (!s->memcg_params)
 		return NULL;
-	return s->memcg_params->memcg_caches[idx];
+	cachep = s->memcg_params->memcg_caches[idx];
+	smp_read_barrier_depends();	/* see memcg_register_cache() */
+	return cachep;
 }
 
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex
  2013-12-18 13:16 [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Vladimir Davydov
  2013-12-18 13:16 ` [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error Vladimir Davydov
  2013-12-18 13:16 ` [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches Vladimir Davydov
@ 2013-12-18 13:16 ` Vladimir Davydov
  2013-12-18 17:41   ` Michal Hocko
  2013-12-18 13:16 ` [PATCH 5/6] memcg: clear memcg_params after removing cache from memcg_slab_caches list Vladimir Davydov
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-18 13:16 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

The memcg_params::memcg_caches array can be updated concurrently from
memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
of these functions take the slab_mutex during their operation, the
latter checks if memcg's cache has already been allocated w/o taking the
mutex. This can result in a race as described below.

Asume two threads schedule kmem_cache creation works for the same
kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
works successfully creates it. Another work should fail then, but if it
interleaves with memcg_update_cache_size() as follows, it does not:

  memcg_create_kmem_cache()                     memcg_update_cache_size()
  (called w/o mutexes held)                     (called with slab_mutex held)
  -------------------------                     -------------------------
  mutex_lock(&memcg_cache_mutex)
                                                s->memcg_params=kzalloc(...)
  new_cachep=cache_from_memcg_idx(cachep,idx)
  // new_cachep==NULL => proceed to creation
                                                s->memcg_params->memcg_caches[i]
                                                    =cur_params->memcg_caches[i]
  // kmem_cache_dup takes slab_mutex so we will
  // hang around here until memcg_update_cache_size()
  // finishes, but ...
  new_cachep = kmem_cache_dup(memcg, cachep)
  // nothing will prevent kmem_cache_dup from
  // succeeding so ...
  cachep->memcg_params->memcg_caches[idx]=new_cachep
  // we've overwritten an existing cache ptr!

Let's fix this by moving both the check and the update of
memcg_params::memcg_caches from memcg_create_kmem_cache() to
kmem_cache_create_memcg() to be called under the slab_mutex.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |    9 ++--
 mm/memcontrol.c            |   98 +++++++++++++++-----------------------------
 mm/slab_common.c           |    8 +++-
 3 files changed, 44 insertions(+), 71 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b357ae3..fdd3f30 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -500,8 +500,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
 int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			    struct kmem_cache *root_cache);
 void memcg_free_cache_params(struct kmem_cache *s);
-void memcg_release_cache(struct kmem_cache *cachep);
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
+void memcg_register_cache(struct kmem_cache *s);
+void memcg_release_cache(struct kmem_cache *s);
 
 int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
 void memcg_update_array_size(int num_groups);
@@ -652,12 +652,11 @@ static inline void memcg_free_cache_params(struct kmem_cache *s);
 {
 }
 
-static inline void memcg_release_cache(struct kmem_cache *cachep)
+static inline void memcg_register_cache(struct kmem_cache *s)
 {
 }
 
-static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
-					struct kmem_cache *s)
+static inline void memcg_release_cache(struct kmem_cache *s)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e37fdb5..62b9991 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3059,16 +3059,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
 		css_put(&memcg->css);
 }
 
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
-	if (!memcg)
-		return;
-
-	mutex_lock(&memcg->slab_caches_mutex);
-	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-	mutex_unlock(&memcg->slab_caches_mutex);
-}
-
 /*
  * helper for acessing a memcg's index. It will be used as an index in the
  * child cache array in kmem_cache, and also to derive its name. This function
@@ -3229,6 +3219,35 @@ void memcg_free_cache_params(struct kmem_cache *s)
 	kfree(s->memcg_params);
 }
 
+void memcg_register_cache(struct kmem_cache *s)
+{
+	struct kmem_cache *root;
+	struct mem_cgroup *memcg;
+	int id;
+
+	if (is_root_cache(s))
+		return;
+
+	memcg = s->memcg_params->memcg;
+	id = memcg_cache_id(memcg);
+	root = s->memcg_params->root_cache;
+
+	css_get(&memcg->css);
+
+	/*
+	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
+	 * barrier here to ensure nobody will see the kmem_cache partially
+	 * initialized.
+	 */
+	smp_wmb();
+
+	root->memcg_params->memcg_caches[id] = s;
+
+	mutex_lock(&memcg->slab_caches_mutex);
+	list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+	mutex_unlock(&memcg->slab_caches_mutex);
+}
+
 void memcg_release_cache(struct kmem_cache *s)
 {
 	struct kmem_cache *root;
@@ -3356,26 +3375,13 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
 	schedule_work(&cachep->memcg_params->destroy);
 }
 
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
-					 struct kmem_cache *s)
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+						  struct kmem_cache *s)
 {
 	struct kmem_cache *new;
 	static char *tmp_name = NULL;
 
-	lockdep_assert_held(&memcg_cache_mutex);
+	BUG_ON(!memcg_can_account_kmem(memcg));
 
 	/*
 	 * kmem_cache_create_memcg duplicates the given name and
@@ -3403,45 +3409,6 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
 	return new;
 }
 
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-						  struct kmem_cache *cachep)
-{
-	struct kmem_cache *new_cachep;
-	int idx;
-
-	BUG_ON(!memcg_can_account_kmem(memcg));
-
-	idx = memcg_cache_id(memcg);
-
-	mutex_lock(&memcg_cache_mutex);
-	new_cachep = cache_from_memcg_idx(cachep, idx);
-	if (new_cachep) {
-		css_put(&memcg->css);
-		goto out;
-	}
-
-	new_cachep = kmem_cache_dup(memcg, cachep);
-	if (new_cachep == NULL) {
-		new_cachep = cachep;
-		css_put(&memcg->css);
-		goto out;
-	}
-
-	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-
-	/*
-	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
-	 * barrier here to ensure nobody will see the kmem_cache partially
-	 * initialized.
-	 */
-	smp_wmb();
-
-	cachep->memcg_params->memcg_caches[idx] = new_cachep;
-out:
-	mutex_unlock(&memcg_cache_mutex);
-	return new_cachep;
-}
-
 void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
 {
 	struct kmem_cache *c;
@@ -3516,6 +3483,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
 
 	cw = container_of(w, struct create_work, work);
 	memcg_create_kmem_cache(cw->memcg, cw->cachep);
+	css_put(&cw->memcg->css);
 	kfree(cw);
 }
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 62712fe..51dc106 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -176,6 +176,12 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
 
+	if (memcg) {
+		s = cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg));
+		if (s)
+			goto out_unlock;
+	}
+
 	err = kmem_cache_sanity_check(memcg, name, size);
 	if (err)
 		goto out_unlock;
@@ -218,7 +224,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
 
 	s->refcount = 1;
 	list_add(&s->list, &slab_caches);
-	memcg_cache_list_add(memcg, s);
+	memcg_register_cache(s);
 
 out_unlock:
 	mutex_unlock(&slab_mutex);
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 5/6] memcg: clear memcg_params after removing cache from memcg_slab_caches list
  2013-12-18 13:16 [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Vladimir Davydov
                   ` (2 preceding siblings ...)
  2013-12-18 13:16 ` [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex Vladimir Davydov
@ 2013-12-18 13:16 ` Vladimir Davydov
  2013-12-18 13:16 ` [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches Vladimir Davydov
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-18 13:16 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

All caches of the same memory cgroup are linked in the memcg_slab_caches
list via kmem_cache::memcg_params::list. This list is traversed when we
read memory.kmem.slabinfo. Since the list actually consists of
memcg_cache_params objects, to convert an element of the list to a
kmem_cache object, we use memcg_params_to_cache(), which obtains the
pointer to the cache from the memcg_params::memcg_caches array of the
root cache, but on cache destruction this pointer is cleared before the
removal of the cache from the list, which potentially can result in a
NULL ptr dereference. Let's fix this by clearing the pointer to a cache
in the memcg_params::memcg_caches array of its parent only after it
cannot be accessed by the memcg_slab_caches list.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memcontrol.c |   16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 62b9991..ad8de6a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3241,6 +3241,11 @@ void memcg_register_cache(struct kmem_cache *s)
 	 */
 	smp_wmb();
 
+	/*
+	 * Initialize the pointer to this cache in its parent's memcg_params
+	 * before adding it to the memcg_slab_caches list, otherwise we can
+	 * fail to convert memcg_params_to_cache() while traversing the list.
+	 */
 	root->memcg_params->memcg_caches[id] = s;
 
 	mutex_lock(&memcg->slab_caches_mutex);
@@ -3265,15 +3270,20 @@ void memcg_release_cache(struct kmem_cache *s)
 		goto out;
 
 	memcg = s->memcg_params->memcg;
-	id  = memcg_cache_id(memcg);
-
+	id = memcg_cache_id(memcg);
 	root = s->memcg_params->root_cache;
-	root->memcg_params->memcg_caches[id] = NULL;
 
 	mutex_lock(&memcg->slab_caches_mutex);
 	list_del(&s->memcg_params->list);
 	mutex_unlock(&memcg->slab_caches_mutex);
 
+	/*
+	 * Clear the pointer to this cache in its parent's memcg_params only
+	 * after removing it from the memcg_slab_caches list, otherwise we can
+	 * fail to convert memcg_params_to_cache() while traversing the list.
+	 */
+	root->memcg_params->memcg_caches[id] = NULL;
+
 	css_put(&memcg->css);
 out:
 	kfree(s->memcg_params);
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches
  2013-12-18 13:16 [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Vladimir Davydov
                   ` (3 preceding siblings ...)
  2013-12-18 13:16 ` [PATCH 5/6] memcg: clear memcg_params after removing cache from memcg_slab_caches list Vladimir Davydov
@ 2013-12-18 13:16 ` Vladimir Davydov
  2013-12-19  9:28   ` Michal Hocko
  2013-12-18 16:56 ` [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Michal Hocko
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-18 13:16 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

We update root cache's memcg_params whenever we need to grow the
memcg_caches array to accommodate all kmem-active memory cgroups.
Currently we free the old version immediately then, which can lead to
use-after-free, because the memcg_caches array is accessed lock-free.
This patch fixes this by making memcg_params RCU-protected.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h |    5 ++++-
 mm/memcontrol.c      |   15 ++++++++-------
 mm/slab.h            |    8 +++++++-
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1e2f4fe..f7e5649 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -528,7 +528,10 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 struct memcg_cache_params {
 	bool is_root_cache;
 	union {
-		struct kmem_cache *memcg_caches[0];
+		struct {
+			struct rcu_head rcu_head;
+			struct kmem_cache *memcg_caches[0];
+		};
 		struct {
 			struct mem_cgroup *memcg;
 			struct list_head list;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ad8de6a..379fc5f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3142,18 +3142,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 
 	if (num_groups > memcg_limited_groups_array_size) {
 		int i;
+		struct memcg_cache_params *new_params;
 		ssize_t size = memcg_caches_array_size(num_groups);
 
 		size *= sizeof(void *);
 		size += offsetof(struct memcg_cache_params, memcg_caches);
 
-		s->memcg_params = kzalloc(size, GFP_KERNEL);
-		if (!s->memcg_params) {
-			s->memcg_params = cur_params;
+		new_params = kzalloc(size, GFP_KERNEL);
+		if (!new_params)
 			return -ENOMEM;
-		}
 
-		s->memcg_params->is_root_cache = true;
+		new_params->is_root_cache = true;
 
 		/*
 		 * There is the chance it will be bigger than
@@ -3167,7 +3166,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 		for (i = 0; i < memcg_limited_groups_array_size; i++) {
 			if (!cur_params->memcg_caches[i])
 				continue;
-			s->memcg_params->memcg_caches[i] =
+			new_params->memcg_caches[i] =
 						cur_params->memcg_caches[i];
 		}
 
@@ -3180,7 +3179,9 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 		 * bigger than the others. And all updates will reset this
 		 * anyway.
 		 */
-		kfree(cur_params);
+		rcu_assign_pointer(s->memcg_params, new_params);
+		if (cur_params)
+			kfree_rcu(cur_params, rcu_head);
 	}
 	return 0;
 }
diff --git a/mm/slab.h b/mm/slab.h
index 1d8b53f..53b81a9 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -164,10 +164,16 @@ static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
 	struct kmem_cache *cachep;
+	struct memcg_cache_params *params;
 
 	if (!s->memcg_params)
 		return NULL;
-	cachep = s->memcg_params->memcg_caches[idx];
+
+	rcu_read_lock();
+	params = rcu_dereference(s->memcg_params);
+	cachep = params->memcg_caches[idx];
+	rcu_read_unlock();
+
 	smp_read_barrier_depends();	/* see memcg_register_cache() */
 	return cachep;
 }
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-18 13:16 [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Vladimir Davydov
                   ` (4 preceding siblings ...)
  2013-12-18 13:16 ` [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches Vladimir Davydov
@ 2013-12-18 16:56 ` Michal Hocko
  2013-12-19  6:31   ` Vladimir Davydov
  2013-12-19  7:27 ` Pekka Enberg
  2013-12-19  8:17 ` [Devel] " Vasily Averin
  7 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-18 16:56 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Wed 18-12-13 17:16:52, Vladimir Davydov wrote:
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Glauber Costa <glommer@gmail.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>

Dunno, is this really better to be worth the code churn?

It even makes the generated code tiny bit bigger:
text    data     bss     dec     hex filename
4355     171     236    4762    129a mm/slab_common.o.after
4342     171     236    4749    128d mm/slab_common.o.before

Or does it make the further changes much more easier? Be explicit in the
patch description if so.

> ---
>  mm/slab_common.c |   66 +++++++++++++++++++++++++++---------------------------
>  1 file changed, 33 insertions(+), 33 deletions(-)
> 
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 0b7bb39..5d6f743 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>  	get_online_cpus();
>  	mutex_lock(&slab_mutex);
>  
> -	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
> -		goto out_locked;
> +	err = kmem_cache_sanity_check(memcg, name, size);
> +	if (err)
> +		goto out_unlock;
>  
>  	/*
>  	 * Some allocators will constraint the set of valid flags to a subset
> @@ -189,45 +190,41 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>  
>  	s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
>  	if (s)
> -		goto out_locked;
> +		goto out_unlock;
>  
>  	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
> -	if (s) {
> -		s->object_size = s->size = size;
> -		s->align = calculate_alignment(flags, align, size);
> -		s->ctor = ctor;
> -
> -		if (memcg_register_cache(memcg, s, parent_cache)) {
> -			kmem_cache_free(kmem_cache, s);
> -			err = -ENOMEM;
> -			goto out_locked;
> -		}
> +	if (!s) {
> +		err = -ENOMEM;
> +		goto out_unlock;
> +	}
>  
> -		s->name = kstrdup(name, GFP_KERNEL);
> -		if (!s->name) {
> -			kmem_cache_free(kmem_cache, s);
> -			err = -ENOMEM;
> -			goto out_locked;
> -		}
> +	s->object_size = s->size = size;
> +	s->align = calculate_alignment(flags, align, size);
> +	s->ctor = ctor;
>  
> -		err = __kmem_cache_create(s, flags);
> -		if (!err) {
> -			s->refcount = 1;
> -			list_add(&s->list, &slab_caches);
> -			memcg_cache_list_add(memcg, s);
> -		} else {
> -			kfree(s->name);
> -			kmem_cache_free(kmem_cache, s);
> -		}
> -	} else
> +	s->name = kstrdup(name, GFP_KERNEL);
> +	if (!s->name) {
>  		err = -ENOMEM;
> +		goto out_free_cache;
> +	}
> +
> +	err = memcg_register_cache(memcg, s, parent_cache);
> +	if (err)
> +		goto out_free_cache;
>  
> -out_locked:
> +	err = __kmem_cache_create(s, flags);
> +	if (err)
> +		goto out_free_cache;
> +
> +	s->refcount = 1;
> +	list_add(&s->list, &slab_caches);
> +	memcg_cache_list_add(memcg, s);
> +
> +out_unlock:
>  	mutex_unlock(&slab_mutex);
>  	put_online_cpus();
>  
>  	if (err) {
> -
>  		if (flags & SLAB_PANIC)
>  			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
>  				name, err);
> @@ -236,11 +233,14 @@ out_locked:
>  				name, err);
>  			dump_stack();
>  		}
> -
>  		return NULL;
>  	}
> -
>  	return s;
> +
> +out_free_cache:
> +	kfree(s->name);
> +	kmem_cache_free(kmem_cache, s);
> +	goto out_unlock;
>  }
>  
>  struct kmem_cache *
> -- 
> 1.7.10.4
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error
  2013-12-18 13:16 ` [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error Vladimir Davydov
@ 2013-12-18 17:06   ` Michal Hocko
  2013-12-19  6:32     ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-18 17:06 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Wed 18-12-13 17:16:53, Vladimir Davydov wrote:
> Plus, rename memcg_register_cache() to memcg_init_cache_params(),
> because it actually does not register the cache anywhere, but simply
> initialize kmem_cache::memcg_params.

I've almost missed this is a memory leak fix.
I do not mind renaming and the name but wouldn't
memcg_alloc_cache_params suit better?

> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Glauber Costa <glommer@gmail.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> ---
>  include/linux/memcontrol.h |   13 +++++++++----
>  mm/memcontrol.c            |    9 +++++++--
>  mm/slab_common.c           |    3 ++-
>  3 files changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index b3e7a66..b357ae3 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -497,8 +497,9 @@ void __memcg_kmem_commit_charge(struct page *page,
>  void __memcg_kmem_uncharge_pages(struct page *page, int order);
>  
>  int memcg_cache_id(struct mem_cgroup *memcg);
> -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
> -			 struct kmem_cache *root_cache);
> +int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
> +			    struct kmem_cache *root_cache);
> +void memcg_free_cache_params(struct kmem_cache *s);
>  void memcg_release_cache(struct kmem_cache *cachep);
>  void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
>  
> @@ -641,12 +642,16 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
>  }
>  
>  static inline int
> -memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
> -		     struct kmem_cache *root_cache)
> +memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
> +			struct kmem_cache *root_cache)
>  {
>  	return 0;
>  }
>  
> +static inline void memcg_free_cache_params(struct kmem_cache *s);
> +{
> +}
> +
>  static inline void memcg_release_cache(struct kmem_cache *cachep)
>  {
>  }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index bf5e894..e6ad6ff 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3195,8 +3195,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
>  	return 0;
>  }
>  
> -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
> -			 struct kmem_cache *root_cache)
> +int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
> +			    struct kmem_cache *root_cache)
>  {
>  	size_t size;
>  
> @@ -3224,6 +3224,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
>  	return 0;
>  }
>  
> +void memcg_free_cache_params(struct kmem_cache *s)
> +{
> +	kfree(s->memcg_params);
> +}
> +
>  void memcg_release_cache(struct kmem_cache *s)
>  {
>  	struct kmem_cache *root;
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 5d6f743..62712fe 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -208,7 +208,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>  		goto out_free_cache;
>  	}
>  
> -	err = memcg_register_cache(memcg, s, parent_cache);
> +	err = memcg_init_cache_params(memcg, s, parent_cache);
>  	if (err)
>  		goto out_free_cache;
>  
> @@ -238,6 +238,7 @@ out_unlock:
>  	return s;
>  
>  out_free_cache:
> +	memcg_free_cache_params(s);
>  	kfree(s->name);
>  	kmem_cache_free(kmem_cache, s);
>  	goto out_unlock;
> -- 
> 1.7.10.4
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches
  2013-12-18 13:16 ` [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches Vladimir Davydov
@ 2013-12-18 17:14   ` Michal Hocko
  2013-12-19  6:37     ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-18 17:14 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
> First, in memcg_create_kmem_cache() we should issue the write barrier
> after the kmem_cache is initialized, but before storing the pointer to
> it in its parent's memcg_params.
> 
> Second, we should always issue the read barrier after
> cache_from_memcg_idx() to conform with the write barrier.
> 
> Third, its better to use smp_* versions of barriers, because we don't
> need them on UP systems.

Please be (much) more verbose on Why. Barriers are tricky and should be
documented accordingly. So if you say that we should issue a barrier
always be specific why we should do it.

> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Glauber Costa <glommer@gmail.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> ---
>  mm/memcontrol.c |   24 ++++++++++--------------
>  mm/slab.h       |    6 +++++-
>  2 files changed, 15 insertions(+), 15 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e6ad6ff..e37fdb5 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3429,12 +3429,14 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
>  
>  	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
>  
> -	cachep->memcg_params->memcg_caches[idx] = new_cachep;
>  	/*
> -	 * the readers won't lock, make sure everybody sees the updated value,
> -	 * so they won't put stuff in the queue again for no reason
> +	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
> +	 * barrier here to ensure nobody will see the kmem_cache partially
> +	 * initialized.
>  	 */
> -	wmb();
> +	smp_wmb();
> +
> +	cachep->memcg_params->memcg_caches[idx] = new_cachep;
>  out:
>  	mutex_unlock(&memcg_cache_mutex);
>  	return new_cachep;
> @@ -3573,7 +3575,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
>  					  gfp_t gfp)
>  {
>  	struct mem_cgroup *memcg;
> -	int idx;
> +	struct kmem_cache *memcg_cachep;
>  
>  	VM_BUG_ON(!cachep->memcg_params);
>  	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
> @@ -3587,15 +3589,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
>  	if (!memcg_can_account_kmem(memcg))
>  		goto out;
>  
> -	idx = memcg_cache_id(memcg);
> -
> -	/*
> -	 * barrier to mare sure we're always seeing the up to date value.  The
> -	 * code updating memcg_caches will issue a write barrier to match this.
> -	 */
> -	read_barrier_depends();
> -	if (likely(cache_from_memcg_idx(cachep, idx))) {
> -		cachep = cache_from_memcg_idx(cachep, idx);
> +	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
> +	if (likely(memcg_cachep)) {
> +		cachep = memcg_cachep;
>  		goto out;
>  	}
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index 0859c42..1d8b53f 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -163,9 +163,13 @@ static inline const char *cache_name(struct kmem_cache *s)
>  static inline struct kmem_cache *
>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
>  {
> +	struct kmem_cache *cachep;
> +
>  	if (!s->memcg_params)
>  		return NULL;
> -	return s->memcg_params->memcg_caches[idx];
> +	cachep = s->memcg_params->memcg_caches[idx];
> +	smp_read_barrier_depends();	/* see memcg_register_cache() */
> +	return cachep;
>  }
>  
>  static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
> -- 
> 1.7.10.4
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex
  2013-12-18 13:16 ` [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex Vladimir Davydov
@ 2013-12-18 17:41   ` Michal Hocko
  2013-12-19  7:07     ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-18 17:41 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
> The memcg_params::memcg_caches array can be updated concurrently from
> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
> of these functions take the slab_mutex during their operation, the
> latter checks if memcg's cache has already been allocated w/o taking the
> mutex. This can result in a race as described below.
> 
> Asume two threads schedule kmem_cache creation works for the same
> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
> works successfully creates it. Another work should fail then, but if it
> interleaves with memcg_update_cache_size() as follows, it does not:

I am not sure I understand the race. memcg_update_cache_size is called
when we start accounting a new memcg or a child is created and it
inherits accounting from the parent. memcg_create_kmem_cache is called
when a new cache is first allocated from, right?

Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
it is running from the workqueue context so it should clash with other
locks.

> 
>   memcg_create_kmem_cache()                     memcg_update_cache_size()
>   (called w/o mutexes held)                     (called with slab_mutex held)
>   -------------------------                     -------------------------
>   mutex_lock(&memcg_cache_mutex)
>                                                 s->memcg_params=kzalloc(...)
>   new_cachep=cache_from_memcg_idx(cachep,idx)
>   // new_cachep==NULL => proceed to creation
>                                                 s->memcg_params->memcg_caches[i]
>                                                     =cur_params->memcg_caches[i]
>   // kmem_cache_dup takes slab_mutex so we will
>   // hang around here until memcg_update_cache_size()
>   // finishes, but ...
>   new_cachep = kmem_cache_dup(memcg, cachep)
>   // nothing will prevent kmem_cache_dup from
>   // succeeding so ...
>   cachep->memcg_params->memcg_caches[idx]=new_cachep
>   // we've overwritten an existing cache ptr!
> 
> Let's fix this by moving both the check and the update of
> memcg_params::memcg_caches from memcg_create_kmem_cache() to
> kmem_cache_create_memcg() to be called under the slab_mutex.
> 
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Glauber Costa <glommer@gmail.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> ---
>  include/linux/memcontrol.h |    9 ++--
>  mm/memcontrol.c            |   98 +++++++++++++++-----------------------------
>  mm/slab_common.c           |    8 +++-
>  3 files changed, 44 insertions(+), 71 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index b357ae3..fdd3f30 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -500,8 +500,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
>  int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>  			    struct kmem_cache *root_cache);
>  void memcg_free_cache_params(struct kmem_cache *s);
> -void memcg_release_cache(struct kmem_cache *cachep);
> -void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
> +void memcg_register_cache(struct kmem_cache *s);
> +void memcg_release_cache(struct kmem_cache *s);
>  
>  int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
>  void memcg_update_array_size(int num_groups);
> @@ -652,12 +652,11 @@ static inline void memcg_free_cache_params(struct kmem_cache *s);
>  {
>  }
>  
> -static inline void memcg_release_cache(struct kmem_cache *cachep)
> +static inline void memcg_register_cache(struct kmem_cache *s)
>  {
>  }
>  
> -static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
> -					struct kmem_cache *s)
> +static inline void memcg_release_cache(struct kmem_cache *s)
>  {
>  }
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e37fdb5..62b9991 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3059,16 +3059,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
>  		css_put(&memcg->css);
>  }
>  
> -void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
> -{
> -	if (!memcg)
> -		return;
> -
> -	mutex_lock(&memcg->slab_caches_mutex);
> -	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
> -	mutex_unlock(&memcg->slab_caches_mutex);
> -}
> -
>  /*
>   * helper for acessing a memcg's index. It will be used as an index in the
>   * child cache array in kmem_cache, and also to derive its name. This function
> @@ -3229,6 +3219,35 @@ void memcg_free_cache_params(struct kmem_cache *s)
>  	kfree(s->memcg_params);
>  }
>  
> +void memcg_register_cache(struct kmem_cache *s)
> +{
> +	struct kmem_cache *root;
> +	struct mem_cgroup *memcg;
> +	int id;
> +
> +	if (is_root_cache(s))
> +		return;
> +
> +	memcg = s->memcg_params->memcg;
> +	id = memcg_cache_id(memcg);
> +	root = s->memcg_params->root_cache;
> +
> +	css_get(&memcg->css);
> +
> +	/*
> +	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
> +	 * barrier here to ensure nobody will see the kmem_cache partially
> +	 * initialized.
> +	 */
> +	smp_wmb();
> +
> +	root->memcg_params->memcg_caches[id] = s;
> +
> +	mutex_lock(&memcg->slab_caches_mutex);
> +	list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
> +	mutex_unlock(&memcg->slab_caches_mutex);
> +}
> +
>  void memcg_release_cache(struct kmem_cache *s)
>  {
>  	struct kmem_cache *root;
> @@ -3356,26 +3375,13 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
>  	schedule_work(&cachep->memcg_params->destroy);
>  }
>  
> -/*
> - * This lock protects updaters, not readers. We want readers to be as fast as
> - * they can, and they will either see NULL or a valid cache value. Our model
> - * allow them to see NULL, in which case the root memcg will be selected.
> - *
> - * We need this lock because multiple allocations to the same cache from a non
> - * will span more than one worker. Only one of them can create the cache.
> - */
> -static DEFINE_MUTEX(memcg_cache_mutex);
> -
> -/*
> - * Called with memcg_cache_mutex held
> - */
> -static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
> -					 struct kmem_cache *s)
> +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
> +						  struct kmem_cache *s)
>  {
>  	struct kmem_cache *new;
>  	static char *tmp_name = NULL;
>  
> -	lockdep_assert_held(&memcg_cache_mutex);
> +	BUG_ON(!memcg_can_account_kmem(memcg));
>  
>  	/*
>  	 * kmem_cache_create_memcg duplicates the given name and
> @@ -3403,45 +3409,6 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
>  	return new;
>  }
>  
> -static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
> -						  struct kmem_cache *cachep)
> -{
> -	struct kmem_cache *new_cachep;
> -	int idx;
> -
> -	BUG_ON(!memcg_can_account_kmem(memcg));
> -
> -	idx = memcg_cache_id(memcg);
> -
> -	mutex_lock(&memcg_cache_mutex);
> -	new_cachep = cache_from_memcg_idx(cachep, idx);
> -	if (new_cachep) {
> -		css_put(&memcg->css);
> -		goto out;
> -	}
> -
> -	new_cachep = kmem_cache_dup(memcg, cachep);
> -	if (new_cachep == NULL) {
> -		new_cachep = cachep;
> -		css_put(&memcg->css);
> -		goto out;
> -	}
> -
> -	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
> -
> -	/*
> -	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
> -	 * barrier here to ensure nobody will see the kmem_cache partially
> -	 * initialized.
> -	 */
> -	smp_wmb();
> -
> -	cachep->memcg_params->memcg_caches[idx] = new_cachep;
> -out:
> -	mutex_unlock(&memcg_cache_mutex);
> -	return new_cachep;
> -}
> -
>  void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
>  {
>  	struct kmem_cache *c;
> @@ -3516,6 +3483,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
>  
>  	cw = container_of(w, struct create_work, work);
>  	memcg_create_kmem_cache(cw->memcg, cw->cachep);
> +	css_put(&cw->memcg->css);
>  	kfree(cw);
>  }
>  
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 62712fe..51dc106 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -176,6 +176,12 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>  	get_online_cpus();
>  	mutex_lock(&slab_mutex);
>  
> +	if (memcg) {
> +		s = cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg));
> +		if (s)
> +			goto out_unlock;
> +	}
> +
>  	err = kmem_cache_sanity_check(memcg, name, size);
>  	if (err)
>  		goto out_unlock;
> @@ -218,7 +224,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>  
>  	s->refcount = 1;
>  	list_add(&s->list, &slab_caches);
> -	memcg_cache_list_add(memcg, s);
> +	memcg_register_cache(s);
>  
>  out_unlock:
>  	mutex_unlock(&slab_mutex);
> -- 
> 1.7.10.4
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-18 16:56 ` [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Michal Hocko
@ 2013-12-19  6:31   ` Vladimir Davydov
  2013-12-19  8:44     ` Michal Hocko
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  6:31 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/18/2013 08:56 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:52, Vladimir Davydov wrote:
>> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
>> Cc: Michal Hocko <mhocko@suse.cz>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Glauber Costa <glommer@gmail.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Pekka Enberg <penberg@kernel.org>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
> Dunno, is this really better to be worth the code churn?
>
> It even makes the generated code tiny bit bigger:
> text    data     bss     dec     hex filename
> 4355     171     236    4762    129a mm/slab_common.o.after
> 4342     171     236    4749    128d mm/slab_common.o.before
>
> Or does it make the further changes much more easier? Be explicit in the
> patch description if so.

Hi, Michal

IMO, undoing under labels looks better than inside conditionals, because
we don't have to repeat the same deinitialization code then, like this
(note three calls to kmem_cache_free()):

    s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
    if (s) {
        s->object_size = s->size = size;
        s->align = calculate_alignment(flags, align, size);
        s->ctor = ctor;

        if (memcg_register_cache(memcg, s, parent_cache)) {
            kmem_cache_free(kmem_cache, s);
            err = -ENOMEM;
            goto out_locked;
        }

        s->name = kstrdup(name, GFP_KERNEL);
        if (!s->name) {
            kmem_cache_free(kmem_cache, s);
            err = -ENOMEM;
            goto out_locked;
        }

        err = __kmem_cache_create(s, flags);
        if (!err) {
            s->refcount = 1;
            list_add(&s->list, &slab_caches);
            memcg_cache_list_add(memcg, s);
        } else {
            kfree(s->name);
            kmem_cache_free(kmem_cache, s);
        }
    } else
        err = -ENOMEM;

The next patch, which fixes the memcg_params leakage on error, would
make it even worse introducing two calls to memcg_free_cache_params()
after kstrdup and __kmem_cache_create.

If you think it isn't worthwhile applying this patch, just let me know,
I don't mind dropping it.

Anyway, I'll improve the comment and resend.

Thanks.

>
>> ---
>>  mm/slab_common.c |   66 +++++++++++++++++++++++++++---------------------------
>>  1 file changed, 33 insertions(+), 33 deletions(-)
>>
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index 0b7bb39..5d6f743 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>>  	get_online_cpus();
>>  	mutex_lock(&slab_mutex);
>>  
>> -	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
>> -		goto out_locked;
>> +	err = kmem_cache_sanity_check(memcg, name, size);
>> +	if (err)
>> +		goto out_unlock;
>>  
>>  	/*
>>  	 * Some allocators will constraint the set of valid flags to a subset
>> @@ -189,45 +190,41 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>>  
>>  	s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
>>  	if (s)
>> -		goto out_locked;
>> +		goto out_unlock;
>>  
>>  	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
>> -	if (s) {
>> -		s->object_size = s->size = size;
>> -		s->align = calculate_alignment(flags, align, size);
>> -		s->ctor = ctor;
>> -
>> -		if (memcg_register_cache(memcg, s, parent_cache)) {
>> -			kmem_cache_free(kmem_cache, s);
>> -			err = -ENOMEM;
>> -			goto out_locked;
>> -		}
>> +	if (!s) {
>> +		err = -ENOMEM;
>> +		goto out_unlock;
>> +	}
>>  
>> -		s->name = kstrdup(name, GFP_KERNEL);
>> -		if (!s->name) {
>> -			kmem_cache_free(kmem_cache, s);
>> -			err = -ENOMEM;
>> -			goto out_locked;
>> -		}
>> +	s->object_size = s->size = size;
>> +	s->align = calculate_alignment(flags, align, size);
>> +	s->ctor = ctor;
>>  
>> -		err = __kmem_cache_create(s, flags);
>> -		if (!err) {
>> -			s->refcount = 1;
>> -			list_add(&s->list, &slab_caches);
>> -			memcg_cache_list_add(memcg, s);
>> -		} else {
>> -			kfree(s->name);
>> -			kmem_cache_free(kmem_cache, s);
>> -		}
>> -	} else
>> +	s->name = kstrdup(name, GFP_KERNEL);
>> +	if (!s->name) {
>>  		err = -ENOMEM;
>> +		goto out_free_cache;
>> +	}
>> +
>> +	err = memcg_register_cache(memcg, s, parent_cache);
>> +	if (err)
>> +		goto out_free_cache;
>>  
>> -out_locked:
>> +	err = __kmem_cache_create(s, flags);
>> +	if (err)
>> +		goto out_free_cache;
>> +
>> +	s->refcount = 1;
>> +	list_add(&s->list, &slab_caches);
>> +	memcg_cache_list_add(memcg, s);
>> +
>> +out_unlock:
>>  	mutex_unlock(&slab_mutex);
>>  	put_online_cpus();
>>  
>>  	if (err) {
>> -
>>  		if (flags & SLAB_PANIC)
>>  			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
>>  				name, err);
>> @@ -236,11 +233,14 @@ out_locked:
>>  				name, err);
>>  			dump_stack();
>>  		}
>> -
>>  		return NULL;
>>  	}
>> -
>>  	return s;
>> +
>> +out_free_cache:
>> +	kfree(s->name);
>> +	kmem_cache_free(kmem_cache, s);
>> +	goto out_unlock;
>>  }
>>  
>>  struct kmem_cache *
>> -- 
>> 1.7.10.4
>>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error
  2013-12-18 17:06   ` Michal Hocko
@ 2013-12-19  6:32     ` Vladimir Davydov
  2013-12-19  8:48       ` Michal Hocko
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  6:32 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/18/2013 09:06 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:53, Vladimir Davydov wrote:
>> Plus, rename memcg_register_cache() to memcg_init_cache_params(),
>> because it actually does not register the cache anywhere, but simply
>> initialize kmem_cache::memcg_params.
> I've almost missed this is a memory leak fix.

Yeah, the comment is poor, sorry about that. Will fix it.

> I do not mind renaming and the name but wouldn't
> memcg_alloc_cache_params suit better?

As you wish. I don't have a strong preference for memcg_init_cache_params.

Thanks.

>
>> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
>> Cc: Michal Hocko <mhocko@suse.cz>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Glauber Costa <glommer@gmail.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Pekka Enberg <penberg@kernel.org>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> ---
>>  include/linux/memcontrol.h |   13 +++++++++----
>>  mm/memcontrol.c            |    9 +++++++--
>>  mm/slab_common.c           |    3 ++-
>>  3 files changed, 18 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index b3e7a66..b357ae3 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -497,8 +497,9 @@ void __memcg_kmem_commit_charge(struct page *page,
>>  void __memcg_kmem_uncharge_pages(struct page *page, int order);
>>  
>>  int memcg_cache_id(struct mem_cgroup *memcg);
>> -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
>> -			 struct kmem_cache *root_cache);
>> +int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>> +			    struct kmem_cache *root_cache);
>> +void memcg_free_cache_params(struct kmem_cache *s);
>>  void memcg_release_cache(struct kmem_cache *cachep);
>>  void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
>>  
>> @@ -641,12 +642,16 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
>>  }
>>  
>>  static inline int
>> -memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
>> -		     struct kmem_cache *root_cache)
>> +memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>> +			struct kmem_cache *root_cache)
>>  {
>>  	return 0;
>>  }
>>  
>> +static inline void memcg_free_cache_params(struct kmem_cache *s);
>> +{
>> +}
>> +
>>  static inline void memcg_release_cache(struct kmem_cache *cachep)
>>  {
>>  }
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index bf5e894..e6ad6ff 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3195,8 +3195,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
>>  	return 0;
>>  }
>>  
>> -int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
>> -			 struct kmem_cache *root_cache)
>> +int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>> +			    struct kmem_cache *root_cache)
>>  {
>>  	size_t size;
>>  
>> @@ -3224,6 +3224,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
>>  	return 0;
>>  }
>>  
>> +void memcg_free_cache_params(struct kmem_cache *s)
>> +{
>> +	kfree(s->memcg_params);
>> +}
>> +
>>  void memcg_release_cache(struct kmem_cache *s)
>>  {
>>  	struct kmem_cache *root;
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index 5d6f743..62712fe 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -208,7 +208,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>>  		goto out_free_cache;
>>  	}
>>  
>> -	err = memcg_register_cache(memcg, s, parent_cache);
>> +	err = memcg_init_cache_params(memcg, s, parent_cache);
>>  	if (err)
>>  		goto out_free_cache;
>>  
>> @@ -238,6 +238,7 @@ out_unlock:
>>  	return s;
>>  
>>  out_free_cache:
>> +	memcg_free_cache_params(s);
>>  	kfree(s->name);
>>  	kmem_cache_free(kmem_cache, s);
>>  	goto out_unlock;
>> -- 
>> 1.7.10.4
>>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches
  2013-12-18 17:14   ` Michal Hocko
@ 2013-12-19  6:37     ` Vladimir Davydov
  2013-12-19  9:10       ` Michal Hocko
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  6:37 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/18/2013 09:14 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
>> First, in memcg_create_kmem_cache() we should issue the write barrier
>> after the kmem_cache is initialized, but before storing the pointer to
>> it in its parent's memcg_params.
>>
>> Second, we should always issue the read barrier after
>> cache_from_memcg_idx() to conform with the write barrier.
>>
>> Third, its better to use smp_* versions of barriers, because we don't
>> need them on UP systems.
> Please be (much) more verbose on Why. Barriers are tricky and should be
> documented accordingly. So if you say that we should issue a barrier
> always be specific why we should do it.

In short, we have kmem_cache::memcg_params::memcg_caches is an array of
pointers to per-memcg caches. We access it lock-free so we should use
memory barriers during initialization. Obviously we should place a write
barrier just before we set the pointer in order to make sure nobody will
see a partially initialized structure. Besides there must be a read
barrier between reading the pointer and accessing the structure, to
conform with the write barrier. It's all that similar to rcu_assign and
rcu_deref. Currently the barrier usage looks rather strange:

memcg_create_kmem_cache:
    initialize kmem
    set the pointer in memcg_caches
    wmb() // ???

__memcg_kmem_get_cache:
    <...>
    read_barrier_depends() // ???
    cachep = root_cache->memcg_params->memcg_caches[memcg_id]
    <...>

Nothing prevents some archs from moving initialization after setting the
pointer, or reading data before reading the pointer to it.

Of course, I will include a detailed description in the next version of
this patch.

Thanks.

>> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
>> Cc: Michal Hocko <mhocko@suse.cz>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Glauber Costa <glommer@gmail.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Pekka Enberg <penberg@kernel.org>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> ---
>>  mm/memcontrol.c |   24 ++++++++++--------------
>>  mm/slab.h       |    6 +++++-
>>  2 files changed, 15 insertions(+), 15 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index e6ad6ff..e37fdb5 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3429,12 +3429,14 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
>>  
>>  	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
>>  
>> -	cachep->memcg_params->memcg_caches[idx] = new_cachep;
>>  	/*
>> -	 * the readers won't lock, make sure everybody sees the updated value,
>> -	 * so they won't put stuff in the queue again for no reason
>> +	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
>> +	 * barrier here to ensure nobody will see the kmem_cache partially
>> +	 * initialized.
>>  	 */
>> -	wmb();
>> +	smp_wmb();
>> +
>> +	cachep->memcg_params->memcg_caches[idx] = new_cachep;
>>  out:
>>  	mutex_unlock(&memcg_cache_mutex);
>>  	return new_cachep;
>> @@ -3573,7 +3575,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
>>  					  gfp_t gfp)
>>  {
>>  	struct mem_cgroup *memcg;
>> -	int idx;
>> +	struct kmem_cache *memcg_cachep;
>>  
>>  	VM_BUG_ON(!cachep->memcg_params);
>>  	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
>> @@ -3587,15 +3589,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
>>  	if (!memcg_can_account_kmem(memcg))
>>  		goto out;
>>  
>> -	idx = memcg_cache_id(memcg);
>> -
>> -	/*
>> -	 * barrier to mare sure we're always seeing the up to date value.  The
>> -	 * code updating memcg_caches will issue a write barrier to match this.
>> -	 */
>> -	read_barrier_depends();
>> -	if (likely(cache_from_memcg_idx(cachep, idx))) {
>> -		cachep = cache_from_memcg_idx(cachep, idx);
>> +	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
>> +	if (likely(memcg_cachep)) {
>> +		cachep = memcg_cachep;
>>  		goto out;
>>  	}
>>  
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 0859c42..1d8b53f 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -163,9 +163,13 @@ static inline const char *cache_name(struct kmem_cache *s)
>>  static inline struct kmem_cache *
>>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
>>  {
>> +	struct kmem_cache *cachep;
>> +
>>  	if (!s->memcg_params)
>>  		return NULL;
>> -	return s->memcg_params->memcg_caches[idx];
>> +	cachep = s->memcg_params->memcg_caches[idx];
>> +	smp_read_barrier_depends();	/* see memcg_register_cache() */
>> +	return cachep;
>>  }
>>  
>>  static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
>> -- 
>> 1.7.10.4
>>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex
  2013-12-18 17:41   ` Michal Hocko
@ 2013-12-19  7:07     ` Vladimir Davydov
  2013-12-19  8:00       ` Glauber Costa
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  7:07 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/18/2013 09:41 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
>> The memcg_params::memcg_caches array can be updated concurrently from
>> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
>> of these functions take the slab_mutex during their operation, the
>> latter checks if memcg's cache has already been allocated w/o taking the
>> mutex. This can result in a race as described below.
>>
>> Asume two threads schedule kmem_cache creation works for the same
>> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
>> works successfully creates it. Another work should fail then, but if it
>> interleaves with memcg_update_cache_size() as follows, it does not:
> I am not sure I understand the race. memcg_update_cache_size is called
> when we start accounting a new memcg or a child is created and it
> inherits accounting from the parent. memcg_create_kmem_cache is called
> when a new cache is first allocated from, right?

memcg_update_cache_size() is called when kmem accounting is activated
for a memcg, no matter how.

memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
It's OK to have a bunch of such methods trying to create the same memcg
cache concurrently, but only one of them should succeed.

> Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
> it is running from the workqueue context so it should clash with other
> locks.

Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
have always been wondering why, because it could simplify flow paths
significantly (e.g. update_cache_sizes() -> update_all_caches() ->
update_cache_size() - from memcontrol.c to slab_common.c and back again
just to take the mutex).

I don't see any reason preventing us from taking the mutex in
memcontrol.c. This would allow us to move all memcg-related kmem cache
initialization (addition to the memcg slab list, initialization of the
pointer in memcg_caches) to memcg_kmem_cache_create() and remove a bunch
of public functions. I guess I'll do this in my next iteration.

Thanks.

>
>>   memcg_create_kmem_cache()                     memcg_update_cache_size()
>>   (called w/o mutexes held)                     (called with slab_mutex held)
>>   -------------------------                     -------------------------
>>   mutex_lock(&memcg_cache_mutex)
>>                                                 s->memcg_params=kzalloc(...)
>>   new_cachep=cache_from_memcg_idx(cachep,idx)
>>   // new_cachep==NULL => proceed to creation
>>                                                 s->memcg_params->memcg_caches[i]
>>                                                     =cur_params->memcg_caches[i]
>>   // kmem_cache_dup takes slab_mutex so we will
>>   // hang around here until memcg_update_cache_size()
>>   // finishes, but ...
>>   new_cachep = kmem_cache_dup(memcg, cachep)
>>   // nothing will prevent kmem_cache_dup from
>>   // succeeding so ...
>>   cachep->memcg_params->memcg_caches[idx]=new_cachep
>>   // we've overwritten an existing cache ptr!
>>
>> Let's fix this by moving both the check and the update of
>> memcg_params::memcg_caches from memcg_create_kmem_cache() to
>> kmem_cache_create_memcg() to be called under the slab_mutex.
>>
>> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
>> Cc: Michal Hocko <mhocko@suse.cz>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Glauber Costa <glommer@gmail.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Pekka Enberg <penberg@kernel.org>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> ---
>>  include/linux/memcontrol.h |    9 ++--
>>  mm/memcontrol.c            |   98 +++++++++++++++-----------------------------
>>  mm/slab_common.c           |    8 +++-
>>  3 files changed, 44 insertions(+), 71 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index b357ae3..fdd3f30 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -500,8 +500,8 @@ int memcg_cache_id(struct mem_cgroup *memcg);
>>  int memcg_init_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
>>  			    struct kmem_cache *root_cache);
>>  void memcg_free_cache_params(struct kmem_cache *s);
>> -void memcg_release_cache(struct kmem_cache *cachep);
>> -void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
>> +void memcg_register_cache(struct kmem_cache *s);
>> +void memcg_release_cache(struct kmem_cache *s);
>>  
>>  int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
>>  void memcg_update_array_size(int num_groups);
>> @@ -652,12 +652,11 @@ static inline void memcg_free_cache_params(struct kmem_cache *s);
>>  {
>>  }
>>  
>> -static inline void memcg_release_cache(struct kmem_cache *cachep)
>> +static inline void memcg_register_cache(struct kmem_cache *s)
>>  {
>>  }
>>  
>> -static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
>> -					struct kmem_cache *s)
>> +static inline void memcg_release_cache(struct kmem_cache *s)
>>  {
>>  }
>>  
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index e37fdb5..62b9991 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3059,16 +3059,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
>>  		css_put(&memcg->css);
>>  }
>>  
>> -void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
>> -{
>> -	if (!memcg)
>> -		return;
>> -
>> -	mutex_lock(&memcg->slab_caches_mutex);
>> -	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
>> -	mutex_unlock(&memcg->slab_caches_mutex);
>> -}
>> -
>>  /*
>>   * helper for acessing a memcg's index. It will be used as an index in the
>>   * child cache array in kmem_cache, and also to derive its name. This function
>> @@ -3229,6 +3219,35 @@ void memcg_free_cache_params(struct kmem_cache *s)
>>  	kfree(s->memcg_params);
>>  }
>>  
>> +void memcg_register_cache(struct kmem_cache *s)
>> +{
>> +	struct kmem_cache *root;
>> +	struct mem_cgroup *memcg;
>> +	int id;
>> +
>> +	if (is_root_cache(s))
>> +		return;
>> +
>> +	memcg = s->memcg_params->memcg;
>> +	id = memcg_cache_id(memcg);
>> +	root = s->memcg_params->root_cache;
>> +
>> +	css_get(&memcg->css);
>> +
>> +	/*
>> +	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
>> +	 * barrier here to ensure nobody will see the kmem_cache partially
>> +	 * initialized.
>> +	 */
>> +	smp_wmb();
>> +
>> +	root->memcg_params->memcg_caches[id] = s;
>> +
>> +	mutex_lock(&memcg->slab_caches_mutex);
>> +	list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
>> +	mutex_unlock(&memcg->slab_caches_mutex);
>> +}
>> +
>>  void memcg_release_cache(struct kmem_cache *s)
>>  {
>>  	struct kmem_cache *root;
>> @@ -3356,26 +3375,13 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
>>  	schedule_work(&cachep->memcg_params->destroy);
>>  }
>>  
>> -/*
>> - * This lock protects updaters, not readers. We want readers to be as fast as
>> - * they can, and they will either see NULL or a valid cache value. Our model
>> - * allow them to see NULL, in which case the root memcg will be selected.
>> - *
>> - * We need this lock because multiple allocations to the same cache from a non
>> - * will span more than one worker. Only one of them can create the cache.
>> - */
>> -static DEFINE_MUTEX(memcg_cache_mutex);
>> -
>> -/*
>> - * Called with memcg_cache_mutex held
>> - */
>> -static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
>> -					 struct kmem_cache *s)
>> +static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
>> +						  struct kmem_cache *s)
>>  {
>>  	struct kmem_cache *new;
>>  	static char *tmp_name = NULL;
>>  
>> -	lockdep_assert_held(&memcg_cache_mutex);
>> +	BUG_ON(!memcg_can_account_kmem(memcg));
>>  
>>  	/*
>>  	 * kmem_cache_create_memcg duplicates the given name and
>> @@ -3403,45 +3409,6 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
>>  	return new;
>>  }
>>  
>> -static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
>> -						  struct kmem_cache *cachep)
>> -{
>> -	struct kmem_cache *new_cachep;
>> -	int idx;
>> -
>> -	BUG_ON(!memcg_can_account_kmem(memcg));
>> -
>> -	idx = memcg_cache_id(memcg);
>> -
>> -	mutex_lock(&memcg_cache_mutex);
>> -	new_cachep = cache_from_memcg_idx(cachep, idx);
>> -	if (new_cachep) {
>> -		css_put(&memcg->css);
>> -		goto out;
>> -	}
>> -
>> -	new_cachep = kmem_cache_dup(memcg, cachep);
>> -	if (new_cachep == NULL) {
>> -		new_cachep = cachep;
>> -		css_put(&memcg->css);
>> -		goto out;
>> -	}
>> -
>> -	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
>> -
>> -	/*
>> -	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
>> -	 * barrier here to ensure nobody will see the kmem_cache partially
>> -	 * initialized.
>> -	 */
>> -	smp_wmb();
>> -
>> -	cachep->memcg_params->memcg_caches[idx] = new_cachep;
>> -out:
>> -	mutex_unlock(&memcg_cache_mutex);
>> -	return new_cachep;
>> -}
>> -
>>  void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
>>  {
>>  	struct kmem_cache *c;
>> @@ -3516,6 +3483,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
>>  
>>  	cw = container_of(w, struct create_work, work);
>>  	memcg_create_kmem_cache(cw->memcg, cw->cachep);
>> +	css_put(&cw->memcg->css);
>>  	kfree(cw);
>>  }
>>  
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index 62712fe..51dc106 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -176,6 +176,12 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>>  	get_online_cpus();
>>  	mutex_lock(&slab_mutex);
>>  
>> +	if (memcg) {
>> +		s = cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg));
>> +		if (s)
>> +			goto out_unlock;
>> +	}
>> +
>>  	err = kmem_cache_sanity_check(memcg, name, size);
>>  	if (err)
>>  		goto out_unlock;
>> @@ -218,7 +224,7 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>>  
>>  	s->refcount = 1;
>>  	list_add(&s->list, &slab_caches);
>> -	memcg_cache_list_add(memcg, s);
>> +	memcg_register_cache(s);
>>  
>>  out_unlock:
>>  	mutex_unlock(&slab_mutex);
>> -- 
>> 1.7.10.4
>>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-18 13:16 [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Vladimir Davydov
                   ` (5 preceding siblings ...)
  2013-12-18 16:56 ` [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Michal Hocko
@ 2013-12-19  7:27 ` Pekka Enberg
  2013-12-19  8:17 ` [Devel] " Vasily Averin
  7 siblings, 0 replies; 42+ messages in thread
From: Pekka Enberg @ 2013-12-19  7:27 UTC (permalink / raw)
  To: Vladimir Davydov, Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/18/2013 03:16 PM, Vladimir Davydov wrote:
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Glauber Costa <glommer@gmail.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>

Look good to me.  Even though this patch just touches slab, I think it 
should go through the memcg tree.

Reviewed-by: Pekka Enberg <penberg@kernel.org>

                         Pekka

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex
  2013-12-19  7:07     ` Vladimir Davydov
@ 2013-12-19  8:00       ` Glauber Costa
  2013-12-19  9:12         ` Michal Hocko
  2013-12-19  9:21         ` Vladimir Davydov
  0 siblings, 2 replies; 42+ messages in thread
From: Glauber Costa @ 2013-12-19  8:00 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Michal Hocko, LKML, linux-mm, cgroups, devel, Johannes Weiner,
	Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu, Dec 19, 2013 at 11:07 AM, Vladimir Davydov
<vdavydov@parallels.com> wrote:
> On 12/18/2013 09:41 PM, Michal Hocko wrote:
>> On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
>>> The memcg_params::memcg_caches array can be updated concurrently from
>>> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
>>> of these functions take the slab_mutex during their operation, the
>>> latter checks if memcg's cache has already been allocated w/o taking the
>>> mutex. This can result in a race as described below.
>>>
>>> Asume two threads schedule kmem_cache creation works for the same
>>> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
>>> works successfully creates it. Another work should fail then, but if it
>>> interleaves with memcg_update_cache_size() as follows, it does not:
>> I am not sure I understand the race. memcg_update_cache_size is called
>> when we start accounting a new memcg or a child is created and it
>> inherits accounting from the parent. memcg_create_kmem_cache is called
>> when a new cache is first allocated from, right?
>
> memcg_update_cache_size() is called when kmem accounting is activated
> for a memcg, no matter how.
>
> memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
> It's OK to have a bunch of such methods trying to create the same memcg
> cache concurrently, but only one of them should succeed.
>
>> Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
>> it is running from the workqueue context so it should clash with other
>> locks.
>
> Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
> have always been wondering why, because it could simplify flow paths
> significantly (e.g. update_cache_sizes() -> update_all_caches() ->
> update_cache_size() - from memcontrol.c to slab_common.c and back again
> just to take the mutex).
>

Because that is a layering violation and exposes implementation
details of the slab to
the outside world. I agree this would make things a lot simpler, but
please check with Christoph
if this is acceptable before going forward.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Devel] [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-18 13:16 [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Vladimir Davydov
                   ` (6 preceding siblings ...)
  2013-12-19  7:27 ` Pekka Enberg
@ 2013-12-19  8:17 ` Vasily Averin
  2013-12-19  8:39   ` Vladimir Davydov
  7 siblings, 1 reply; 42+ messages in thread
From: Vasily Averin @ 2013-12-19  8:17 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Michal Hocko, Glauber Costa, linux-kernel, Pekka Enberg,
	linux-mm, Johannes Weiner, cgroups, Christoph Lameter,
	Andrew Morton, devel

On 12/18/2013 05:16 PM, Vladimir Davydov wrote:
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>  	get_online_cpus();
>  	mutex_lock(&slab_mutex);
>  
> -	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
> -		goto out_locked;
> +	err = kmem_cache_sanity_check(memcg, name, size);
> +	if (err)
> +		goto out_unlock;
>  
>  	/*
>  	 * Some allocators will constraint the set of valid flags to a subset

Theoretically in future kmem_cache_sanity_check() can return positive value.
Probably it's better to check (err < 0) in caller ?

Thank you,
	Vasily Averin

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Devel] [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-19  8:17 ` [Devel] " Vasily Averin
@ 2013-12-19  8:39   ` Vladimir Davydov
  2013-12-19  9:26     ` Vasily Averin
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  8:39 UTC (permalink / raw)
  To: Vasily Averin
  Cc: Michal Hocko, Glauber Costa, linux-kernel, Pekka Enberg,
	linux-mm, Johannes Weiner, cgroups, Christoph Lameter,
	Andrew Morton, devel

On 12/19/2013 12:17 PM, Vasily Averin wrote:
> On 12/18/2013 05:16 PM, Vladimir Davydov wrote:
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>>  	get_online_cpus();
>>  	mutex_lock(&slab_mutex);
>>  
>> -	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
>> -		goto out_locked;
>> +	err = kmem_cache_sanity_check(memcg, name, size);
>> +	if (err)
>> +		goto out_unlock;
>>  
>>  	/*
>>  	 * Some allocators will constraint the set of valid flags to a subset
> Theoretically in future kmem_cache_sanity_check() can return positive value.
> Probably it's better to check (err < 0) in caller ?

Hmm, why? What information could positive retval carry here? We have
plenty of places throughout the code where we check for (err), not
(err<0), simply because it looks clearer, e.g. look at
__kmem_cache_create() calls. If it returns a positive value one day, we
will have to parse every place where it's called. Anyway, if someone
wants to change a function behavior, he must check every place where
this function is called and fix them accordingly.

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-19  6:31   ` Vladimir Davydov
@ 2013-12-19  8:44     ` Michal Hocko
  2013-12-19  8:51       ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  8:44 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 10:31:43, Vladimir Davydov wrote:
> On 12/18/2013 08:56 PM, Michal Hocko wrote:
> > On Wed 18-12-13 17:16:52, Vladimir Davydov wrote:
> >> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> >> Cc: Michal Hocko <mhocko@suse.cz>
> >> Cc: Johannes Weiner <hannes@cmpxchg.org>
> >> Cc: Glauber Costa <glommer@gmail.com>
> >> Cc: Christoph Lameter <cl@linux.com>
> >> Cc: Pekka Enberg <penberg@kernel.org>
> >> Cc: Andrew Morton <akpm@linux-foundation.org>
> > Dunno, is this really better to be worth the code churn?
> >
> > It even makes the generated code tiny bit bigger:
> > text    data     bss     dec     hex filename
> > 4355     171     236    4762    129a mm/slab_common.o.after
> > 4342     171     236    4749    128d mm/slab_common.o.before
> >
> > Or does it make the further changes much more easier? Be explicit in the
> > patch description if so.
> 
> Hi, Michal
> 
> IMO, undoing under labels looks better than inside conditionals, because
> we don't have to repeat the same deinitialization code then, like this
> (note three calls to kmem_cache_free()):

Agreed but the resulting code is far from doing nice undo on different
conditions. You have out_free_cache which frees everything regardless
whether name or cache registration failed. So it doesn't help with
readability much IMO.

> 
>     s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
>     if (s) {
>         s->object_size = s->size = size;
>         s->align = calculate_alignment(flags, align, size);
>         s->ctor = ctor;
> 
>         if (memcg_register_cache(memcg, s, parent_cache)) {
>             kmem_cache_free(kmem_cache, s);
>             err = -ENOMEM;
>             goto out_locked;
>         }
> 
>         s->name = kstrdup(name, GFP_KERNEL);
>         if (!s->name) {
>             kmem_cache_free(kmem_cache, s);
>             err = -ENOMEM;
>             goto out_locked;
>         }
> 
>         err = __kmem_cache_create(s, flags);
>         if (!err) {
>             s->refcount = 1;
>             list_add(&s->list, &slab_caches);
>             memcg_cache_list_add(memcg, s);
>         } else {
>             kfree(s->name);
>             kmem_cache_free(kmem_cache, s);
>         }
>     } else
>         err = -ENOMEM;
> 
> The next patch, which fixes the memcg_params leakage on error, would
> make it even worse introducing two calls to memcg_free_cache_params()
> after kstrdup and __kmem_cache_create.
> 
> If you think it isn't worthwhile applying this patch, just let me know,
> I don't mind dropping it.

As I've said if it helps with the later patches then I do not mind but
on its own it doesn't sound like a huge improvement.

Btw. you do not have to set err = -ENOMEM before goto out_locked. Just
set before kmem_cache_zalloc. You also do not need to initialize it to 0
because kmem_cache_sanity_check will set it.

> Anyway, I'll improve the comment and resend.

Thanks!

> Thanks.
> 
> >
> >> ---
> >>  mm/slab_common.c |   66 +++++++++++++++++++++++++++---------------------------
> >>  1 file changed, 33 insertions(+), 33 deletions(-)
> >>
> >> diff --git a/mm/slab_common.c b/mm/slab_common.c
> >> index 0b7bb39..5d6f743 100644
> >> --- a/mm/slab_common.c
> >> +++ b/mm/slab_common.c
> >> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
> >>  	get_online_cpus();
> >>  	mutex_lock(&slab_mutex);
> >>  
> >> -	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
> >> -		goto out_locked;
> >> +	err = kmem_cache_sanity_check(memcg, name, size);
> >> +	if (err)
> >> +		goto out_unlock;
> >>  
> >>  	/*
> >>  	 * Some allocators will constraint the set of valid flags to a subset
> >> @@ -189,45 +190,41 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
> >>  
> >>  	s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
> >>  	if (s)
> >> -		goto out_locked;
> >> +		goto out_unlock;
> >>  
> >>  	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
> >> -	if (s) {
> >> -		s->object_size = s->size = size;
> >> -		s->align = calculate_alignment(flags, align, size);
> >> -		s->ctor = ctor;
> >> -
> >> -		if (memcg_register_cache(memcg, s, parent_cache)) {
> >> -			kmem_cache_free(kmem_cache, s);
> >> -			err = -ENOMEM;
> >> -			goto out_locked;
> >> -		}
> >> +	if (!s) {
> >> +		err = -ENOMEM;
> >> +		goto out_unlock;
> >> +	}
> >>  
> >> -		s->name = kstrdup(name, GFP_KERNEL);
> >> -		if (!s->name) {
> >> -			kmem_cache_free(kmem_cache, s);
> >> -			err = -ENOMEM;
> >> -			goto out_locked;
> >> -		}
> >> +	s->object_size = s->size = size;
> >> +	s->align = calculate_alignment(flags, align, size);
> >> +	s->ctor = ctor;
> >>  
> >> -		err = __kmem_cache_create(s, flags);
> >> -		if (!err) {
> >> -			s->refcount = 1;
> >> -			list_add(&s->list, &slab_caches);
> >> -			memcg_cache_list_add(memcg, s);
> >> -		} else {
> >> -			kfree(s->name);
> >> -			kmem_cache_free(kmem_cache, s);
> >> -		}
> >> -	} else
> >> +	s->name = kstrdup(name, GFP_KERNEL);
> >> +	if (!s->name) {
> >>  		err = -ENOMEM;
> >> +		goto out_free_cache;
> >> +	}
> >> +
> >> +	err = memcg_register_cache(memcg, s, parent_cache);
> >> +	if (err)
> >> +		goto out_free_cache;
> >>  
> >> -out_locked:
> >> +	err = __kmem_cache_create(s, flags);
> >> +	if (err)
> >> +		goto out_free_cache;
> >> +
> >> +	s->refcount = 1;
> >> +	list_add(&s->list, &slab_caches);
> >> +	memcg_cache_list_add(memcg, s);
> >> +
> >> +out_unlock:
> >>  	mutex_unlock(&slab_mutex);
> >>  	put_online_cpus();
> >>  
> >>  	if (err) {
> >> -
> >>  		if (flags & SLAB_PANIC)
> >>  			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
> >>  				name, err);
> >> @@ -236,11 +233,14 @@ out_locked:
> >>  				name, err);
> >>  			dump_stack();
> >>  		}
> >> -
> >>  		return NULL;
> >>  	}
> >> -
> >>  	return s;
> >> +
> >> +out_free_cache:
> >> +	kfree(s->name);
> >> +	kmem_cache_free(kmem_cache, s);
> >> +	goto out_unlock;
> >>  }
> >>  
> >>  struct kmem_cache *
> >> -- 
> >> 1.7.10.4
> >>
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error
  2013-12-19  6:32     ` Vladimir Davydov
@ 2013-12-19  8:48       ` Michal Hocko
  2013-12-19  9:01         ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  8:48 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 10:32:29, Vladimir Davydov wrote:
> On 12/18/2013 09:06 PM, Michal Hocko wrote:
> > On Wed 18-12-13 17:16:53, Vladimir Davydov wrote:
> >> Plus, rename memcg_register_cache() to memcg_init_cache_params(),
> >> because it actually does not register the cache anywhere, but simply
> >> initialize kmem_cache::memcg_params.
> > I've almost missed this is a memory leak fix.
> 
> Yeah, the comment is poor, sorry about that. Will fix it.
> 
> > I do not mind renaming and the name but wouldn't
> > memcg_alloc_cache_params suit better?
> 
> As you wish. I don't have a strong preference for memcg_init_cache_params.

I really hate naming... but it seems that alloc is a better fit. _init_
would expect an already allocated object.

Btw. memcg_free_cache_params is called only once which sounds
suspicious. The regular destroy path should use it as well?
[...]
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-19  8:44     ` Michal Hocko
@ 2013-12-19  8:51       ` Vladimir Davydov
  2013-12-19  9:16         ` Michal Hocko
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  8:51 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/19/2013 12:44 PM, Michal Hocko wrote:
> On Thu 19-12-13 10:31:43, Vladimir Davydov wrote:
>> On 12/18/2013 08:56 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:52, Vladimir Davydov wrote:
>>>> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
>>>> Cc: Michal Hocko <mhocko@suse.cz>
>>>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>>>> Cc: Glauber Costa <glommer@gmail.com>
>>>> Cc: Christoph Lameter <cl@linux.com>
>>>> Cc: Pekka Enberg <penberg@kernel.org>
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Dunno, is this really better to be worth the code churn?
>>>
>>> It even makes the generated code tiny bit bigger:
>>> text    data     bss     dec     hex filename
>>> 4355     171     236    4762    129a mm/slab_common.o.after
>>> 4342     171     236    4749    128d mm/slab_common.o.before
>>>
>>> Or does it make the further changes much more easier? Be explicit in the
>>> patch description if so.
>> Hi, Michal
>>
>> IMO, undoing under labels looks better than inside conditionals, because
>> we don't have to repeat the same deinitialization code then, like this
>> (note three calls to kmem_cache_free()):
> Agreed but the resulting code is far from doing nice undo on different
> conditions. You have out_free_cache which frees everything regardless
> whether name or cache registration failed. So it doesn't help with
> readability much IMO.

AFAIK it's common practice not to split kfree's to be called under
different labels on fail paths, because kfree(NULL) results in a no-op.
Since on undo, we only call kfree, I introduce the only label. Of course
I could do something like

    s->name=...
    if (!s->name)
        goto out_free_name;
    err = __kmem_new_cache(...)
    if (err)
        goto out_free_name;
<...>
out_free_name:
    kfree(s->name);
out_free_cache:
    kfree(s);
    goto out_unlock;

But I think using only out_free_cache makes the code look clearer.

>
>>     s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
>>     if (s) {
>>         s->object_size = s->size = size;
>>         s->align = calculate_alignment(flags, align, size);
>>         s->ctor = ctor;
>>
>>         if (memcg_register_cache(memcg, s, parent_cache)) {
>>             kmem_cache_free(kmem_cache, s);
>>             err = -ENOMEM;
>>             goto out_locked;
>>         }
>>
>>         s->name = kstrdup(name, GFP_KERNEL);
>>         if (!s->name) {
>>             kmem_cache_free(kmem_cache, s);
>>             err = -ENOMEM;
>>             goto out_locked;
>>         }
>>
>>         err = __kmem_cache_create(s, flags);
>>         if (!err) {
>>             s->refcount = 1;
>>             list_add(&s->list, &slab_caches);
>>             memcg_cache_list_add(memcg, s);
>>         } else {
>>             kfree(s->name);
>>             kmem_cache_free(kmem_cache, s);
>>         }
>>     } else
>>         err = -ENOMEM;
>>
>> The next patch, which fixes the memcg_params leakage on error, would
>> make it even worse introducing two calls to memcg_free_cache_params()
>> after kstrdup and __kmem_cache_create.
>>
>> If you think it isn't worthwhile applying this patch, just let me know,
>> I don't mind dropping it.
> As I've said if it helps with the later patches then I do not mind but
> on its own it doesn't sound like a huge improvement.
>
> Btw. you do not have to set err = -ENOMEM before goto out_locked. Just
> set before kmem_cache_zalloc. You also do not need to initialize it to 0
> because kmem_cache_sanity_check will set it.

OK, thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error
  2013-12-19  8:48       ` Michal Hocko
@ 2013-12-19  9:01         ` Vladimir Davydov
  2013-12-19  9:19           ` Michal Hocko
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  9:01 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/19/2013 12:48 PM, Michal Hocko wrote:
> On Thu 19-12-13 10:32:29, Vladimir Davydov wrote:
>> On 12/18/2013 09:06 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:53, Vladimir Davydov wrote:
>>>> Plus, rename memcg_register_cache() to memcg_init_cache_params(),
>>>> because it actually does not register the cache anywhere, but simply
>>>> initialize kmem_cache::memcg_params.
>>> I've almost missed this is a memory leak fix.
>> Yeah, the comment is poor, sorry about that. Will fix it.
>>
>>> I do not mind renaming and the name but wouldn't
>>> memcg_alloc_cache_params suit better?
>> As you wish. I don't have a strong preference for memcg_init_cache_params.
> I really hate naming... but it seems that alloc is a better fit. _init_
> would expect an already allocated object.
>
> Btw. memcg_free_cache_params is called only once which sounds
> suspicious. The regular destroy path should use it as well?
> [...]

The usual destroy path uses memcg_release_cache(), which does the trick.
Plus, it actually "unregisters" the cache. BTW, I forgot to substitute
kfree(s->memcg_params) with the new memcg_free_cache_params() there.
Although it currently does not break anything, better to fix it in case
new memcg_free_cache_params() will have to do something else.

And you're right about the naming is not good.

Currently we have:

  on create:
    memcg_register_cache()
    memcg_cache_list_add()
  on destroy:
    memcg_release_cache()

After this patch we would have:

  on create:
    memcg_alloc_cache_params()
    memcg_register_cache()
  on destroy:
    memcg_release_cache()

Still not perfect: "alloc" does not have corresponding "free", while
"register" does not have corresponding "unregister", everything is done
by "release".

What do you think about splitting memcg_release_cache() into two functions:

    memcg_unregister_cache()
    memcg_free_cache_params()

?

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches
  2013-12-19  6:37     ` Vladimir Davydov
@ 2013-12-19  9:10       ` Michal Hocko
  2013-12-19  9:16         ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  9:10 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 10:37:27, Vladimir Davydov wrote:
> On 12/18/2013 09:14 PM, Michal Hocko wrote:
> > On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
> >> First, in memcg_create_kmem_cache() we should issue the write barrier
> >> after the kmem_cache is initialized, but before storing the pointer to
> >> it in its parent's memcg_params.
> >>
> >> Second, we should always issue the read barrier after
> >> cache_from_memcg_idx() to conform with the write barrier.
> >>
> >> Third, its better to use smp_* versions of barriers, because we don't
> >> need them on UP systems.
> > Please be (much) more verbose on Why. Barriers are tricky and should be
> > documented accordingly. So if you say that we should issue a barrier
> > always be specific why we should do it.
> 
> In short, we have kmem_cache::memcg_params::memcg_caches is an array of
> pointers to per-memcg caches. We access it lock-free so we should use
> memory barriers during initialization. Obviously we should place a write
> barrier just before we set the pointer in order to make sure nobody will
> see a partially initialized structure. Besides there must be a read
> barrier between reading the pointer and accessing the structure, to
> conform with the write barrier. It's all that similar to rcu_assign and
> rcu_deref. Currently the barrier usage looks rather strange:
> 
> memcg_create_kmem_cache:
>     initialize kmem
>     set the pointer in memcg_caches
>     wmb() // ???
> 
> __memcg_kmem_get_cache:
>     <...>
>     read_barrier_depends() // ???
>     cachep = root_cache->memcg_params->memcg_caches[memcg_id]
>     <...>

Why do we need explicit memory barriers when we can use RCU?
__memcg_kmem_get_cache already dereferences within rcu_read_lock.

Btw. cache_from_memcg_idx is desperately asking for a comment about
required locking.

> Nothing prevents some archs from moving initialization after setting the
> pointer, or reading data before reading the pointer to it.
> 
> Of course, I will include a detailed description in the next version of
> this patch.
> 
> Thanks.
> 
> >> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> >> Cc: Michal Hocko <mhocko@suse.cz>
> >> Cc: Johannes Weiner <hannes@cmpxchg.org>
> >> Cc: Glauber Costa <glommer@gmail.com>
> >> Cc: Christoph Lameter <cl@linux.com>
> >> Cc: Pekka Enberg <penberg@kernel.org>
> >> Cc: Andrew Morton <akpm@linux-foundation.org>
> >> ---
> >>  mm/memcontrol.c |   24 ++++++++++--------------
> >>  mm/slab.h       |    6 +++++-
> >>  2 files changed, 15 insertions(+), 15 deletions(-)
> >>
> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >> index e6ad6ff..e37fdb5 100644
> >> --- a/mm/memcontrol.c
> >> +++ b/mm/memcontrol.c
> >> @@ -3429,12 +3429,14 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
> >>  
> >>  	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
> >>  
> >> -	cachep->memcg_params->memcg_caches[idx] = new_cachep;
> >>  	/*
> >> -	 * the readers won't lock, make sure everybody sees the updated value,
> >> -	 * so they won't put stuff in the queue again for no reason
> >> +	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
> >> +	 * barrier here to ensure nobody will see the kmem_cache partially
> >> +	 * initialized.
> >>  	 */
> >> -	wmb();
> >> +	smp_wmb();
> >> +
> >> +	cachep->memcg_params->memcg_caches[idx] = new_cachep;
> >>  out:
> >>  	mutex_unlock(&memcg_cache_mutex);
> >>  	return new_cachep;
> >> @@ -3573,7 +3575,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
> >>  					  gfp_t gfp)
> >>  {
> >>  	struct mem_cgroup *memcg;
> >> -	int idx;
> >> +	struct kmem_cache *memcg_cachep;
> >>  
> >>  	VM_BUG_ON(!cachep->memcg_params);
> >>  	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
> >> @@ -3587,15 +3589,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
> >>  	if (!memcg_can_account_kmem(memcg))
> >>  		goto out;
> >>  
> >> -	idx = memcg_cache_id(memcg);
> >> -
> >> -	/*
> >> -	 * barrier to mare sure we're always seeing the up to date value.  The
> >> -	 * code updating memcg_caches will issue a write barrier to match this.
> >> -	 */
> >> -	read_barrier_depends();
> >> -	if (likely(cache_from_memcg_idx(cachep, idx))) {
> >> -		cachep = cache_from_memcg_idx(cachep, idx);
> >> +	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
> >> +	if (likely(memcg_cachep)) {
> >> +		cachep = memcg_cachep;
> >>  		goto out;
> >>  	}
> >>  
> >> diff --git a/mm/slab.h b/mm/slab.h
> >> index 0859c42..1d8b53f 100644
> >> --- a/mm/slab.h
> >> +++ b/mm/slab.h
> >> @@ -163,9 +163,13 @@ static inline const char *cache_name(struct kmem_cache *s)
> >>  static inline struct kmem_cache *
> >>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
> >>  {
> >> +	struct kmem_cache *cachep;
> >> +
> >>  	if (!s->memcg_params)
> >>  		return NULL;
> >> -	return s->memcg_params->memcg_caches[idx];
> >> +	cachep = s->memcg_params->memcg_caches[idx];
> >> +	smp_read_barrier_depends();	/* see memcg_register_cache() */
> >> +	return cachep;
> >>  }
> >>  
> >>  static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
> >> -- 
> >> 1.7.10.4
> >>
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex
  2013-12-19  8:00       ` Glauber Costa
@ 2013-12-19  9:12         ` Michal Hocko
  2013-12-19  9:17           ` Vladimir Davydov
  2013-12-19  9:21         ` Vladimir Davydov
  1 sibling, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  9:12 UTC (permalink / raw)
  To: Glauber Costa
  Cc: Vladimir Davydov, LKML, linux-mm, cgroups, devel,
	Johannes Weiner, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 12:00:58, Glauber Costa wrote:
> On Thu, Dec 19, 2013 at 11:07 AM, Vladimir Davydov
> <vdavydov@parallels.com> wrote:
> > On 12/18/2013 09:41 PM, Michal Hocko wrote:
> >> On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
> >>> The memcg_params::memcg_caches array can be updated concurrently from
> >>> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
> >>> of these functions take the slab_mutex during their operation, the
> >>> latter checks if memcg's cache has already been allocated w/o taking the
> >>> mutex. This can result in a race as described below.
> >>>
> >>> Asume two threads schedule kmem_cache creation works for the same
> >>> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
> >>> works successfully creates it. Another work should fail then, but if it
> >>> interleaves with memcg_update_cache_size() as follows, it does not:
> >> I am not sure I understand the race. memcg_update_cache_size is called
> >> when we start accounting a new memcg or a child is created and it
> >> inherits accounting from the parent. memcg_create_kmem_cache is called
> >> when a new cache is first allocated from, right?
> >
> > memcg_update_cache_size() is called when kmem accounting is activated
> > for a memcg, no matter how.
> >
> > memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
> > It's OK to have a bunch of such methods trying to create the same memcg
> > cache concurrently, but only one of them should succeed.
> >
> >> Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
> >> it is running from the workqueue context so it should clash with other
> >> locks.
> >
> > Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
> > have always been wondering why, because it could simplify flow paths
> > significantly (e.g. update_cache_sizes() -> update_all_caches() ->
> > update_cache_size() - from memcontrol.c to slab_common.c and back again
> > just to take the mutex).
> >
> 
> Because that is a layering violation and exposes implementation
> details of the slab to
> the outside world. I agree this would make things a lot simpler, but
> please check with Christoph
> if this is acceptable before going forward.

We do not have to expose the lock directly. We can hide it behind a
helper function. Relying on the lock silently at many places is worse
then expose it IMHO.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches
  2013-12-19  9:10       ` Michal Hocko
@ 2013-12-19  9:16         ` Vladimir Davydov
  2013-12-19  9:21           ` Michal Hocko
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  9:16 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/19/2013 01:10 PM, Michal Hocko wrote:
> On Thu 19-12-13 10:37:27, Vladimir Davydov wrote:
>> On 12/18/2013 09:14 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
>>>> First, in memcg_create_kmem_cache() we should issue the write barrier
>>>> after the kmem_cache is initialized, but before storing the pointer to
>>>> it in its parent's memcg_params.
>>>>
>>>> Second, we should always issue the read barrier after
>>>> cache_from_memcg_idx() to conform with the write barrier.
>>>>
>>>> Third, its better to use smp_* versions of barriers, because we don't
>>>> need them on UP systems.
>>> Please be (much) more verbose on Why. Barriers are tricky and should be
>>> documented accordingly. So if you say that we should issue a barrier
>>> always be specific why we should do it.
>> In short, we have kmem_cache::memcg_params::memcg_caches is an array of
>> pointers to per-memcg caches. We access it lock-free so we should use
>> memory barriers during initialization. Obviously we should place a write
>> barrier just before we set the pointer in order to make sure nobody will
>> see a partially initialized structure. Besides there must be a read
>> barrier between reading the pointer and accessing the structure, to
>> conform with the write barrier. It's all that similar to rcu_assign and
>> rcu_deref. Currently the barrier usage looks rather strange:
>>
>> memcg_create_kmem_cache:
>>     initialize kmem
>>     set the pointer in memcg_caches
>>     wmb() // ???
>>
>> __memcg_kmem_get_cache:
>>     <...>
>>     read_barrier_depends() // ???
>>     cachep = root_cache->memcg_params->memcg_caches[memcg_id]
>>     <...>
> Why do we need explicit memory barriers when we can use RCU?
> __memcg_kmem_get_cache already dereferences within rcu_read_lock.

Because it's not RCU, IMO. RCU implies freeing the old version after a
grace period, while kmem_caches are freed immediately. We simply want to
be sure the kmem_cache is fully initialized. And we do not require
calling this in an RCU critical section.

> Btw. cache_from_memcg_idx is desperately asking for a comment about
> required locking.

Actually, I placed a reference to the comment there ;-) but no problem,
I move it to cache_from_memcg_idx().

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-19  8:51       ` Vladimir Davydov
@ 2013-12-19  9:16         ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  9:16 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 12:51:38, Vladimir Davydov wrote:
> On 12/19/2013 12:44 PM, Michal Hocko wrote:
> > On Thu 19-12-13 10:31:43, Vladimir Davydov wrote:
> >> On 12/18/2013 08:56 PM, Michal Hocko wrote:
> >>> On Wed 18-12-13 17:16:52, Vladimir Davydov wrote:
> >>>> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> >>>> Cc: Michal Hocko <mhocko@suse.cz>
> >>>> Cc: Johannes Weiner <hannes@cmpxchg.org>
> >>>> Cc: Glauber Costa <glommer@gmail.com>
> >>>> Cc: Christoph Lameter <cl@linux.com>
> >>>> Cc: Pekka Enberg <penberg@kernel.org>
> >>>> Cc: Andrew Morton <akpm@linux-foundation.org>
> >>> Dunno, is this really better to be worth the code churn?
> >>>
> >>> It even makes the generated code tiny bit bigger:
> >>> text    data     bss     dec     hex filename
> >>> 4355     171     236    4762    129a mm/slab_common.o.after
> >>> 4342     171     236    4749    128d mm/slab_common.o.before
> >>>
> >>> Or does it make the further changes much more easier? Be explicit in the
> >>> patch description if so.
> >> Hi, Michal
> >>
> >> IMO, undoing under labels looks better than inside conditionals, because
> >> we don't have to repeat the same deinitialization code then, like this
> >> (note three calls to kmem_cache_free()):
> > Agreed but the resulting code is far from doing nice undo on different
> > conditions. You have out_free_cache which frees everything regardless
> > whether name or cache registration failed. So it doesn't help with
> > readability much IMO.
> 
> AFAIK it's common practice not to split kfree's to be called under
> different labels on fail paths, because kfree(NULL) results in a no-op.
> Since on undo, we only call kfree, I introduce the only label. Of course
> I could do something like
> 
>     s->name=...
>     if (!s->name)
>         goto out_free_name;
>     err = __kmem_new_cache(...)
>     if (err)
>         goto out_free_name;
> <...>
> out_free_name:
>     kfree(s->name);
> out_free_cache:
>     kfree(s);
>     goto out_unlock;
> 
> But I think using only out_free_cache makes the code look clearer.

I disagree. It is much easier to review code for mem leaks when you have
explicit cleanup gotos. But this is a matter of taste I guess.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex
  2013-12-19  9:12         ` Michal Hocko
@ 2013-12-19  9:17           ` Vladimir Davydov
  0 siblings, 0 replies; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  9:17 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Glauber Costa, LKML, linux-mm, cgroups, devel, Johannes Weiner,
	Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/19/2013 01:12 PM, Michal Hocko wrote:
> On Thu 19-12-13 12:00:58, Glauber Costa wrote:
>> On Thu, Dec 19, 2013 at 11:07 AM, Vladimir Davydov
>> <vdavydov@parallels.com> wrote:
>>> On 12/18/2013 09:41 PM, Michal Hocko wrote:
>>>> On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
>>>>> The memcg_params::memcg_caches array can be updated concurrently from
>>>>> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
>>>>> of these functions take the slab_mutex during their operation, the
>>>>> latter checks if memcg's cache has already been allocated w/o taking the
>>>>> mutex. This can result in a race as described below.
>>>>>
>>>>> Asume two threads schedule kmem_cache creation works for the same
>>>>> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
>>>>> works successfully creates it. Another work should fail then, but if it
>>>>> interleaves with memcg_update_cache_size() as follows, it does not:
>>>> I am not sure I understand the race. memcg_update_cache_size is called
>>>> when we start accounting a new memcg or a child is created and it
>>>> inherits accounting from the parent. memcg_create_kmem_cache is called
>>>> when a new cache is first allocated from, right?
>>> memcg_update_cache_size() is called when kmem accounting is activated
>>> for a memcg, no matter how.
>>>
>>> memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
>>> It's OK to have a bunch of such methods trying to create the same memcg
>>> cache concurrently, but only one of them should succeed.
>>>
>>>> Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
>>>> it is running from the workqueue context so it should clash with other
>>>> locks.
>>> Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
>>> have always been wondering why, because it could simplify flow paths
>>> significantly (e.g. update_cache_sizes() -> update_all_caches() ->
>>> update_cache_size() - from memcontrol.c to slab_common.c and back again
>>> just to take the mutex).
>>>
>> Because that is a layering violation and exposes implementation
>> details of the slab to
>> the outside world. I agree this would make things a lot simpler, but
>> please check with Christoph
>> if this is acceptable before going forward.
> We do not have to expose the lock directly. We can hide it behind a
> helper function. Relying on the lock silently at many places is worse
> then expose it IMHO.

BTW, the lock is already exposed by mm/slab.h, which is included into
mm/memcontrol.c :-) So we have immediate access to the lock right now.

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error
  2013-12-19  9:01         ` Vladimir Davydov
@ 2013-12-19  9:19           ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  9:19 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 13:01:28, Vladimir Davydov wrote:
> On 12/19/2013 12:48 PM, Michal Hocko wrote:
> > On Thu 19-12-13 10:32:29, Vladimir Davydov wrote:
> >> On 12/18/2013 09:06 PM, Michal Hocko wrote:
> >>> On Wed 18-12-13 17:16:53, Vladimir Davydov wrote:
> >>>> Plus, rename memcg_register_cache() to memcg_init_cache_params(),
> >>>> because it actually does not register the cache anywhere, but simply
> >>>> initialize kmem_cache::memcg_params.
> >>> I've almost missed this is a memory leak fix.
> >> Yeah, the comment is poor, sorry about that. Will fix it.
> >>
> >>> I do not mind renaming and the name but wouldn't
> >>> memcg_alloc_cache_params suit better?
> >> As you wish. I don't have a strong preference for memcg_init_cache_params.
> > I really hate naming... but it seems that alloc is a better fit. _init_
> > would expect an already allocated object.
> >
> > Btw. memcg_free_cache_params is called only once which sounds
> > suspicious. The regular destroy path should use it as well?
> > [...]
> 
> The usual destroy path uses memcg_release_cache(), which does the trick.
> Plus, it actually "unregisters" the cache. BTW, I forgot to substitute
> kfree(s->memcg_params) with the new memcg_free_cache_params() there.
> Although it currently does not break anything, better to fix it in case
> new memcg_free_cache_params() will have to do something else.
> 
> And you're right about the naming is not good.
> 
> Currently we have:
> 
>   on create:
>     memcg_register_cache()
>     memcg_cache_list_add()
>   on destroy:
>     memcg_release_cache()
> 
> After this patch we would have:
> 
>   on create:
>     memcg_alloc_cache_params()
>     memcg_register_cache()
>   on destroy:
>     memcg_release_cache()
> 
> Still not perfect: "alloc" does not have corresponding "free", while
> "register" does not have corresponding "unregister", everything is done
> by "release".
> 
> What do you think about splitting memcg_release_cache() into two functions:
> 
>     memcg_unregister_cache()
>     memcg_free_cache_params()

yes I am all for cleaning up this mess. I am still trying to wrap my
head around what is each of this function responsible for.
Absolute lack of documentation is not helping at all...

> 
> ?
> 
> Thanks.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches
  2013-12-19  9:16         ` Vladimir Davydov
@ 2013-12-19  9:21           ` Michal Hocko
  2013-12-19  9:29             ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  9:21 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 13:16:01, Vladimir Davydov wrote:
> On 12/19/2013 01:10 PM, Michal Hocko wrote:
> > On Thu 19-12-13 10:37:27, Vladimir Davydov wrote:
> >> On 12/18/2013 09:14 PM, Michal Hocko wrote:
> >>> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
> >>>> First, in memcg_create_kmem_cache() we should issue the write barrier
> >>>> after the kmem_cache is initialized, but before storing the pointer to
> >>>> it in its parent's memcg_params.
> >>>>
> >>>> Second, we should always issue the read barrier after
> >>>> cache_from_memcg_idx() to conform with the write barrier.
> >>>>
> >>>> Third, its better to use smp_* versions of barriers, because we don't
> >>>> need them on UP systems.
> >>> Please be (much) more verbose on Why. Barriers are tricky and should be
> >>> documented accordingly. So if you say that we should issue a barrier
> >>> always be specific why we should do it.
> >> In short, we have kmem_cache::memcg_params::memcg_caches is an array of
> >> pointers to per-memcg caches. We access it lock-free so we should use
> >> memory barriers during initialization. Obviously we should place a write
> >> barrier just before we set the pointer in order to make sure nobody will
> >> see a partially initialized structure. Besides there must be a read
> >> barrier between reading the pointer and accessing the structure, to
> >> conform with the write barrier. It's all that similar to rcu_assign and
> >> rcu_deref. Currently the barrier usage looks rather strange:
> >>
> >> memcg_create_kmem_cache:
> >>     initialize kmem
> >>     set the pointer in memcg_caches
> >>     wmb() // ???
> >>
> >> __memcg_kmem_get_cache:
> >>     <...>
> >>     read_barrier_depends() // ???
> >>     cachep = root_cache->memcg_params->memcg_caches[memcg_id]
> >>     <...>
> > Why do we need explicit memory barriers when we can use RCU?
> > __memcg_kmem_get_cache already dereferences within rcu_read_lock.
> 
> Because it's not RCU, IMO. RCU implies freeing the old version after a
> grace period, while kmem_caches are freed immediately. We simply want to
> be sure the kmem_cache is fully initialized. And we do not require
> calling this in an RCU critical section.

And you can use rcu_dereference and rcu_assign for that as well. It
hides all the juicy details about memory barriers. Besides that nothing
prevents us from freeing from rcu callback. Or?
 
> > Btw. cache_from_memcg_idx is desperately asking for a comment about
> > required locking.
> 
> Actually, I placed a reference to the comment there ;-) but no problem,
> I move it to cache_from_memcg_idx().
> 
> Thanks.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex
  2013-12-19  8:00       ` Glauber Costa
  2013-12-19  9:12         ` Michal Hocko
@ 2013-12-19  9:21         ` Vladimir Davydov
  1 sibling, 0 replies; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  9:21 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Glauber Costa, Michal Hocko, LKML, linux-mm, cgroups, devel,
	Johannes Weiner, Pekka Enberg, Andrew Morton

Hi, Christoph

We have a problem with memcg-vs-slab interactions. Currently we set the
pointer to a new kmem_cache in its parent's memcg_caches array inside
memcg_create_kmem_cache() (mm/memcontrol.c):

memcg_create_kmem_cache():
    new_cachep = cache_from_memcg_idx(cachep, idx);
    if (new_cachep)
        goto out;
    new_cachep = kmem_cache_dup(memcg, cachep);
    cachep->memcg_params->memcg_caches[idx] = new_cachep;

It seems to be prone to a race as explained in the comment to this
patch. To fix the race, we need to move the assignment of new_cachep to
memcg_caches[idx] to be called under the slab_mutex protection.

There are basically two ways of doing this:

1. Move the assignment to kmem_cache_create_memcg() defined in
mm/slab.c. This is how this patch handles it.
2. Move taking of the slab_mutex, along with some memcg-specific
initialization bits, from kmem_cache_create_memcg() to
memcg_create_kmem_cache().

The second way, although looks clearer, will break the convention not to
take the slab_mutex inside mm/memcontrol.c, Glauber tried to observe
while implementing kmemcg.

So the question is: what do you think about taking the slab_mutex
directly from mm/memcontrol.c w/o using helper functions (confusing call
paths, IMO)?

Thanks.

On 12/19/2013 12:00 PM, Glauber Costa wrote:
> On Thu, Dec 19, 2013 at 11:07 AM, Vladimir Davydov
> <vdavydov@parallels.com> wrote:
>> On 12/18/2013 09:41 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:55, Vladimir Davydov wrote:
>>>> The memcg_params::memcg_caches array can be updated concurrently from
>>>> memcg_update_cache_size() and memcg_create_kmem_cache(). Although both
>>>> of these functions take the slab_mutex during their operation, the
>>>> latter checks if memcg's cache has already been allocated w/o taking the
>>>> mutex. This can result in a race as described below.
>>>>
>>>> Asume two threads schedule kmem_cache creation works for the same
>>>> kmem_cache of the same memcg from __memcg_kmem_get_cache(). One of the
>>>> works successfully creates it. Another work should fail then, but if it
>>>> interleaves with memcg_update_cache_size() as follows, it does not:
>>> I am not sure I understand the race. memcg_update_cache_size is called
>>> when we start accounting a new memcg or a child is created and it
>>> inherits accounting from the parent. memcg_create_kmem_cache is called
>>> when a new cache is first allocated from, right?
>> memcg_update_cache_size() is called when kmem accounting is activated
>> for a memcg, no matter how.
>>
>> memcg_create_kmem_cache() is scheduled from __memcg_kmem_get_cache().
>> It's OK to have a bunch of such methods trying to create the same memcg
>> cache concurrently, but only one of them should succeed.
>>
>>> Why cannot we simply take slab_mutex inside memcg_create_kmem_cache?
>>> it is running from the workqueue context so it should clash with other
>>> locks.
>> Hmm, Glauber's code never takes the slab_mutex inside memcontrol.c. I
>> have always been wondering why, because it could simplify flow paths
>> significantly (e.g. update_cache_sizes() -> update_all_caches() ->
>> update_cache_size() - from memcontrol.c to slab_common.c and back again
>> just to take the mutex).
>>
> Because that is a layering violation and exposes implementation
> details of the slab to
> the outside world. I agree this would make things a lot simpler, but
> please check with Christoph
> if this is acceptable before going forward.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Devel] [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-19  8:39   ` Vladimir Davydov
@ 2013-12-19  9:26     ` Vasily Averin
  2013-12-19  9:42       ` Vladimir Davydov
                         ` (2 more replies)
  0 siblings, 3 replies; 42+ messages in thread
From: Vasily Averin @ 2013-12-19  9:26 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Michal Hocko, Glauber Costa, linux-kernel, Pekka Enberg,
	linux-mm, Johannes Weiner, cgroups, Christoph Lameter,
	Andrew Morton, devel

On 12/19/2013 12:39 PM, Vladimir Davydov wrote:
> On 12/19/2013 12:17 PM, Vasily Averin wrote:
>> On 12/18/2013 05:16 PM, Vladimir Davydov wrote:
>>> --- a/mm/slab_common.c
>>> +++ b/mm/slab_common.c
>>> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>>>  	get_online_cpus();
>>>  	mutex_lock(&slab_mutex);
>>>  
>>> -	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
>>> -		goto out_locked;
>>> +	err = kmem_cache_sanity_check(memcg, name, size);
>>> +	if (err)
>>> +		goto out_unlock;
>>>  
>>>  	/*
>>>  	 * Some allocators will constraint the set of valid flags to a subset
>> Theoretically in future kmem_cache_sanity_check() can return positive value.
>> Probably it's better to check (err < 0) in caller ?
> 
> Hmm, why? What information could positive retval carry here? We have
> plenty of places throughout the code where we check for (err), not
> (err<0), simply because it looks clearer, e.g. look at
> __kmem_cache_create() calls. If it returns a positive value one day, we
> will have to parse every place where it's called. Anyway, if someone
> wants to change a function behavior, he must check every place where
> this function is called and fix them accordingly.

I believe expected semantic of function -- return negative in case of error.
So correct error cheek should be (err < 0).
(err) check is semantically incorrect, and it can lead to troubles in future.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches
  2013-12-18 13:16 ` [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches Vladimir Davydov
@ 2013-12-19  9:28   ` Michal Hocko
  2013-12-19  9:36     ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  9:28 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Wed 18-12-13 17:16:57, Vladimir Davydov wrote:
> We update root cache's memcg_params whenever we need to grow the
> memcg_caches array to accommodate all kmem-active memory cgroups.
> Currently we free the old version immediately then, which can lead to
> use-after-free, because the memcg_caches array is accessed lock-free.
> This patch fixes this by making memcg_params RCU-protected.

yes, I was thinking about something like this when talking about RCU
usage.

> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Glauber Costa <glommer@gmail.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> ---
>  include/linux/slab.h |    5 ++++-
>  mm/memcontrol.c      |   15 ++++++++-------
>  mm/slab.h            |    8 +++++++-
>  3 files changed, 19 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 1e2f4fe..f7e5649 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -528,7 +528,10 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
>  struct memcg_cache_params {
>  	bool is_root_cache;
>  	union {
> -		struct kmem_cache *memcg_caches[0];
> +		struct {
> +			struct rcu_head rcu_head;
> +			struct kmem_cache *memcg_caches[0];
> +		};
>  		struct {
>  			struct mem_cgroup *memcg;
>  			struct list_head list;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index ad8de6a..379fc5f 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3142,18 +3142,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
>  
>  	if (num_groups > memcg_limited_groups_array_size) {
>  		int i;
> +		struct memcg_cache_params *new_params;
>  		ssize_t size = memcg_caches_array_size(num_groups);
>  
>  		size *= sizeof(void *);
>  		size += offsetof(struct memcg_cache_params, memcg_caches);
>  
> -		s->memcg_params = kzalloc(size, GFP_KERNEL);
> -		if (!s->memcg_params) {
> -			s->memcg_params = cur_params;
> +		new_params = kzalloc(size, GFP_KERNEL);
> +		if (!new_params)
>  			return -ENOMEM;
> -		}
>  
> -		s->memcg_params->is_root_cache = true;
> +		new_params->is_root_cache = true;
>  
>  		/*
>  		 * There is the chance it will be bigger than
> @@ -3167,7 +3166,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
>  		for (i = 0; i < memcg_limited_groups_array_size; i++) {
>  			if (!cur_params->memcg_caches[i])
>  				continue;
> -			s->memcg_params->memcg_caches[i] =
> +			new_params->memcg_caches[i] =
>  						cur_params->memcg_caches[i];
>  		}
>  
> @@ -3180,7 +3179,9 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
>  		 * bigger than the others. And all updates will reset this
>  		 * anyway.
>  		 */
> -		kfree(cur_params);
> +		rcu_assign_pointer(s->memcg_params, new_params);
> +		if (cur_params)
> +			kfree_rcu(cur_params, rcu_head);
>  	}
>  	return 0;
>  }
> diff --git a/mm/slab.h b/mm/slab.h
> index 1d8b53f..53b81a9 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -164,10 +164,16 @@ static inline struct kmem_cache *
>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
>  {
>  	struct kmem_cache *cachep;
> +	struct memcg_cache_params *params;
>  
>  	if (!s->memcg_params)
>  		return NULL;
> -	cachep = s->memcg_params->memcg_caches[idx];
> +
> +	rcu_read_lock();
> +	params = rcu_dereference(s->memcg_params);
> +	cachep = params->memcg_caches[idx];
> +	rcu_read_unlock();
> +

Consumer has to be covered by the same rcu section otherwise
memcg_params might be freed right after rcu unlock here.

>  	smp_read_barrier_depends();	/* see memcg_register_cache() */
>  	return cachep;
>  }
> -- 
> 1.7.10.4
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches
  2013-12-19  9:21           ` Michal Hocko
@ 2013-12-19  9:29             ` Vladimir Davydov
  2013-12-19  9:36               ` Michal Hocko
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  9:29 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/19/2013 01:21 PM, Michal Hocko wrote:
> On Thu 19-12-13 13:16:01, Vladimir Davydov wrote:
>> On 12/19/2013 01:10 PM, Michal Hocko wrote:
>>> On Thu 19-12-13 10:37:27, Vladimir Davydov wrote:
>>>> On 12/18/2013 09:14 PM, Michal Hocko wrote:
>>>>> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
>>>>>> First, in memcg_create_kmem_cache() we should issue the write barrier
>>>>>> after the kmem_cache is initialized, but before storing the pointer to
>>>>>> it in its parent's memcg_params.
>>>>>>
>>>>>> Second, we should always issue the read barrier after
>>>>>> cache_from_memcg_idx() to conform with the write barrier.
>>>>>>
>>>>>> Third, its better to use smp_* versions of barriers, because we don't
>>>>>> need them on UP systems.
>>>>> Please be (much) more verbose on Why. Barriers are tricky and should be
>>>>> documented accordingly. So if you say that we should issue a barrier
>>>>> always be specific why we should do it.
>>>> In short, we have kmem_cache::memcg_params::memcg_caches is an array of
>>>> pointers to per-memcg caches. We access it lock-free so we should use
>>>> memory barriers during initialization. Obviously we should place a write
>>>> barrier just before we set the pointer in order to make sure nobody will
>>>> see a partially initialized structure. Besides there must be a read
>>>> barrier between reading the pointer and accessing the structure, to
>>>> conform with the write barrier. It's all that similar to rcu_assign and
>>>> rcu_deref. Currently the barrier usage looks rather strange:
>>>>
>>>> memcg_create_kmem_cache:
>>>>     initialize kmem
>>>>     set the pointer in memcg_caches
>>>>     wmb() // ???
>>>>
>>>> __memcg_kmem_get_cache:
>>>>     <...>
>>>>     read_barrier_depends() // ???
>>>>     cachep = root_cache->memcg_params->memcg_caches[memcg_id]
>>>>     <...>
>>> Why do we need explicit memory barriers when we can use RCU?
>>> __memcg_kmem_get_cache already dereferences within rcu_read_lock.
>> Because it's not RCU, IMO. RCU implies freeing the old version after a
>> grace period, while kmem_caches are freed immediately. We simply want to
>> be sure the kmem_cache is fully initialized. And we do not require
>> calling this in an RCU critical section.
> And you can use rcu_dereference and rcu_assign for that as well.

rcu_dereference() will complain if called outside an RCU critical
section, while cache_from_memcg_idx() is called w/o RCU protection from
some places.

> It hides all the juicy details about memory barriers.

IMO, a memory barrier with a good comment looks better than an
rcu_dereference() without RCU protection :-)

> Besides that nothing prevents us from freeing from rcu callback. Or?

It's an overhead we can live without there. The point is that we can
access a cache only if it is active. I mean no allocation can go from a
cache that has already been destroyed. It would be a bug. So there is no
point in introducing RCU-protection for kmem_caches there. It would only
confuse, IMO.

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches
  2013-12-19  9:29             ` Vladimir Davydov
@ 2013-12-19  9:36               ` Michal Hocko
  2013-12-19  9:53                 ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  9:36 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 13:29:59, Vladimir Davydov wrote:
> On 12/19/2013 01:21 PM, Michal Hocko wrote:
> > On Thu 19-12-13 13:16:01, Vladimir Davydov wrote:
> >> On 12/19/2013 01:10 PM, Michal Hocko wrote:
> >>> On Thu 19-12-13 10:37:27, Vladimir Davydov wrote:
> >>>> On 12/18/2013 09:14 PM, Michal Hocko wrote:
> >>>>> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
> >>>>>> First, in memcg_create_kmem_cache() we should issue the write barrier
> >>>>>> after the kmem_cache is initialized, but before storing the pointer to
> >>>>>> it in its parent's memcg_params.
> >>>>>>
> >>>>>> Second, we should always issue the read barrier after
> >>>>>> cache_from_memcg_idx() to conform with the write barrier.
> >>>>>>
> >>>>>> Third, its better to use smp_* versions of barriers, because we don't
> >>>>>> need them on UP systems.
> >>>>> Please be (much) more verbose on Why. Barriers are tricky and should be
> >>>>> documented accordingly. So if you say that we should issue a barrier
> >>>>> always be specific why we should do it.
> >>>> In short, we have kmem_cache::memcg_params::memcg_caches is an array of
> >>>> pointers to per-memcg caches. We access it lock-free so we should use
> >>>> memory barriers during initialization. Obviously we should place a write
> >>>> barrier just before we set the pointer in order to make sure nobody will
> >>>> see a partially initialized structure. Besides there must be a read
> >>>> barrier between reading the pointer and accessing the structure, to
> >>>> conform with the write barrier. It's all that similar to rcu_assign and
> >>>> rcu_deref. Currently the barrier usage looks rather strange:
> >>>>
> >>>> memcg_create_kmem_cache:
> >>>>     initialize kmem
> >>>>     set the pointer in memcg_caches
> >>>>     wmb() // ???
> >>>>
> >>>> __memcg_kmem_get_cache:
> >>>>     <...>
> >>>>     read_barrier_depends() // ???
> >>>>     cachep = root_cache->memcg_params->memcg_caches[memcg_id]
> >>>>     <...>
> >>> Why do we need explicit memory barriers when we can use RCU?
> >>> __memcg_kmem_get_cache already dereferences within rcu_read_lock.
> >> Because it's not RCU, IMO. RCU implies freeing the old version after a
> >> grace period, while kmem_caches are freed immediately. We simply want to
> >> be sure the kmem_cache is fully initialized. And we do not require
> >> calling this in an RCU critical section.
> > And you can use rcu_dereference and rcu_assign for that as well.
> 
> rcu_dereference() will complain if called outside an RCU critical
> section, while cache_from_memcg_idx() is called w/o RCU protection from
> some places.

Does anything prevents us from using RCU from those callers as well?

> > It hides all the juicy details about memory barriers.
> 
> IMO, a memory barrier with a good comment looks better than an
> rcu_dereference() without RCU protection :-)

OK, let's wait for a good comment then ;)

> > Besides that nothing prevents us from freeing from rcu callback. Or?
> 
> It's an overhead we can live without there. The point is that we can
> access a cache only if it is active. I mean no allocation can go from a
> cache that has already been destroyed. It would be a bug. So there is no
> point in introducing RCU-protection for kmem_caches there. It would only
> confuse, IMO.

My point was that the current state is a disaster. Implicit assumptions
on different locking with memory barriers to make it even more juicy.
This should be cleaned up really. Replacing explicit memory barriers by
RCU sounds like a straightforward and much easier to follow for many
people (unlike memory barriers).

I do not insist on RCU but please make this code comprehensible. My head
is spinning anytime I look down there and try to find out which locks
are actually held and whether that is safe.

> 
> Thanks.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches
  2013-12-19  9:28   ` Michal Hocko
@ 2013-12-19  9:36     ` Vladimir Davydov
  2013-12-19  9:43       ` Michal Hocko
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  9:36 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/19/2013 01:28 PM, Michal Hocko wrote:
> On Wed 18-12-13 17:16:57, Vladimir Davydov wrote:
>> We update root cache's memcg_params whenever we need to grow the
>> memcg_caches array to accommodate all kmem-active memory cgroups.
>> Currently we free the old version immediately then, which can lead to
>> use-after-free, because the memcg_caches array is accessed lock-free.
>> This patch fixes this by making memcg_params RCU-protected.
> yes, I was thinking about something like this when talking about RCU
> usage.

Not exactly (if you mean your replies to this series). We do not protect
kmem_caches, but we do protect the memcg_caches array, which can grow.

>
>> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
>> Cc: Michal Hocko <mhocko@suse.cz>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Glauber Costa <glommer@gmail.com>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Pekka Enberg <penberg@kernel.org>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> ---
>>  include/linux/slab.h |    5 ++++-
>>  mm/memcontrol.c      |   15 ++++++++-------
>>  mm/slab.h            |    8 +++++++-
>>  3 files changed, 19 insertions(+), 9 deletions(-)
>>
>> diff --git a/include/linux/slab.h b/include/linux/slab.h
>> index 1e2f4fe..f7e5649 100644
>> --- a/include/linux/slab.h
>> +++ b/include/linux/slab.h
>> @@ -528,7 +528,10 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
>>  struct memcg_cache_params {
>>  	bool is_root_cache;
>>  	union {
>> -		struct kmem_cache *memcg_caches[0];
>> +		struct {
>> +			struct rcu_head rcu_head;
>> +			struct kmem_cache *memcg_caches[0];
>> +		};
>>  		struct {
>>  			struct mem_cgroup *memcg;
>>  			struct list_head list;
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index ad8de6a..379fc5f 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3142,18 +3142,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
>>  
>>  	if (num_groups > memcg_limited_groups_array_size) {
>>  		int i;
>> +		struct memcg_cache_params *new_params;
>>  		ssize_t size = memcg_caches_array_size(num_groups);
>>  
>>  		size *= sizeof(void *);
>>  		size += offsetof(struct memcg_cache_params, memcg_caches);
>>  
>> -		s->memcg_params = kzalloc(size, GFP_KERNEL);
>> -		if (!s->memcg_params) {
>> -			s->memcg_params = cur_params;
>> +		new_params = kzalloc(size, GFP_KERNEL);
>> +		if (!new_params)
>>  			return -ENOMEM;
>> -		}
>>  
>> -		s->memcg_params->is_root_cache = true;
>> +		new_params->is_root_cache = true;
>>  
>>  		/*
>>  		 * There is the chance it will be bigger than
>> @@ -3167,7 +3166,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
>>  		for (i = 0; i < memcg_limited_groups_array_size; i++) {
>>  			if (!cur_params->memcg_caches[i])
>>  				continue;
>> -			s->memcg_params->memcg_caches[i] =
>> +			new_params->memcg_caches[i] =
>>  						cur_params->memcg_caches[i];
>>  		}
>>  
>> @@ -3180,7 +3179,9 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
>>  		 * bigger than the others. And all updates will reset this
>>  		 * anyway.
>>  		 */
>> -		kfree(cur_params);
>> +		rcu_assign_pointer(s->memcg_params, new_params);
>> +		if (cur_params)
>> +			kfree_rcu(cur_params, rcu_head);
>>  	}
>>  	return 0;
>>  }
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 1d8b53f..53b81a9 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -164,10 +164,16 @@ static inline struct kmem_cache *
>>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
>>  {
>>  	struct kmem_cache *cachep;
>> +	struct memcg_cache_params *params;
>>  
>>  	if (!s->memcg_params)
>>  		return NULL;
>> -	cachep = s->memcg_params->memcg_caches[idx];
>> +
>> +	rcu_read_lock();
>> +	params = rcu_dereference(s->memcg_params);
>> +	cachep = params->memcg_caches[idx];
>> +	rcu_read_unlock();
>> +
> Consumer has to be covered by the same rcu section otherwise
> memcg_params might be freed right after rcu unlock here.

No. We protect only accesses to kmem_cache::memcg_params, which can
potentially be relocated for root caches. But as soon as we get the
pointer to a kmem_cache from this array, we can freely dereference it,
because the cache cannot be freed when we use it. This is, because we
access a kmem_cache either under the slab_mutex or
memcg->slab_caches_mutex, or when we allocate/free from it. While doing
the latter, the cache can't go away, it would be a bug. IMO.

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Devel] [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-19  9:26     ` Vasily Averin
@ 2013-12-19  9:42       ` Vladimir Davydov
  2013-12-19  9:45       ` Michal Hocko
  2013-12-19 10:23       ` Pekka Enberg
  2 siblings, 0 replies; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  9:42 UTC (permalink / raw)
  To: Vasily Averin
  Cc: Michal Hocko, Glauber Costa, linux-kernel, Pekka Enberg,
	linux-mm, Johannes Weiner, cgroups, Christoph Lameter,
	Andrew Morton, devel

On 12/19/2013 01:26 PM, Vasily Averin wrote:
> On 12/19/2013 12:39 PM, Vladimir Davydov wrote:
>> On 12/19/2013 12:17 PM, Vasily Averin wrote:
>>> On 12/18/2013 05:16 PM, Vladimir Davydov wrote:
>>>> --- a/mm/slab_common.c
>>>> +++ b/mm/slab_common.c
>>>> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>>>>  	get_online_cpus();
>>>>  	mutex_lock(&slab_mutex);
>>>>  
>>>> -	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
>>>> -		goto out_locked;
>>>> +	err = kmem_cache_sanity_check(memcg, name, size);
>>>> +	if (err)
>>>> +		goto out_unlock;
>>>>  
>>>>  	/*
>>>>  	 * Some allocators will constraint the set of valid flags to a subset
>>> Theoretically in future kmem_cache_sanity_check() can return positive value.
>>> Probably it's better to check (err < 0) in caller ?
>> Hmm, why? What information could positive retval carry here? We have
>> plenty of places throughout the code where we check for (err), not
>> (err<0), simply because it looks clearer, e.g. look at
>> __kmem_cache_create() calls. If it returns a positive value one day, we
>> will have to parse every place where it's called. Anyway, if someone
>> wants to change a function behavior, he must check every place where
>> this function is called and fix them accordingly.
> I believe expected semantic of function -- return negative in case of error.
> So correct error cheek should be (err < 0).
> (err) check is semantically incorrect, and it can lead to troubles in future.

You are free to use the "correct" check then, but making everyone do so
would be too painful ;-)

linux-tip$ grep -rI '^\s*if (err)' . | wc -l
13631
linux-tip$ grep -rI '^\s*if (err\s*<\s*0)' . | wc -l
5449

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches
  2013-12-19  9:36     ` Vladimir Davydov
@ 2013-12-19  9:43       ` Michal Hocko
  2013-12-19  9:47         ` Vladimir Davydov
  0 siblings, 1 reply; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  9:43 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 13:36:42, Vladimir Davydov wrote:
> On 12/19/2013 01:28 PM, Michal Hocko wrote:
> > On Wed 18-12-13 17:16:57, Vladimir Davydov wrote:
[...]
> >> diff --git a/mm/slab.h b/mm/slab.h
> >> index 1d8b53f..53b81a9 100644
> >> --- a/mm/slab.h
> >> +++ b/mm/slab.h
> >> @@ -164,10 +164,16 @@ static inline struct kmem_cache *
> >>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
> >>  {
> >>  	struct kmem_cache *cachep;
> >> +	struct memcg_cache_params *params;
> >>  
> >>  	if (!s->memcg_params)
> >>  		return NULL;
> >> -	cachep = s->memcg_params->memcg_caches[idx];
> >> +
> >> +	rcu_read_lock();
> >> +	params = rcu_dereference(s->memcg_params);
> >> +	cachep = params->memcg_caches[idx];
> >> +	rcu_read_unlock();
> >> +
> > Consumer has to be covered by the same rcu section otherwise
> > memcg_params might be freed right after rcu unlock here.
> 
> No. We protect only accesses to kmem_cache::memcg_params, which can
> potentially be relocated for root caches.

Hmm, ok. So memcg_params might change (a new memcg is accounted) but
pointers at idx will be same, right?

> But as soon as we get the
> pointer to a kmem_cache from this array, we can freely dereference it,
> because the cache cannot be freed when we use it. This is, because we
> access a kmem_cache either under the slab_mutex or
> memcg->slab_caches_mutex, or when we allocate/free from it. While doing
> the latter, the cache can't go away, it would be a bug. IMO.

That expects that cache_from_memcg_idx is always called with slab_mutex
or slab_caches_mutex held, right? Please document it.

> 
> Thanks.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Devel] [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-19  9:26     ` Vasily Averin
  2013-12-19  9:42       ` Vladimir Davydov
@ 2013-12-19  9:45       ` Michal Hocko
  2013-12-19 10:23       ` Pekka Enberg
  2 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2013-12-19  9:45 UTC (permalink / raw)
  To: Vasily Averin
  Cc: Vladimir Davydov, Glauber Costa, linux-kernel, Pekka Enberg,
	linux-mm, Johannes Weiner, cgroups, Christoph Lameter,
	Andrew Morton, devel

On Thu 19-12-13 13:26:12, Vasily Averin wrote:
> On 12/19/2013 12:39 PM, Vladimir Davydov wrote:
> > On 12/19/2013 12:17 PM, Vasily Averin wrote:
> >> On 12/18/2013 05:16 PM, Vladimir Davydov wrote:
> >>> --- a/mm/slab_common.c
> >>> +++ b/mm/slab_common.c
> >>> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
> >>>  	get_online_cpus();
> >>>  	mutex_lock(&slab_mutex);
> >>>  
> >>> -	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
> >>> -		goto out_locked;
> >>> +	err = kmem_cache_sanity_check(memcg, name, size);
> >>> +	if (err)
> >>> +		goto out_unlock;
> >>>  
> >>>  	/*
> >>>  	 * Some allocators will constraint the set of valid flags to a subset
> >> Theoretically in future kmem_cache_sanity_check() can return positive value.
> >> Probably it's better to check (err < 0) in caller ?
> > 
> > Hmm, why? What information could positive retval carry here? We have
> > plenty of places throughout the code where we check for (err), not
> > (err<0), simply because it looks clearer, e.g. look at
> > __kmem_cache_create() calls. If it returns a positive value one day, we
> > will have to parse every place where it's called. Anyway, if someone
> > wants to change a function behavior, he must check every place where
> > this function is called and fix them accordingly.
> 
> I believe expected semantic of function -- return negative in case of error.
> So correct error cheek should be (err < 0).
> (err) check is semantically incorrect, and it can lead to troubles in future.

No, this function returns -ERRNO or 0 on success.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches
  2013-12-19  9:43       ` Michal Hocko
@ 2013-12-19  9:47         ` Vladimir Davydov
  2013-12-19 10:06           ` Michal Hocko
  0 siblings, 1 reply; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  9:47 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/19/2013 01:43 PM, Michal Hocko wrote:
> On Thu 19-12-13 13:36:42, Vladimir Davydov wrote:
>> On 12/19/2013 01:28 PM, Michal Hocko wrote:
>>> On Wed 18-12-13 17:16:57, Vladimir Davydov wrote:
> [...]
>>>> diff --git a/mm/slab.h b/mm/slab.h
>>>> index 1d8b53f..53b81a9 100644
>>>> --- a/mm/slab.h
>>>> +++ b/mm/slab.h
>>>> @@ -164,10 +164,16 @@ static inline struct kmem_cache *
>>>>  cache_from_memcg_idx(struct kmem_cache *s, int idx)
>>>>  {
>>>>  	struct kmem_cache *cachep;
>>>> +	struct memcg_cache_params *params;
>>>>  
>>>>  	if (!s->memcg_params)
>>>>  		return NULL;
>>>> -	cachep = s->memcg_params->memcg_caches[idx];
>>>> +
>>>> +	rcu_read_lock();
>>>> +	params = rcu_dereference(s->memcg_params);
>>>> +	cachep = params->memcg_caches[idx];
>>>> +	rcu_read_unlock();
>>>> +
>>> Consumer has to be covered by the same rcu section otherwise
>>> memcg_params might be freed right after rcu unlock here.
>> No. We protect only accesses to kmem_cache::memcg_params, which can
>> potentially be relocated for root caches.
> Hmm, ok. So memcg_params might change (a new memcg is accounted) but
> pointers at idx will be same, right?

Yes, that's a classical Read-Copy-Update :-)

>
>> But as soon as we get the
>> pointer to a kmem_cache from this array, we can freely dereference it,
>> because the cache cannot be freed when we use it. This is, because we
>> access a kmem_cache either under the slab_mutex or
>> memcg->slab_caches_mutex, or when we allocate/free from it. While doing
>> the latter, the cache can't go away, it would be a bug. IMO.
> That expects that cache_from_memcg_idx is always called with slab_mutex
> or slab_caches_mutex held, right? Please document it.

Yeah, you're right, this longs for a documentation. I'm going to check
this code a bit more and try to write a good comment about it (although
I'm rather poor at writing comments :-( )

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches
  2013-12-19  9:36               ` Michal Hocko
@ 2013-12-19  9:53                 ` Vladimir Davydov
  0 siblings, 0 replies; 42+ messages in thread
From: Vladimir Davydov @ 2013-12-19  9:53 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On 12/19/2013 01:36 PM, Michal Hocko wrote:
> On Thu 19-12-13 13:29:59, Vladimir Davydov wrote:
>> On 12/19/2013 01:21 PM, Michal Hocko wrote:
>>> On Thu 19-12-13 13:16:01, Vladimir Davydov wrote:
>>>> On 12/19/2013 01:10 PM, Michal Hocko wrote:
>>>>> On Thu 19-12-13 10:37:27, Vladimir Davydov wrote:
>>>>>> On 12/18/2013 09:14 PM, Michal Hocko wrote:
>>>>>>> On Wed 18-12-13 17:16:54, Vladimir Davydov wrote:
>>>>>>>> First, in memcg_create_kmem_cache() we should issue the write barrier
>>>>>>>> after the kmem_cache is initialized, but before storing the pointer to
>>>>>>>> it in its parent's memcg_params.
>>>>>>>>
>>>>>>>> Second, we should always issue the read barrier after
>>>>>>>> cache_from_memcg_idx() to conform with the write barrier.
>>>>>>>>
>>>>>>>> Third, its better to use smp_* versions of barriers, because we don't
>>>>>>>> need them on UP systems.
>>>>>>> Please be (much) more verbose on Why. Barriers are tricky and should be
>>>>>>> documented accordingly. So if you say that we should issue a barrier
>>>>>>> always be specific why we should do it.
>>>>>> In short, we have kmem_cache::memcg_params::memcg_caches is an array of
>>>>>> pointers to per-memcg caches. We access it lock-free so we should use
>>>>>> memory barriers during initialization. Obviously we should place a write
>>>>>> barrier just before we set the pointer in order to make sure nobody will
>>>>>> see a partially initialized structure. Besides there must be a read
>>>>>> barrier between reading the pointer and accessing the structure, to
>>>>>> conform with the write barrier. It's all that similar to rcu_assign and
>>>>>> rcu_deref. Currently the barrier usage looks rather strange:
>>>>>>
>>>>>> memcg_create_kmem_cache:
>>>>>>     initialize kmem
>>>>>>     set the pointer in memcg_caches
>>>>>>     wmb() // ???
>>>>>>
>>>>>> __memcg_kmem_get_cache:
>>>>>>     <...>
>>>>>>     read_barrier_depends() // ???
>>>>>>     cachep = root_cache->memcg_params->memcg_caches[memcg_id]
>>>>>>     <...>
>>>>> Why do we need explicit memory barriers when we can use RCU?
>>>>> __memcg_kmem_get_cache already dereferences within rcu_read_lock.
>>>> Because it's not RCU, IMO. RCU implies freeing the old version after a
>>>> grace period, while kmem_caches are freed immediately. We simply want to
>>>> be sure the kmem_cache is fully initialized. And we do not require
>>>> calling this in an RCU critical section.
>>> And you can use rcu_dereference and rcu_assign for that as well.
>> rcu_dereference() will complain if called outside an RCU critical
>> section, while cache_from_memcg_idx() is called w/o RCU protection from
>> some places.
> Does anything prevents us from using RCU from those callers as well?

Yes, take a look at kmem_cache_destroy_memcg_children(), for instance.
We call cancel_work_sync() there on a cache obtained via
cache_from_memcg_idx().

Thanks.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches
  2013-12-19  9:47         ` Vladimir Davydov
@ 2013-12-19 10:06           ` Michal Hocko
  0 siblings, 0 replies; 42+ messages in thread
From: Michal Hocko @ 2013-12-19 10:06 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: linux-kernel, linux-mm, cgroups, devel, Johannes Weiner,
	Glauber Costa, Christoph Lameter, Pekka Enberg, Andrew Morton

On Thu 19-12-13 13:47:33, Vladimir Davydov wrote:
[...]
> Yeah, you're right, this longs for a documentation. I'm going to check

We desparately need a documentation for the life cycle of all involved
objects and description of which locks are used at which stage. 

> this code a bit more and try to write a good comment about it (although
> I'm rather poor at writing comments :-( )

A nice diagram would do as well...

Thanks!
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [Devel] [PATCH 1/6] slab: cleanup kmem_cache_create_memcg()
  2013-12-19  9:26     ` Vasily Averin
  2013-12-19  9:42       ` Vladimir Davydov
  2013-12-19  9:45       ` Michal Hocko
@ 2013-12-19 10:23       ` Pekka Enberg
  2 siblings, 0 replies; 42+ messages in thread
From: Pekka Enberg @ 2013-12-19 10:23 UTC (permalink / raw)
  To: Vasily Averin, Vladimir Davydov
  Cc: Michal Hocko, Glauber Costa, linux-kernel, Pekka Enberg,
	linux-mm, Johannes Weiner, cgroups, Christoph Lameter,
	Andrew Morton, devel

On 12/19/2013 11:26 AM, Vasily Averin wrote:
> On 12/19/2013 12:39 PM, Vladimir Davydov wrote:
>> On 12/19/2013 12:17 PM, Vasily Averin wrote:
>>> On 12/18/2013 05:16 PM, Vladimir Davydov wrote:
>>>> --- a/mm/slab_common.c
>>>> +++ b/mm/slab_common.c
>>>> @@ -176,8 +176,9 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
>>>>   	get_online_cpus();
>>>>   	mutex_lock(&slab_mutex);
>>>>   
>>>> -	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
>>>> -		goto out_locked;
>>>> +	err = kmem_cache_sanity_check(memcg, name, size);
>>>> +	if (err)
>>>> +		goto out_unlock;
>>>>   
>>>>   	/*
>>>>   	 * Some allocators will constraint the set of valid flags to a subset
>>> Theoretically in future kmem_cache_sanity_check() can return positive value.
>>> Probably it's better to check (err < 0) in caller ?
>> Hmm, why? What information could positive retval carry here? We have
>> plenty of places throughout the code where we check for (err), not
>> (err<0), simply because it looks clearer, e.g. look at
>> __kmem_cache_create() calls. If it returns a positive value one day, we
>> will have to parse every place where it's called. Anyway, if someone
>> wants to change a function behavior, he must check every place where
>> this function is called and fix them accordingly.
> I believe expected semantic of function -- return negative in case of error.
> So correct error cheek should be (err < 0).
> (err) check is semantically incorrect, and it can lead to troubles in future.

I don't know what semantics you are referring to but a typical 
convention in mm/*.c is to return zero on success and negative on error 
but never positive numbers.

Looking at mm/slab_common.c, "if (err)" is the established convention so 
using "if (err < 0)" just because is pointless here.

                             Pekka

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2013-12-19 10:23 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-12-18 13:16 [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Vladimir Davydov
2013-12-18 13:16 ` [PATCH 2/6] memcg, slab: kmem_cache_create_memcg(): free memcg params on error Vladimir Davydov
2013-12-18 17:06   ` Michal Hocko
2013-12-19  6:32     ` Vladimir Davydov
2013-12-19  8:48       ` Michal Hocko
2013-12-19  9:01         ` Vladimir Davydov
2013-12-19  9:19           ` Michal Hocko
2013-12-18 13:16 ` [PATCH 3/6] memcg, slab: cleanup barrier usage when accessing memcg_caches Vladimir Davydov
2013-12-18 17:14   ` Michal Hocko
2013-12-19  6:37     ` Vladimir Davydov
2013-12-19  9:10       ` Michal Hocko
2013-12-19  9:16         ` Vladimir Davydov
2013-12-19  9:21           ` Michal Hocko
2013-12-19  9:29             ` Vladimir Davydov
2013-12-19  9:36               ` Michal Hocko
2013-12-19  9:53                 ` Vladimir Davydov
2013-12-18 13:16 ` [PATCH 4/6] memcg, slab: check and init memcg_cahes under slab_mutex Vladimir Davydov
2013-12-18 17:41   ` Michal Hocko
2013-12-19  7:07     ` Vladimir Davydov
2013-12-19  8:00       ` Glauber Costa
2013-12-19  9:12         ` Michal Hocko
2013-12-19  9:17           ` Vladimir Davydov
2013-12-19  9:21         ` Vladimir Davydov
2013-12-18 13:16 ` [PATCH 5/6] memcg: clear memcg_params after removing cache from memcg_slab_caches list Vladimir Davydov
2013-12-18 13:16 ` [PATCH 6/6] memcg, slab: RCU protect memcg_params for root caches Vladimir Davydov
2013-12-19  9:28   ` Michal Hocko
2013-12-19  9:36     ` Vladimir Davydov
2013-12-19  9:43       ` Michal Hocko
2013-12-19  9:47         ` Vladimir Davydov
2013-12-19 10:06           ` Michal Hocko
2013-12-18 16:56 ` [PATCH 1/6] slab: cleanup kmem_cache_create_memcg() Michal Hocko
2013-12-19  6:31   ` Vladimir Davydov
2013-12-19  8:44     ` Michal Hocko
2013-12-19  8:51       ` Vladimir Davydov
2013-12-19  9:16         ` Michal Hocko
2013-12-19  7:27 ` Pekka Enberg
2013-12-19  8:17 ` [Devel] " Vasily Averin
2013-12-19  8:39   ` Vladimir Davydov
2013-12-19  9:26     ` Vasily Averin
2013-12-19  9:42       ` Vladimir Davydov
2013-12-19  9:45       ` Michal Hocko
2013-12-19 10:23       ` Pekka Enberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).