All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: vdavydov.dev@gmail.com, cl@linux.com, penberg@kernel.org,
	rientjes@google.com, iamjoonsoo.kim@lge.com,
	akpm@linux-foundation.org
Cc: jsvana@fb.com, hannes@cmpxchg.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, cgroups@vger.kernel.org, kernel-team@fb.com,
	Tejun Heo <tj@kernel.org>
Subject: [PATCH 6/9] slab: don't put memcg caches on slab_caches list
Date: Sat, 14 Jan 2017 00:54:46 -0500	[thread overview]
Message-ID: <20170114055449.11044-7-tj@kernel.org> (raw)
In-Reply-To: <20170114055449.11044-1-tj@kernel.org>

With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is
not under memory pressure.  When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.  This is one of the patches to address the issue.

slab_caches currently lists all caches including root and memcg ones.
This is the only data structure which lists the root caches and
iterating root caches can only be done by walking the list while
skipping over memcg caches.  As there can be a huge number of memcg
caches, this can become very expensive.

This also can make /proc/slabinfo behave very badly.  seq_file
processes reads in 4k chunks and seeks to the previous Nth position on
slab_caches list to resume after each chunk.  With a lot of memcg
cache churns on the list, reading /proc/slabinfo can become very slow
and its content often ends up with duplicate and/or missing entries.

As the previous patch made it unnecessary to walk slab_caches to
iterate memcg-specific caches, there is no reason to keep memcg caches
on the list.  This patch makes slab_caches include only the root
caches.  As this makes slab_cache->list unused for memcg caches,
->memcg_params.children_node is removed and ->list is used instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h |  3 ---
 mm/slab.h            |  3 +--
 mm/slab_common.c     | 58 +++++++++++++++++++++++++---------------------------
 3 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 54ec959..63d543d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -564,8 +564,6 @@ struct memcg_cache_array {
  *
  * @memcg:	Pointer to the memcg this cache belongs to.
  *
- * @children_node: List node for @root_cache->children list.
- *
  * @kmem_caches_node: List node for @memcg->kmem_caches list.
  */
 struct memcg_cache_params {
@@ -577,7 +575,6 @@ struct memcg_cache_params {
 		};
 		struct {
 			struct mem_cgroup *memcg;
-			struct list_head children_node;
 			struct list_head kmem_caches_node;
 		};
 	};
diff --git a/mm/slab.h b/mm/slab.h
index b5e0040..8f47a44 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -203,8 +203,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
  * slab_mutex.
  */
 #define for_each_memcg_cache(iter, root) \
-	list_for_each_entry(iter, &(root)->memcg_params.children, \
-			    memcg_params.children_node)
+	list_for_each_entry(iter, &(root)->memcg_params.children, list)
 
 static inline bool is_root_cache(struct kmem_cache *s)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 74c36d8..c0d0126 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -68,6 +68,22 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
 EXPORT_SYMBOL(kmem_cache_size);
 
 #ifdef CONFIG_DEBUG_VM
+static void kmem_cache_verify_name(struct kmem_cache *s)
+{
+	char tmp;
+	int res;
+
+	/*
+	 * This happens when the module gets unloaded and doesn't destroy
+	 * its slab cache and no-one else reuses the vmalloc area of the
+	 * module.  Print a warning.
+	 */
+	res = probe_kernel_address(s->name, tmp);
+	if (res)
+		pr_err("Slab cache with size %d has lost its name\n",
+		       s->object_size);
+}
+
 static int kmem_cache_sanity_check(const char *name, size_t size)
 {
 	struct kmem_cache *s = NULL;
@@ -79,20 +95,12 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
 	}
 
 	list_for_each_entry(s, &slab_caches, list) {
-		char tmp;
-		int res;
+		struct kmem_cache *c;
 
-		/*
-		 * This happens when the module gets unloaded and doesn't
-		 * destroy its slab cache and no-one else reuses the vmalloc
-		 * area of the module.  Print a warning.
-		 */
-		res = probe_kernel_address(s->name, tmp);
-		if (res) {
-			pr_err("Slab cache with size %d has lost its name\n",
-			       s->object_size);
-			continue;
-		}
+		kmem_cache_verify_name(s);
+
+		for_each_memcg_cache(c, s)
+			kmem_cache_verify_name(c);
 	}
 
 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
@@ -148,7 +156,6 @@ static int init_memcg_params(struct kmem_cache *s,
 	if (root_cache) {
 		s->memcg_params.root_cache = root_cache;
 		s->memcg_params.memcg = memcg;
-		INIT_LIST_HEAD(&s->memcg_params.children_node);
 		INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
 		return 0;
 	}
@@ -178,9 +185,6 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
 	struct memcg_cache_array *old, *new;
 
-	if (!is_root_cache(s))
-		return 0;
-
 	new = kzalloc(sizeof(struct memcg_cache_array) +
 		      new_array_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
@@ -219,7 +223,6 @@ int memcg_update_all_caches(int num_memcgs)
 
 static void unlink_memcg_cache(struct kmem_cache *s)
 {
-	list_del(&s->memcg_params.children_node);
 	list_del(&s->memcg_params.kmem_caches_node);
 }
 #else
@@ -243,10 +246,10 @@ static inline void unlink_memcg_cache(struct kmem_cache *s)
  */
 int slab_unmergeable(struct kmem_cache *s)
 {
-	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
+	if (!is_root_cache(s))
 		return 1;
 
-	if (!is_root_cache(s))
+	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
 		return 1;
 
 	if (s->ctor)
@@ -360,7 +363,8 @@ static struct kmem_cache *create_cache(const char *name,
 		goto out_free_cache;
 
 	s->refcount = 1;
-	list_add(&s->list, &slab_caches);
+	if (is_root_cache(s))
+		list_add(&s->list, &slab_caches);
 out:
 	if (err)
 		return ERR_PTR(err);
@@ -561,8 +565,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 		goto out_unlock;
 	}
 
-	list_add(&s->memcg_params.children_node,
-		 &root_cache->memcg_params.children);
+	list_add(&s->list, &root_cache->memcg_params.children);
 	list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches);
 
 	/*
@@ -593,9 +596,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
-		if (!is_root_cache(s))
-			continue;
-
 		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
 						lockdep_is_held(&slab_mutex));
 		c = arr->entries[idx];
@@ -653,8 +653,7 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
 	/*
 	 * Shutdown all caches.
 	 */
-	list_for_each_entry_safe(c, c2, &s->memcg_params.children,
-				 memcg_params.children_node)
+	list_for_each_entry_safe(c, c2, &s->memcg_params.children, list)
 		shutdown_cache(c);
 
 	/*
@@ -1143,8 +1142,7 @@ static int slab_show(struct seq_file *m, void *p)
 
 	if (p == slab_caches.next)
 		print_slabinfo_header(m);
-	if (is_root_cache(s))
-		cache_show(s, m);
+	cache_show(s, m);
 	return 0;
 }
 
-- 
2.9.3

WARNING: multiple messages have this Message-ID (diff)
From: Tejun Heo <tj@kernel.org>
To: vdavydov.dev@gmail.com, cl@linux.com, penberg@kernel.org,
	rientjes@google.com, iamjoonsoo.kim@lge.com,
	akpm@linux-foundation.org
Cc: jsvana@fb.com, hannes@cmpxchg.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, cgroups@vger.kernel.org, kernel-team@fb.com,
	Tejun Heo <tj@kernel.org>
Subject: [PATCH 6/9] slab: don't put memcg caches on slab_caches list
Date: Sat, 14 Jan 2017 00:54:46 -0500	[thread overview]
Message-ID: <20170114055449.11044-7-tj@kernel.org> (raw)
In-Reply-To: <20170114055449.11044-1-tj@kernel.org>

With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is
not under memory pressure.  When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code.  This is one of the patches to address the issue.

slab_caches currently lists all caches including root and memcg ones.
This is the only data structure which lists the root caches and
iterating root caches can only be done by walking the list while
skipping over memcg caches.  As there can be a huge number of memcg
caches, this can become very expensive.

This also can make /proc/slabinfo behave very badly.  seq_file
processes reads in 4k chunks and seeks to the previous Nth position on
slab_caches list to resume after each chunk.  With a lot of memcg
cache churns on the list, reading /proc/slabinfo can become very slow
and its content often ends up with duplicate and/or missing entries.

As the previous patch made it unnecessary to walk slab_caches to
iterate memcg-specific caches, there is no reason to keep memcg caches
on the list.  This patch makes slab_caches include only the root
caches.  As this makes slab_cache->list unused for memcg caches,
->memcg_params.children_node is removed and ->list is used instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h |  3 ---
 mm/slab.h            |  3 +--
 mm/slab_common.c     | 58 +++++++++++++++++++++++++---------------------------
 3 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 54ec959..63d543d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -564,8 +564,6 @@ struct memcg_cache_array {
  *
  * @memcg:	Pointer to the memcg this cache belongs to.
  *
- * @children_node: List node for @root_cache->children list.
- *
  * @kmem_caches_node: List node for @memcg->kmem_caches list.
  */
 struct memcg_cache_params {
@@ -577,7 +575,6 @@ struct memcg_cache_params {
 		};
 		struct {
 			struct mem_cgroup *memcg;
-			struct list_head children_node;
 			struct list_head kmem_caches_node;
 		};
 	};
diff --git a/mm/slab.h b/mm/slab.h
index b5e0040..8f47a44 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -203,8 +203,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
  * slab_mutex.
  */
 #define for_each_memcg_cache(iter, root) \
-	list_for_each_entry(iter, &(root)->memcg_params.children, \
-			    memcg_params.children_node)
+	list_for_each_entry(iter, &(root)->memcg_params.children, list)
 
 static inline bool is_root_cache(struct kmem_cache *s)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 74c36d8..c0d0126 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -68,6 +68,22 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
 EXPORT_SYMBOL(kmem_cache_size);
 
 #ifdef CONFIG_DEBUG_VM
+static void kmem_cache_verify_name(struct kmem_cache *s)
+{
+	char tmp;
+	int res;
+
+	/*
+	 * This happens when the module gets unloaded and doesn't destroy
+	 * its slab cache and no-one else reuses the vmalloc area of the
+	 * module.  Print a warning.
+	 */
+	res = probe_kernel_address(s->name, tmp);
+	if (res)
+		pr_err("Slab cache with size %d has lost its name\n",
+		       s->object_size);
+}
+
 static int kmem_cache_sanity_check(const char *name, size_t size)
 {
 	struct kmem_cache *s = NULL;
@@ -79,20 +95,12 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
 	}
 
 	list_for_each_entry(s, &slab_caches, list) {
-		char tmp;
-		int res;
+		struct kmem_cache *c;
 
-		/*
-		 * This happens when the module gets unloaded and doesn't
-		 * destroy its slab cache and no-one else reuses the vmalloc
-		 * area of the module.  Print a warning.
-		 */
-		res = probe_kernel_address(s->name, tmp);
-		if (res) {
-			pr_err("Slab cache with size %d has lost its name\n",
-			       s->object_size);
-			continue;
-		}
+		kmem_cache_verify_name(s);
+
+		for_each_memcg_cache(c, s)
+			kmem_cache_verify_name(c);
 	}
 
 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
@@ -148,7 +156,6 @@ static int init_memcg_params(struct kmem_cache *s,
 	if (root_cache) {
 		s->memcg_params.root_cache = root_cache;
 		s->memcg_params.memcg = memcg;
-		INIT_LIST_HEAD(&s->memcg_params.children_node);
 		INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
 		return 0;
 	}
@@ -178,9 +185,6 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
 	struct memcg_cache_array *old, *new;
 
-	if (!is_root_cache(s))
-		return 0;
-
 	new = kzalloc(sizeof(struct memcg_cache_array) +
 		      new_array_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
@@ -219,7 +223,6 @@ int memcg_update_all_caches(int num_memcgs)
 
 static void unlink_memcg_cache(struct kmem_cache *s)
 {
-	list_del(&s->memcg_params.children_node);
 	list_del(&s->memcg_params.kmem_caches_node);
 }
 #else
@@ -243,10 +246,10 @@ static inline void unlink_memcg_cache(struct kmem_cache *s)
  */
 int slab_unmergeable(struct kmem_cache *s)
 {
-	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
+	if (!is_root_cache(s))
 		return 1;
 
-	if (!is_root_cache(s))
+	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
 		return 1;
 
 	if (s->ctor)
@@ -360,7 +363,8 @@ static struct kmem_cache *create_cache(const char *name,
 		goto out_free_cache;
 
 	s->refcount = 1;
-	list_add(&s->list, &slab_caches);
+	if (is_root_cache(s))
+		list_add(&s->list, &slab_caches);
 out:
 	if (err)
 		return ERR_PTR(err);
@@ -561,8 +565,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 		goto out_unlock;
 	}
 
-	list_add(&s->memcg_params.children_node,
-		 &root_cache->memcg_params.children);
+	list_add(&s->list, &root_cache->memcg_params.children);
 	list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches);
 
 	/*
@@ -593,9 +596,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
-		if (!is_root_cache(s))
-			continue;
-
 		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
 						lockdep_is_held(&slab_mutex));
 		c = arr->entries[idx];
@@ -653,8 +653,7 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
 	/*
 	 * Shutdown all caches.
 	 */
-	list_for_each_entry_safe(c, c2, &s->memcg_params.children,
-				 memcg_params.children_node)
+	list_for_each_entry_safe(c, c2, &s->memcg_params.children, list)
 		shutdown_cache(c);
 
 	/*
@@ -1143,8 +1142,7 @@ static int slab_show(struct seq_file *m, void *p)
 
 	if (p == slab_caches.next)
 		print_slabinfo_header(m);
-	if (is_root_cache(s))
-		cache_show(s, m);
+	cache_show(s, m);
 	return 0;
 }
 
-- 
2.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2017-01-14  5:55 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-01-14  5:54 [PATCHSET] slab: make memcg slab destruction scalable Tejun Heo
2017-01-14  5:54 ` Tejun Heo
2017-01-14  5:54 ` Tejun Heo
2017-01-14  5:54 ` [PATCH 1/9] Revert "slub: move synchronize_sched out of slab_mutex on shrink" Tejun Heo
2017-01-14  5:54   ` Tejun Heo
2017-01-14  5:54 ` [PATCH 2/9] slab: remove synchronous rcu_barrier() call in memcg cache release path Tejun Heo
2017-01-14  5:54   ` Tejun Heo
2017-01-14 13:19   ` Vladimir Davydov
2017-01-14 13:19     ` Vladimir Davydov
2017-01-14 15:19     ` Tejun Heo
2017-01-14 15:19       ` Tejun Heo
2017-01-17  0:07       ` Joonsoo Kim
2017-01-17  0:07         ` Joonsoo Kim
2017-01-17  0:07         ` Joonsoo Kim
2017-01-17 16:37         ` Tejun Heo
2017-01-17 16:37           ` Tejun Heo
2017-01-17 17:02           ` Tejun Heo
2017-01-17 17:02             ` Tejun Heo
2017-01-14  5:54 ` [PATCH 3/9] slab: simplify shutdown_memcg_caches() Tejun Heo
2017-01-14  5:54   ` Tejun Heo
2017-01-14 13:27   ` Vladimir Davydov
2017-01-14 13:27     ` Vladimir Davydov
2017-01-14 15:38     ` Tejun Heo
2017-01-14 15:38       ` Tejun Heo
2017-01-14 15:53       ` Tejun Heo
2017-01-14 15:53         ` Tejun Heo
2017-01-14  5:54 ` [PATCH 4/9] slab: reorganize memcg_cache_params Tejun Heo
2017-01-14  5:54   ` Tejun Heo
2017-01-14 13:30   ` Vladimir Davydov
2017-01-14 13:30     ` Vladimir Davydov
2017-01-14  5:54 ` [PATCH 5/9] slab: link memcg kmem_caches on their associated memory cgroup Tejun Heo
2017-01-14  5:54   ` Tejun Heo
2017-01-14 13:33   ` Vladimir Davydov
2017-01-14 13:33     ` Vladimir Davydov
2017-01-14  5:54 ` Tejun Heo [this message]
2017-01-14  5:54   ` [PATCH 6/9] slab: don't put memcg caches on slab_caches list Tejun Heo
2017-01-14 13:39   ` Vladimir Davydov
2017-01-14 13:39     ` Vladimir Davydov
2017-01-14 15:39     ` Tejun Heo
2017-01-14 15:39       ` Tejun Heo
2017-01-14 15:39       ` Tejun Heo
2017-01-14  5:54 ` [PATCH 7/9] slab: introduce __kmemcg_cache_deactivate() Tejun Heo
2017-01-14  5:54   ` Tejun Heo
2017-01-14 13:42   ` Vladimir Davydov
2017-01-14 13:42     ` Vladimir Davydov
2017-01-14 15:39     ` Tejun Heo
2017-01-14 15:39       ` Tejun Heo
2017-01-14  5:54 ` [PATCH 8/9] slab: remove synchronous synchronize_sched() from memcg cache deactivation path Tejun Heo
2017-01-14  5:54   ` Tejun Heo
2017-01-14 13:57   ` Vladimir Davydov
2017-01-14 13:57     ` Vladimir Davydov
2017-01-14 13:57     ` Vladimir Davydov
2017-01-14  5:54 ` [PATCH 9/9] slab: remove slub sysfs interface files early for empty memcg caches Tejun Heo
2017-01-14  5:54   ` Tejun Heo
2017-01-14 14:00   ` Vladimir Davydov
2017-01-14 14:00     ` Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170114055449.11044-7-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=cl@linux.com \
    --cc=hannes@cmpxchg.org \
    --cc=iamjoonsoo.kim@lge.com \
    --cc=jsvana@fb.com \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=penberg@kernel.org \
    --cc=rientjes@google.com \
    --cc=vdavydov.dev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.