All of lore.kernel.org
 help / color / mirror / Atom feed
From: Glauber Costa <glommer@parallels.com>
To: <linux-kernel@vger.kernel.org>
Cc: <cgroups@vger.kernel.org>, <linux-mm@kvack.org>,
	<kamezawa.hiroyu@jp.fujitsu.com>, Tejun Heo <tj@kernel.org>,
	Li Zefan <lizefan@huawei.com>, Greg Thelen <gthelen@google.com>,
	Suleiman Souhlal <suleiman@google.com>,
	Michal Hocko <mhocko@suse.cz>,
	Johannes Weiner <hannes@cmpxchg.org>, <devel@openvz.org>,
	Glauber Costa <glommer@parallels.com>,
	Christoph Lameter <cl@linux.com>,
	Pekka Enberg <penberg@cs.helsinki.fi>
Subject: [PATCH v2 23/29] memcg: destroy memcg caches
Date: Fri, 11 May 2012 14:44:25 -0300	[thread overview]
Message-ID: <1336758272-24284-24-git-send-email-glommer@parallels.com> (raw)
In-Reply-To: <1336758272-24284-1-git-send-email-glommer@parallels.com>

This patch implements destruction of memcg caches. Right now,
only caches where our reference counter is the last remaining are
deleted. If there are any other reference counters around, we just
leave the caches lying around until they go away.

When that happen, a destruction function is called from the cache
code. Caches are only destroyed in process context, so we queue them
up for later processing in the general case.

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Christoph Lameter <cl@linux.com>
CC: Pekka Enberg <penberg@cs.helsinki.fi>
CC: Michal Hocko <mhocko@suse.cz>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Suleiman Souhlal <suleiman@google.com>
---
 include/linux/memcontrol.h |    2 +
 include/linux/slab.h       |    1 +
 mm/memcontrol.c            |   91 +++++++++++++++++++++++++++++++++++++++++++-
 mm/slab.c                  |    5 +-
 mm/slub.c                  |    7 ++-
 5 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4000798..3e03f26 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -463,6 +463,8 @@ __mem_cgroup_get_kmem_cache(struct kmem_cache *cachep, gfp_t gfp);
 
 extern struct static_key mem_cgroup_kmem_enabled_key;
 #define mem_cgroup_kmem_on static_key_false(&mem_cgroup_kmem_enabled_key)
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
 #else
 static inline void mem_cgroup_register_cache(struct mem_cgroup *memcg,
 					     struct kmem_cache *s)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index e73ef71..a03a4f2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -164,6 +164,7 @@ struct mem_cgroup_cache_params {
 	size_t orig_align;
 
 #endif
+	struct list_head destroyed_list; /* Used when deleting memcg cache */
 };
 #endif
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ad60648..1d1a307 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -476,6 +476,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
 {
 	if (memcg->kmem_accounted)
 		static_key_slow_dec(&mem_cgroup_kmem_enabled_key);
+	/*
+	 * This check can't live in kmem destruction function,
+	 * since the charges will outlive the cgroup
+	 */
+	BUG_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
 }
 
 #ifdef CONFIG_INET
@@ -540,6 +545,8 @@ void mem_cgroup_register_cache(struct mem_cgroup *memcg,
 	if (!memcg)
 		id = ida_simple_get(&cache_types, 0, MAX_KMEM_CACHE_TYPES,
 				    GFP_KERNEL);
+	else
+		INIT_LIST_HEAD(&cachep->memcg_params.destroyed_list);
 	cachep->memcg_params.id = id;
 }
 
@@ -592,6 +599,53 @@ struct create_work {
 /* Use a single spinlock for destruction and creation, not a frequent op */
 static DEFINE_SPINLOCK(cache_queue_lock);
 static LIST_HEAD(create_queue);
+static LIST_HEAD(destroyed_caches);
+
+static void kmem_cache_destroy_work_func(struct work_struct *w)
+{
+	struct kmem_cache *cachep;
+	struct mem_cgroup_cache_params *p, *tmp;
+	unsigned long flags;
+	LIST_HEAD(del_unlocked);
+
+	spin_lock_irqsave(&cache_queue_lock, flags);
+	list_for_each_entry_safe(p, tmp, &destroyed_caches, destroyed_list) {
+		cachep = container_of(p, struct kmem_cache, memcg_params);
+		list_move(&cachep->memcg_params.destroyed_list, &del_unlocked);
+	}
+	spin_unlock_irqrestore(&cache_queue_lock, flags);
+
+	list_for_each_entry_safe(p, tmp, &del_unlocked, destroyed_list) {
+		cachep = container_of(p, struct kmem_cache, memcg_params);
+		list_del(&cachep->memcg_params.destroyed_list);
+		if (!atomic_read(&cachep->memcg_params.refcnt)) {
+			mem_cgroup_put(cachep->memcg_params.memcg);
+			kmem_cache_destroy(cachep);
+		}
+	}
+}
+static DECLARE_WORK(kmem_cache_destroy_work, kmem_cache_destroy_work_func);
+
+static void __mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+	BUG_ON(cachep->memcg_params.id != -1);
+	list_add(&cachep->memcg_params.destroyed_list, &destroyed_caches);
+}
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+	unsigned long flags;
+
+	/*
+	 * We have to defer the actual destroying to a workqueue, because
+	 * we might currently be in a context that cannot sleep.
+	 */
+	spin_lock_irqsave(&cache_queue_lock, flags);
+	__mem_cgroup_destroy_cache(cachep);
+	spin_unlock_irqrestore(&cache_queue_lock, flags);
+
+	schedule_work(&kmem_cache_destroy_work);
+}
 
 /*
  * Flush the queue of kmem_caches to create, because we're creating a cgroup.
@@ -613,6 +667,33 @@ void mem_cgroup_flush_cache_create_queue(void)
 	spin_unlock_irqrestore(&cache_queue_lock, flags);
 }
 
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+	struct kmem_cache *cachep;
+	unsigned long flags;
+	int i;
+
+	/*
+	 * pre_destroy() gets called with no tasks in the cgroup.
+	 * this means that after flushing the create queue, no more caches
+	 * will appear
+	 */
+	mem_cgroup_flush_cache_create_queue();
+
+	spin_lock_irqsave(&cache_queue_lock, flags);
+	for (i = 0; i < MAX_KMEM_CACHE_TYPES; i++) {
+		cachep = memcg->slabs[i];
+		if (!cachep)
+			continue;
+
+		if (atomic_dec_and_test(&cachep->memcg_params.refcnt))
+			__mem_cgroup_destroy_cache(cachep);
+	}
+	spin_unlock_irqrestore(&cache_queue_lock, flags);
+
+	schedule_work(&kmem_cache_destroy_work);
+}
+
 static void memcg_create_cache_work_func(struct work_struct *w)
 {
 	struct create_work *cw, *tmp;
@@ -854,6 +935,10 @@ static void memcg_slab_init(struct mem_cgroup *memcg)
 static inline void disarm_static_keys(struct mem_cgroup *memcg)
 {
 }
+
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+}
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
 static void drain_all_stock_async(struct mem_cgroup *memcg);
@@ -4133,6 +4218,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
 	int node, zid, shrink;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct cgroup *cgrp = memcg->css.cgroup;
+	u64 usage;
 
 	css_get(&memcg->css);
 
@@ -4172,8 +4258,10 @@ move_account:
 		if (ret == -ENOMEM)
 			goto try_to_free;
 		cond_resched();
+		usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
+			res_counter_read_u64(&memcg->kmem, RES_USAGE);
 	/* "ret" should also be checked to ensure all lists are empty. */
-	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
+	} while (usage > 0 || ret);
 out:
 	css_put(&memcg->css);
 	return ret;
@@ -5518,6 +5606,7 @@ static int mem_cgroup_pre_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
+	mem_cgroup_destroy_all_caches(memcg);
 	return mem_cgroup_force_empty(memcg, false);
 }
 
diff --git a/mm/slab.c b/mm/slab.c
index 7022f86..a6fd82e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1861,8 +1861,9 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 void kmem_cache_drop_ref(struct kmem_cache *cachep)
 {
-	if (cachep->memcg_params.id == -1)
-		atomic_dec(&cachep->memcg_params.refcnt);
+	if (cachep->memcg_params.id == -1 &&
+	    unlikely(atomic_dec_and_test(&cachep->memcg_params.refcnt)))
+		mem_cgroup_destroy_cache(cachep);
 }
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
diff --git a/mm/slub.c b/mm/slub.c
index c70db56..02d8f5e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1297,8 +1297,11 @@ static void kmem_cache_inc_ref(struct kmem_cache *s)
 }
 static void kmem_cache_drop_ref(struct kmem_cache *s)
 {
-	if (s->memcg_params.memcg)
-		atomic_dec(&s->memcg_params.refcnt);
+	if (!s->memcg_params.memcg)
+		return;
+
+	if (unlikely(atomic_dec_and_test(&s->memcg_params.refcnt)))
+		mem_cgroup_destroy_cache(s);
 }
 #else
 static inline void kmem_cache_inc_ref(struct kmem_cache *s)
-- 
1.7.7.6


WARNING: multiple messages have this Message-ID (diff)
From: Glauber Costa <glommer@parallels.com>
To: linux-kernel@vger.kernel.org
Cc: cgroups@vger.kernel.org, linux-mm@kvack.org,
	kamezawa.hiroyu@jp.fujitsu.com, Tejun Heo <tj@kernel.org>,
	Li Zefan <lizefan@huawei.com>, Greg Thelen <gthelen@google.com>,
	Suleiman Souhlal <suleiman@google.com>,
	Michal Hocko <mhocko@suse.cz>,
	Johannes Weiner <hannes@cmpxchg.org>,
	devel@openvz.org, Glauber Costa <glommer@parallels.com>,
	Christoph Lameter <cl@linux.com>,
	Pekka Enberg <penberg@cs.helsinki.fi>
Subject: [PATCH v2 23/29] memcg: destroy memcg caches
Date: Fri, 11 May 2012 14:44:25 -0300	[thread overview]
Message-ID: <1336758272-24284-24-git-send-email-glommer@parallels.com> (raw)
In-Reply-To: <1336758272-24284-1-git-send-email-glommer@parallels.com>

This patch implements destruction of memcg caches. Right now,
only caches where our reference counter is the last remaining are
deleted. If there are any other reference counters around, we just
leave the caches lying around until they go away.

When that happen, a destruction function is called from the cache
code. Caches are only destroyed in process context, so we queue them
up for later processing in the general case.

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Christoph Lameter <cl@linux.com>
CC: Pekka Enberg <penberg@cs.helsinki.fi>
CC: Michal Hocko <mhocko@suse.cz>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Suleiman Souhlal <suleiman@google.com>
---
 include/linux/memcontrol.h |    2 +
 include/linux/slab.h       |    1 +
 mm/memcontrol.c            |   91 +++++++++++++++++++++++++++++++++++++++++++-
 mm/slab.c                  |    5 +-
 mm/slub.c                  |    7 ++-
 5 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4000798..3e03f26 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -463,6 +463,8 @@ __mem_cgroup_get_kmem_cache(struct kmem_cache *cachep, gfp_t gfp);
 
 extern struct static_key mem_cgroup_kmem_enabled_key;
 #define mem_cgroup_kmem_on static_key_false(&mem_cgroup_kmem_enabled_key)
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
 #else
 static inline void mem_cgroup_register_cache(struct mem_cgroup *memcg,
 					     struct kmem_cache *s)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index e73ef71..a03a4f2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -164,6 +164,7 @@ struct mem_cgroup_cache_params {
 	size_t orig_align;
 
 #endif
+	struct list_head destroyed_list; /* Used when deleting memcg cache */
 };
 #endif
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ad60648..1d1a307 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -476,6 +476,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
 {
 	if (memcg->kmem_accounted)
 		static_key_slow_dec(&mem_cgroup_kmem_enabled_key);
+	/*
+	 * This check can't live in kmem destruction function,
+	 * since the charges will outlive the cgroup
+	 */
+	BUG_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
 }
 
 #ifdef CONFIG_INET
@@ -540,6 +545,8 @@ void mem_cgroup_register_cache(struct mem_cgroup *memcg,
 	if (!memcg)
 		id = ida_simple_get(&cache_types, 0, MAX_KMEM_CACHE_TYPES,
 				    GFP_KERNEL);
+	else
+		INIT_LIST_HEAD(&cachep->memcg_params.destroyed_list);
 	cachep->memcg_params.id = id;
 }
 
@@ -592,6 +599,53 @@ struct create_work {
 /* Use a single spinlock for destruction and creation, not a frequent op */
 static DEFINE_SPINLOCK(cache_queue_lock);
 static LIST_HEAD(create_queue);
+static LIST_HEAD(destroyed_caches);
+
+static void kmem_cache_destroy_work_func(struct work_struct *w)
+{
+	struct kmem_cache *cachep;
+	struct mem_cgroup_cache_params *p, *tmp;
+	unsigned long flags;
+	LIST_HEAD(del_unlocked);
+
+	spin_lock_irqsave(&cache_queue_lock, flags);
+	list_for_each_entry_safe(p, tmp, &destroyed_caches, destroyed_list) {
+		cachep = container_of(p, struct kmem_cache, memcg_params);
+		list_move(&cachep->memcg_params.destroyed_list, &del_unlocked);
+	}
+	spin_unlock_irqrestore(&cache_queue_lock, flags);
+
+	list_for_each_entry_safe(p, tmp, &del_unlocked, destroyed_list) {
+		cachep = container_of(p, struct kmem_cache, memcg_params);
+		list_del(&cachep->memcg_params.destroyed_list);
+		if (!atomic_read(&cachep->memcg_params.refcnt)) {
+			mem_cgroup_put(cachep->memcg_params.memcg);
+			kmem_cache_destroy(cachep);
+		}
+	}
+}
+static DECLARE_WORK(kmem_cache_destroy_work, kmem_cache_destroy_work_func);
+
+static void __mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+	BUG_ON(cachep->memcg_params.id != -1);
+	list_add(&cachep->memcg_params.destroyed_list, &destroyed_caches);
+}
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+	unsigned long flags;
+
+	/*
+	 * We have to defer the actual destroying to a workqueue, because
+	 * we might currently be in a context that cannot sleep.
+	 */
+	spin_lock_irqsave(&cache_queue_lock, flags);
+	__mem_cgroup_destroy_cache(cachep);
+	spin_unlock_irqrestore(&cache_queue_lock, flags);
+
+	schedule_work(&kmem_cache_destroy_work);
+}
 
 /*
  * Flush the queue of kmem_caches to create, because we're creating a cgroup.
@@ -613,6 +667,33 @@ void mem_cgroup_flush_cache_create_queue(void)
 	spin_unlock_irqrestore(&cache_queue_lock, flags);
 }
 
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+	struct kmem_cache *cachep;
+	unsigned long flags;
+	int i;
+
+	/*
+	 * pre_destroy() gets called with no tasks in the cgroup.
+	 * this means that after flushing the create queue, no more caches
+	 * will appear
+	 */
+	mem_cgroup_flush_cache_create_queue();
+
+	spin_lock_irqsave(&cache_queue_lock, flags);
+	for (i = 0; i < MAX_KMEM_CACHE_TYPES; i++) {
+		cachep = memcg->slabs[i];
+		if (!cachep)
+			continue;
+
+		if (atomic_dec_and_test(&cachep->memcg_params.refcnt))
+			__mem_cgroup_destroy_cache(cachep);
+	}
+	spin_unlock_irqrestore(&cache_queue_lock, flags);
+
+	schedule_work(&kmem_cache_destroy_work);
+}
+
 static void memcg_create_cache_work_func(struct work_struct *w)
 {
 	struct create_work *cw, *tmp;
@@ -854,6 +935,10 @@ static void memcg_slab_init(struct mem_cgroup *memcg)
 static inline void disarm_static_keys(struct mem_cgroup *memcg)
 {
 }
+
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+}
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
 static void drain_all_stock_async(struct mem_cgroup *memcg);
@@ -4133,6 +4218,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
 	int node, zid, shrink;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct cgroup *cgrp = memcg->css.cgroup;
+	u64 usage;
 
 	css_get(&memcg->css);
 
@@ -4172,8 +4258,10 @@ move_account:
 		if (ret == -ENOMEM)
 			goto try_to_free;
 		cond_resched();
+		usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
+			res_counter_read_u64(&memcg->kmem, RES_USAGE);
 	/* "ret" should also be checked to ensure all lists are empty. */
-	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
+	} while (usage > 0 || ret);
 out:
 	css_put(&memcg->css);
 	return ret;
@@ -5518,6 +5606,7 @@ static int mem_cgroup_pre_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
+	mem_cgroup_destroy_all_caches(memcg);
 	return mem_cgroup_force_empty(memcg, false);
 }
 
diff --git a/mm/slab.c b/mm/slab.c
index 7022f86..a6fd82e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1861,8 +1861,9 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 void kmem_cache_drop_ref(struct kmem_cache *cachep)
 {
-	if (cachep->memcg_params.id == -1)
-		atomic_dec(&cachep->memcg_params.refcnt);
+	if (cachep->memcg_params.id == -1 &&
+	    unlikely(atomic_dec_and_test(&cachep->memcg_params.refcnt)))
+		mem_cgroup_destroy_cache(cachep);
 }
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
diff --git a/mm/slub.c b/mm/slub.c
index c70db56..02d8f5e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1297,8 +1297,11 @@ static void kmem_cache_inc_ref(struct kmem_cache *s)
 }
 static void kmem_cache_drop_ref(struct kmem_cache *s)
 {
-	if (s->memcg_params.memcg)
-		atomic_dec(&s->memcg_params.refcnt);
+	if (!s->memcg_params.memcg)
+		return;
+
+	if (unlikely(atomic_dec_and_test(&s->memcg_params.refcnt)))
+		mem_cgroup_destroy_cache(s);
 }
 #else
 static inline void kmem_cache_inc_ref(struct kmem_cache *s)
-- 
1.7.7.6

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)
From: Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
To: linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org,
	kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org,
	Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>,
	Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>,
	Greg Thelen <gthelen-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>,
	Suleiman Souhlal
	<suleiman-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>,
	Michal Hocko <mhocko-AlSwsSmVLrQ@public.gmane.org>,
	Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>,
	devel-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org,
	Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>,
	Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>,
	Pekka Enberg <penberg-bbCR+/B0CizivPeTLB3BmA@public.gmane.org>
Subject: [PATCH v2 23/29] memcg: destroy memcg caches
Date: Fri, 11 May 2012 14:44:25 -0300	[thread overview]
Message-ID: <1336758272-24284-24-git-send-email-glommer@parallels.com> (raw)
In-Reply-To: <1336758272-24284-1-git-send-email-glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

This patch implements destruction of memcg caches. Right now,
only caches where our reference counter is the last remaining are
deleted. If there are any other reference counters around, we just
leave the caches lying around until they go away.

When that happen, a destruction function is called from the cache
code. Caches are only destroyed in process context, so we queue them
up for later processing in the general case.

Signed-off-by: Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
CC: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
CC: Pekka Enberg <penberg-bbCR+/B0CizivPeTLB3BmA@public.gmane.org>
CC: Michal Hocko <mhocko-AlSwsSmVLrQ@public.gmane.org>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
CC: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
CC: Suleiman Souhlal <suleiman-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
 include/linux/memcontrol.h |    2 +
 include/linux/slab.h       |    1 +
 mm/memcontrol.c            |   91 +++++++++++++++++++++++++++++++++++++++++++-
 mm/slab.c                  |    5 +-
 mm/slub.c                  |    7 ++-
 5 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4000798..3e03f26 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -463,6 +463,8 @@ __mem_cgroup_get_kmem_cache(struct kmem_cache *cachep, gfp_t gfp);
 
 extern struct static_key mem_cgroup_kmem_enabled_key;
 #define mem_cgroup_kmem_on static_key_false(&mem_cgroup_kmem_enabled_key)
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
 #else
 static inline void mem_cgroup_register_cache(struct mem_cgroup *memcg,
 					     struct kmem_cache *s)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index e73ef71..a03a4f2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -164,6 +164,7 @@ struct mem_cgroup_cache_params {
 	size_t orig_align;
 
 #endif
+	struct list_head destroyed_list; /* Used when deleting memcg cache */
 };
 #endif
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ad60648..1d1a307 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -476,6 +476,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
 {
 	if (memcg->kmem_accounted)
 		static_key_slow_dec(&mem_cgroup_kmem_enabled_key);
+	/*
+	 * This check can't live in kmem destruction function,
+	 * since the charges will outlive the cgroup
+	 */
+	BUG_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
 }
 
 #ifdef CONFIG_INET
@@ -540,6 +545,8 @@ void mem_cgroup_register_cache(struct mem_cgroup *memcg,
 	if (!memcg)
 		id = ida_simple_get(&cache_types, 0, MAX_KMEM_CACHE_TYPES,
 				    GFP_KERNEL);
+	else
+		INIT_LIST_HEAD(&cachep->memcg_params.destroyed_list);
 	cachep->memcg_params.id = id;
 }
 
@@ -592,6 +599,53 @@ struct create_work {
 /* Use a single spinlock for destruction and creation, not a frequent op */
 static DEFINE_SPINLOCK(cache_queue_lock);
 static LIST_HEAD(create_queue);
+static LIST_HEAD(destroyed_caches);
+
+static void kmem_cache_destroy_work_func(struct work_struct *w)
+{
+	struct kmem_cache *cachep;
+	struct mem_cgroup_cache_params *p, *tmp;
+	unsigned long flags;
+	LIST_HEAD(del_unlocked);
+
+	spin_lock_irqsave(&cache_queue_lock, flags);
+	list_for_each_entry_safe(p, tmp, &destroyed_caches, destroyed_list) {
+		cachep = container_of(p, struct kmem_cache, memcg_params);
+		list_move(&cachep->memcg_params.destroyed_list, &del_unlocked);
+	}
+	spin_unlock_irqrestore(&cache_queue_lock, flags);
+
+	list_for_each_entry_safe(p, tmp, &del_unlocked, destroyed_list) {
+		cachep = container_of(p, struct kmem_cache, memcg_params);
+		list_del(&cachep->memcg_params.destroyed_list);
+		if (!atomic_read(&cachep->memcg_params.refcnt)) {
+			mem_cgroup_put(cachep->memcg_params.memcg);
+			kmem_cache_destroy(cachep);
+		}
+	}
+}
+static DECLARE_WORK(kmem_cache_destroy_work, kmem_cache_destroy_work_func);
+
+static void __mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+	BUG_ON(cachep->memcg_params.id != -1);
+	list_add(&cachep->memcg_params.destroyed_list, &destroyed_caches);
+}
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+	unsigned long flags;
+
+	/*
+	 * We have to defer the actual destroying to a workqueue, because
+	 * we might currently be in a context that cannot sleep.
+	 */
+	spin_lock_irqsave(&cache_queue_lock, flags);
+	__mem_cgroup_destroy_cache(cachep);
+	spin_unlock_irqrestore(&cache_queue_lock, flags);
+
+	schedule_work(&kmem_cache_destroy_work);
+}
 
 /*
  * Flush the queue of kmem_caches to create, because we're creating a cgroup.
@@ -613,6 +667,33 @@ void mem_cgroup_flush_cache_create_queue(void)
 	spin_unlock_irqrestore(&cache_queue_lock, flags);
 }
 
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+	struct kmem_cache *cachep;
+	unsigned long flags;
+	int i;
+
+	/*
+	 * pre_destroy() gets called with no tasks in the cgroup.
+	 * this means that after flushing the create queue, no more caches
+	 * will appear
+	 */
+	mem_cgroup_flush_cache_create_queue();
+
+	spin_lock_irqsave(&cache_queue_lock, flags);
+	for (i = 0; i < MAX_KMEM_CACHE_TYPES; i++) {
+		cachep = memcg->slabs[i];
+		if (!cachep)
+			continue;
+
+		if (atomic_dec_and_test(&cachep->memcg_params.refcnt))
+			__mem_cgroup_destroy_cache(cachep);
+	}
+	spin_unlock_irqrestore(&cache_queue_lock, flags);
+
+	schedule_work(&kmem_cache_destroy_work);
+}
+
 static void memcg_create_cache_work_func(struct work_struct *w)
 {
 	struct create_work *cw, *tmp;
@@ -854,6 +935,10 @@ static void memcg_slab_init(struct mem_cgroup *memcg)
 static inline void disarm_static_keys(struct mem_cgroup *memcg)
 {
 }
+
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+}
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
 static void drain_all_stock_async(struct mem_cgroup *memcg);
@@ -4133,6 +4218,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
 	int node, zid, shrink;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct cgroup *cgrp = memcg->css.cgroup;
+	u64 usage;
 
 	css_get(&memcg->css);
 
@@ -4172,8 +4258,10 @@ move_account:
 		if (ret == -ENOMEM)
 			goto try_to_free;
 		cond_resched();
+		usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
+			res_counter_read_u64(&memcg->kmem, RES_USAGE);
 	/* "ret" should also be checked to ensure all lists are empty. */
-	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
+	} while (usage > 0 || ret);
 out:
 	css_put(&memcg->css);
 	return ret;
@@ -5518,6 +5606,7 @@ static int mem_cgroup_pre_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
+	mem_cgroup_destroy_all_caches(memcg);
 	return mem_cgroup_force_empty(memcg, false);
 }
 
diff --git a/mm/slab.c b/mm/slab.c
index 7022f86..a6fd82e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1861,8 +1861,9 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 void kmem_cache_drop_ref(struct kmem_cache *cachep)
 {
-	if (cachep->memcg_params.id == -1)
-		atomic_dec(&cachep->memcg_params.refcnt);
+	if (cachep->memcg_params.id == -1 &&
+	    unlikely(atomic_dec_and_test(&cachep->memcg_params.refcnt)))
+		mem_cgroup_destroy_cache(cachep);
 }
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
diff --git a/mm/slub.c b/mm/slub.c
index c70db56..02d8f5e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1297,8 +1297,11 @@ static void kmem_cache_inc_ref(struct kmem_cache *s)
 }
 static void kmem_cache_drop_ref(struct kmem_cache *s)
 {
-	if (s->memcg_params.memcg)
-		atomic_dec(&s->memcg_params.refcnt);
+	if (!s->memcg_params.memcg)
+		return;
+
+	if (unlikely(atomic_dec_and_test(&s->memcg_params.refcnt)))
+		mem_cgroup_destroy_cache(s);
 }
 #else
 static inline void kmem_cache_inc_ref(struct kmem_cache *s)
-- 
1.7.7.6

  parent reply	other threads:[~2012-05-11 17:52 UTC|newest]

Thread overview: 167+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-05-11 17:44 [PATCH v2 00/29] kmem limitation for memcg Glauber Costa
2012-05-11 17:44 ` Glauber Costa
2012-05-11 17:44 ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 01/29] slab: dup name string Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-15 22:04   ` David Rientjes
2012-05-15 22:04     ` David Rientjes
2012-05-16  6:12     ` Glauber Costa
2012-05-16  6:12       ` Glauber Costa
2012-05-16  6:12       ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 02/29] slub: fix slab_state for slub Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:51   ` Christoph Lameter
2012-05-11 17:51     ` Christoph Lameter
2012-05-15 21:55   ` David Rientjes
2012-05-15 21:55     ` David Rientjes
2012-05-15 21:55     ` David Rientjes
2012-05-16  6:10     ` Glauber Costa
2012-05-16  6:10       ` Glauber Costa
2012-05-16  6:10       ` Glauber Costa
2012-05-17 10:14     ` Glauber Costa
2012-05-17 10:14       ` Glauber Costa
2012-05-17 10:14       ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 03/29] memcg: Always free struct memcg through schedule_work() Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 04/29] slub: always get the cache from its page in kfree Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:53   ` Christoph Lameter
2012-05-11 17:53     ` Christoph Lameter
2012-05-11 17:57     ` Glauber Costa
2012-05-11 17:57       ` Glauber Costa
2012-05-11 17:57       ` Glauber Costa
2012-05-11 18:06       ` Christoph Lameter
2012-05-11 18:06         ` Christoph Lameter
2012-05-11 18:11         ` Glauber Costa
2012-05-11 18:11           ` Glauber Costa
2012-05-11 18:11           ` Glauber Costa
2012-05-11 18:17           ` Christoph Lameter
2012-05-11 18:17             ` Christoph Lameter
2012-05-11 18:20             ` Glauber Costa
2012-05-11 18:20               ` Glauber Costa
2012-05-11 18:20               ` Glauber Costa
2012-05-11 18:32               ` Christoph Lameter
2012-05-11 18:32                 ` Christoph Lameter
2012-05-11 18:32                 ` Christoph Lameter
2012-05-11 18:42                 ` Glauber Costa
2012-05-11 18:42                   ` Glauber Costa
2012-05-11 18:42                   ` Glauber Costa
2012-05-11 18:56                   ` Christoph Lameter
2012-05-11 18:56                     ` Christoph Lameter
2012-05-11 18:56                     ` Christoph Lameter
2012-05-11 18:58                     ` Glauber Costa
2012-05-11 18:58                       ` Glauber Costa
2012-05-11 18:58                       ` Glauber Costa
2012-05-11 19:09                       ` Christoph Lameter
2012-05-11 19:09                         ` Christoph Lameter
2012-05-11 19:11                         ` Glauber Costa
2012-05-11 19:11                           ` Glauber Costa
2012-05-11 19:11                           ` Glauber Costa
2012-05-11 19:20                           ` Christoph Lameter
2012-05-11 19:20                             ` Christoph Lameter
2012-05-11 19:24                             ` Glauber Costa
2012-05-11 19:24                               ` Glauber Costa
2012-05-11 19:24                               ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 05/29] slab: rename gfpflags to allocflags Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:54   ` Christoph Lameter
2012-05-11 17:54     ` Christoph Lameter
2012-05-15 21:57   ` David Rientjes
2012-05-15 21:57     ` David Rientjes
2012-05-11 17:44 ` [PATCH v2 06/29] memcg: Make it possible to use the stock for more than one page Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 07/29] memcg: Reclaim when more than one page needed Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 08/29] slab: use obj_size field of struct kmem_cache when not debugging Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 09/29] memcg: change defines to an enum Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 10/29] res_counter: don't force return value checking in res_counter_charge_nofail Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 11/29] cgroups: ability to stop res charge propagation on bounded ancestor Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-15  2:59   ` KAMEZAWA Hiroyuki
2012-05-15  2:59     ` KAMEZAWA Hiroyuki
2012-05-16  6:16     ` Glauber Costa
2012-05-16  6:16       ` Glauber Costa
2012-05-16  6:16       ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 12/29] kmem slab accounting basic infrastructure Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 13/29] slab/slub: struct memcg_params Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 14/29] slub: consider a memcg parameter in kmem_create_cache Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 15/29] slab: pass memcg parameter to kmem_cache_create Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 16/29] slub: create duplicate cache Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 17/29] slab: " Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 18/29] memcg: kmem controller charge/uncharge infrastructure Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-15  2:57   ` KAMEZAWA Hiroyuki
2012-05-15  2:57     ` KAMEZAWA Hiroyuki
2012-05-16  6:42     ` Glauber Costa
2012-05-16  6:42       ` Glauber Costa
2012-05-16  8:18       ` KAMEZAWA Hiroyuki
2012-05-16  8:18         ` KAMEZAWA Hiroyuki
2012-05-16  8:25         ` Glauber Costa
2012-05-16  8:25           ` Glauber Costa
2012-05-16  8:25           ` Glauber Costa
2012-05-16  9:15           ` KAMEZAWA Hiroyuki
2012-05-16  9:15             ` KAMEZAWA Hiroyuki
2012-05-16  9:15             ` KAMEZAWA Hiroyuki
2012-05-11 17:44 ` [PATCH v2 19/29] skip memcg kmem allocations in specified code regions Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-15  2:46   ` KAMEZAWA Hiroyuki
2012-05-15  2:46     ` KAMEZAWA Hiroyuki
2012-05-16  6:19     ` Glauber Costa
2012-05-16  6:19       ` Glauber Costa
2012-05-16  6:19       ` Glauber Costa
2012-05-16  7:55       ` KAMEZAWA Hiroyuki
2012-05-16  7:55         ` KAMEZAWA Hiroyuki
2012-05-16  7:55         ` KAMEZAWA Hiroyuki
2012-05-11 17:44 ` [PATCH v2 20/29] slub: charge allocation to a memcg Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 21/29] slab: per-memcg accounting of slab caches Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 22/29] memcg: disable kmem code when not in use Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` Glauber Costa [this message]
2012-05-11 17:44   ` [PATCH v2 23/29] memcg: destroy memcg caches Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 24/29] memcg/slub: shrink dead caches Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 25/29] memcg: Track all the memcg children of a kmem_cache Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 26/29] memcg: Per-memcg memory.kmem.slabinfo file Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 27/29] slub: create slabinfo file for memcg Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 28/29] slub: track all children of a kmem cache Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 17:44 ` [PATCH v2 29/29] Documentation: add documentation for slab tracker for memcg Glauber Costa
2012-05-11 17:44   ` Glauber Costa
2012-05-11 18:05 ` [PATCH v2 00/29] kmem limitation " Glauber Costa
2012-05-11 18:05   ` Glauber Costa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1336758272-24284-24-git-send-email-glommer@parallels.com \
    --to=glommer@parallels.com \
    --cc=cgroups@vger.kernel.org \
    --cc=cl@linux.com \
    --cc=devel@openvz.org \
    --cc=gthelen@google.com \
    --cc=hannes@cmpxchg.org \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizefan@huawei.com \
    --cc=mhocko@suse.cz \
    --cc=penberg@cs.helsinki.fi \
    --cc=suleiman@google.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.