linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC 0/3] kmemcg slab reparenting
@ 2014-05-13 13:48 Vladimir Davydov
  2014-05-13 13:48 ` [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches Vladimir Davydov
                   ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-13 13:48 UTC (permalink / raw)
  To: hannes, mhocko, cl; +Cc: akpm, linux-kernel, linux-mm

Hi Johannes, Michal, Christoph,

Recently I posted my thoughts on how we can handle kmem caches of dead
memcgs:

https://lkml.org/lkml/2014/4/20/38

The only feedback I got then was from Johannes who voted for migrating
slabs of such caches to the parent memcg's cache (so called
reparenting), so in this RFC I'd like to propose a draft of possible
implementation of slab reparenting. I'd appreciate if you could look
through it and post if it's worth developing in this direction or not.

The implementation of reparenting is given in patch 3, which is the most
important part of this set. Patch 1 just makes slub keep full slabs on
list, and patch 2 a bit extends percpu-refcnt interface.

NOTE the implementation is given only for slub, though it should be easy
to implement the same hack for slab.

Thanks,

Vladimir Davydov (3):
  slub: keep full slabs on list for per memcg caches
  percpu-refcount: allow to get dead reference
  slub: reparent memcg caches' slabs on memcg offline

 include/linux/memcontrol.h      |    4 +-
 include/linux/percpu-refcount.h |   11 +-
 include/linux/slab.h            |    7 +-
 mm/memcontrol.c                 |   54 ++++---
 mm/slab.h                       |    7 +-
 mm/slub.c                       |  299 ++++++++++++++++++++++++++++++++++-----
 6 files changed, 318 insertions(+), 64 deletions(-)

-- 
1.7.10.4


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches
  2014-05-13 13:48 [PATCH RFC 0/3] kmemcg slab reparenting Vladimir Davydov
@ 2014-05-13 13:48 ` Vladimir Davydov
  2014-05-14 16:16   ` Christoph Lameter
  2014-05-13 13:48 ` [PATCH RFC 2/3] percpu-refcount: allow to get dead reference Vladimir Davydov
  2014-05-13 13:48 ` [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline Vladimir Davydov
  2 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-13 13:48 UTC (permalink / raw)
  To: hannes, mhocko, cl; +Cc: akpm, linux-kernel, linux-mm

Currently full slabs are only kept on per-node lists for debugging, but
we need this feature to reparent per memcg caches, so let's enable it
for them too.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
---
 mm/slab.h |    2 ++
 mm/slub.c |   91 +++++++++++++++++++++++++++++++++++++++++--------------------
 2 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index 961a3fb1f5a2..0eca922ed7a0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -288,6 +288,8 @@ struct kmem_cache_node {
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_t nr_slabs;
 	atomic_long_t total_objects;
+#endif
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_SLUB_DEBUG)
 	struct list_head full;
 #endif
 #endif
diff --git a/mm/slub.c b/mm/slub.c
index 4d5002f518b1..6019c315a2f9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -132,6 +132,11 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 #endif
 }
 
+static inline bool kmem_cache_tracks_full(struct kmem_cache *s)
+{
+	return !is_root_cache(s) || kmem_cache_debug(s);
+}
+
 /*
  * Issues still to be resolved:
  *
@@ -998,28 +1003,6 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 		debug_check_no_obj_freed(x, s->object_size);
 }
 
-/*
- * Tracking of fully allocated slabs for debugging purposes.
- */
-static void add_full(struct kmem_cache *s,
-	struct kmem_cache_node *n, struct page *page)
-{
-	if (!(s->flags & SLAB_STORE_USER))
-		return;
-
-	lockdep_assert_held(&n->list_lock);
-	list_add(&page->lru, &n->full);
-}
-
-static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
-{
-	if (!(s->flags & SLAB_STORE_USER))
-		return;
-
-	lockdep_assert_held(&n->list_lock);
-	list_del(&page->lru);
-}
-
 /* Tracking of the number of slabs for debugging purposes */
 static inline unsigned long slabs_node(struct kmem_cache *s, int node)
 {
@@ -1259,10 +1242,6 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
 			{ return 1; }
 static inline int check_object(struct kmem_cache *s, struct page *page,
 			void *object, u8 val) { return 1; }
-static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
-					struct page *page) {}
-static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
-					struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long object_size,
 	unsigned long flags, const char *name,
 	void (*ctor)(void *))
@@ -1557,6 +1536,33 @@ static inline void remove_partial(struct kmem_cache_node *n,
 	__remove_partial(n, page);
 }
 
+#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_MEMCG_KMEM)
+static inline void add_full(struct kmem_cache *s,
+			    struct kmem_cache_node *n, struct page *page)
+{
+	if (is_root_cache(s) && !(s->flags & SLAB_STORE_USER))
+		return;
+
+	lockdep_assert_held(&n->list_lock);
+	list_add(&page->lru, &n->full);
+}
+
+static inline void remove_full(struct kmem_cache *s,
+			       struct kmem_cache_node *n, struct page *page)
+{
+	if (is_root_cache(s) && !(s->flags & SLAB_STORE_USER))
+		return;
+
+	lockdep_assert_held(&n->list_lock);
+	list_del(&page->lru);
+}
+#else
+static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
+					struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
+					struct page *page) {}
+#endif
+
 /*
  * Remove slab from the partial list, freeze it and
  * return the pointer to the freelist.
@@ -1896,7 +1902,7 @@ redo:
 		}
 	} else {
 		m = M_FULL;
-		if (kmem_cache_debug(s) && !lock) {
+		if (kmem_cache_tracks_full(s) && !lock) {
 			lock = 1;
 			/*
 			 * This also ensures that the scanning of full
@@ -2257,8 +2263,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 	struct page new;
 	unsigned long counters;
 	void *freelist;
+	struct kmem_cache_node *n = NULL;
 
 	do {
+		if (unlikely(n)) {
+			spin_unlock(&n->list_lock);
+			n = NULL;
+		}
+
 		freelist = page->freelist;
 		counters = page->counters;
 
@@ -2268,11 +2280,21 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 		new.inuse = page->objects;
 		new.frozen = freelist != NULL;
 
+		if (kmem_cache_tracks_full(s) && !new.frozen) {
+			n = get_node(s, page_to_nid(page));
+			spin_lock(&n->list_lock);
+		}
+
 	} while (!__cmpxchg_double_slab(s, page,
 		freelist, counters,
 		NULL, new.counters,
 		"get_freelist"));
 
+	if (n) {
+		add_full(s, n, page);
+		spin_unlock(&n->list_lock);
+	}
+
 	return freelist;
 }
 
@@ -2575,7 +2597,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		new.inuse--;
 		if ((!new.inuse || !prior) && !was_frozen) {
 
-			if (kmem_cache_has_cpu_partial(s) && !prior) {
+			if (kmem_cache_has_cpu_partial(s) &&
+			    !kmem_cache_tracks_full(s) && !prior) {
 
 				/*
 				 * Slab was on no list before and will be
@@ -2587,6 +2610,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 
 			} else { /* Needs to be taken off a list */
 
+				if (kmem_cache_has_cpu_partial(s) && !prior)
+					new.frozen = 1;
+
 	                        n = get_node(s, page_to_nid(page));
 				/*
 				 * Speculatively acquire the list_lock.
@@ -2606,6 +2632,12 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		object, new.counters,
 		"__slab_free"));
 
+	if (unlikely(n) && new.frozen && !was_frozen) {
+		remove_full(s, n, page);
+		spin_unlock_irqrestore(&n->list_lock, flags);
+		n = NULL;
+	}
+
 	if (likely(!n)) {
 
 		/*
@@ -2633,8 +2665,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	 * then add it.
 	 */
 	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
-		if (kmem_cache_debug(s))
-			remove_full(s, n, page);
+		remove_full(s, n, page);
 		add_partial(n, page, DEACTIVATE_TO_TAIL);
 		stat(s, FREE_ADD_PARTIAL);
 	}
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH RFC 2/3] percpu-refcount: allow to get dead reference
  2014-05-13 13:48 [PATCH RFC 0/3] kmemcg slab reparenting Vladimir Davydov
  2014-05-13 13:48 ` [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches Vladimir Davydov
@ 2014-05-13 13:48 ` Vladimir Davydov
  2014-05-13 13:48 ` [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline Vladimir Davydov
  2 siblings, 0 replies; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-13 13:48 UTC (permalink / raw)
  To: hannes, mhocko, cl; +Cc: akpm, linux-kernel, linux-mm

Currently percpu_ref_tryget fails after percpu_ref_kill is called, even
if refcnt != 0. In the next patch I need a method to get a ref that will
only fail if refcnt == 0, so let's extend the interface to percpu ref a
bit to allow that.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
---
 include/linux/percpu-refcount.h |   11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 95961f0bf62d..7729e59b9cb7 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -129,7 +129,8 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
  * used.  After the confirm_kill callback is invoked, it's guaranteed that
  * no new reference will be given out by percpu_ref_tryget().
  */
-static inline bool percpu_ref_tryget(struct percpu_ref *ref)
+static inline bool __percpu_ref_tryget(struct percpu_ref *ref,
+				       bool maybe_dead)
 {
 	unsigned __percpu *pcpu_count;
 	int ret = false;
@@ -141,13 +142,19 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
 	if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) {
 		__this_cpu_inc(*pcpu_count);
 		ret = true;
-	}
+	} else if (maybe_dead && unlikely(atomic_read(&ref->count)))
+		ret = atomic_inc_not_zero(&ref->count);
 
 	rcu_read_unlock_sched();
 
 	return ret;
 }
 
+static inline bool percpu_ref_tryget(struct percpu_ref *ref)
+{
+	return __percpu_ref_tryget(ref, false);
+}
+
 /**
  * percpu_ref_put - decrement a percpu refcount
  * @ref: percpu_ref to put
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-13 13:48 [PATCH RFC 0/3] kmemcg slab reparenting Vladimir Davydov
  2014-05-13 13:48 ` [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches Vladimir Davydov
  2014-05-13 13:48 ` [PATCH RFC 2/3] percpu-refcount: allow to get dead reference Vladimir Davydov
@ 2014-05-13 13:48 ` Vladimir Davydov
  2014-05-14 16:20   ` Christoph Lameter
  2 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-13 13:48 UTC (permalink / raw)
  To: hannes, mhocko, cl; +Cc: akpm, linux-kernel, linux-mm

As there won't be any allocations from kmem caches that belong to a dead
memcg, it's better to move their slab pages to the parent cache on css
offline instead of having them hanging around for indefinite time and
bothering about reaping them periodically or on vmpressure.

A tricky thing about it is that there still may be free's to those
caches, so we should synchronize with them somehow. This is difficult,
because slub implementation is mostly lockless, and there is no magic
lock that can be used for this.

This patch solves this problem by switching all free's to dead caches to
the "slow" mode while we are migrating their slab pages. The "slow" mode
uses the PG_locked bit of the slab for synchronization. Under this lock
it just puts the object being freed to the page's freelist not bothering
about per cpu/node slab lists, which will be handled by the reparenting
procedure. This gives us a clear synchronization point between kfree's
and the reparenting procedure - it's the PG_locked bit spin lock, which
we take for both freeing an object and changing a slab's slab_cache ptr
on reparenting. Although the "slow" mode free is really slow, there
shouldn't be lot of them, because they can only happen while reparenting
is in progress, which shouldn't take long.

Since the "slow" and the "normal" free's can't coexist at the same time,
we must assure all conventional free's have finished before switching
all further free's to the "slow" mode and starting reparenting. To
achieve that, a percpu refcounter is used. It is taken and held during
each "normal" free. The refcounter is killed on memcg offline, and the
cache's pages migration is initiated from the refcounter's release
function. If we fail to take a ref on kfree, it means all "normal"
free's have been completed and the cache is being reparented right now,
so we should free the object using the "slow" mode.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
---
 include/linux/memcontrol.h |    4 +-
 include/linux/slab.h       |    7 +-
 mm/memcontrol.c            |   54 +++++++-----
 mm/slab.h                  |    5 +-
 mm/slub.c                  |  208 +++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 246 insertions(+), 32 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index aa429de275cc..ce6bb47d59e5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -511,8 +511,8 @@ void memcg_update_array_size(int num_groups);
 struct kmem_cache *
 __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
 
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order);
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order);
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size);
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size);
 
 int __kmem_cache_destroy_memcg_children(struct kmem_cache *s);
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 86e5b26fbdab..ce2189ac4899 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -14,6 +14,8 @@
 #include <linux/gfp.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/percpu-refcount.h>
 
 
 /*
@@ -526,7 +528,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * @memcg: pointer to the memcg this cache belongs to
  * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
- * @nr_pages: number of pages that belongs to this cache.
  */
 struct memcg_cache_params {
 	bool is_root_cache;
@@ -539,7 +540,9 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head list;
 			struct kmem_cache *root_cache;
-			atomic_t nr_pages;
+
+			struct percpu_ref refcnt;
+			struct work_struct destroy_work;
 		};
 	};
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f5ea266f4d9a..4156010ee5a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2942,7 +2942,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
 }
 #endif
 
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
 {
 	struct res_counter *fail_res;
 	int ret = 0;
@@ -2980,7 +2980,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
 	return ret;
 }
 
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
 {
 	res_counter_uncharge(&memcg->res, size);
 	if (do_swap_account)
@@ -3090,9 +3090,12 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 	return 0;
 }
 
+static void memcg_kmem_cache_release_func(struct percpu_ref *ref);
+
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			     struct kmem_cache *root_cache)
 {
+	int err;
 	size_t size;
 
 	if (!memcg_kmem_enabled())
@@ -3109,6 +3112,12 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 		return -ENOMEM;
 
 	if (memcg) {
+		err = percpu_ref_init(&s->memcg_params->refcnt,
+				      memcg_kmem_cache_release_func);
+		if (err) {
+			kfree(s->memcg_params);
+			return err;
+		}
 		s->memcg_params->memcg = memcg;
 		s->memcg_params->root_cache = root_cache;
 		css_get(&memcg->css);
@@ -3192,6 +3201,26 @@ static void memcg_kmem_destroy_cache(struct kmem_cache *cachep)
 	kmem_cache_destroy(cachep);
 }
 
+static void memcg_kmem_cache_destroy_func(struct work_struct *work)
+{
+	struct memcg_cache_params *params =
+		container_of(work, struct memcg_cache_params, destroy_work);
+	struct kmem_cache *cachep = memcg_params_to_cache(params);
+
+	mutex_lock(&memcg_slab_mutex);
+	memcg_kmem_destroy_cache(cachep);
+	mutex_unlock(&memcg_slab_mutex);
+}
+
+static void memcg_kmem_cache_release_func(struct percpu_ref *ref)
+{
+	struct memcg_cache_params *params =
+		container_of(ref, struct memcg_cache_params, refcnt);
+
+	INIT_WORK(&params->destroy_work, memcg_kmem_cache_destroy_func);
+	schedule_work(&params->destroy_work);
+}
+
 /*
  * During the creation a new cache, we need to disable our accounting mechanism
  * altogether. This is true even if we are not creating, but rather just
@@ -3254,9 +3283,7 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
 	mutex_lock(&memcg_slab_mutex);
 	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
 		cachep = memcg_params_to_cache(params);
-		kmem_cache_shrink(cachep);
-		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
-			memcg_kmem_destroy_cache(cachep);
+		percpu_ref_kill(&cachep->memcg_params->refcnt);
 	}
 	mutex_unlock(&memcg_slab_mutex);
 }
@@ -3321,23 +3348,6 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
 	memcg_resume_kmem_account();
 }
 
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
-	int res;
-
-	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
-				PAGE_SIZE << order);
-	if (!res)
-		atomic_add(1 << order, &cachep->memcg_params->nr_pages);
-	return res;
-}
-
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
-	memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
-	atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
-}
-
 /*
  * Return the kmem_cache we're supposed to use for a slab allocation.
  * We try to use the current memcg's version of the cache.
diff --git a/mm/slab.h b/mm/slab.h
index 0eca922ed7a0..c084935a1c29 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -186,7 +186,8 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
 		return 0;
 	if (is_root_cache(s))
 		return 0;
-	return __memcg_charge_slab(s, gfp, order);
+	return memcg_charge_kmem(s->memcg_params->memcg, gfp,
+				 PAGE_SIZE << order);
 }
 
 static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -195,7 +196,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 		return;
 	if (is_root_cache(s))
 		return;
-	__memcg_uncharge_slab(s, order);
+	memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order);
 }
 #else
 static inline bool is_root_cache(struct kmem_cache *s)
diff --git a/mm/slub.c b/mm/slub.c
index 6019c315a2f9..dfe7d3695a9e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2700,7 +2700,7 @@ slab_empty:
  * If fastpath is not possible then fall back to __slab_free where we deal
  * with all sorts of special processing.
  */
-static __always_inline void slab_free(struct kmem_cache *s,
+static __always_inline void do_slab_free(struct kmem_cache *s,
 			struct page *page, void *x, unsigned long addr)
 {
 	void **object = (void *)x;
@@ -2736,19 +2736,218 @@ redo:
 		stat(s, FREE_FASTPATH);
 	} else
 		__slab_free(s, page, x, addr);
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static __always_inline void slab_free(struct kmem_cache *s,
+			struct page *page, void *x, unsigned long addr)
+{
+	void **object = (void *)x;
+	unsigned long flags;
+
+	if (!memcg_kmem_enabled())
+		return do_slab_free(s, page, x, addr);
+
+retry:
+	/* page->slab_cache is RCU-protected */
+	rcu_read_lock_sched();
+
+	s = ACCESS_ONCE(page->slab_cache);
+	if (is_root_cache(s)) {
+		rcu_read_unlock_sched();
+		return do_slab_free(s, page, x, addr);
+	}
+
+	/*
+	 * Usual percpu_ref_tryget() will fail after percpu_ref_kill() is
+	 * called, even if refcnt != 0, which means there may be free's
+	 * operating in the "normal" mode in flight. To avoid races with them,
+	 * switch to the "slow" mode only if refcnt == 0.
+	 */
+	if (__percpu_ref_tryget(&s->memcg_params->refcnt, true)) {
+		rcu_read_unlock_sched();
+		do_slab_free(s, page, x, addr);
+		percpu_ref_put(&s->memcg_params->refcnt);
+		return;
+	}
+
+	/*
+	 * This is the "slow" mode, which locks the slab to avoid races with
+	 * slab_reparent(). Note, there is no need bothering about per cpu/node
+	 * list consistency, because this will be handled by slab_reparent().
+	 */
+	local_irq_save(flags);
+	slab_lock(page);
+
+	if (unlikely(s != page->slab_cache)) {
+		/* the slab was reparented while we were trying to lock it */
+		slab_unlock(page);
+		local_irq_restore(flags);
+		rcu_read_unlock_sched();
+		goto retry;
+	}
+
+	slab_free_hook(s, x);
+
+	set_freepointer(s, object, page->freelist);
+	page->freelist = object;
+	page->inuse--;
 
+	slab_unlock(page);
+	local_irq_restore(flags);
+	rcu_read_unlock_sched();
 }
+#else
+static __always_inline void slab_free(struct kmem_cache *s,
+			struct page *page, void *x, unsigned long addr)
+{
+	do_slab_free(s, page, x, addr);
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
-	s = cache_from_obj(s, x);
-	if (!s)
-		return;
 	slab_free(s, virt_to_head_page(x), x, _RET_IP_);
 	trace_kmem_cache_free(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
+#ifdef CONFIG_MEMCG_KMEM
+static void __slab_reparent(struct kmem_cache *from, struct kmem_cache *to,
+			    u64 *size)
+{
+	int node, cpu;
+	struct page *page;
+	LIST_HEAD(slabs);
+	LIST_HEAD(slabs_tofree);
+
+	*size = 0;
+
+	/*
+	 * All free's to the @from cache are now operating in the "slow" mode,
+	 * which means we only need to take the slab lock to safely modify per
+	 * slab data.
+	 */
+
+	for_each_possible_cpu(cpu) {
+		struct kmem_cache_cpu *c = per_cpu_ptr(from->cpu_slab, cpu);
+
+		page = c->page;
+		if (page) {
+			void *lastfree = NULL, *freelist = c->freelist;
+			int nr_objects = 0;
+
+			/* drain per-cpu freelist */
+			while (freelist) {
+				nr_objects++;
+				lastfree = freelist;
+				freelist = get_freepointer(from, freelist);
+			}
+
+			if (lastfree) {
+				local_irq_disable();
+				slab_lock(page);
+
+				set_freepointer(from, lastfree, page->freelist);
+				page->freelist = c->freelist;
+				VM_BUG_ON(page->inuse < nr_objects);
+				page->inuse -= nr_objects;
+
+				slab_unlock(page);
+				local_irq_enable();
+			}
+
+			list_add(&page->lru, &slabs);
+			c->page = NULL;
+			c->freelist = NULL;
+		}
+
+		/* drain per-cpu list of partial slabs */
+		while ((page = c->partial) != NULL) {
+			c->partial = page->next;
+			list_add(&page->lru, &slabs);
+		}
+	}
+
+	for_each_node_state(node, N_POSSIBLE) {
+		struct kmem_cache_node *n = get_node(from, node);
+
+		if (!n)
+			continue;
+
+		list_splice_init(&n->partial, &slabs);
+		list_splice_init(&n->full, &slabs);
+		n->nr_partial = 0;
+
+#ifdef CONFIG_SLUB_DEBUG
+		atomic_long_set(&n->nr_slabs, 0);
+		atomic_long_set(&n->total_objects, 0);
+#endif
+	}
+
+	/* insert @from's slabs to the @to cache */
+	while (!list_empty(&slabs)) {
+		struct kmem_cache_node *n;
+
+		page = list_first_entry(&slabs, struct page, lru);
+		list_del(&page->lru);
+
+		node = page_to_nid(page);
+		n = get_node(to, node);
+		spin_lock_irq(&n->list_lock);
+
+		slab_lock(page);
+		page->frozen = 0;
+		if (!page->inuse) {
+			list_add(&page->lru, &slabs_tofree);
+		} else {
+			page->slab_cache = to;
+			inc_slabs_node(to, node, page->objects);
+			if (!page->freelist)
+				add_full(to, n, page);
+			else
+				add_partial(n, page, DEACTIVATE_TO_TAIL);
+			*size += PAGE_SIZE << compound_order(page);
+		}
+		slab_unlock(page);
+
+		spin_unlock_irq(&n->list_lock);
+	}
+
+	while (!list_empty(&slabs_tofree)) {
+		page = list_first_entry(&slabs_tofree, struct page, lru);
+		list_del(&page->lru);
+		free_slab(from, page);
+	}
+
+	/*
+	 * The @from cache can be safely destroyed now, because no free can see
+	 * the slab_cache == from after this point.
+	 */
+	synchronize_sched();
+}
+
+static void slab_reparent(struct kmem_cache *cachep)
+{
+	struct kmem_cache *root_cache;
+	struct mem_cgroup *memcg;
+	u64 size;
+
+	if (is_root_cache(cachep))
+		return;
+
+	root_cache = cachep->memcg_params->root_cache;
+	memcg = cachep->memcg_params->memcg;
+
+	__slab_reparent(cachep, root_cache, &size);
+	memcg_uncharge_kmem(memcg, size);
+}
+#else
+static void slab_reparent(struct kmem_cache *cachep)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
 /*
  * Object placement in a slab is made very easy because we always start at
  * offset 0. If we tune the size of the object to the alignment then we can
@@ -3275,6 +3474,7 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 
 int __kmem_cache_shutdown(struct kmem_cache *s)
 {
+	slab_reparent(s);
 	return kmem_cache_close(s);
 }
 
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches
  2014-05-13 13:48 ` [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches Vladimir Davydov
@ 2014-05-14 16:16   ` Christoph Lameter
  2014-05-15  6:34     ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-14 16:16 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Tue, 13 May 2014, Vladimir Davydov wrote:

> Currently full slabs are only kept on per-node lists for debugging, but
> we need this feature to reparent per memcg caches, so let's enable it
> for them too.

That will significantly impact the fastpaths for alloc and free.

Also a pretty significant change the logic of the fastpaths since they
were not designed to handle the full lists. In debug mode all operations
were only performed by the slow paths and only the slow paths so far
supported tracking full slabs.

> @@ -2587,6 +2610,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
>
>  			} else { /* Needs to be taken off a list */
>
> +				if (kmem_cache_has_cpu_partial(s) && !prior)
> +					new.frozen = 1;
> +
>  	                        n = get_node(s, page_to_nid(page));

Make this code conditional?

>  				/*
>  				 * Speculatively acquire the list_lock.
> @@ -2606,6 +2632,12 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
>  		object, new.counters,
>  		"__slab_free"));
>
> +	if (unlikely(n) && new.frozen && !was_frozen) {
> +		remove_full(s, n, page);
> +		spin_unlock_irqrestore(&n->list_lock, flags);
> +		n = NULL;
> +	}
> +
>  	if (likely(!n)) {

Here too.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-13 13:48 ` [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline Vladimir Davydov
@ 2014-05-14 16:20   ` Christoph Lameter
  2014-05-15  7:16     ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-14 16:20 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Tue, 13 May 2014, Vladimir Davydov wrote:

> Since the "slow" and the "normal" free's can't coexist at the same time,
> we must assure all conventional free's have finished before switching
> all further free's to the "slow" mode and starting reparenting. To
> achieve that, a percpu refcounter is used. It is taken and held during
> each "normal" free. The refcounter is killed on memcg offline, and the
> cache's pages migration is initiated from the refcounter's release
> function. If we fail to take a ref on kfree, it means all "normal"
> free's have been completed and the cache is being reparented right now,
> so we should free the object using the "slow" mode.

Argh adding more code to the free path touching more cachelines in the
process.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches
  2014-05-14 16:16   ` Christoph Lameter
@ 2014-05-15  6:34     ` Vladimir Davydov
  2014-05-15 15:15       ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-15  6:34 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Wed, May 14, 2014 at 11:16:36AM -0500, Christoph Lameter wrote:
> On Tue, 13 May 2014, Vladimir Davydov wrote:
> 
> > Currently full slabs are only kept on per-node lists for debugging, but
> > we need this feature to reparent per memcg caches, so let's enable it
> > for them too.
> 
> That will significantly impact the fastpaths for alloc and free.
> 
> Also a pretty significant change the logic of the fastpaths since they
> were not designed to handle the full lists. In debug mode all operations
> were only performed by the slow paths and only the slow paths so far
> supported tracking full slabs.

That's the minimal price we have to pay for slab re-parenting, because
w/o it we won't be able to look up for all slabs of a particular per
memcg cache. The question is, can it be tolerated or I'd better try some
other way?

> 
> > @@ -2587,6 +2610,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
> >
> >  			} else { /* Needs to be taken off a list */
> >
> > +				if (kmem_cache_has_cpu_partial(s) && !prior)
> > +					new.frozen = 1;
> > +
> >  	                        n = get_node(s, page_to_nid(page));
> 
> Make this code conditional?

No problem, this patch is just a draft. Thanks to static keys, it won't
be difficult to eliminate any overhead if there is no kmem-active
memcgs.

Thanks.

> 
> >  				/*
> >  				 * Speculatively acquire the list_lock.
> > @@ -2606,6 +2632,12 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
> >  		object, new.counters,
> >  		"__slab_free"));
> >
> > +	if (unlikely(n) && new.frozen && !was_frozen) {
> > +		remove_full(s, n, page);
> > +		spin_unlock_irqrestore(&n->list_lock, flags);
> > +		n = NULL;
> > +	}
> > +
> >  	if (likely(!n)) {
> 
> Here too.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-14 16:20   ` Christoph Lameter
@ 2014-05-15  7:16     ` Vladimir Davydov
  2014-05-15 15:16       ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-15  7:16 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Wed, May 14, 2014 at 11:20:51AM -0500, Christoph Lameter wrote:
> On Tue, 13 May 2014, Vladimir Davydov wrote:
> 
> > Since the "slow" and the "normal" free's can't coexist at the same time,
> > we must assure all conventional free's have finished before switching
> > all further free's to the "slow" mode and starting reparenting. To
> > achieve that, a percpu refcounter is used. It is taken and held during
> > each "normal" free. The refcounter is killed on memcg offline, and the
> > cache's pages migration is initiated from the refcounter's release
> > function. If we fail to take a ref on kfree, it means all "normal"
> > free's have been completed and the cache is being reparented right now,
> > so we should free the object using the "slow" mode.
> 
> Argh adding more code to the free path touching more cachelines in the
> process.

Actually, there is not that much active code added, IMO. In fact, it's
only percpu ref get/put for per memcg caches plus a couple of
conditionals. The "slow" mode code is meant to be executed very rarely,
so we can move it to a separate function under unlikely optimization.

I admit that's far not perfect, because kfree is really a hot path,
where every byte of code matters, but unfortunately I don't see how we
can avoid this in case we want slab re-parenting.

Again, I'd like to hear from you if there is any point in moving in this
direction, or I should give up and concentrate on some other approach,
because you'll never accept it.

Thanks.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches
  2014-05-15  6:34     ` Vladimir Davydov
@ 2014-05-15 15:15       ` Christoph Lameter
  2014-05-16 13:06         ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-15 15:15 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Thu, 15 May 2014, Vladimir Davydov wrote:

> > That will significantly impact the fastpaths for alloc and free.
> >
> > Also a pretty significant change the logic of the fastpaths since they
> > were not designed to handle the full lists. In debug mode all operations
> > were only performed by the slow paths and only the slow paths so far
> > supported tracking full slabs.
>
> That's the minimal price we have to pay for slab re-parenting, because
> w/o it we won't be able to look up for all slabs of a particular per
> memcg cache. The question is, can it be tolerated or I'd better try some
> other way?

AFACIT these modifications all together will have a significant impact on
performance.

You could avoid the refcounting on free relying on the atomic nature of
cmpxchg operations. If you zap the per cpu slab then the fast path will be
forced to fall back to the slowpaths where you could do what you need to
do.

There is no tracking of full slabs without adding much more logic to the
fastpath. You could force any operation that affects tne full list into
the slow path. But that also would have an impact.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-15  7:16     ` Vladimir Davydov
@ 2014-05-15 15:16       ` Christoph Lameter
  2014-05-16 13:22         ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-15 15:16 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Thu, 15 May 2014, Vladimir Davydov wrote:

> I admit that's far not perfect, because kfree is really a hot path,
> where every byte of code matters, but unfortunately I don't see how we
> can avoid this in case we want slab re-parenting.

Do we even know that all objects in that slab belong to a certain cgroup?
AFAICT the fastpath currently do not allow to make that distinction.

> Again, I'd like to hear from you if there is any point in moving in this
> direction, or I should give up and concentrate on some other approach,
> because you'll never accept it.

I wish you would find some other way to do this.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches
  2014-05-15 15:15       ` Christoph Lameter
@ 2014-05-16 13:06         ` Vladimir Davydov
  2014-05-16 15:05           ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-16 13:06 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Thu, May 15, 2014 at 10:15:10AM -0500, Christoph Lameter wrote:
> On Thu, 15 May 2014, Vladimir Davydov wrote:
> 
> > > That will significantly impact the fastpaths for alloc and free.
> > >
> > > Also a pretty significant change the logic of the fastpaths since they
> > > were not designed to handle the full lists. In debug mode all operations
> > > were only performed by the slow paths and only the slow paths so far
> > > supported tracking full slabs.
> >
> > That's the minimal price we have to pay for slab re-parenting, because
> > w/o it we won't be able to look up for all slabs of a particular per
> > memcg cache. The question is, can it be tolerated or I'd better try some
> > other way?
> 
> AFACIT these modifications all together will have a significant impact on
> performance.
> 
> You could avoid the refcounting on free relying on the atomic nature of
> cmpxchg operations. If you zap the per cpu slab then the fast path will be
> forced to fall back to the slowpaths where you could do what you need to
> do.

Hmm, looking at __slab_free once again, I tend to agree that we could
rely on cmpxchg to do re-parenting: we could freeze all slabs of the
cache being re-parented forcing every on-going kfree to do only a
cmpxchg w/o touching any lists and taking any locks, and then unfreeze
all the frozen slabs to the target cache. No need in the ugly "slow
mode" I introduced in this patch set would be necessary then.

But w/o ref-counting how can we make sure that all kfrees to the cache
we are going to re-parent have been completed so that it can be safely
destroyed? An example:

  CPU0:                                 CPU1:
  -----                                 -----
  kfree(obj):
    page = virt_to_head_page(obj)
    s = page->slab_cache
    slab_free(s, page, obj):
      <<< gets preempted here

                                        reparent_slab_cache:
                                          for each slab page
                                            [...]
                                            page->slab_cache = target_cache;

                                          kmem_cache_destroy(old_cache)

      <<< continues execution
      c = s->cpu_slab /* s points to the previous owner cache,
                         so we use-after-free here */

If kfree were not preemptable, we could make reparent_slab_cache wait
for all cpus to schedule() before destroying the cache to avoid this,
but since it is, we need ref-counting...

Thanks.

> There is no tracking of full slabs without adding much more logic to the
> fastpath. You could force any operation that affects tne full list into
> the slow path. But that also would have an impact.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-15 15:16       ` Christoph Lameter
@ 2014-05-16 13:22         ` Vladimir Davydov
  2014-05-16 15:03           ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-16 13:22 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Thu, May 15, 2014 at 10:16:39AM -0500, Christoph Lameter wrote:
> On Thu, 15 May 2014, Vladimir Davydov wrote:
> 
> > I admit that's far not perfect, because kfree is really a hot path,
> > where every byte of code matters, but unfortunately I don't see how we
> > can avoid this in case we want slab re-parenting.
> 
> Do we even know that all objects in that slab belong to a certain cgroup?
> AFAICT the fastpath currently do not allow to make that distinction.

All allocations from a memcg's cache are accounted to the owner memcg,
so that all objects on the same slab belong to the same memcg, a pointer
to which can be obtained from the page->slab_cache->memcg_params. At
least, this is true since commit faebbfe10ec1 ("sl[au]b: charge slabs to
kmemcg explicitly").

> > Again, I'd like to hear from you if there is any point in moving in this
> > direction, or I should give up and concentrate on some other approach,
> > because you'll never accept it.
> 
> I wish you would find some other way to do this.

The only practical alternative to re-parenting I see right now is
periodic reaping, but Johannes isn't very fond of it, and his opinion is
quite justified, because having caches that will never be allocated from
hanging around indefinitely, only because they have a couple of active
objects to be freed, doesn't look very good.

Thanks.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-16 13:22         ` Vladimir Davydov
@ 2014-05-16 15:03           ` Christoph Lameter
  2014-05-19 15:24             ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-16 15:03 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Fri, 16 May 2014, Vladimir Davydov wrote:

> > Do we even know that all objects in that slab belong to a certain cgroup?
> > AFAICT the fastpath currently do not allow to make that distinction.
>
> All allocations from a memcg's cache are accounted to the owner memcg,
> so that all objects on the same slab belong to the same memcg, a pointer
> to which can be obtained from the page->slab_cache->memcg_params. At
> least, this is true since commit faebbfe10ec1 ("sl[au]b: charge slabs to
> kmemcg explicitly").

I doubt that. The accounting occurs when a new cpu slab page is allocated.
But the individual allocations in the fastpath are not accounted to a
specific group. Thus allocation in a slab page can belong to various
cgroups.

> > I wish you would find some other way to do this.
>
> The only practical alternative to re-parenting I see right now is
> periodic reaping, but Johannes isn't very fond of it, and his opinion is
> quite justified, because having caches that will never be allocated from
> hanging around indefinitely, only because they have a couple of active
> objects to be freed, doesn't look very good.

If all objects in the cache are in use then the slab page needs to hang
around since the objects presence is required. You may not know exactly
which cgroups these object belong to. The only thing that you may now (if
you keep a list of full slabs) is which cgroup was in use then the
slab page was initially allocated.

Isnt it sufficient to add a counter of full slabs to a cgroup? When you
allocate a new slab page add to the counter. When an object in a slab page
is freed and the slab page goes on a partial list decrement the counter.

That way you can avoid tracking full slabs.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches
  2014-05-16 13:06         ` Vladimir Davydov
@ 2014-05-16 15:05           ` Christoph Lameter
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Lameter @ 2014-05-16 15:05 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Fri, 16 May 2014, Vladimir Davydov wrote:

> But w/o ref-counting how can we make sure that all kfrees to the cache
> we are going to re-parent have been completed so that it can be safely
> destroyed? An example:

Keep the old structure around until the counter of slabs (partial, full)
of that old structure is zero? Move individual slab pages until you reach
zero.

One can lock out frees by setting c->page = NULL, zapping the partial list
and taking the per node list lock.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-16 15:03           ` Christoph Lameter
@ 2014-05-19 15:24             ` Vladimir Davydov
  2014-05-19 16:03               ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-19 15:24 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Fri, May 16, 2014 at 10:03:22AM -0500, Christoph Lameter wrote:
> On Fri, 16 May 2014, Vladimir Davydov wrote:
> 
> > > Do we even know that all objects in that slab belong to a certain cgroup?
> > > AFAICT the fastpath currently do not allow to make that distinction.
> >
> > All allocations from a memcg's cache are accounted to the owner memcg,
> > so that all objects on the same slab belong to the same memcg, a pointer
> > to which can be obtained from the page->slab_cache->memcg_params. At
> > least, this is true since commit faebbfe10ec1 ("sl[au]b: charge slabs to
> > kmemcg explicitly").
> 
> I doubt that. The accounting occurs when a new cpu slab page is allocated.
> But the individual allocations in the fastpath are not accounted to a
> specific group. Thus allocation in a slab page can belong to various
> cgroups.

On each kmalloc, we pick the cache that belongs to the current memcg,
and allocate objects from that cache (see memcg_kmem_get_cache()). And
all slab pages allocated for a per memcg cache are accounted to the
memcg the cache belongs to (see memcg_charge_slab). So currently, each
kmem cache, i.e. each slab of it, can only have objects of one cgroup,
namely its owner.

> > > I wish you would find some other way to do this.
> >
> > The only practical alternative to re-parenting I see right now is
> > periodic reaping, but Johannes isn't very fond of it, and his opinion is
> > quite justified, because having caches that will never be allocated from
> > hanging around indefinitely, only because they have a couple of active
> > objects to be freed, doesn't look very good.
> 
> If all objects in the cache are in use then the slab page needs to hang
> around since the objects presence is required. You may not know exactly
> which cgroups these object belong to. The only thing that you may now (if
> you keep a list of full slabs) is which cgroup was in use then the
> slab page was initially allocated.
> 
> Isnt it sufficient to add a counter of full slabs to a cgroup? When you
> allocate a new slab page add to the counter. When an object in a slab page
> is freed and the slab page goes on a partial list decrement the counter.
> 
> That way you can avoid tracking full slabs.

The idea was to destroy all memcg's caches on css offline no matter
whether there are active objects on them or not. For that, we have to
migrate *all* slabs to the parent cache, which means, at least, changing
page->slab_cache ptr on them. For the latter, we have to walk through
all slabs (including full ones), which means we have to track them all.

Anyway, you said we shouldn't track full slabs, because it neglects
cpu-partial/frozen lists optimization. Actually, I agree with you at
this point: I did some testing and found that contended kfrees to the
same NUMA node on a 2-node machine are slowed down up to 25% due to full
slabs tracking - not an option obviously.

OK, it seems we have no choice but keeping dead caches left after memcg
offline until they have active slabs. How can we get rid of them then?
Simply counting slabs on cache and destroying cache when the count goes
to 0 isn't enough, because slub may keep some free slabs by default (if
they are frozen e.g.) Reaping them periodically doesn't look nice.

What if we modify __slab_free so that it won't keep empty slabs for dead
caches? That way we would only have to count slabs allocated to a cache,
and destroy caches as soon as the counter drops to 0. No
periodic/vmpressure reaping would be necessary. I attached the patch
that does the trick below. The changes it introduces to __slab_free do
not look very intrusive to me. Could you please take a look at it (to
diff slub.c primarily) when you have time, and say if, in your opinion,
the changes to __slab_free are acceptable or not?

Thank you.

--
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1d9abb7d22a0..9ad536a756ea 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -526,7 +526,8 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * @memcg: pointer to the memcg this cache belongs to
  * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
- * @nr_pages: number of pages that belongs to this cache.
+ * @count: the ref count; the cache is destroyed as soon as it reaches 0
+ * @unregister_work: the cache destruction work
  */
 struct memcg_cache_params {
 	bool is_root_cache;
@@ -539,11 +540,32 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head list;
 			struct kmem_cache *root_cache;
-			atomic_t nr_pages;
+			atomic_t count;
+			struct work_struct unregister_work;
 		};
 	};
 };
 
+/*
+ * Each active slab increments the cache's memcg_params->count, and the owner
+ * memcg, while it's online, adds MEMCG_PARAMS_COUNT_BIAS to the count so that
+ * the cache is dead (i.e. belongs to a memcg that was turned offline) iff
+ * memcg_params->count < MEMCG_PARAMS_COUNT_BIAS.
+ */
+#define MEMCG_PARAMS_COUNT_BIAS		(1U << 31)
+
+/* Returns true if the cache belongs to a memcg that was turned offline. */
+static inline bool memcg_cache_dead(struct kmem_cache *s)
+{
+	bool ret = false;
+
+#ifdef CONFIG_MEMCG_KMEM
+	if (atomic_read(&s->memcg_params->count) < MEMCG_PARAMS_COUNT_BIAS)
+		ret = true;
+#endif
+	return ret;
+}
+
 int memcg_update_all_caches(int num_memcgs);
 
 struct seq_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0fb108e5b905..2b076f17eae7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3090,6 +3090,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 	return 0;
 }
 
+static void memcg_unregister_cache_func(struct work_struct *w);
+
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			     struct kmem_cache *root_cache)
 {
@@ -3111,6 +3113,9 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 	if (memcg) {
 		s->memcg_params->memcg = memcg;
 		s->memcg_params->root_cache = root_cache;
+		atomic_set(&s->memcg_params->count, MEMCG_PARAMS_COUNT_BIAS);
+		INIT_WORK(&s->memcg_params->unregister_work,
+			  memcg_unregister_cache_func);
 		css_get(&memcg->css);
 	} else
 		s->memcg_params->is_root_cache = true;
@@ -3192,6 +3197,17 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
 	kmem_cache_destroy(cachep);
 }
 
+static void memcg_unregister_cache_func(struct work_struct *w)
+{
+	struct memcg_cache_params *params =
+		container_of(w, struct memcg_cache_params, unregister_work);
+	struct kmem_cache *cachep = memcg_params_to_cache(params);
+
+	mutex_lock(&memcg_slab_mutex);
+	memcg_unregister_cache(cachep);
+	mutex_unlock(&memcg_slab_mutex);
+}
+
 /*
  * During the creation a new cache, we need to disable our accounting mechanism
  * altogether. This is true even if we are not creating, but rather just
@@ -3254,8 +3270,21 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
 	mutex_lock(&memcg_slab_mutex);
 	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
 		cachep = memcg_params_to_cache(params);
+
+		/* mark the cache as dead while still holding a ref to it */
+		atomic_sub(MEMCG_PARAMS_COUNT_BIAS - 1, &params->count);
+
+		/*
+		 * Make sure that all ongoing free's see the cache as dead and
+		 * won't do unnecessary slab caching (freezing in case of
+		 * slub, see comment to __slab_free).
+		 */
+		synchronize_sched();
+
 		kmem_cache_shrink(cachep);
-		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+
+		/* if nobody except us uses the cache, destroy it immediately */
+		if (atomic_dec_and_test(&params->count))
 			memcg_unregister_cache(cachep);
 	}
 	mutex_unlock(&memcg_slab_mutex);
@@ -3329,14 +3358,15 @@ int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
 	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
 				PAGE_SIZE << order);
 	if (!res)
-		atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+		atomic_inc(&cachep->memcg_params->count);
 	return res;
 }
 
 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
 {
 	memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
-	atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+	if (atomic_dec_and_test(&cachep->memcg_params->count))
+		schedule_work(&cachep->memcg_params->unregister_work);
 }
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index fdf0fe4da9a9..da7a3edde2cd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2550,6 +2550,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	unsigned long counters;
 	struct kmem_cache_node *n = NULL;
 	unsigned long uninitialized_var(flags);
+	int cache_dead = 0;
 
 	stat(s, FREE_SLOWPATH);
 
@@ -2557,6 +2558,32 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		!(n = free_debug_processing(s, page, x, addr, &flags)))
 		return;
 
+	if (!is_root_cache(s)) {
+		/*
+		 * We must not keep free slabs after the memcg the cache
+		 * belongs to is turned offline, otherwise the cache will be
+		 * hanging around for ever. For the slub implementation that
+		 * means we must not freeze slabs and we must ignore the
+		 * min_partial limit for dead caches.
+		 *
+		 * There is one subtle thing, however. If we get preempted
+		 * after we see the cache alive and before we try to
+		 * cmpxchg-free the object to it, the cache may be killed in
+		 * between and we may occasionally freeze a slab for a dead
+		 * cache.
+		 *
+		 * We avoid this by disabling preemption before the check if
+		 * the cache is dead and re-enabling it after cmpxchg-free,
+		 * where we can freeze a slab. Then, to assure a dead cache
+		 * won't have got frozen slabs it's enough to
+		 * synchronize_sched() after marking the cache dead and before
+		 * shrinking it (see memcg_unregister_all_caches()).
+		 */
+		preempt_disable();
+		if (memcg_cache_dead(s))
+			cache_dead = 1;
+	}
+
 	do {
 		if (unlikely(n)) {
 			spin_unlock_irqrestore(&n->list_lock, flags);
@@ -2570,7 +2597,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		new.inuse--;
 		if ((!new.inuse || !prior) && !was_frozen) {
 
-			if (kmem_cache_has_cpu_partial(s) && !prior) {
+			if (kmem_cache_has_cpu_partial(s) && !prior &&
+			    !cache_dead) {
 
 				/*
 				 * Slab was on no list before and will be
@@ -2601,6 +2629,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		object, new.counters,
 		"__slab_free"));
 
+	if (!is_root_cache(s))
+		preempt_enable();
+
 	if (likely(!n)) {
 
 		/*
@@ -2620,14 +2651,16 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                 return;
         }
 
-	if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+	if (unlikely(!new.inuse &&
+		     (n->nr_partial > s->min_partial || cache_dead)))
 		goto slab_empty;
 
 	/*
 	 * Objects left in the slab. If it was not on the partial list before
 	 * then add it.
 	 */
-	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
+	if ((!kmem_cache_has_cpu_partial(s) || cache_dead) &&
+	    unlikely(!prior)) {
 		if (kmem_cache_debug(s))
 			remove_full(s, n, page);
 		add_partial(n, page, DEACTIVATE_TO_TAIL);

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-19 15:24             ` Vladimir Davydov
@ 2014-05-19 16:03               ` Christoph Lameter
  2014-05-19 18:27                 ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-19 16:03 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Mon, 19 May 2014, Vladimir Davydov wrote:

> > I doubt that. The accounting occurs when a new cpu slab page is allocated.
> > But the individual allocations in the fastpath are not accounted to a
> > specific group. Thus allocation in a slab page can belong to various
> > cgroups.
>
> On each kmalloc, we pick the cache that belongs to the current memcg,
> and allocate objects from that cache (see memcg_kmem_get_cache()). And
> all slab pages allocated for a per memcg cache are accounted to the
> memcg the cache belongs to (see memcg_charge_slab). So currently, each
> kmem cache, i.e. each slab of it, can only have objects of one cgroup,
> namely its owner.

Ok that works for kmalloc. What about dentry/inodes and so on?

> OK, it seems we have no choice but keeping dead caches left after memcg
> offline until they have active slabs. How can we get rid of them then?

Then they are moved to a list and therefore you can move them to yours I
think.

> Simply counting slabs on cache and destroying cache when the count goes
> to 0 isn't enough, because slub may keep some free slabs by default (if
> they are frozen e.g.) Reaping them periodically doesn't look nice.

But those are only limited to one slab per cpu ( plus eventual cpu partial
ones but you can switch that feature off).

> What if we modify __slab_free so that it won't keep empty slabs for dead
> caches? That way we would only have to count slabs allocated to a cache,
> and destroy caches as soon as the counter drops to 0. No

Well that should already be in there. Se s->min_partial to zero?

> periodic/vmpressure reaping would be necessary. I attached the patch
> that does the trick below. The changes it introduces to __slab_free do
> not look very intrusive to me. Could you please take a look at it (to
> diff slub.c primarily) when you have time, and say if, in your opinion,
> the changes to __slab_free are acceptable or not?

Looking now.

> @@ -2620,14 +2651,16 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
>                  return;
>          }
>
> -	if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
> +	if (unlikely(!new.inuse &&
> +		     (n->nr_partial > s->min_partial || cache_dead)))
>  		goto slab_empty;

Could you set s->min_partial = 0 to avoid this?

>
>  	/*
>  	 * Objects left in the slab. If it was not on the partial list before
>  	 * then add it.
>  	 */
> -	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
> +	if ((!kmem_cache_has_cpu_partial(s) || cache_dead) &&
> +	    unlikely(!prior)) {
>  		if (kmem_cache_debug(s))
>  			remove_full(s, n, page);
>  		add_partial(n, page, DEACTIVATE_TO_TAIL);

Not sure why we need this and the other stuff.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-19 16:03               ` Christoph Lameter
@ 2014-05-19 18:27                 ` Vladimir Davydov
  2014-05-21 13:58                   ` Vladimir Davydov
  2014-05-21 14:41                   ` Christoph Lameter
  0 siblings, 2 replies; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-19 18:27 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

19.05.2014 20:03, Christoph Lameter:
> On Mon, 19 May 2014, Vladimir Davydov wrote:
>
>>> I doubt that. The accounting occurs when a new cpu slab page is allocated.
>>> But the individual allocations in the fastpath are not accounted to a
>>> specific group. Thus allocation in a slab page can belong to various
>>> cgroups.
>>
>> On each kmalloc, we pick the cache that belongs to the current memcg,
>> and allocate objects from that cache (see memcg_kmem_get_cache()). And
>> all slab pages allocated for a per memcg cache are accounted to the
>> memcg the cache belongs to (see memcg_charge_slab). So currently, each
>> kmem cache, i.e. each slab of it, can only have objects of one cgroup,
>> namely its owner.
>
> Ok that works for kmalloc. What about dentry/inodes and so on?

The same. Actually, by kmalloc I meant kmem_cache_alloc, or
slab_alloc_node, to be more exact.

>> OK, it seems we have no choice but keeping dead caches left after memcg
>> offline until they have active slabs. How can we get rid of them then?
>
> Then they are moved to a list and therefore you can move them to yours I
> think.
>
>> Simply counting slabs on cache and destroying cache when the count goes
>> to 0 isn't enough, because slub may keep some free slabs by default (if
>> they are frozen e.g.) Reaping them periodically doesn't look nice.
>
> But those are only limited to one slab per cpu ( plus eventual cpu partial
> ones but you can switch that feature off).

AFAIU slub can keep free slabs in:

1) One per cpu slab. This is the easiest thing to handle - we only need
to shrink the cache, because this slab can only be added on allocation,
not on free.

2) Per node partial slab lists. Free slabs can be added there on frees,
but only if min_partial > 0, so setting min_partial = 0 would solve
that, just as you pointed below.

3) Per cpu partial slabs. We can disable this feature for dead caches by
adding appropriate check to kmem_cache_has_cpu_partial.

So far, everything looks very simple - it seems we don't have to modify
__slab_free at all if we follow the instruction above.

However, there is one thing regarding preemptable kernels. The problem
is after forbidding the cache store free slabs in per-cpu/node partial
lists by setting min_partial=0 and kmem_cache_has_cpu_partial=false
(i.e. marking the cache as dead), we have to make sure that all frees
that saw the cache as alive are over, otherwise they can occasionally
add a free slab to a per-cpu/node partial list *after* the cache was
marked dead. For instance,

CPU0                            CPU1
----                            ----
memcg offline:                  __slab_free:
                                   // let the slab we're freeing the obj
                                   // to is full, so we are considering
                                   // freezing it;
                                   // since kmem_cache_has_cpu_partial
                                   // returns true, we'll try to freeze
                                   // it;
                                   new.frozen=1

                                   // but before proceeding to cmpxchg
                                   // we get preempted
   mark cache s dead:
     s->min_partial=0
     make kmem_cache_has_cpu_partial return false

   kmem_cache_shrink(s) // this removes all empty slabs from
                        // per cpu/node partial lists

                                   // when this thread continues to
                                   // __slab_free, the cache is dead
                                   // and no slabs must be added to
                                   // per-cpu partial list, but the
                                   // following cmpxchg may succeed
                                   // in freezing the slab
                                   cmpxchg_double_slab(s, page,
                                       prior, counters,
                                       object, new.counters)

As a result, we'll eventually end up with a free slab on a per-cpu
partial list, so that the cache refcounter will never drop to zero and
the cache leaks.

This is very unlikely, but still possible. To avoid that, we should
assure all __slab_free's like that running on CPU1 are over before
proceeding to kmem_cache_shrink on memcg offline. Currently, it is
impossible to achieve that on fully preemptable kernels w/o modifying
__slab_free, AFAIU. That's what all that trickery in __slab_free about.

Makes sense or am I missing something?

Thanks.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-19 18:27                 ` Vladimir Davydov
@ 2014-05-21 13:58                   ` Vladimir Davydov
  2014-05-21 14:45                     ` Christoph Lameter
  2014-05-21 14:41                   ` Christoph Lameter
  1 sibling, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-21 13:58 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Mon, May 19, 2014 at 10:27:51PM +0400, Vladimir Davydov wrote:
[...]
> However, there is one thing regarding preemptable kernels. The problem
> is after forbidding the cache store free slabs in per-cpu/node partial
> lists by setting min_partial=0 and kmem_cache_has_cpu_partial=false
> (i.e. marking the cache as dead), we have to make sure that all frees
> that saw the cache as alive are over, otherwise they can occasionally
> add a free slab to a per-cpu/node partial list *after* the cache was
> marked dead.

Seems I've found a better way to avoid this race, which does not involve
messing up free hot paths. The idea is to explicitly zap each per-cpu
partial list by setting it pointing to an invalid ptr. Since
put_cpu_partial(), which is called from __slab_free(), uses atomic
cmpxchg for adding a new partial slab to a per cpu partial list, it is
enough to add a check if partials are zapped there and bail out if so.

The patch doing the trick is attached. Could you please take a look at
it once time permit?

Thank you.

--
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1d9abb7d22a0..c1e318247248 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -526,7 +526,8 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * @memcg: pointer to the memcg this cache belongs to
  * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
- * @nr_pages: number of pages that belongs to this cache.
+ * @count: the ref count; the cache is destroyed as soon as it reaches 0
+ * @unregister_work: the cache destruction work
  */
 struct memcg_cache_params {
 	bool is_root_cache;
@@ -539,11 +540,20 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head list;
 			struct kmem_cache *root_cache;
-			atomic_t nr_pages;
+			atomic_t count;
+			struct work_struct unregister_work;
 		};
 	};
 };
 
+/*
+ * Each active slab increments the cache's memcg_params->count, and the owner
+ * memcg, while it's online, adds MEMCG_PARAMS_COUNT_BIAS to the count so that
+ * the cache is dead (i.e. belongs to a memcg that was turned offline) iff
+ * memcg_params->count < MEMCG_PARAMS_COUNT_BIAS.
+ */
+#define MEMCG_PARAMS_COUNT_BIAS		(1U << 31)
+
 int memcg_update_all_caches(int num_memcgs);
 
 struct seq_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0fb108e5b905..6c922dd96fd5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3090,6 +3090,8 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 	return 0;
 }
 
+static void memcg_unregister_cache_func(struct work_struct *w);
+
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			     struct kmem_cache *root_cache)
 {
@@ -3111,6 +3113,9 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 	if (memcg) {
 		s->memcg_params->memcg = memcg;
 		s->memcg_params->root_cache = root_cache;
+		atomic_set(&s->memcg_params->count, MEMCG_PARAMS_COUNT_BIAS);
+		INIT_WORK(&s->memcg_params->unregister_work,
+			  memcg_unregister_cache_func);
 		css_get(&memcg->css);
 	} else
 		s->memcg_params->is_root_cache = true;
@@ -3192,6 +3197,17 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
 	kmem_cache_destroy(cachep);
 }
 
+static void memcg_unregister_cache_func(struct work_struct *w)
+{
+	struct memcg_cache_params *params =
+		container_of(w, struct memcg_cache_params, unregister_work);
+	struct kmem_cache *cachep = memcg_params_to_cache(params);
+
+	mutex_lock(&memcg_slab_mutex);
+	memcg_unregister_cache(cachep);
+	mutex_unlock(&memcg_slab_mutex);
+}
+
 /*
  * During the creation a new cache, we need to disable our accounting mechanism
  * altogether. This is true even if we are not creating, but rather just
@@ -3254,8 +3270,14 @@ static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
 	mutex_lock(&memcg_slab_mutex);
 	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
 		cachep = memcg_params_to_cache(params);
+
+		/* mark the cache as dead while still holding a ref to it */
+		atomic_sub(MEMCG_PARAMS_COUNT_BIAS - 1, &params->count);
+
 		kmem_cache_shrink(cachep);
-		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+
+		/* if nobody except us uses the cache, destroy it immediately */
+		if (atomic_dec_and_test(&params->count))
 			memcg_unregister_cache(cachep);
 	}
 	mutex_unlock(&memcg_slab_mutex);
@@ -3329,14 +3351,15 @@ int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
 	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
 				PAGE_SIZE << order);
 	if (!res)
-		atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+		atomic_inc(&cachep->memcg_params->count);
 	return res;
 }
 
 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
 {
 	memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
-	atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+	if (atomic_dec_and_test(&cachep->memcg_params->count))
+		schedule_work(&cachep->memcg_params->unregister_work);
 }
 
 /*
diff --git a/mm/slab.h b/mm/slab.h
index 961a3fb1f5a2..996968c55222 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -179,6 +179,14 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 	return s->memcg_params->root_cache;
 }
 
+/* Returns true if the cache belongs to a memcg that was turned offline. */
+static inline bool memcg_cache_dead(struct kmem_cache *s)
+{
+	return !is_root_cache(s) &&
+		atomic_read(&s->memcg_params->count) < MEMCG_PARAMS_COUNT_BIAS;
+}
+
+
 static __always_inline int memcg_charge_slab(struct kmem_cache *s,
 					     gfp_t gfp, int order)
 {
@@ -225,6 +233,11 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 	return s;
 }
 
+static inline bool memcg_cache_dead(struct kmem_cache *s)
+{
+	return false;
+}
+
 static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
 {
 	return 0;
diff --git a/mm/slub.c b/mm/slub.c
index fdf0fe4da9a9..21ffae4766e5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -123,15 +123,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #endif
 }
 
-static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-	return !kmem_cache_debug(s);
-#else
-	return false;
-#endif
-}
-
 /*
  * Issues still to be resolved:
  *
@@ -186,6 +177,24 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
 /* Internal SLUB flags */
 #define __OBJECT_POISON		0x80000000UL /* Poison object */
 #define __CMPXCHG_DOUBLE	0x40000000UL /* Use cmpxchg_double */
+#define __DISABLE_CPU_PARTIAL	0x20000000UL
+
+static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+	return !kmem_cache_debug(s) &&
+		!unlikely(s->flags & __DISABLE_CPU_PARTIAL);
+#else
+	return false;
+#endif
+}
+
+#define CPU_PARTIAL_ZAPPED			((struct page *)~0UL)
+
+static inline bool CPU_PARTIAL_VALID(struct page *page)
+{
+	return page && page != CPU_PARTIAL_ZAPPED;
+}
 
 #ifdef CONFIG_SMP
 static struct notifier_block slab_notifier;
@@ -1947,6 +1956,59 @@ redo:
 	}
 }
 
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+/*
+ * Unfreeze partial slab. Called with n->list_lock held.
+ */
+static void __unfreeze_partial(struct kmem_cache *s, struct kmem_cache_node *n,
+			       struct page *page, struct page **discard_page)
+{
+	struct page new, old;
+
+	do {
+
+		old.freelist = page->freelist;
+		old.counters = page->counters;
+		VM_BUG_ON(!old.frozen);
+
+		new.counters = old.counters;
+		new.freelist = old.freelist;
+
+		new.frozen = 0;
+
+	} while (!__cmpxchg_double_slab(s, page,
+			old.freelist, old.counters,
+			new.freelist, new.counters,
+			"unfreezing slab"));
+
+	if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
+		page->next = *discard_page;
+		*discard_page = page;
+	} else {
+		add_partial(n, page, DEACTIVATE_TO_TAIL);
+		stat(s, FREE_ADD_PARTIAL);
+	}
+}
+
+static void cancel_cpu_partial(struct kmem_cache *s, struct page *page)
+{
+	struct page *discard_page = NULL;
+	struct kmem_cache_node *n;
+	unsigned long flags;
+
+	n = get_node(s, page_to_nid(page));
+	spin_lock_irqsave(&n->list_lock, flags);
+	__unfreeze_partial(s, n, page, &discard_page);
+	spin_unlock_irqrestore(&n->list_lock, flags);
+
+	if (discard_page) {
+		stat(s, DEACTIVATE_EMPTY);
+		stat(s, FREE_SLAB);
+		discard_slab(s, page);
+	}
+}
+#endif
+
 /*
  * Unfreeze all the cpu partial slabs.
  *
@@ -1961,12 +2023,12 @@ static void unfreeze_partials(struct kmem_cache *s,
 	struct kmem_cache_node *n = NULL, *n2 = NULL;
 	struct page *page, *discard_page = NULL;
 
-	while ((page = c->partial)) {
-		struct page new;
-		struct page old;
-
+	while (CPU_PARTIAL_VALID(page = c->partial)) {
 		c->partial = page->next;
 
+		if (unlikely(s->flags & __DISABLE_CPU_PARTIAL) && !c->partial)
+			c->partial = CPU_PARTIAL_ZAPPED;
+
 		n2 = get_node(s, page_to_nid(page));
 		if (n != n2) {
 			if (n)
@@ -1976,29 +2038,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 			spin_lock(&n->list_lock);
 		}
 
-		do {
-
-			old.freelist = page->freelist;
-			old.counters = page->counters;
-			VM_BUG_ON(!old.frozen);
-
-			new.counters = old.counters;
-			new.freelist = old.freelist;
-
-			new.frozen = 0;
-
-		} while (!__cmpxchg_double_slab(s, page,
-				old.freelist, old.counters,
-				new.freelist, new.counters,
-				"unfreezing slab"));
-
-		if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
-			page->next = discard_page;
-			discard_page = page;
-		} else {
-			add_partial(n, page, DEACTIVATE_TO_TAIL);
-			stat(s, FREE_ADD_PARTIAL);
-		}
+		__unfreeze_partial(s, n, page, &discard_page);
 	}
 
 	if (n)
@@ -2036,6 +2076,11 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 		pobjects = 0;
 		oldpage = this_cpu_read(s->cpu_slab->partial);
 
+		if (unlikely(oldpage == CPU_PARTIAL_ZAPPED)) {
+			cancel_cpu_partial(s, page);
+			return;
+		}
+
 		if (oldpage) {
 			pobjects = oldpage->pobjects;
 			pages = oldpage->pages;
@@ -2106,7 +2151,7 @@ static bool has_cpu_slab(int cpu, void *info)
 	struct kmem_cache *s = info;
 	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 
-	return c->page || c->partial;
+	return c->page || CPU_PARTIAL_VALID(c->partial);
 }
 
 static void flush_all(struct kmem_cache *s)
@@ -3411,6 +3456,11 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 	if (!slabs_by_inuse)
 		return -ENOMEM;
 
+	if (memcg_cache_dead(s)) {
+		s->flags |= __DISABLE_CPU_PARTIAL;
+		s->min_partial = 0;
+	}
+
 	flush_all(s);
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		n = get_node(s, node);
@@ -4315,7 +4365,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 			nodes[node] += x;
 
 			page = ACCESS_ONCE(c->partial);
-			if (page) {
+			if (CPU_PARTIAL_VALID(page)) {
 				node = page_to_nid(page);
 				if (flags & SO_TOTAL)
 					WARN_ON_ONCE(1);
@@ -4471,7 +4521,8 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
 	if (err)
 		return err;
 
-	set_min_partial(s, min);
+	if (!memcg_cache_dead(s))
+		set_min_partial(s, min);
 	return length;
 }
 SLAB_ATTR(min_partial);
@@ -4547,7 +4598,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 	for_each_online_cpu(cpu) {
 		struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
 
-		if (page) {
+		if (CPU_PARTIAL_VALID(page)) {
 			pages += page->pages;
 			objects += page->pobjects;
 		}
@@ -4559,7 +4610,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 	for_each_online_cpu(cpu) {
 		struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
 
-		if (page && len < PAGE_SIZE - 20)
+		if (CPU_PARTIAL_VALID(page) && len < PAGE_SIZE - 20)
 			len += sprintf(buf + len, " C%d=%d(%d)", cpu,
 				page->pobjects, page->pages);
 	}

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-19 18:27                 ` Vladimir Davydov
  2014-05-21 13:58                   ` Vladimir Davydov
@ 2014-05-21 14:41                   ` Christoph Lameter
  2014-05-21 15:04                     ` Vladimir Davydov
  1 sibling, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-21 14:41 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Mon, 19 May 2014, Vladimir Davydov wrote:

> 3) Per cpu partial slabs. We can disable this feature for dead caches by
> adding appropriate check to kmem_cache_has_cpu_partial.

There is already a s->cpu_partial number in kmem_cache. If that is zero
then no partial cpu slabs should be kept.

> So far, everything looks very simple - it seems we don't have to modify
> __slab_free at all if we follow the instruction above.
>
> However, there is one thing regarding preemptable kernels. The problem
> is after forbidding the cache store free slabs in per-cpu/node partial
> lists by setting min_partial=0 and kmem_cache_has_cpu_partial=false
> (i.e. marking the cache as dead), we have to make sure that all frees
> that saw the cache as alive are over, otherwise they can occasionally
> add a free slab to a per-cpu/node partial list *after* the cache was
> marked dead. For instance,

Ok then lets switch off preeempt there? Preemption is not supported by
most distribution and so will have the least impact.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-21 13:58                   ` Vladimir Davydov
@ 2014-05-21 14:45                     ` Christoph Lameter
  2014-05-21 15:14                       ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-21 14:45 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Wed, 21 May 2014, Vladimir Davydov wrote:

> Seems I've found a better way to avoid this race, which does not involve
> messing up free hot paths. The idea is to explicitly zap each per-cpu
> partial list by setting it pointing to an invalid ptr. Since
> put_cpu_partial(), which is called from __slab_free(), uses atomic
> cmpxchg for adding a new partial slab to a per cpu partial list, it is
> enough to add a check if partials are zapped there and bail out if so.
>
> The patch doing the trick is attached. Could you please take a look at
> it once time permit?

Well if you set s->cpu_partial = 0 then the slab should not be added to
the partial lists. Ok its put on there temporarily but then immediately
moved to the node partial list in put_cpu_partial().


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-21 14:41                   ` Christoph Lameter
@ 2014-05-21 15:04                     ` Vladimir Davydov
  2014-05-22  0:13                       ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-21 15:04 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Wed, May 21, 2014 at 09:41:03AM -0500, Christoph Lameter wrote:
> On Mon, 19 May 2014, Vladimir Davydov wrote:
> 
> > 3) Per cpu partial slabs. We can disable this feature for dead caches by
> > adding appropriate check to kmem_cache_has_cpu_partial.
> 
> There is already a s->cpu_partial number in kmem_cache. If that is zero
> then no partial cpu slabs should be kept.
> 
> > So far, everything looks very simple - it seems we don't have to modify
> > __slab_free at all if we follow the instruction above.
> >
> > However, there is one thing regarding preemptable kernels. The problem
> > is after forbidding the cache store free slabs in per-cpu/node partial
> > lists by setting min_partial=0 and kmem_cache_has_cpu_partial=false
> > (i.e. marking the cache as dead), we have to make sure that all frees
> > that saw the cache as alive are over, otherwise they can occasionally
> > add a free slab to a per-cpu/node partial list *after* the cache was
> > marked dead. For instance,
> 
> Ok then lets switch off preeempt there? Preemption is not supported by
> most distribution and so will have the least impact.

Do I understand you correctly that the following change looks OK to you?

diff --git a/mm/slub.c b/mm/slub.c
index fdf0fe4da9a9..dc3582c2b5bb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2676,31 +2676,31 @@ static __always_inline void slab_free(struct kmem_cache *s,
 redo:
 	/*
 	 * Determine the currently cpus per cpu slab.
 	 * The cpu may change afterward. However that does not matter since
 	 * data is retrieved via this pointer. If we are on the same cpu
 	 * during the cmpxchg then the free will succedd.
 	 */
 	preempt_disable();
 	c = this_cpu_ptr(s->cpu_slab);
 
 	tid = c->tid;
-	preempt_enable();
 
 	if (likely(page == c->page)) {
 		set_freepointer(s, object, c->freelist);
 
 		if (unlikely(!this_cpu_cmpxchg_double(
 				s->cpu_slab->freelist, s->cpu_slab->tid,
 				c->freelist, tid,
 				object, next_tid(tid)))) {
 
 			note_cmpxchg_failure("slab_free", s, tid);
 			goto redo;
 		}
 		stat(s, FREE_FASTPATH);
 	} else
 		__slab_free(s, page, x, addr);
 
+	preempt_enable();
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-21 14:45                     ` Christoph Lameter
@ 2014-05-21 15:14                       ` Vladimir Davydov
  2014-05-22  0:15                         ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-21 15:14 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Wed, May 21, 2014 at 09:45:54AM -0500, Christoph Lameter wrote:
> On Wed, 21 May 2014, Vladimir Davydov wrote:
> 
> > Seems I've found a better way to avoid this race, which does not involve
> > messing up free hot paths. The idea is to explicitly zap each per-cpu
> > partial list by setting it pointing to an invalid ptr. Since
> > put_cpu_partial(), which is called from __slab_free(), uses atomic
> > cmpxchg for adding a new partial slab to a per cpu partial list, it is
> > enough to add a check if partials are zapped there and bail out if so.
> >
> > The patch doing the trick is attached. Could you please take a look at
> > it once time permit?
> 
> Well if you set s->cpu_partial = 0 then the slab should not be added to
> the partial lists. Ok its put on there temporarily but then immediately
> moved to the node partial list in put_cpu_partial().

Don't think so. AFAIU put_cpu_partial() first checks if the per-cpu
partial list has more than s->cpu_partial objects draining it if so, but
then it adds the newly frozen slab there anyway.

Thanks.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-21 15:04                     ` Vladimir Davydov
@ 2014-05-22  0:13                       ` Christoph Lameter
  2014-05-22 13:47                         ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-22  0:13 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Wed, 21 May 2014, Vladimir Davydov wrote:

> Do I understand you correctly that the following change looks OK to you?

Almost. Preemption needs to be enabled before functions that invoke the
page allocator etc etc.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-21 15:14                       ` Vladimir Davydov
@ 2014-05-22  0:15                         ` Christoph Lameter
  2014-05-22 14:07                           ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-22  0:15 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Wed, 21 May 2014, Vladimir Davydov wrote:

> Don't think so. AFAIU put_cpu_partial() first checks if the per-cpu
> partial list has more than s->cpu_partial objects draining it if so, but
> then it adds the newly frozen slab there anyway.

Hmmm... Ok true. Maybe insert some code there then.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-22  0:13                       ` Christoph Lameter
@ 2014-05-22 13:47                         ` Vladimir Davydov
  2014-05-22 19:25                           ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-22 13:47 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Wed, May 21, 2014 at 07:13:21PM -0500, Christoph Lameter wrote:
> On Wed, 21 May 2014, Vladimir Davydov wrote:
> 
> > Do I understand you correctly that the following change looks OK to you?
> 
> Almost. Preemption needs to be enabled before functions that invoke the
> page allocator etc etc.

I need to disable preemption only in slab_free, which never blocks
according to its semantics, so everything should be fine just like that.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-22  0:15                         ` Christoph Lameter
@ 2014-05-22 14:07                           ` Vladimir Davydov
  0 siblings, 0 replies; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-22 14:07 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Wed, May 21, 2014 at 07:15:26PM -0500, Christoph Lameter wrote:
> On Wed, 21 May 2014, Vladimir Davydov wrote:
> 
> > Don't think so. AFAIU put_cpu_partial() first checks if the per-cpu
> > partial list has more than s->cpu_partial objects draining it if so, but
> > then it adds the newly frozen slab there anyway.
> 
> Hmmm... Ok true. Maybe insert some code there then.

Agree, it's better to add the check to put_cpu_partial() rather than to
__slab_free(), because the latter is a hot path.

I'll send the patches soon.

Thank you!

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-22 13:47                         ` Vladimir Davydov
@ 2014-05-22 19:25                           ` Christoph Lameter
  2014-05-23 15:26                             ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-22 19:25 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Thu, 22 May 2014, Vladimir Davydov wrote:

> On Wed, May 21, 2014 at 07:13:21PM -0500, Christoph Lameter wrote:
> > On Wed, 21 May 2014, Vladimir Davydov wrote:
> >
> > > Do I understand you correctly that the following change looks OK to you?
> >
> > Almost. Preemption needs to be enabled before functions that invoke the
> > page allocator etc etc.
>
> I need to disable preemption only in slab_free, which never blocks
> according to its semantics, so everything should be fine just like that.

slab_free calls __slab_free which can release slabs via
put_cpu_partial()/unfreeze_partials()/discard_slab() to the page
allocator. I'd rather have preemption enabled there.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-22 19:25                           ` Christoph Lameter
@ 2014-05-23 15:26                             ` Vladimir Davydov
  2014-05-23 17:45                               ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-23 15:26 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Thu, May 22, 2014 at 02:25:30PM -0500, Christoph Lameter wrote:
> slab_free calls __slab_free which can release slabs via
> put_cpu_partial()/unfreeze_partials()/discard_slab() to the page
> allocator. I'd rather have preemption enabled there.

Hmm, why? IMO, calling __free_pages with preempt disabled won't hurt
latency, because it proceeds really fast. BTW, we already call it for a
bunch of pages from __slab_free() -> put_cpu_partial() ->
unfreeze_partials() with irqs disabled, which is harder. FWIW, SLAB has
the whole obj free path executed under local_irq_save/restore, and it
doesn't bother enabling irqs for freeing pages.

IMO, the latency improvement we can achieve by enabling preemption while
calling __free_pages is rather minor, and it isn't worth complicating
the code.

Thanks.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-23 15:26                             ` Vladimir Davydov
@ 2014-05-23 17:45                               ` Christoph Lameter
  2014-05-23 19:57                                 ` Vladimir Davydov
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Lameter @ 2014-05-23 17:45 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Fri, 23 May 2014, Vladimir Davydov wrote:

> On Thu, May 22, 2014 at 02:25:30PM -0500, Christoph Lameter wrote:
> > slab_free calls __slab_free which can release slabs via
> > put_cpu_partial()/unfreeze_partials()/discard_slab() to the page
> > allocator. I'd rather have preemption enabled there.
>
> Hmm, why? IMO, calling __free_pages with preempt disabled won't hurt
> latency, because it proceeds really fast. BTW, we already call it for a
> bunch of pages from __slab_free() -> put_cpu_partial() ->
> unfreeze_partials() with irqs disabled, which is harder. FWIW, SLAB has
> the whole obj free path executed under local_irq_save/restore, and it
> doesn't bother enabling irqs for freeing pages.
>
> IMO, the latency improvement we can achieve by enabling preemption while
> calling __free_pages is rather minor, and it isn't worth complicating
> the code.

If you look at the end of unfreeze_partials() you see that we release
locks and therefore enable preempt before calling into the page allocator.

You never know what other new features they are going to be adding to the
page allocator. I'd rather be safe than sorry on this one. We have had
some trouble in the past with some debugging logic triggering.



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-23 17:45                               ` Christoph Lameter
@ 2014-05-23 19:57                                 ` Vladimir Davydov
  2014-05-27 14:38                                   ` Christoph Lameter
  0 siblings, 1 reply; 31+ messages in thread
From: Vladimir Davydov @ 2014-05-23 19:57 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Fri, May 23, 2014 at 12:45:48PM -0500, Christoph Lameter wrote:
> On Fri, 23 May 2014, Vladimir Davydov wrote:
> 
> > On Thu, May 22, 2014 at 02:25:30PM -0500, Christoph Lameter wrote:
> > > slab_free calls __slab_free which can release slabs via
> > > put_cpu_partial()/unfreeze_partials()/discard_slab() to the page
> > > allocator. I'd rather have preemption enabled there.
> >
> > Hmm, why? IMO, calling __free_pages with preempt disabled won't hurt
> > latency, because it proceeds really fast. BTW, we already call it for a
> > bunch of pages from __slab_free() -> put_cpu_partial() ->
> > unfreeze_partials() with irqs disabled, which is harder. FWIW, SLAB has
> > the whole obj free path executed under local_irq_save/restore, and it
> > doesn't bother enabling irqs for freeing pages.
> >
> > IMO, the latency improvement we can achieve by enabling preemption while
> > calling __free_pages is rather minor, and it isn't worth complicating
> > the code.
> 
> If you look at the end of unfreeze_partials() you see that we release
> locks and therefore enable preempt before calling into the page allocator.

Yes, we release the node's list_lock before calling discard_slab(), but
we don't enable irqs, which are disabled in put_cpu_partial(), just
before calling it, so we call the page allocator with irqs off and
therefore preemption disabled.

> You never know what other new features they are going to be adding to the
> page allocator. I'd rather be safe than sorry on this one. We have had
> some trouble in the past with some debugging logic triggering.

I guess by "some troubles in the past with some debugging logic
triggering" you mean the issue that was fixed by commit 9ada19342b244 ?

    From: Shaohua Li <shaohua.li@intel.com>

    slub: move discard_slab out of node lock
    
    Lockdep reports there is potential deadlock for slub node list_lock.
    discard_slab() is called with the lock hold in unfreeze_partials(),
    which could trigger a slab allocation, which could hold the lock again.
    
    discard_slab() doesn't need hold the lock actually, if the slab is
    already removed from partial list.

If so - nothing to worry about, because I'm not going to make calls to
the page allocator under an internal slab lock. What I propose is
calling __free_pages with preempt disabled, which already happens here
and there and can't result in deadlocks or lockdep warns.

Thanks.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline
  2014-05-23 19:57                                 ` Vladimir Davydov
@ 2014-05-27 14:38                                   ` Christoph Lameter
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Lameter @ 2014-05-27 14:38 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: hannes, mhocko, akpm, linux-kernel, linux-mm

On Fri, 23 May 2014, Vladimir Davydov wrote:

> > If you look at the end of unfreeze_partials() you see that we release
> > locks and therefore enable preempt before calling into the page allocator.
>
> Yes, we release the node's list_lock before calling discard_slab(), but
> we don't enable irqs, which are disabled in put_cpu_partial(), just
> before calling it, so we call the page allocator with irqs off and
> therefore preemption disabled.

Ok that is something that could be fixed.

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2014-05-27 14:38 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-05-13 13:48 [PATCH RFC 0/3] kmemcg slab reparenting Vladimir Davydov
2014-05-13 13:48 ` [PATCH RFC 1/3] slub: keep full slabs on list for per memcg caches Vladimir Davydov
2014-05-14 16:16   ` Christoph Lameter
2014-05-15  6:34     ` Vladimir Davydov
2014-05-15 15:15       ` Christoph Lameter
2014-05-16 13:06         ` Vladimir Davydov
2014-05-16 15:05           ` Christoph Lameter
2014-05-13 13:48 ` [PATCH RFC 2/3] percpu-refcount: allow to get dead reference Vladimir Davydov
2014-05-13 13:48 ` [PATCH RFC 3/3] slub: reparent memcg caches' slabs on memcg offline Vladimir Davydov
2014-05-14 16:20   ` Christoph Lameter
2014-05-15  7:16     ` Vladimir Davydov
2014-05-15 15:16       ` Christoph Lameter
2014-05-16 13:22         ` Vladimir Davydov
2014-05-16 15:03           ` Christoph Lameter
2014-05-19 15:24             ` Vladimir Davydov
2014-05-19 16:03               ` Christoph Lameter
2014-05-19 18:27                 ` Vladimir Davydov
2014-05-21 13:58                   ` Vladimir Davydov
2014-05-21 14:45                     ` Christoph Lameter
2014-05-21 15:14                       ` Vladimir Davydov
2014-05-22  0:15                         ` Christoph Lameter
2014-05-22 14:07                           ` Vladimir Davydov
2014-05-21 14:41                   ` Christoph Lameter
2014-05-21 15:04                     ` Vladimir Davydov
2014-05-22  0:13                       ` Christoph Lameter
2014-05-22 13:47                         ` Vladimir Davydov
2014-05-22 19:25                           ` Christoph Lameter
2014-05-23 15:26                             ` Vladimir Davydov
2014-05-23 17:45                               ` Christoph Lameter
2014-05-23 19:57                                 ` Vladimir Davydov
2014-05-27 14:38                                   ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).