All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm/slab: Improve performance of gathering slabinfo stats
@ 2016-08-02  0:09 ` Aruna Ramakrishna
  0 siblings, 0 replies; 12+ messages in thread
From: Aruna Ramakrishna @ 2016-08-02  0:09 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: Mike Kravetz, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Andrew Morton

On large systems, when some slab caches grow to millions of objects (and
many gigabytes), running 'cat /proc/slabinfo' can take up to 1-2 seconds.
During this time, interrupts are disabled while walking the slab lists
(slabs_full, slabs_partial, and slabs_free) for each node, and this
sometimes causes timeouts in other drivers (for instance, Infiniband).

This patch optimizes 'cat /proc/slabinfo' by maintaining slab counters to
keep track of number of slabs per node, per cache. These counters are
updated as slabs are created and destroyed. This avoids having to scan the
slab lists for gathering slabinfo stats, resulting in a dramatic
performance improvement. We tested this after growing the dentry cache to
70GB, and the performance improved from 2s to 2ms.

Signed-off-by: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
Note: this has been tested only on x86_64.

 mm/slab.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++---------------
 mm/slab.h |  5 ++++
 2 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 09771ed..c205cd8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -233,6 +233,11 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 	spin_lock_init(&parent->list_lock);
 	parent->free_objects = 0;
 	parent->free_touched = 0;
+	parent->num_slabs_partial = 0;
+	parent->num_slabs_full = 0;
+	parent->num_slabs_free = 0;
+	parent->cache_grown = 0;
+	parent->using_free_slab = 0;
 }
 
 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
@@ -2331,6 +2336,7 @@ static int drain_freelist(struct kmem_cache *cache,
 		 * to the cache.
 		 */
 		n->free_objects -= cache->num;
+		n->num_slabs_free--;
 		spin_unlock_irq(&n->list_lock);
 		slab_destroy(cache, page);
 		nr_freed++;
@@ -2731,6 +2737,14 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
 	kasan_poison_slab(page);
 	cache_init_objs(cachep, page);
 
+	spin_lock(&n->list_lock);
+	/*
+	 * Track the cache growth until the slabs lists and counters
+	 * are adjusted.
+	 */
+	n->cache_grown++;
+	spin_unlock(&n->list_lock);
+
 	if (gfpflags_allow_blocking(local_flags))
 		local_irq_disable();
 
@@ -2758,12 +2772,14 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
 	n = get_node(cachep, page_to_nid(page));
 
 	spin_lock(&n->list_lock);
-	if (!page->active)
+	if (!page->active) {
 		list_add_tail(&page->lru, &(n->slabs_free));
-	else
+		n->num_slabs_free++;
+	} else
 		fixup_slab_list(cachep, n, page, &list);
 	STATS_INC_GROWN(cachep);
 	n->free_objects += cachep->num - page->active;
+	n->cache_grown--;
 	spin_unlock(&n->list_lock);
 
 	fixup_objfreelist_debug(cachep, &list);
@@ -2867,8 +2883,26 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
 {
 	/* move slabp to correct slabp list: */
 	list_del(&page->lru);
+	/*
+	 * If the cache was not grown, then this slabp was deleted from a
+	 * slabs_partial or a slabs_free list, and we decrement the counter
+	 * for the appropriate list here. The flag using_free_slab is set
+	 * whenever a slab from the slabs_free list is used for allocation.
+	 * If set, we know that a slab was deleted from slabs_free and will be
+	 * moved to slabs_partial (or slabs_full) later below. Otherwise,
+	 * it was deleted from slabs_partial.
+	 * If the cache was grown, slabp points to a new slab not present in
+	 * any list - so we do not decrement any counters.
+	 */
+	if (!n->cache_grown) {
+		if (n->using_free_slab)
+			n->num_slabs_free--;
+		else
+			n->num_slabs_partial--;
+	}
 	if (page->active == cachep->num) {
 		list_add(&page->lru, &n->slabs_full);
+		n->num_slabs_full++;
 		if (OBJFREELIST_SLAB(cachep)) {
 #if DEBUG
 			/* Poisoning will be done without holding the lock */
@@ -2881,8 +2915,10 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
 #endif
 			page->freelist = NULL;
 		}
-	} else
+	} else {
 		list_add(&page->lru, &n->slabs_partial);
+		n->num_slabs_partial++;
+	}
 }
 
 /* Try to find non-pfmemalloc slab if needed */
@@ -2912,8 +2948,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
 		list_add_tail(&page->lru, &n->slabs_partial);
 
 	list_for_each_entry(page, &n->slabs_partial, lru) {
-		if (!PageSlabPfmemalloc(page))
+		if (!PageSlabPfmemalloc(page)) {
+			n->using_free_slab = 0;
 			return page;
+		}
 	}
 
 	list_for_each_entry(page, &n->slabs_free, lru) {
@@ -2934,6 +2972,8 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
 		n->free_touched = 1;
 		page = list_first_entry_or_null(&n->slabs_free,
 				struct page, lru);
+		if (page)
+			n->using_free_slab = 1;
 	}
 
 	if (sk_memalloc_socks())
@@ -3431,20 +3471,27 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 		objp = objpp[i];
 
 		page = virt_to_head_page(objp);
+		if (page->active == cachep->num)
+			n->num_slabs_full--;
+		else
+			n->num_slabs_partial--;
+
 		list_del(&page->lru);
 		check_spinlock_acquired_node(cachep, node);
 		slab_put_obj(cachep, page, objp);
 		STATS_DEC_ACTIVE(cachep);
 
 		/* fixup slab chains */
-		if (page->active == 0)
+		if (page->active == 0) {
 			list_add(&page->lru, &n->slabs_free);
-		else {
+			n->num_slabs_free++;
+		} else {
 			/* Unconditionally move a slab to the end of the
 			 * partial list on free - maximum time for the
 			 * other objects to be freed, too.
 			 */
 			list_add_tail(&page->lru, &n->slabs_partial);
+			n->num_slabs_partial++;
 		}
 	}
 
@@ -3453,6 +3500,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 
 		page = list_last_entry(&n->slabs_free, struct page, lru);
 		list_move(&page->lru, list);
+		n->num_slabs_free--;
 	}
 }
 
@@ -4121,24 +4169,20 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 		check_irq_on();
 		spin_lock_irq(&n->list_lock);
 
-		list_for_each_entry(page, &n->slabs_full, lru) {
-			if (page->active != cachep->num && !error)
-				error = "slabs_full accounting error";
-			active_objs += cachep->num;
-			active_slabs++;
-		}
+		active_slabs += n->num_slabs_partial +
+				n->num_slabs_full;
+		num_slabs += n->num_slabs_free +
+				n->num_slabs_partial +
+				n->num_slabs_full;
+
+		active_objs += (n->num_slabs_full * cachep->num);
+
 		list_for_each_entry(page, &n->slabs_partial, lru) {
 			if (page->active == cachep->num && !error)
 				error = "slabs_partial accounting error";
 			if (!page->active && !error)
 				error = "slabs_partial accounting error";
 			active_objs += page->active;
-			active_slabs++;
-		}
-		list_for_each_entry(page, &n->slabs_free, lru) {
-			if (page->active && !error)
-				error = "slabs_free accounting error";
-			num_slabs++;
 		}
 		free_objects += n->free_objects;
 		if (n->shared)
@@ -4146,7 +4190,6 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 
 		spin_unlock_irq(&n->list_lock);
 	}
-	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
 	if (num_objs - active_objs != free_objects && !error)
 		error = "free_objects accounting error";
diff --git a/mm/slab.h b/mm/slab.h
index 9653f2e..c0e8084 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -432,12 +432,17 @@ struct kmem_cache_node {
 	struct list_head slabs_partial;	/* partial list first, better asm code */
 	struct list_head slabs_full;
 	struct list_head slabs_free;
+	unsigned long num_slabs_partial;
+	unsigned long num_slabs_full;
+	unsigned long num_slabs_free;
 	unsigned long free_objects;
 	unsigned int free_limit;
 	unsigned int colour_next;	/* Per-node cache coloring */
 	struct array_cache *shared;	/* shared per node */
 	struct alien_cache **alien;	/* on other nodes */
 	unsigned long next_reap;	/* updated without locking */
+	int cache_grown;
+	bool using_free_slab;
 	int free_touched;		/* updated without locking */
 #endif
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] mm/slab: Improve performance of gathering slabinfo stats
@ 2016-08-02  0:09 ` Aruna Ramakrishna
  0 siblings, 0 replies; 12+ messages in thread
From: Aruna Ramakrishna @ 2016-08-02  0:09 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: Mike Kravetz, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Andrew Morton

On large systems, when some slab caches grow to millions of objects (and
many gigabytes), running 'cat /proc/slabinfo' can take up to 1-2 seconds.
During this time, interrupts are disabled while walking the slab lists
(slabs_full, slabs_partial, and slabs_free) for each node, and this
sometimes causes timeouts in other drivers (for instance, Infiniband).

This patch optimizes 'cat /proc/slabinfo' by maintaining slab counters to
keep track of number of slabs per node, per cache. These counters are
updated as slabs are created and destroyed. This avoids having to scan the
slab lists for gathering slabinfo stats, resulting in a dramatic
performance improvement. We tested this after growing the dentry cache to
70GB, and the performance improved from 2s to 2ms.

Signed-off-by: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
Note: this has been tested only on x86_64.

 mm/slab.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++---------------
 mm/slab.h |  5 ++++
 2 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 09771ed..c205cd8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -233,6 +233,11 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 	spin_lock_init(&parent->list_lock);
 	parent->free_objects = 0;
 	parent->free_touched = 0;
+	parent->num_slabs_partial = 0;
+	parent->num_slabs_full = 0;
+	parent->num_slabs_free = 0;
+	parent->cache_grown = 0;
+	parent->using_free_slab = 0;
 }
 
 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
@@ -2331,6 +2336,7 @@ static int drain_freelist(struct kmem_cache *cache,
 		 * to the cache.
 		 */
 		n->free_objects -= cache->num;
+		n->num_slabs_free--;
 		spin_unlock_irq(&n->list_lock);
 		slab_destroy(cache, page);
 		nr_freed++;
@@ -2731,6 +2737,14 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
 	kasan_poison_slab(page);
 	cache_init_objs(cachep, page);
 
+	spin_lock(&n->list_lock);
+	/*
+	 * Track the cache growth until the slabs lists and counters
+	 * are adjusted.
+	 */
+	n->cache_grown++;
+	spin_unlock(&n->list_lock);
+
 	if (gfpflags_allow_blocking(local_flags))
 		local_irq_disable();
 
@@ -2758,12 +2772,14 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
 	n = get_node(cachep, page_to_nid(page));
 
 	spin_lock(&n->list_lock);
-	if (!page->active)
+	if (!page->active) {
 		list_add_tail(&page->lru, &(n->slabs_free));
-	else
+		n->num_slabs_free++;
+	} else
 		fixup_slab_list(cachep, n, page, &list);
 	STATS_INC_GROWN(cachep);
 	n->free_objects += cachep->num - page->active;
+	n->cache_grown--;
 	spin_unlock(&n->list_lock);
 
 	fixup_objfreelist_debug(cachep, &list);
@@ -2867,8 +2883,26 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
 {
 	/* move slabp to correct slabp list: */
 	list_del(&page->lru);
+	/*
+	 * If the cache was not grown, then this slabp was deleted from a
+	 * slabs_partial or a slabs_free list, and we decrement the counter
+	 * for the appropriate list here. The flag using_free_slab is set
+	 * whenever a slab from the slabs_free list is used for allocation.
+	 * If set, we know that a slab was deleted from slabs_free and will be
+	 * moved to slabs_partial (or slabs_full) later below. Otherwise,
+	 * it was deleted from slabs_partial.
+	 * If the cache was grown, slabp points to a new slab not present in
+	 * any list - so we do not decrement any counters.
+	 */
+	if (!n->cache_grown) {
+		if (n->using_free_slab)
+			n->num_slabs_free--;
+		else
+			n->num_slabs_partial--;
+	}
 	if (page->active == cachep->num) {
 		list_add(&page->lru, &n->slabs_full);
+		n->num_slabs_full++;
 		if (OBJFREELIST_SLAB(cachep)) {
 #if DEBUG
 			/* Poisoning will be done without holding the lock */
@@ -2881,8 +2915,10 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
 #endif
 			page->freelist = NULL;
 		}
-	} else
+	} else {
 		list_add(&page->lru, &n->slabs_partial);
+		n->num_slabs_partial++;
+	}
 }
 
 /* Try to find non-pfmemalloc slab if needed */
@@ -2912,8 +2948,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
 		list_add_tail(&page->lru, &n->slabs_partial);
 
 	list_for_each_entry(page, &n->slabs_partial, lru) {
-		if (!PageSlabPfmemalloc(page))
+		if (!PageSlabPfmemalloc(page)) {
+			n->using_free_slab = 0;
 			return page;
+		}
 	}
 
 	list_for_each_entry(page, &n->slabs_free, lru) {
@@ -2934,6 +2972,8 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
 		n->free_touched = 1;
 		page = list_first_entry_or_null(&n->slabs_free,
 				struct page, lru);
+		if (page)
+			n->using_free_slab = 1;
 	}
 
 	if (sk_memalloc_socks())
@@ -3431,20 +3471,27 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 		objp = objpp[i];
 
 		page = virt_to_head_page(objp);
+		if (page->active == cachep->num)
+			n->num_slabs_full--;
+		else
+			n->num_slabs_partial--;
+
 		list_del(&page->lru);
 		check_spinlock_acquired_node(cachep, node);
 		slab_put_obj(cachep, page, objp);
 		STATS_DEC_ACTIVE(cachep);
 
 		/* fixup slab chains */
-		if (page->active == 0)
+		if (page->active == 0) {
 			list_add(&page->lru, &n->slabs_free);
-		else {
+			n->num_slabs_free++;
+		} else {
 			/* Unconditionally move a slab to the end of the
 			 * partial list on free - maximum time for the
 			 * other objects to be freed, too.
 			 */
 			list_add_tail(&page->lru, &n->slabs_partial);
+			n->num_slabs_partial++;
 		}
 	}
 
@@ -3453,6 +3500,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 
 		page = list_last_entry(&n->slabs_free, struct page, lru);
 		list_move(&page->lru, list);
+		n->num_slabs_free--;
 	}
 }
 
@@ -4121,24 +4169,20 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 		check_irq_on();
 		spin_lock_irq(&n->list_lock);
 
-		list_for_each_entry(page, &n->slabs_full, lru) {
-			if (page->active != cachep->num && !error)
-				error = "slabs_full accounting error";
-			active_objs += cachep->num;
-			active_slabs++;
-		}
+		active_slabs += n->num_slabs_partial +
+				n->num_slabs_full;
+		num_slabs += n->num_slabs_free +
+				n->num_slabs_partial +
+				n->num_slabs_full;
+
+		active_objs += (n->num_slabs_full * cachep->num);
+
 		list_for_each_entry(page, &n->slabs_partial, lru) {
 			if (page->active == cachep->num && !error)
 				error = "slabs_partial accounting error";
 			if (!page->active && !error)
 				error = "slabs_partial accounting error";
 			active_objs += page->active;
-			active_slabs++;
-		}
-		list_for_each_entry(page, &n->slabs_free, lru) {
-			if (page->active && !error)
-				error = "slabs_free accounting error";
-			num_slabs++;
 		}
 		free_objects += n->free_objects;
 		if (n->shared)
@@ -4146,7 +4190,6 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 
 		spin_unlock_irq(&n->list_lock);
 	}
-	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
 	if (num_objs - active_objs != free_objects && !error)
 		error = "free_objects accounting error";
diff --git a/mm/slab.h b/mm/slab.h
index 9653f2e..c0e8084 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -432,12 +432,17 @@ struct kmem_cache_node {
 	struct list_head slabs_partial;	/* partial list first, better asm code */
 	struct list_head slabs_full;
 	struct list_head slabs_free;
+	unsigned long num_slabs_partial;
+	unsigned long num_slabs_full;
+	unsigned long num_slabs_free;
 	unsigned long free_objects;
 	unsigned int free_limit;
 	unsigned int colour_next;	/* Per-node cache coloring */
 	struct array_cache *shared;	/* shared per node */
 	struct alien_cache **alien;	/* on other nodes */
 	unsigned long next_reap;	/* updated without locking */
+	int cache_grown;
+	bool using_free_slab;
 	int free_touched;		/* updated without locking */
 #endif
 
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
  2016-08-02  0:09 ` Aruna Ramakrishna
@ 2016-08-02  0:55   ` Joonsoo Kim
  -1 siblings, 0 replies; 12+ messages in thread
From: Joonsoo Kim @ 2016-08-02  0:55 UTC (permalink / raw)
  To: Aruna Ramakrishna
  Cc: linux-mm, linux-kernel, Mike Kravetz, Christoph Lameter,
	Pekka Enberg, David Rientjes, Andrew Morton

Hello, Aruna.

On Mon, Aug 01, 2016 at 05:09:08PM -0700, Aruna Ramakrishna wrote:
> On large systems, when some slab caches grow to millions of objects (and
> many gigabytes), running 'cat /proc/slabinfo' can take up to 1-2 seconds.
> During this time, interrupts are disabled while walking the slab lists
> (slabs_full, slabs_partial, and slabs_free) for each node, and this
> sometimes causes timeouts in other drivers (for instance, Infiniband).

Okay.

> 
> This patch optimizes 'cat /proc/slabinfo' by maintaining slab counters to
> keep track of number of slabs per node, per cache. These counters are
> updated as slabs are created and destroyed. This avoids having to scan the

Your patch updates these counters not only when a slabs are created and
destroyed but also when object is allocated/freed from the slab. This
would hurt runtime performance.

> slab lists for gathering slabinfo stats, resulting in a dramatic
> performance improvement. We tested this after growing the dentry cache to
> 70GB, and the performance improved from 2s to 2ms.

Nice improvement. I can think of an altenative.

I guess that improvement of your change comes from skipping to iterate
n->slabs_full list. We can achieve it just with introducing only num_slabs.
num_slabs can be updated when a slabs are created and destroyed.

We can calculate num_slabs_full by following equation.

num_slabs_full = num_slabs - num_slabs_partial - num_slabs_free

Calculating both num_slabs_partial and num_slabs_free by iterating
n->slabs_XXX list would not take too much time.

How about this solution?

Thanks.

> 
> Signed-off-by: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> ---
> Note: this has been tested only on x86_64.
> 
>  mm/slab.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++---------------
>  mm/slab.h |  5 ++++
>  2 files changed, 67 insertions(+), 19 deletions(-)
> 
> diff --git a/mm/slab.c b/mm/slab.c
> index 09771ed..c205cd8 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -233,6 +233,11 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
>  	spin_lock_init(&parent->list_lock);
>  	parent->free_objects = 0;
>  	parent->free_touched = 0;
> +	parent->num_slabs_partial = 0;
> +	parent->num_slabs_full = 0;
> +	parent->num_slabs_free = 0;
> +	parent->cache_grown = 0;
> +	parent->using_free_slab = 0;
>  }
>  
>  #define MAKE_LIST(cachep, listp, slab, nodeid)				\
> @@ -2331,6 +2336,7 @@ static int drain_freelist(struct kmem_cache *cache,
>  		 * to the cache.
>  		 */
>  		n->free_objects -= cache->num;
> +		n->num_slabs_free--;
>  		spin_unlock_irq(&n->list_lock);
>  		slab_destroy(cache, page);
>  		nr_freed++;
> @@ -2731,6 +2737,14 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
>  	kasan_poison_slab(page);
>  	cache_init_objs(cachep, page);
>  
> +	spin_lock(&n->list_lock);
> +	/*
> +	 * Track the cache growth until the slabs lists and counters
> +	 * are adjusted.
> +	 */
> +	n->cache_grown++;
> +	spin_unlock(&n->list_lock);
> +
>  	if (gfpflags_allow_blocking(local_flags))
>  		local_irq_disable();
>  
> @@ -2758,12 +2772,14 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
>  	n = get_node(cachep, page_to_nid(page));
>  
>  	spin_lock(&n->list_lock);
> -	if (!page->active)
> +	if (!page->active) {
>  		list_add_tail(&page->lru, &(n->slabs_free));
> -	else
> +		n->num_slabs_free++;
> +	} else
>  		fixup_slab_list(cachep, n, page, &list);
>  	STATS_INC_GROWN(cachep);
>  	n->free_objects += cachep->num - page->active;
> +	n->cache_grown--;
>  	spin_unlock(&n->list_lock);
>  
>  	fixup_objfreelist_debug(cachep, &list);
> @@ -2867,8 +2883,26 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
>  {
>  	/* move slabp to correct slabp list: */
>  	list_del(&page->lru);
> +	/*
> +	 * If the cache was not grown, then this slabp was deleted from a
> +	 * slabs_partial or a slabs_free list, and we decrement the counter
> +	 * for the appropriate list here. The flag using_free_slab is set
> +	 * whenever a slab from the slabs_free list is used for allocation.
> +	 * If set, we know that a slab was deleted from slabs_free and will be
> +	 * moved to slabs_partial (or slabs_full) later below. Otherwise,
> +	 * it was deleted from slabs_partial.
> +	 * If the cache was grown, slabp points to a new slab not present in
> +	 * any list - so we do not decrement any counters.
> +	 */
> +	if (!n->cache_grown) {
> +		if (n->using_free_slab)
> +			n->num_slabs_free--;
> +		else
> +			n->num_slabs_partial--;
> +	}
>  	if (page->active == cachep->num) {
>  		list_add(&page->lru, &n->slabs_full);
> +		n->num_slabs_full++;
>  		if (OBJFREELIST_SLAB(cachep)) {
>  #if DEBUG
>  			/* Poisoning will be done without holding the lock */
> @@ -2881,8 +2915,10 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
>  #endif
>  			page->freelist = NULL;
>  		}
> -	} else
> +	} else {
>  		list_add(&page->lru, &n->slabs_partial);
> +		n->num_slabs_partial++;
> +	}
>  }
>  
>  /* Try to find non-pfmemalloc slab if needed */
> @@ -2912,8 +2948,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
>  		list_add_tail(&page->lru, &n->slabs_partial);
>  
>  	list_for_each_entry(page, &n->slabs_partial, lru) {
> -		if (!PageSlabPfmemalloc(page))
> +		if (!PageSlabPfmemalloc(page)) {
> +			n->using_free_slab = 0;
>  			return page;
> +		}
>  	}
>  
>  	list_for_each_entry(page, &n->slabs_free, lru) {
> @@ -2934,6 +2972,8 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
>  		n->free_touched = 1;
>  		page = list_first_entry_or_null(&n->slabs_free,
>  				struct page, lru);
> +		if (page)
> +			n->using_free_slab = 1;
>  	}
>  
>  	if (sk_memalloc_socks())
> @@ -3431,20 +3471,27 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
>  		objp = objpp[i];
>  
>  		page = virt_to_head_page(objp);
> +		if (page->active == cachep->num)
> +			n->num_slabs_full--;
> +		else
> +			n->num_slabs_partial--;
> +
>  		list_del(&page->lru);
>  		check_spinlock_acquired_node(cachep, node);
>  		slab_put_obj(cachep, page, objp);
>  		STATS_DEC_ACTIVE(cachep);
>  
>  		/* fixup slab chains */
> -		if (page->active == 0)
> +		if (page->active == 0) {
>  			list_add(&page->lru, &n->slabs_free);
> -		else {
> +			n->num_slabs_free++;
> +		} else {
>  			/* Unconditionally move a slab to the end of the
>  			 * partial list on free - maximum time for the
>  			 * other objects to be freed, too.
>  			 */
>  			list_add_tail(&page->lru, &n->slabs_partial);
> +			n->num_slabs_partial++;
>  		}
>  	}
>  
> @@ -3453,6 +3500,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
>  
>  		page = list_last_entry(&n->slabs_free, struct page, lru);
>  		list_move(&page->lru, list);
> +		n->num_slabs_free--;
>  	}
>  }
>  
> @@ -4121,24 +4169,20 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
>  		check_irq_on();
>  		spin_lock_irq(&n->list_lock);
>  
> -		list_for_each_entry(page, &n->slabs_full, lru) {
> -			if (page->active != cachep->num && !error)
> -				error = "slabs_full accounting error";
> -			active_objs += cachep->num;
> -			active_slabs++;
> -		}
> +		active_slabs += n->num_slabs_partial +
> +				n->num_slabs_full;
> +		num_slabs += n->num_slabs_free +
> +				n->num_slabs_partial +
> +				n->num_slabs_full;
> +
> +		active_objs += (n->num_slabs_full * cachep->num);
> +
>  		list_for_each_entry(page, &n->slabs_partial, lru) {
>  			if (page->active == cachep->num && !error)
>  				error = "slabs_partial accounting error";
>  			if (!page->active && !error)
>  				error = "slabs_partial accounting error";
>  			active_objs += page->active;
> -			active_slabs++;
> -		}
> -		list_for_each_entry(page, &n->slabs_free, lru) {
> -			if (page->active && !error)
> -				error = "slabs_free accounting error";
> -			num_slabs++;
>  		}
>  		free_objects += n->free_objects;
>  		if (n->shared)
> @@ -4146,7 +4190,6 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
>  
>  		spin_unlock_irq(&n->list_lock);
>  	}
> -	num_slabs += active_slabs;
>  	num_objs = num_slabs * cachep->num;
>  	if (num_objs - active_objs != free_objects && !error)
>  		error = "free_objects accounting error";
> diff --git a/mm/slab.h b/mm/slab.h
> index 9653f2e..c0e8084 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -432,12 +432,17 @@ struct kmem_cache_node {
>  	struct list_head slabs_partial;	/* partial list first, better asm code */
>  	struct list_head slabs_full;
>  	struct list_head slabs_free;
> +	unsigned long num_slabs_partial;
> +	unsigned long num_slabs_full;
> +	unsigned long num_slabs_free;
>  	unsigned long free_objects;
>  	unsigned int free_limit;
>  	unsigned int colour_next;	/* Per-node cache coloring */
>  	struct array_cache *shared;	/* shared per node */
>  	struct alien_cache **alien;	/* on other nodes */
>  	unsigned long next_reap;	/* updated without locking */
> +	int cache_grown;
> +	bool using_free_slab;
>  	int free_touched;		/* updated without locking */
>  #endif
>  
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
@ 2016-08-02  0:55   ` Joonsoo Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Joonsoo Kim @ 2016-08-02  0:55 UTC (permalink / raw)
  To: Aruna Ramakrishna
  Cc: linux-mm, linux-kernel, Mike Kravetz, Christoph Lameter,
	Pekka Enberg, David Rientjes, Andrew Morton

Hello, Aruna.

On Mon, Aug 01, 2016 at 05:09:08PM -0700, Aruna Ramakrishna wrote:
> On large systems, when some slab caches grow to millions of objects (and
> many gigabytes), running 'cat /proc/slabinfo' can take up to 1-2 seconds.
> During this time, interrupts are disabled while walking the slab lists
> (slabs_full, slabs_partial, and slabs_free) for each node, and this
> sometimes causes timeouts in other drivers (for instance, Infiniband).

Okay.

> 
> This patch optimizes 'cat /proc/slabinfo' by maintaining slab counters to
> keep track of number of slabs per node, per cache. These counters are
> updated as slabs are created and destroyed. This avoids having to scan the

Your patch updates these counters not only when a slabs are created and
destroyed but also when object is allocated/freed from the slab. This
would hurt runtime performance.

> slab lists for gathering slabinfo stats, resulting in a dramatic
> performance improvement. We tested this after growing the dentry cache to
> 70GB, and the performance improved from 2s to 2ms.

Nice improvement. I can think of an altenative.

I guess that improvement of your change comes from skipping to iterate
n->slabs_full list. We can achieve it just with introducing only num_slabs.
num_slabs can be updated when a slabs are created and destroyed.

We can calculate num_slabs_full by following equation.

num_slabs_full = num_slabs - num_slabs_partial - num_slabs_free

Calculating both num_slabs_partial and num_slabs_free by iterating
n->slabs_XXX list would not take too much time.

How about this solution?

Thanks.

> 
> Signed-off-by: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> ---
> Note: this has been tested only on x86_64.
> 
>  mm/slab.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++---------------
>  mm/slab.h |  5 ++++
>  2 files changed, 67 insertions(+), 19 deletions(-)
> 
> diff --git a/mm/slab.c b/mm/slab.c
> index 09771ed..c205cd8 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -233,6 +233,11 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
>  	spin_lock_init(&parent->list_lock);
>  	parent->free_objects = 0;
>  	parent->free_touched = 0;
> +	parent->num_slabs_partial = 0;
> +	parent->num_slabs_full = 0;
> +	parent->num_slabs_free = 0;
> +	parent->cache_grown = 0;
> +	parent->using_free_slab = 0;
>  }
>  
>  #define MAKE_LIST(cachep, listp, slab, nodeid)				\
> @@ -2331,6 +2336,7 @@ static int drain_freelist(struct kmem_cache *cache,
>  		 * to the cache.
>  		 */
>  		n->free_objects -= cache->num;
> +		n->num_slabs_free--;
>  		spin_unlock_irq(&n->list_lock);
>  		slab_destroy(cache, page);
>  		nr_freed++;
> @@ -2731,6 +2737,14 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
>  	kasan_poison_slab(page);
>  	cache_init_objs(cachep, page);
>  
> +	spin_lock(&n->list_lock);
> +	/*
> +	 * Track the cache growth until the slabs lists and counters
> +	 * are adjusted.
> +	 */
> +	n->cache_grown++;
> +	spin_unlock(&n->list_lock);
> +
>  	if (gfpflags_allow_blocking(local_flags))
>  		local_irq_disable();
>  
> @@ -2758,12 +2772,14 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
>  	n = get_node(cachep, page_to_nid(page));
>  
>  	spin_lock(&n->list_lock);
> -	if (!page->active)
> +	if (!page->active) {
>  		list_add_tail(&page->lru, &(n->slabs_free));
> -	else
> +		n->num_slabs_free++;
> +	} else
>  		fixup_slab_list(cachep, n, page, &list);
>  	STATS_INC_GROWN(cachep);
>  	n->free_objects += cachep->num - page->active;
> +	n->cache_grown--;
>  	spin_unlock(&n->list_lock);
>  
>  	fixup_objfreelist_debug(cachep, &list);
> @@ -2867,8 +2883,26 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
>  {
>  	/* move slabp to correct slabp list: */
>  	list_del(&page->lru);
> +	/*
> +	 * If the cache was not grown, then this slabp was deleted from a
> +	 * slabs_partial or a slabs_free list, and we decrement the counter
> +	 * for the appropriate list here. The flag using_free_slab is set
> +	 * whenever a slab from the slabs_free list is used for allocation.
> +	 * If set, we know that a slab was deleted from slabs_free and will be
> +	 * moved to slabs_partial (or slabs_full) later below. Otherwise,
> +	 * it was deleted from slabs_partial.
> +	 * If the cache was grown, slabp points to a new slab not present in
> +	 * any list - so we do not decrement any counters.
> +	 */
> +	if (!n->cache_grown) {
> +		if (n->using_free_slab)
> +			n->num_slabs_free--;
> +		else
> +			n->num_slabs_partial--;
> +	}
>  	if (page->active == cachep->num) {
>  		list_add(&page->lru, &n->slabs_full);
> +		n->num_slabs_full++;
>  		if (OBJFREELIST_SLAB(cachep)) {
>  #if DEBUG
>  			/* Poisoning will be done without holding the lock */
> @@ -2881,8 +2915,10 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
>  #endif
>  			page->freelist = NULL;
>  		}
> -	} else
> +	} else {
>  		list_add(&page->lru, &n->slabs_partial);
> +		n->num_slabs_partial++;
> +	}
>  }
>  
>  /* Try to find non-pfmemalloc slab if needed */
> @@ -2912,8 +2948,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
>  		list_add_tail(&page->lru, &n->slabs_partial);
>  
>  	list_for_each_entry(page, &n->slabs_partial, lru) {
> -		if (!PageSlabPfmemalloc(page))
> +		if (!PageSlabPfmemalloc(page)) {
> +			n->using_free_slab = 0;
>  			return page;
> +		}
>  	}
>  
>  	list_for_each_entry(page, &n->slabs_free, lru) {
> @@ -2934,6 +2972,8 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
>  		n->free_touched = 1;
>  		page = list_first_entry_or_null(&n->slabs_free,
>  				struct page, lru);
> +		if (page)
> +			n->using_free_slab = 1;
>  	}
>  
>  	if (sk_memalloc_socks())
> @@ -3431,20 +3471,27 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
>  		objp = objpp[i];
>  
>  		page = virt_to_head_page(objp);
> +		if (page->active == cachep->num)
> +			n->num_slabs_full--;
> +		else
> +			n->num_slabs_partial--;
> +
>  		list_del(&page->lru);
>  		check_spinlock_acquired_node(cachep, node);
>  		slab_put_obj(cachep, page, objp);
>  		STATS_DEC_ACTIVE(cachep);
>  
>  		/* fixup slab chains */
> -		if (page->active == 0)
> +		if (page->active == 0) {
>  			list_add(&page->lru, &n->slabs_free);
> -		else {
> +			n->num_slabs_free++;
> +		} else {
>  			/* Unconditionally move a slab to the end of the
>  			 * partial list on free - maximum time for the
>  			 * other objects to be freed, too.
>  			 */
>  			list_add_tail(&page->lru, &n->slabs_partial);
> +			n->num_slabs_partial++;
>  		}
>  	}
>  
> @@ -3453,6 +3500,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
>  
>  		page = list_last_entry(&n->slabs_free, struct page, lru);
>  		list_move(&page->lru, list);
> +		n->num_slabs_free--;
>  	}
>  }
>  
> @@ -4121,24 +4169,20 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
>  		check_irq_on();
>  		spin_lock_irq(&n->list_lock);
>  
> -		list_for_each_entry(page, &n->slabs_full, lru) {
> -			if (page->active != cachep->num && !error)
> -				error = "slabs_full accounting error";
> -			active_objs += cachep->num;
> -			active_slabs++;
> -		}
> +		active_slabs += n->num_slabs_partial +
> +				n->num_slabs_full;
> +		num_slabs += n->num_slabs_free +
> +				n->num_slabs_partial +
> +				n->num_slabs_full;
> +
> +		active_objs += (n->num_slabs_full * cachep->num);
> +
>  		list_for_each_entry(page, &n->slabs_partial, lru) {
>  			if (page->active == cachep->num && !error)
>  				error = "slabs_partial accounting error";
>  			if (!page->active && !error)
>  				error = "slabs_partial accounting error";
>  			active_objs += page->active;
> -			active_slabs++;
> -		}
> -		list_for_each_entry(page, &n->slabs_free, lru) {
> -			if (page->active && !error)
> -				error = "slabs_free accounting error";
> -			num_slabs++;
>  		}
>  		free_objects += n->free_objects;
>  		if (n->shared)
> @@ -4146,7 +4190,6 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
>  
>  		spin_unlock_irq(&n->list_lock);
>  	}
> -	num_slabs += active_slabs;
>  	num_objs = num_slabs * cachep->num;
>  	if (num_objs - active_objs != free_objects && !error)
>  		error = "free_objects accounting error";
> diff --git a/mm/slab.h b/mm/slab.h
> index 9653f2e..c0e8084 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -432,12 +432,17 @@ struct kmem_cache_node {
>  	struct list_head slabs_partial;	/* partial list first, better asm code */
>  	struct list_head slabs_full;
>  	struct list_head slabs_free;
> +	unsigned long num_slabs_partial;
> +	unsigned long num_slabs_full;
> +	unsigned long num_slabs_free;
>  	unsigned long free_objects;
>  	unsigned int free_limit;
>  	unsigned int colour_next;	/* Per-node cache coloring */
>  	struct array_cache *shared;	/* shared per node */
>  	struct alien_cache **alien;	/* on other nodes */
>  	unsigned long next_reap;	/* updated without locking */
> +	int cache_grown;
> +	bool using_free_slab;
>  	int free_touched;		/* updated without locking */
>  #endif
>  
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
  2016-08-02  0:55   ` Joonsoo Kim
@ 2016-08-02  1:43     ` Aruna Ramakrishna
  -1 siblings, 0 replies; 12+ messages in thread
From: Aruna Ramakrishna @ 2016-08-02  1:43 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: linux-mm, linux-kernel, Mike Kravetz, Christoph Lameter,
	Pekka Enberg, David Rientjes, Andrew Morton

Hi Joonsoo,

On 08/01/2016 05:55 PM, Joonsoo Kim wrote:
> Your patch updates these counters not only when a slabs are created and
> destroyed but also when object is allocated/freed from the slab. This
> would hurt runtime performance.
>

The counters are not updated for each object allocation/free - only if 
that allocation/free results in that slab moving from one list 
(free/partial/full) to another.

>> > slab lists for gathering slabinfo stats, resulting in a dramatic
>> > performance improvement. We tested this after growing the dentry cache to
>> > 70GB, and the performance improved from 2s to 2ms.
> Nice improvement. I can think of an altenative.
>
> I guess that improvement of your change comes from skipping to iterate
> n->slabs_full list. We can achieve it just with introducing only num_slabs.
> num_slabs can be updated when a slabs are created and destroyed.
>

Yes, slabs_full is typically the largest list.

> We can calculate num_slabs_full by following equation.
>
> num_slabs_full = num_slabs - num_slabs_partial - num_slabs_free
>
> Calculating both num_slabs_partial and num_slabs_free by iterating
> n->slabs_XXX list would not take too much time.

Yes, this would work too. We cannot avoid traversal of slabs_partial, 
and slabs_free is usually a small list, so this should give us similar 
performance benefits. But having separate counters could also be useful 
for debugging, like the ones defined under CONFIG_DEBUG_SLAB/STATS. 
Won't that help?

Thanks,
Aruna

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
@ 2016-08-02  1:43     ` Aruna Ramakrishna
  0 siblings, 0 replies; 12+ messages in thread
From: Aruna Ramakrishna @ 2016-08-02  1:43 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: linux-mm, linux-kernel, Mike Kravetz, Christoph Lameter,
	Pekka Enberg, David Rientjes, Andrew Morton

Hi Joonsoo,

On 08/01/2016 05:55 PM, Joonsoo Kim wrote:
> Your patch updates these counters not only when a slabs are created and
> destroyed but also when object is allocated/freed from the slab. This
> would hurt runtime performance.
>

The counters are not updated for each object allocation/free - only if 
that allocation/free results in that slab moving from one list 
(free/partial/full) to another.

>> > slab lists for gathering slabinfo stats, resulting in a dramatic
>> > performance improvement. We tested this after growing the dentry cache to
>> > 70GB, and the performance improved from 2s to 2ms.
> Nice improvement. I can think of an altenative.
>
> I guess that improvement of your change comes from skipping to iterate
> n->slabs_full list. We can achieve it just with introducing only num_slabs.
> num_slabs can be updated when a slabs are created and destroyed.
>

Yes, slabs_full is typically the largest list.

> We can calculate num_slabs_full by following equation.
>
> num_slabs_full = num_slabs - num_slabs_partial - num_slabs_free
>
> Calculating both num_slabs_partial and num_slabs_free by iterating
> n->slabs_XXX list would not take too much time.

Yes, this would work too. We cannot avoid traversal of slabs_partial, 
and slabs_free is usually a small list, so this should give us similar 
performance benefits. But having separate counters could also be useful 
for debugging, like the ones defined under CONFIG_DEBUG_SLAB/STATS. 
Won't that help?

Thanks,
Aruna

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
  2016-08-02  1:43     ` Aruna Ramakrishna
@ 2016-08-02  2:43       ` Joonsoo Kim
  -1 siblings, 0 replies; 12+ messages in thread
From: Joonsoo Kim @ 2016-08-02  2:43 UTC (permalink / raw)
  To: Aruna Ramakrishna
  Cc: linux-mm, linux-kernel, Mike Kravetz, Christoph Lameter,
	Pekka Enberg, David Rientjes, Andrew Morton

On Mon, Aug 01, 2016 at 06:43:00PM -0700, Aruna Ramakrishna wrote:
> Hi Joonsoo,
> 
> On 08/01/2016 05:55 PM, Joonsoo Kim wrote:
> >Your patch updates these counters not only when a slabs are created and
> >destroyed but also when object is allocated/freed from the slab. This
> >would hurt runtime performance.
> >
> 
> The counters are not updated for each object allocation/free - only
> if that allocation/free results in that slab moving from one list
> (free/partial/full) to another.
> 
> >>> slab lists for gathering slabinfo stats, resulting in a dramatic
> >>> performance improvement. We tested this after growing the dentry cache to
> >>> 70GB, and the performance improved from 2s to 2ms.
> >Nice improvement. I can think of an altenative.
> >
> >I guess that improvement of your change comes from skipping to iterate
> >n->slabs_full list. We can achieve it just with introducing only num_slabs.
> >num_slabs can be updated when a slabs are created and destroyed.
> >
> 
> Yes, slabs_full is typically the largest list.
> 
> >We can calculate num_slabs_full by following equation.
> >
> >num_slabs_full = num_slabs - num_slabs_partial - num_slabs_free
> >
> >Calculating both num_slabs_partial and num_slabs_free by iterating
> >n->slabs_XXX list would not take too much time.
> 
> Yes, this would work too. We cannot avoid traversal of
> slabs_partial, and slabs_free is usually a small list, so this
> should give us similar performance benefits. But having separate
> counters could also be useful for debugging, like the ones defined
> under CONFIG_DEBUG_SLAB/STATS. Won't that help?

We can calculate these counters by traversing all list so it would not
be helpful except for performance reason. Cost of maintaining these
counters isn't free so it's better not to add more than we need.

Thanks.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
@ 2016-08-02  2:43       ` Joonsoo Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Joonsoo Kim @ 2016-08-02  2:43 UTC (permalink / raw)
  To: Aruna Ramakrishna
  Cc: linux-mm, linux-kernel, Mike Kravetz, Christoph Lameter,
	Pekka Enberg, David Rientjes, Andrew Morton

On Mon, Aug 01, 2016 at 06:43:00PM -0700, Aruna Ramakrishna wrote:
> Hi Joonsoo,
> 
> On 08/01/2016 05:55 PM, Joonsoo Kim wrote:
> >Your patch updates these counters not only when a slabs are created and
> >destroyed but also when object is allocated/freed from the slab. This
> >would hurt runtime performance.
> >
> 
> The counters are not updated for each object allocation/free - only
> if that allocation/free results in that slab moving from one list
> (free/partial/full) to another.
> 
> >>> slab lists for gathering slabinfo stats, resulting in a dramatic
> >>> performance improvement. We tested this after growing the dentry cache to
> >>> 70GB, and the performance improved from 2s to 2ms.
> >Nice improvement. I can think of an altenative.
> >
> >I guess that improvement of your change comes from skipping to iterate
> >n->slabs_full list. We can achieve it just with introducing only num_slabs.
> >num_slabs can be updated when a slabs are created and destroyed.
> >
> 
> Yes, slabs_full is typically the largest list.
> 
> >We can calculate num_slabs_full by following equation.
> >
> >num_slabs_full = num_slabs - num_slabs_partial - num_slabs_free
> >
> >Calculating both num_slabs_partial and num_slabs_free by iterating
> >n->slabs_XXX list would not take too much time.
> 
> Yes, this would work too. We cannot avoid traversal of
> slabs_partial, and slabs_free is usually a small list, so this
> should give us similar performance benefits. But having separate
> counters could also be useful for debugging, like the ones defined
> under CONFIG_DEBUG_SLAB/STATS. Won't that help?

We can calculate these counters by traversing all list so it would not
be helpful except for performance reason. Cost of maintaining these
counters isn't free so it's better not to add more than we need.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
  2016-08-02  2:43       ` Joonsoo Kim
@ 2016-08-02 14:59         ` Christoph Lameter
  -1 siblings, 0 replies; 12+ messages in thread
From: Christoph Lameter @ 2016-08-02 14:59 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Aruna Ramakrishna, linux-mm, linux-kernel, Mike Kravetz,
	Pekka Enberg, David Rientjes, Andrew Morton

Hmm.... What SLUB does is:

1. Keep a count of the total number of allocated slab pages per node.
	This counter only needs to be updated when a slab page is
	allocated from the page allocator or when it is freed to the
	page allocator. At that point we already hold the per node lock,
	page allocator operations are extremely costly anyways and so that
	is ok.

2. Keep a count of the number of partially allocated slab pages per node.
	At that point we have to access the partial list and take a per
	node lock. Placing the counter into the same cacheline and
	the increment/decrement into the period when the lock has been taken
	avoids the overhead.

The number of full pages is then

	total - partial


If both allocators would use the same scheme here then the code to
maintain the counter can be moved into mm/slab_common.c. Plus the per node
structures could be mostly harmonized between both allocators. Maybe even
the page allocator operations could become common code.

Aruna: Could you work on a solution like that?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
@ 2016-08-02 14:59         ` Christoph Lameter
  0 siblings, 0 replies; 12+ messages in thread
From: Christoph Lameter @ 2016-08-02 14:59 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Aruna Ramakrishna, linux-mm, linux-kernel, Mike Kravetz,
	Pekka Enberg, David Rientjes, Andrew Morton

Hmm.... What SLUB does is:

1. Keep a count of the total number of allocated slab pages per node.
	This counter only needs to be updated when a slab page is
	allocated from the page allocator or when it is freed to the
	page allocator. At that point we already hold the per node lock,
	page allocator operations are extremely costly anyways and so that
	is ok.

2. Keep a count of the number of partially allocated slab pages per node.
	At that point we have to access the partial list and take a per
	node lock. Placing the counter into the same cacheline and
	the increment/decrement into the period when the lock has been taken
	avoids the overhead.

The number of full pages is then

	total - partial


If both allocators would use the same scheme here then the code to
maintain the counter can be moved into mm/slab_common.c. Plus the per node
structures could be mostly harmonized between both allocators. Maybe even
the page allocator operations could become common code.

Aruna: Could you work on a solution like that?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
  2016-08-02 14:59         ` Christoph Lameter
@ 2016-08-02 17:39           ` Aruna Ramakrishna
  -1 siblings, 0 replies; 12+ messages in thread
From: Aruna Ramakrishna @ 2016-08-02 17:39 UTC (permalink / raw)
  To: Christoph Lameter, Joonsoo Kim
  Cc: linux-mm, linux-kernel, Mike Kravetz, Pekka Enberg,
	David Rientjes, Andrew Morton


On 08/02/2016 07:59 AM, Christoph Lameter wrote:
> Hmm.... What SLUB does is:
>
> 1. Keep a count of the total number of allocated slab pages per node.
> 	This counter only needs to be updated when a slab page is
> 	allocated from the page allocator or when it is freed to the
> 	page allocator. At that point we already hold the per node lock,
> 	page allocator operations are extremely costly anyways and so that
> 	is ok.
>
> 2. Keep a count of the number of partially allocated slab pages per node.
> 	At that point we have to access the partial list and take a per
> 	node lock. Placing the counter into the same cacheline and
> 	the increment/decrement into the period when the lock has been taken
> 	avoids the overhead.
>

As Joonsoo mentioned in his previous comment, the partial list is pretty 
small anyway. And we cannot avoid traversal of the partial list - we 
have to count the number of active objects in each partial slab:
	
	active_objs += page->active;

So keeping a count of partially allocated slabs seems unnecessary to me.

> The number of full pages is then
>
> 	total - partial
>
>
> If both allocators would use the same scheme here then the code to
> maintain the counter can be moved into mm/slab_common.c. Plus the per node
> structures could be mostly harmonized between both allocators. Maybe even
> the page allocator operations could become common code.
>
> Aruna: Could you work on a solution like that?
>

Yup, I'll replace the 3 counters with one counter for number of slabs 
per node and send out a new patch. I'll try to make the counter 
management as similar as possible, between SLAB and SLUB.

Thanks,
Aruna

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/slab: Improve performance of gathering slabinfo stats
@ 2016-08-02 17:39           ` Aruna Ramakrishna
  0 siblings, 0 replies; 12+ messages in thread
From: Aruna Ramakrishna @ 2016-08-02 17:39 UTC (permalink / raw)
  To: Christoph Lameter, Joonsoo Kim
  Cc: linux-mm, linux-kernel, Mike Kravetz, Pekka Enberg,
	David Rientjes, Andrew Morton


On 08/02/2016 07:59 AM, Christoph Lameter wrote:
> Hmm.... What SLUB does is:
>
> 1. Keep a count of the total number of allocated slab pages per node.
> 	This counter only needs to be updated when a slab page is
> 	allocated from the page allocator or when it is freed to the
> 	page allocator. At that point we already hold the per node lock,
> 	page allocator operations are extremely costly anyways and so that
> 	is ok.
>
> 2. Keep a count of the number of partially allocated slab pages per node.
> 	At that point we have to access the partial list and take a per
> 	node lock. Placing the counter into the same cacheline and
> 	the increment/decrement into the period when the lock has been taken
> 	avoids the overhead.
>

As Joonsoo mentioned in his previous comment, the partial list is pretty 
small anyway. And we cannot avoid traversal of the partial list - we 
have to count the number of active objects in each partial slab:
	
	active_objs += page->active;

So keeping a count of partially allocated slabs seems unnecessary to me.

> The number of full pages is then
>
> 	total - partial
>
>
> If both allocators would use the same scheme here then the code to
> maintain the counter can be moved into mm/slab_common.c. Plus the per node
> structures could be mostly harmonized between both allocators. Maybe even
> the page allocator operations could become common code.
>
> Aruna: Could you work on a solution like that?
>

Yup, I'll replace the 3 counters with one counter for number of slabs 
per node and send out a new patch. I'll try to make the counter 
management as similar as possible, between SLAB and SLUB.

Thanks,
Aruna

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-08-02 17:41 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-02  0:09 [PATCH] mm/slab: Improve performance of gathering slabinfo stats Aruna Ramakrishna
2016-08-02  0:09 ` Aruna Ramakrishna
2016-08-02  0:55 ` Joonsoo Kim
2016-08-02  0:55   ` Joonsoo Kim
2016-08-02  1:43   ` Aruna Ramakrishna
2016-08-02  1:43     ` Aruna Ramakrishna
2016-08-02  2:43     ` Joonsoo Kim
2016-08-02  2:43       ` Joonsoo Kim
2016-08-02 14:59       ` Christoph Lameter
2016-08-02 14:59         ` Christoph Lameter
2016-08-02 17:39         ` Aruna Ramakrishna
2016-08-02 17:39           ` Aruna Ramakrishna

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.