linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
@ 2020-07-02  8:32 Xunlei Pang
  2020-07-02  8:32 ` [PATCH 2/2] mm/slub: Get rid of count_partial() Xunlei Pang
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Xunlei Pang @ 2020-07-02  8:32 UTC (permalink / raw)
  To: Christoph Lameter, Andrew Morton, Wen Yang, Yang Shi, Roman Gushchin
  Cc: linux-mm, linux-kernel

The node list_lock in count_partial() spend long time iterating
in case of large amount of partial page lists, which can cause
thunder herd effect to the list_lock contention, e.g. it cause
business response-time jitters when accessing "/proc/slabinfo"
in our production environments.

This patch introduces two counters to maintain the actual number
of partial objects dynamically instead of iterating the partial
page lists with list_lock held.

New counters of kmem_cache_node are: pfree_objects, ptotal_objects.
The main operations are under list_lock in slow path, its performance
impact is minimal.

Co-developed-by: Wen Yang <wenyang@linux.alibaba.com>
Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 mm/slab.h |  2 ++
 mm/slub.c | 38 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/mm/slab.h b/mm/slab.h
index 7e94700..5935749 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -616,6 +616,8 @@ struct kmem_cache_node {
 #ifdef CONFIG_SLUB
 	unsigned long nr_partial;
 	struct list_head partial;
+	atomic_long_t pfree_objects; /* partial free objects */
+	atomic_long_t ptotal_objects; /* partial total objects */
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_t nr_slabs;
 	atomic_long_t total_objects;
diff --git a/mm/slub.c b/mm/slub.c
index 6589b41..53890f3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1775,10 +1775,24 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
 /*
  * Management of partially allocated slabs.
  */
+
+static inline void
+__update_partial_free(struct kmem_cache_node *n, long delta)
+{
+	atomic_long_add(delta, &n->pfree_objects);
+}
+
+static inline void
+__update_partial_total(struct kmem_cache_node *n, long delta)
+{
+	atomic_long_add(delta, &n->ptotal_objects);
+}
+
 static inline void
 __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
 {
 	n->nr_partial++;
+	__update_partial_total(n, page->objects);
 	if (tail == DEACTIVATE_TO_TAIL)
 		list_add_tail(&page->slab_list, &n->partial);
 	else
@@ -1798,6 +1812,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
 	lockdep_assert_held(&n->list_lock);
 	list_del(&page->slab_list);
 	n->nr_partial--;
+	__update_partial_total(n, -page->objects);
 }
 
 /*
@@ -1842,6 +1857,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
 		return NULL;
 
 	remove_partial(n, page);
+	__update_partial_free(n, -*objects);
 	WARN_ON(!freelist);
 	return freelist;
 }
@@ -2174,8 +2190,11 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
 				"unfreezing slab"))
 		goto redo;
 
-	if (lock)
+	if (lock) {
+		if (m == M_PARTIAL)
+			__update_partial_free(n, page->objects - page->inuse);
 		spin_unlock(&n->list_lock);
+	}
 
 	if (m == M_PARTIAL)
 		stat(s, tail);
@@ -2241,6 +2260,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 			discard_page = page;
 		} else {
 			add_partial(n, page, DEACTIVATE_TO_TAIL);
+			__update_partial_free(n, page->objects - page->inuse);
 			stat(s, FREE_ADD_PARTIAL);
 		}
 	}
@@ -2915,6 +2935,14 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		head, new.counters,
 		"__slab_free"));
 
+	if (!was_frozen && prior) {
+		if (n)
+			__update_partial_free(n, cnt);
+		else
+			__update_partial_free(get_node(s, page_to_nid(page)),
+					cnt);
+	}
+
 	if (likely(!n)) {
 
 		/*
@@ -2944,6 +2972,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
 		remove_full(s, n, page);
 		add_partial(n, page, DEACTIVATE_TO_TAIL);
+		__update_partial_free(n, page->objects - page->inuse);
 		stat(s, FREE_ADD_PARTIAL);
 	}
 	spin_unlock_irqrestore(&n->list_lock, flags);
@@ -2955,6 +2984,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		 * Slab on the partial list.
 		 */
 		remove_partial(n, page);
+		__update_partial_free(n, page->inuse - page->objects);
 		stat(s, FREE_REMOVE_PARTIAL);
 	} else {
 		/* Slab must be on the full list */
@@ -3364,6 +3394,8 @@ static inline int calculate_order(unsigned int size)
 	n->nr_partial = 0;
 	spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
+	atomic_long_set(&n->pfree_objects, 0);
+	atomic_long_set(&n->ptotal_objects, 0);
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_set(&n->nr_slabs, 0);
 	atomic_long_set(&n->total_objects, 0);
@@ -3437,6 +3469,7 @@ static void early_kmem_cache_node_alloc(int node)
 	 * initialized and there is no concurrent access.
 	 */
 	__add_partial(n, page, DEACTIVATE_TO_HEAD);
+	__update_partial_free(n, page->objects - page->inuse);
 }
 
 static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3747,6 +3780,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 	list_for_each_entry_safe(page, h, &n->partial, slab_list) {
 		if (!page->inuse) {
 			remove_partial(n, page);
+			__update_partial_free(n, page->objects - page->inuse);
 			list_add(&page->slab_list, &discard);
 		} else {
 			list_slab_objects(s, page,
@@ -4045,6 +4079,8 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 			if (free == page->objects) {
 				list_move(&page->slab_list, &discard);
 				n->nr_partial--;
+				__update_partial_free(n, -free);
+				__update_partial_total(n, -free);
 			} else if (free <= SHRINK_PROMOTE_MAX)
 				list_move(&page->slab_list, promote + free - 1);
 		}
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/2] mm/slub: Get rid of count_partial()
  2020-07-02  8:32 [PATCH 1/2] mm/slub: Introduce two counters for the partial objects Xunlei Pang
@ 2020-07-02  8:32 ` Xunlei Pang
  2020-07-02 11:59 ` [PATCH 1/2] mm/slub: Introduce two counters for the partial objects Pekka Enberg
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 17+ messages in thread
From: Xunlei Pang @ 2020-07-02  8:32 UTC (permalink / raw)
  To: Christoph Lameter, Andrew Morton, Wen Yang, Yang Shi, Roman Gushchin
  Cc: linux-mm, linux-kernel

Now the partial counters are ready, let's use them directly
and get rid of count_partial().

Co-developed-by: Wen Yang <wenyang@linux.alibaba.com>
Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 mm/slub.c | 57 ++++++++++++++++++++++++---------------------------------
 1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 53890f3..f1946ed 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2414,11 +2414,6 @@ static inline int node_match(struct page *page, int node)
 }
 
 #ifdef CONFIG_SLUB_DEBUG
-static int count_free(struct page *page)
-{
-	return page->objects - page->inuse;
-}
-
 static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
 {
 	return atomic_long_read(&n->total_objects);
@@ -2426,19 +2421,27 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
 #endif /* CONFIG_SLUB_DEBUG */
 
 #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
-static unsigned long count_partial(struct kmem_cache_node *n,
-					int (*get_count)(struct page *))
-{
-	unsigned long flags;
-	unsigned long x = 0;
-	struct page *page;
+enum partial_item { PARTIAL_FREE, PARTIAL_INUSE, PARTIAL_TOTAL };
+
+static unsigned long partial_counter(struct kmem_cache_node *n,
+		enum partial_item item)
+{
+	unsigned long ret = 0;
+
+	if (item == PARTIAL_FREE) {
+		ret = atomic_long_read(&n->pfree_objects);
+	} else if (item == PARTIAL_TOTAL) {
+		ret = atomic_long_read(&n->total_objects);
+	} else if (item == PARTIAL_INUSE) {
+		ret = atomic_long_read(&n->total_objects) -
+		atomic_long_read(&n->pfree_objects);
+		if ((long)ret < 0)
+			ret = 0;
+	}
 
-	spin_lock_irqsave(&n->list_lock, flags);
-	list_for_each_entry(page, &n->partial, slab_list)
-		x += get_count(page);
-	spin_unlock_irqrestore(&n->list_lock, flags);
-	return x;
+	return ret;
 }
+
 #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
 
 static noinline void
@@ -2468,7 +2471,7 @@ static unsigned long count_partial(struct kmem_cache_node *n,
 		unsigned long nr_objs;
 		unsigned long nr_free;
 
-		nr_free  = count_partial(n, count_free);
+		nr_free  = partial_counter(n, PARTIAL_FREE);
 		nr_slabs = node_nr_slabs(n);
 		nr_objs  = node_nr_objs(n);
 
@@ -4445,18 +4448,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 }
 #endif
 
-#ifdef CONFIG_SYSFS
-static int count_inuse(struct page *page)
-{
-	return page->inuse;
-}
-
-static int count_total(struct page *page)
-{
-	return page->objects;
-}
-#endif
-
 #ifdef CONFIG_SLUB_DEBUG
 static void validate_slab(struct kmem_cache *s, struct page *page)
 {
@@ -4913,7 +4904,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 				x = atomic_long_read(&n->total_objects);
 			else if (flags & SO_OBJECTS)
 				x = atomic_long_read(&n->total_objects) -
-					count_partial(n, count_free);
+					partial_counter(n, PARTIAL_FREE);
 			else
 				x = atomic_long_read(&n->nr_slabs);
 			total += x;
@@ -4927,9 +4918,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 
 		for_each_kmem_cache_node(s, node, n) {
 			if (flags & SO_TOTAL)
-				x = count_partial(n, count_total);
+				x = partial_counter(n, PARTIAL_TOTAL);
 			else if (flags & SO_OBJECTS)
-				x = count_partial(n, count_inuse);
+				x = partial_counter(n, PARTIAL_INUSE);
 			else
 				x = n->nr_partial;
 			total += x;
@@ -5962,7 +5953,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
 	for_each_kmem_cache_node(s, node, n) {
 		nr_slabs += node_nr_slabs(n);
 		nr_objs += node_nr_objs(n);
-		nr_free += count_partial(n, count_free);
+		nr_free += partial_counter(n, PARTIAL_FREE);
 	}
 
 	sinfo->active_objs = nr_objs - nr_free;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-07-02  8:32 [PATCH 1/2] mm/slub: Introduce two counters for the partial objects Xunlei Pang
  2020-07-02  8:32 ` [PATCH 2/2] mm/slub: Get rid of count_partial() Xunlei Pang
@ 2020-07-02 11:59 ` Pekka Enberg
  2020-07-03  9:37   ` xunlei
  2020-07-07  6:59 ` Christopher Lameter
  2020-08-06 12:42 ` Vlastimil Babka
  3 siblings, 1 reply; 17+ messages in thread
From: Pekka Enberg @ 2020-07-02 11:59 UTC (permalink / raw)
  To: Xunlei Pang
  Cc: Christoph Lameter, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML

On Thu, Jul 2, 2020 at 11:32 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
> The node list_lock in count_partial() spend long time iterating
> in case of large amount of partial page lists, which can cause
> thunder herd effect to the list_lock contention, e.g. it cause
> business response-time jitters when accessing "/proc/slabinfo"
> in our production environments.

Would you have any numbers to share to quantify this jitter? I have no
objections to this approach, but I think the original design
deliberately made reading "/proc/slabinfo" more expensive to avoid
atomic operations in the allocation/deallocation paths. It would be
good to understand what is the gain of this approach before we switch
to it. Maybe even run some slab-related benchmark (not sure if there's
something better than hackbench these days) to see if the overhead of
this approach shows up.

> This patch introduces two counters to maintain the actual number
> of partial objects dynamically instead of iterating the partial
> page lists with list_lock held.
>
> New counters of kmem_cache_node are: pfree_objects, ptotal_objects.
> The main operations are under list_lock in slow path, its performance
> impact is minimal.
>
> Co-developed-by: Wen Yang <wenyang@linux.alibaba.com>
> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
> ---
>  mm/slab.h |  2 ++
>  mm/slub.c | 38 +++++++++++++++++++++++++++++++++++++-
>  2 files changed, 39 insertions(+), 1 deletion(-)
>
> diff --git a/mm/slab.h b/mm/slab.h
> index 7e94700..5935749 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -616,6 +616,8 @@ struct kmem_cache_node {
>  #ifdef CONFIG_SLUB
>         unsigned long nr_partial;
>         struct list_head partial;
> +       atomic_long_t pfree_objects; /* partial free objects */
> +       atomic_long_t ptotal_objects; /* partial total objects */

You could rename these to "nr_partial_free_objs" and
"nr_partial_total_objs" for readability.

- Pekka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-07-02 11:59 ` [PATCH 1/2] mm/slub: Introduce two counters for the partial objects Pekka Enberg
@ 2020-07-03  9:37   ` xunlei
  2020-07-07 15:23     ` Pekka Enberg
  0 siblings, 1 reply; 17+ messages in thread
From: xunlei @ 2020-07-03  9:37 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Christoph Lameter, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML

On 2020/7/2 PM 7:59, Pekka Enberg wrote:
> On Thu, Jul 2, 2020 at 11:32 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>> The node list_lock in count_partial() spend long time iterating
>> in case of large amount of partial page lists, which can cause
>> thunder herd effect to the list_lock contention, e.g. it cause
>> business response-time jitters when accessing "/proc/slabinfo"
>> in our production environments.
> 
> Would you have any numbers to share to quantify this jitter? I have no

We have HSF RT(High-speed Service Framework Response-Time) monitors, the
RT figures fluctuated randomly, then we deployed a tool detecting "irq
off" and "preempt off" to dump the culprit's calltrace, capturing the
list_lock cost up to 100ms with irq off issued by "ss", this also caused
network timeouts.

> objections to this approach, but I think the original design
> deliberately made reading "/proc/slabinfo" more expensive to avoid
> atomic operations in the allocation/deallocation paths. It would be
> good to understand what is the gain of this approach before we switch
> to it. Maybe even run some slab-related benchmark (not sure if there's
> something better than hackbench these days) to see if the overhead of
> this approach shows up.

I thought that before, but most atomic operations are serialized by the
list_lock. Another possible way is to hold list_lock in __slab_free(),
then these two counters can be changed from atomic to long.

I also have no idea what's the standard SLUB benchmark for the
regression test, any specific suggestion?

> 
>> This patch introduces two counters to maintain the actual number
>> of partial objects dynamically instead of iterating the partial
>> page lists with list_lock held.
>>
>> New counters of kmem_cache_node are: pfree_objects, ptotal_objects.
>> The main operations are under list_lock in slow path, its performance
>> impact is minimal.
>>
>> Co-developed-by: Wen Yang <wenyang@linux.alibaba.com>
>> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
>> ---
>>  mm/slab.h |  2 ++
>>  mm/slub.c | 38 +++++++++++++++++++++++++++++++++++++-
>>  2 files changed, 39 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/slab.h b/mm/slab.h
>> index 7e94700..5935749 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -616,6 +616,8 @@ struct kmem_cache_node {
>>  #ifdef CONFIG_SLUB
>>         unsigned long nr_partial;
>>         struct list_head partial;
>> +       atomic_long_t pfree_objects; /* partial free objects */
>> +       atomic_long_t ptotal_objects; /* partial total objects */
> 
> You could rename these to "nr_partial_free_objs" and
> "nr_partial_total_objs" for readability.

Sounds good.

Thanks!

> 
> - Pekka
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-07-02  8:32 [PATCH 1/2] mm/slub: Introduce two counters for the partial objects Xunlei Pang
  2020-07-02  8:32 ` [PATCH 2/2] mm/slub: Get rid of count_partial() Xunlei Pang
  2020-07-02 11:59 ` [PATCH 1/2] mm/slub: Introduce two counters for the partial objects Pekka Enberg
@ 2020-07-07  6:59 ` Christopher Lameter
  2020-07-31  2:52   ` xunlei
  2020-08-06 12:42 ` Vlastimil Babka
  3 siblings, 1 reply; 17+ messages in thread
From: Christopher Lameter @ 2020-07-07  6:59 UTC (permalink / raw)
  To: Xunlei Pang
  Cc: Andrew Morton, Wen Yang, Yang Shi, Roman Gushchin, linux-mm,
	linux-kernel

On Thu, 2 Jul 2020, Xunlei Pang wrote:

> This patch introduces two counters to maintain the actual number
> of partial objects dynamically instead of iterating the partial
> page lists with list_lock held.
>
> New counters of kmem_cache_node are: pfree_objects, ptotal_objects.
> The main operations are under list_lock in slow path, its performance
> impact is minimal.


If at all then these counters need to be under CONFIG_SLUB_DEBUG.

> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -616,6 +616,8 @@ struct kmem_cache_node {
>  #ifdef CONFIG_SLUB
>  	unsigned long nr_partial;
>  	struct list_head partial;
> +	atomic_long_t pfree_objects; /* partial free objects */
> +	atomic_long_t ptotal_objects; /* partial total objects */

Please in the CONFIG_SLUB_DEBUG. Without CONFIG_SLUB_DEBUG we need to
build with minimal memory footprint.

>  #ifdef CONFIG_SLUB_DEBUG
>  	atomic_long_t nr_slabs;
>  	atomic_long_t total_objects;
> diff --git a/mm/slub.c b/mm/slub.c



Also this looks to be quite heavy on the cache and on execution time. Note
that the list_lock could be taken frequently in the performance sensitive
case of freeing an object that is not in the partial lists.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-07-03  9:37   ` xunlei
@ 2020-07-07 15:23     ` Pekka Enberg
  2020-07-09 14:32       ` Christopher Lameter
  2020-07-31  2:57       ` xunlei
  0 siblings, 2 replies; 17+ messages in thread
From: Pekka Enberg @ 2020-07-07 15:23 UTC (permalink / raw)
  To: Xunlei Pang
  Cc: Christoph Lameter, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML

Hi!

(Sorry for the delay, I missed your response.)

On Fri, Jul 3, 2020 at 12:38 PM xunlei <xlpang@linux.alibaba.com> wrote:
>
> On 2020/7/2 PM 7:59, Pekka Enberg wrote:
> > On Thu, Jul 2, 2020 at 11:32 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
> >> The node list_lock in count_partial() spend long time iterating
> >> in case of large amount of partial page lists, which can cause
> >> thunder herd effect to the list_lock contention, e.g. it cause
> >> business response-time jitters when accessing "/proc/slabinfo"
> >> in our production environments.
> >
> > Would you have any numbers to share to quantify this jitter? I have no
>
> We have HSF RT(High-speed Service Framework Response-Time) monitors, the
> RT figures fluctuated randomly, then we deployed a tool detecting "irq
> off" and "preempt off" to dump the culprit's calltrace, capturing the
> list_lock cost up to 100ms with irq off issued by "ss", this also caused
> network timeouts.

Thanks for the follow up. This sounds like a good enough motivation
for this patch, but please include it in the changelog.

> > objections to this approach, but I think the original design
> > deliberately made reading "/proc/slabinfo" more expensive to avoid
> > atomic operations in the allocation/deallocation paths. It would be
> > good to understand what is the gain of this approach before we switch
> > to it. Maybe even run some slab-related benchmark (not sure if there's
> > something better than hackbench these days) to see if the overhead of
> > this approach shows up.
>
> I thought that before, but most atomic operations are serialized by the
> list_lock. Another possible way is to hold list_lock in __slab_free(),
> then these two counters can be changed from atomic to long.
>
> I also have no idea what's the standard SLUB benchmark for the
> regression test, any specific suggestion?

I don't know what people use these days. When I did benchmarking in
the past, hackbench and netperf were known to be slab-allocation
intensive macro-benchmarks. Christoph also had some SLUB
micro-benchmarks, but I don't think we ever merged them into the tree.

- Pekka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-07-07 15:23     ` Pekka Enberg
@ 2020-07-09 14:32       ` Christopher Lameter
  2020-07-31  2:57       ` xunlei
  1 sibling, 0 replies; 17+ messages in thread
From: Christopher Lameter @ 2020-07-09 14:32 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Xunlei Pang, Andrew Morton, Wen Yang, Yang Shi, Roman Gushchin,
	linux-mm, LKML

On Tue, 7 Jul 2020, Pekka Enberg wrote:

> On Fri, Jul 3, 2020 at 12:38 PM xunlei <xlpang@linux.alibaba.com> wrote:
> >
> > On 2020/7/2 PM 7:59, Pekka Enberg wrote:
> > > On Thu, Jul 2, 2020 at 11:32 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
> > >> The node list_lock in count_partial() spend long time iterating
> > >> in case of large amount of partial page lists, which can cause
> > >> thunder herd effect to the list_lock contention, e.g. it cause
> > >> business response-time jitters when accessing "/proc/slabinfo"
> > >> in our production environments.
> > >
> > > Would you have any numbers to share to quantify this jitter? I have no
> >
> > We have HSF RT(High-speed Service Framework Response-Time) monitors, the
> > RT figures fluctuated randomly, then we deployed a tool detecting "irq
> > off" and "preempt off" to dump the culprit's calltrace, capturing the
> > list_lock cost up to 100ms with irq off issued by "ss", this also caused
> > network timeouts.
>
> Thanks for the follow up. This sounds like a good enough motivation
> for this patch, but please include it in the changelog.


Well this is access via sysfs causing a holdoff. Another way of access to
the same information without adding atomics and counters would be best.

> > I also have no idea what's the standard SLUB benchmark for the
> > regression test, any specific suggestion?
>
> I don't know what people use these days. When I did benchmarking in
> the past, hackbench and netperf were known to be slab-allocation
> intensive macro-benchmarks. Christoph also had some SLUB
> micro-benchmarks, but I don't think we ever merged them into the tree.

They are still where they have been for the last decade or so. In my git
tree on kernel.org. They were also reworked a couple of times and posted
to linux-mm. There are historical posts going back over the years where
individuals have modified them and used them to create multiple other
tests.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-07-07  6:59 ` Christopher Lameter
@ 2020-07-31  2:52   ` xunlei
  0 siblings, 0 replies; 17+ messages in thread
From: xunlei @ 2020-07-31  2:52 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Andrew Morton, Wen Yang, Yang Shi, Roman Gushchin, linux-mm,
	linux-kernel

On 2020/7/7 下午2:59, Christopher Lameter wrote:
> On Thu, 2 Jul 2020, Xunlei Pang wrote:
> 
>> This patch introduces two counters to maintain the actual number
>> of partial objects dynamically instead of iterating the partial
>> page lists with list_lock held.
>>
>> New counters of kmem_cache_node are: pfree_objects, ptotal_objects.
>> The main operations are under list_lock in slow path, its performance
>> impact is minimal.
> 
> 
> If at all then these counters need to be under CONFIG_SLUB_DEBUG.
> 
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -616,6 +616,8 @@ struct kmem_cache_node {
>>  #ifdef CONFIG_SLUB
>>  	unsigned long nr_partial;
>>  	struct list_head partial;
>> +	atomic_long_t pfree_objects; /* partial free objects */
>> +	atomic_long_t ptotal_objects; /* partial total objects */
> 
> Please in the CONFIG_SLUB_DEBUG. Without CONFIG_SLUB_DEBUG we need to
> build with minimal memory footprint.

Thanks for the comments.

show_slab_objects() also calls it with CONFIG_SYSFS:
    if (flags & SO_PARTIAL) {
        struct kmem_cache_node *n;

        for_each_kmem_cache_node(s, node, n) {
            if (flags & SO_TOTAL)
                x = count_partial(n, count_total);
            else if (flags & SO_OBJECTS)
                x = count_partial(n, count_inuse);
            else
                x = n->nr_partial;
            total += x;
            nodes[node] += x;
        }
    }

I'm not sure if it's due to some historical reason, it works without
CONFIG_SLUB_DEBUG.

> 
>>  #ifdef CONFIG_SLUB_DEBUG
>>  	atomic_long_t nr_slabs;
>>  	atomic_long_t total_objects;
>> diff --git a/mm/slub.c b/mm/slub.c
> 
> 
> 
> Also this looks to be quite heavy on the cache and on execution time. Note
> that the list_lock could be taken frequently in the performance sensitive
> case of freeing an object that is not in the partial lists.
> 

Yes, the concurrent __slab_free() has potential lock/atomic contention,
how about using percpu variable for partial free like below?
  static inline void
   __update_partial_free(struct kmem_cache_node *n, long delta)
  {
      atomic_long_add(delta, this_cpu_ptr(n->partial_free_objs));
  }

-Xunlei

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-07-07 15:23     ` Pekka Enberg
  2020-07-09 14:32       ` Christopher Lameter
@ 2020-07-31  2:57       ` xunlei
  1 sibling, 0 replies; 17+ messages in thread
From: xunlei @ 2020-07-31  2:57 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Christoph Lameter, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML

On 2020/7/7 下午11:23, Pekka Enberg wrote:
> Hi!
> 
> (Sorry for the delay, I missed your response.)
> 
> On Fri, Jul 3, 2020 at 12:38 PM xunlei <xlpang@linux.alibaba.com> wrote:
>>
>> On 2020/7/2 PM 7:59, Pekka Enberg wrote:
>>> On Thu, Jul 2, 2020 at 11:32 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>>>> The node list_lock in count_partial() spend long time iterating
>>>> in case of large amount of partial page lists, which can cause
>>>> thunder herd effect to the list_lock contention, e.g. it cause
>>>> business response-time jitters when accessing "/proc/slabinfo"
>>>> in our production environments.
>>>
>>> Would you have any numbers to share to quantify this jitter? I have no
>>
>> We have HSF RT(High-speed Service Framework Response-Time) monitors, the
>> RT figures fluctuated randomly, then we deployed a tool detecting "irq
>> off" and "preempt off" to dump the culprit's calltrace, capturing the
>> list_lock cost up to 100ms with irq off issued by "ss", this also caused
>> network timeouts.
> 
> Thanks for the follow up. This sounds like a good enough motivation
> for this patch, but please include it in the changelog.
> 
>>> objections to this approach, but I think the original design
>>> deliberately made reading "/proc/slabinfo" more expensive to avoid
>>> atomic operations in the allocation/deallocation paths. It would be
>>> good to understand what is the gain of this approach before we switch
>>> to it. Maybe even run some slab-related benchmark (not sure if there's
>>> something better than hackbench these days) to see if the overhead of
>>> this approach shows up.
>>
>> I thought that before, but most atomic operations are serialized by the
>> list_lock. Another possible way is to hold list_lock in __slab_free(),
>> then these two counters can be changed from atomic to long.
>>
>> I also have no idea what's the standard SLUB benchmark for the
>> regression test, any specific suggestion?
> 
> I don't know what people use these days. When I did benchmarking in
> the past, hackbench and netperf were known to be slab-allocation
> intensive macro-benchmarks. Christoph also had some SLUB
> micro-benchmarks, but I don't think we ever merged them into the tree.

I tested hackbench on 24-CPU machine, here are the results:

"hackbench 20 thread 1000"

== orignal(without any patch)
Time: 53.793
Time: 54.305
Time: 54.073

== with my patch1~2
Time: 54.036
Time: 53.840
Time: 54.066
Time: 53.449

== with my patch1~2, plus using a percpu partial free objects counter
Time: 53.303
Time: 52.994
Time: 53.218
Time: 53.268
Time: 53.739
Time: 53.072

The results show no performance regression, it's strange that the
figures even get a little better when using percpu counter.

Thanks,
Xunlei

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-07-02  8:32 [PATCH 1/2] mm/slub: Introduce two counters for the partial objects Xunlei Pang
                   ` (2 preceding siblings ...)
  2020-07-07  6:59 ` Christopher Lameter
@ 2020-08-06 12:42 ` Vlastimil Babka
  2020-08-07  7:25   ` Pekka Enberg
  3 siblings, 1 reply; 17+ messages in thread
From: Vlastimil Babka @ 2020-08-06 12:42 UTC (permalink / raw)
  To: Xunlei Pang, Christoph Lameter, Andrew Morton, Wen Yang,
	Yang Shi, Roman Gushchin
  Cc: linux-mm, linux-kernel, Konstantin Khlebnikov, David Rientjes

On 7/2/20 10:32 AM, Xunlei Pang wrote:
> The node list_lock in count_partial() spend long time iterating
> in case of large amount of partial page lists, which can cause
> thunder herd effect to the list_lock contention, e.g. it cause
> business response-time jitters when accessing "/proc/slabinfo"
> in our production environments.
> 
> This patch introduces two counters to maintain the actual number
> of partial objects dynamically instead of iterating the partial
> page lists with list_lock held.
> 
> New counters of kmem_cache_node are: pfree_objects, ptotal_objects.
> The main operations are under list_lock in slow path, its performance
> impact is minimal.
> 
> Co-developed-by: Wen Yang <wenyang@linux.alibaba.com>
> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>

This or similar things seem to be reported every few months now, last time was
here [1] AFAIK. The solution was to just stop counting at some point.

Shall we perhaps add these counters under CONFIG_SLUB_DEBUG then and be done
with it? If anyone needs the extreme performance and builds without
CONFIG_SLUB_DEBUG, I'd assume they also don't have userspace programs reading
/proc/slabinfo periodically anyway?

[1]
https://lore.kernel.org/linux-mm/158860845968.33385.4165926113074799048.stgit@buzz/

> ---
>  mm/slab.h |  2 ++
>  mm/slub.c | 38 +++++++++++++++++++++++++++++++++++++-
>  2 files changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/slab.h b/mm/slab.h
> index 7e94700..5935749 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -616,6 +616,8 @@ struct kmem_cache_node {
>  #ifdef CONFIG_SLUB
>  	unsigned long nr_partial;
>  	struct list_head partial;
> +	atomic_long_t pfree_objects; /* partial free objects */
> +	atomic_long_t ptotal_objects; /* partial total objects */
>  #ifdef CONFIG_SLUB_DEBUG
>  	atomic_long_t nr_slabs;
>  	atomic_long_t total_objects;
> diff --git a/mm/slub.c b/mm/slub.c
> index 6589b41..53890f3 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1775,10 +1775,24 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
>  /*
>   * Management of partially allocated slabs.
>   */
> +
> +static inline void
> +__update_partial_free(struct kmem_cache_node *n, long delta)
> +{
> +	atomic_long_add(delta, &n->pfree_objects);
> +}
> +
> +static inline void
> +__update_partial_total(struct kmem_cache_node *n, long delta)
> +{
> +	atomic_long_add(delta, &n->ptotal_objects);
> +}
> +
>  static inline void
>  __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
>  {
>  	n->nr_partial++;
> +	__update_partial_total(n, page->objects);
>  	if (tail == DEACTIVATE_TO_TAIL)
>  		list_add_tail(&page->slab_list, &n->partial);
>  	else
> @@ -1798,6 +1812,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
>  	lockdep_assert_held(&n->list_lock);
>  	list_del(&page->slab_list);
>  	n->nr_partial--;
> +	__update_partial_total(n, -page->objects);
>  }
>  
>  /*
> @@ -1842,6 +1857,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
>  		return NULL;
>  
>  	remove_partial(n, page);
> +	__update_partial_free(n, -*objects);
>  	WARN_ON(!freelist);
>  	return freelist;
>  }
> @@ -2174,8 +2190,11 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
>  				"unfreezing slab"))
>  		goto redo;
>  
> -	if (lock)
> +	if (lock) {
> +		if (m == M_PARTIAL)
> +			__update_partial_free(n, page->objects - page->inuse);
>  		spin_unlock(&n->list_lock);
> +	}
>  
>  	if (m == M_PARTIAL)
>  		stat(s, tail);
> @@ -2241,6 +2260,7 @@ static void unfreeze_partials(struct kmem_cache *s,
>  			discard_page = page;
>  		} else {
>  			add_partial(n, page, DEACTIVATE_TO_TAIL);
> +			__update_partial_free(n, page->objects - page->inuse);
>  			stat(s, FREE_ADD_PARTIAL);
>  		}
>  	}
> @@ -2915,6 +2935,14 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
>  		head, new.counters,
>  		"__slab_free"));
>  
> +	if (!was_frozen && prior) {
> +		if (n)
> +			__update_partial_free(n, cnt);
> +		else
> +			__update_partial_free(get_node(s, page_to_nid(page)),
> +					cnt);
> +	}
> +
>  	if (likely(!n)) {
>  
>  		/*
> @@ -2944,6 +2972,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
>  	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
>  		remove_full(s, n, page);
>  		add_partial(n, page, DEACTIVATE_TO_TAIL);
> +		__update_partial_free(n, page->objects - page->inuse);
>  		stat(s, FREE_ADD_PARTIAL);
>  	}
>  	spin_unlock_irqrestore(&n->list_lock, flags);
> @@ -2955,6 +2984,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
>  		 * Slab on the partial list.
>  		 */
>  		remove_partial(n, page);
> +		__update_partial_free(n, page->inuse - page->objects);
>  		stat(s, FREE_REMOVE_PARTIAL);
>  	} else {
>  		/* Slab must be on the full list */
> @@ -3364,6 +3394,8 @@ static inline int calculate_order(unsigned int size)
>  	n->nr_partial = 0;
>  	spin_lock_init(&n->list_lock);
>  	INIT_LIST_HEAD(&n->partial);
> +	atomic_long_set(&n->pfree_objects, 0);
> +	atomic_long_set(&n->ptotal_objects, 0);
>  #ifdef CONFIG_SLUB_DEBUG
>  	atomic_long_set(&n->nr_slabs, 0);
>  	atomic_long_set(&n->total_objects, 0);
> @@ -3437,6 +3469,7 @@ static void early_kmem_cache_node_alloc(int node)
>  	 * initialized and there is no concurrent access.
>  	 */
>  	__add_partial(n, page, DEACTIVATE_TO_HEAD);
> +	__update_partial_free(n, page->objects - page->inuse);
>  }
>  
>  static void free_kmem_cache_nodes(struct kmem_cache *s)
> @@ -3747,6 +3780,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
>  	list_for_each_entry_safe(page, h, &n->partial, slab_list) {
>  		if (!page->inuse) {
>  			remove_partial(n, page);
> +			__update_partial_free(n, page->objects - page->inuse);
>  			list_add(&page->slab_list, &discard);
>  		} else {
>  			list_slab_objects(s, page,
> @@ -4045,6 +4079,8 @@ int __kmem_cache_shrink(struct kmem_cache *s)
>  			if (free == page->objects) {
>  				list_move(&page->slab_list, &discard);
>  				n->nr_partial--;
> +				__update_partial_free(n, -free);
> +				__update_partial_total(n, -free);
>  			} else if (free <= SHRINK_PROMOTE_MAX)
>  				list_move(&page->slab_list, promote + free - 1);
>  		}
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-08-06 12:42 ` Vlastimil Babka
@ 2020-08-07  7:25   ` Pekka Enberg
  2020-08-07 13:02     ` Christopher Lameter
  0 siblings, 1 reply; 17+ messages in thread
From: Pekka Enberg @ 2020-08-07  7:25 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Xunlei Pang, Christoph Lameter, Andrew Morton, Wen Yang,
	Yang Shi, Roman Gushchin, linux-mm, LKML, Konstantin Khlebnikov,
	David Rientjes

On Thu, Aug 6, 2020 at 3:42 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 7/2/20 10:32 AM, Xunlei Pang wrote:
> > The node list_lock in count_partial() spend long time iterating
> > in case of large amount of partial page lists, which can cause
> > thunder herd effect to the list_lock contention, e.g. it cause
> > business response-time jitters when accessing "/proc/slabinfo"
> > in our production environments.
> >
> > This patch introduces two counters to maintain the actual number
> > of partial objects dynamically instead of iterating the partial
> > page lists with list_lock held.
> >
> > New counters of kmem_cache_node are: pfree_objects, ptotal_objects.
> > The main operations are under list_lock in slow path, its performance
> > impact is minimal.
> >
> > Co-developed-by: Wen Yang <wenyang@linux.alibaba.com>
> > Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
>
> This or similar things seem to be reported every few months now, last time was
> here [1] AFAIK. The solution was to just stop counting at some point.
>
> Shall we perhaps add these counters under CONFIG_SLUB_DEBUG then and be done
> with it? If anyone needs the extreme performance and builds without
> CONFIG_SLUB_DEBUG, I'd assume they also don't have userspace programs reading
> /proc/slabinfo periodically anyway?

I think we can just default to the counters. After all, if I
understood correctly, we're talking about up to 100 ms time period
with IRQs disabled when count_partial() is called. As this is
triggerable from user space, that's a performance bug whatever way you
look at it.

Whoever needs to eliminate these counters from fast-path, can wrap
them in a CONFIG_MAKE_SLABINFO_EXTREMELY_SLOW option.

So for this patch, with updated information about the severity of the
problem, and the hackbench numbers:

Acked-by: Pekka Enberg <penberg@kernel.org>

Christoph, others, any objections?

- Pekka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-08-07  7:25   ` Pekka Enberg
@ 2020-08-07 13:02     ` Christopher Lameter
  2020-08-07 17:28       ` Pekka Enberg
  0 siblings, 1 reply; 17+ messages in thread
From: Christopher Lameter @ 2020-08-07 13:02 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Vlastimil Babka, Xunlei Pang, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML, Konstantin Khlebnikov,
	David Rientjes

On Fri, 7 Aug 2020, Pekka Enberg wrote:

> I think we can just default to the counters. After all, if I
> understood correctly, we're talking about up to 100 ms time period
> with IRQs disabled when count_partial() is called. As this is
> triggerable from user space, that's a performance bug whatever way you
> look at it.


Well yes under extreme conditions and this is only happening for sysfs
counter retrieval.

There could be other solutions to this. This solution here is penalizing
evertu hotpath slab allocation for the sake of relatively infrequently
used counter monitoring. There the possibility of not traversing the list
ande simply estimating the value based on the number of slab pages
allocated on that node.

> Christoph, others, any objections?

Obviously .... ;-)


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-08-07 13:02     ` Christopher Lameter
@ 2020-08-07 17:28       ` Pekka Enberg
  2020-08-10 11:56         ` xunlei
  2020-08-11 12:52         ` Christopher Lameter
  0 siblings, 2 replies; 17+ messages in thread
From: Pekka Enberg @ 2020-08-07 17:28 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Vlastimil Babka, Xunlei Pang, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML, Konstantin Khlebnikov,
	David Rientjes

Hi Christopher,

On Fri, 7 Aug 2020, Pekka Enberg wrote:
> > I think we can just default to the counters. After all, if I
> > understood correctly, we're talking about up to 100 ms time period
> > with IRQs disabled when count_partial() is called. As this is
> > triggerable from user space, that's a performance bug whatever way you
> > look at it.

On Fri, Aug 7, 2020 at 4:02 PM Christopher Lameter <cl@linux.com> wrote:
> Well yes under extreme conditions and this is only happening for sysfs
> counter retrieval.

You will likely get some stall even in less extreme conditions, and in
any case, the kernel should not allow user space to trigger such a
stall.

On Fri, Aug 7, 2020 at 4:02 PM Christopher Lameter <cl@linux.com> wrote:
> There could be other solutions to this. This solution here is penalizing
> evertu hotpath slab allocation for the sake of relatively infrequently
> used counter monitoring. There the possibility of not traversing the list
> ande simply estimating the value based on the number of slab pages
> allocated on that node.

Why do you consider this to be a fast path? This is all partial list
accounting when we allocate/deallocate a slab, no? Just like
___slab_alloc() says, I assumed this to be the slow path... What am I
missing?

No objections to alternative fixes, of course, but wrapping the
counters under CONFIG_DEBUG seems like just hiding the actual issue...

- Pekka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-08-07 17:28       ` Pekka Enberg
@ 2020-08-10 11:56         ` xunlei
  2020-08-11 12:52         ` Christopher Lameter
  1 sibling, 0 replies; 17+ messages in thread
From: xunlei @ 2020-08-10 11:56 UTC (permalink / raw)
  To: Pekka Enberg, Christopher Lameter
  Cc: Vlastimil Babka, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML, Konstantin Khlebnikov,
	David Rientjes

On 2020/8/8 上午1:28, Pekka Enberg wrote:
> Hi Christopher,
> 
> On Fri, 7 Aug 2020, Pekka Enberg wrote:
>>> I think we can just default to the counters. After all, if I
>>> understood correctly, we're talking about up to 100 ms time period
>>> with IRQs disabled when count_partial() is called. As this is
>>> triggerable from user space, that's a performance bug whatever way you
>>> look at it.
> 
> On Fri, Aug 7, 2020 at 4:02 PM Christopher Lameter <cl@linux.com> wrote:
>> Well yes under extreme conditions and this is only happening for sysfs
>> counter retrieval.
> 
> You will likely get some stall even in less extreme conditions, and in
> any case, the kernel should not allow user space to trigger such a
> stall.
> 

Yes, agree. This problem has been causing lots of trouble to us and
other people, and should be fixed.

Either my approach or the approach provided by "Vlastimil Babka" [1] is
better than doing nothing.

[1]:
https://lore.kernel.org/linux-mm/158860845968.33385.4165926113074799048.stgit@buzz/

> On Fri, Aug 7, 2020 at 4:02 PM Christopher Lameter <cl@linux.com> wrote:
>> There could be other solutions to this. This solution here is penalizing
>> evertu hotpath slab allocation for the sake of relatively infrequently
>> used counter monitoring. There the possibility of not traversing the list
>> ande simply estimating the value based on the number of slab pages
>> allocated on that node.
> 
> Why do you consider this to be a fast path? This is all partial list
> accounting when we allocate/deallocate a slab, no? Just like
> ___slab_alloc() says, I assumed this to be the slow path... What am I
> missing?

The only hot path is __slab_free(), I've made an extra patch with percpu
counter to avoid the potential performance degradation, will send v2 out
for review.

> 
> No objections to alternative fixes, of course, but wrapping the
> counters under CONFIG_DEBUG seems like just hiding the actual issue...
> 
> - Pekka
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-08-07 17:28       ` Pekka Enberg
  2020-08-10 11:56         ` xunlei
@ 2020-08-11 12:52         ` Christopher Lameter
  2020-08-20 13:58           ` Pekka Enberg
  1 sibling, 1 reply; 17+ messages in thread
From: Christopher Lameter @ 2020-08-11 12:52 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Vlastimil Babka, Xunlei Pang, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML, Konstantin Khlebnikov,
	David Rientjes

On Fri, 7 Aug 2020, Pekka Enberg wrote:

> Why do you consider this to be a fast path? This is all partial list
> accounting when we allocate/deallocate a slab, no? Just like
> ___slab_alloc() says, I assumed this to be the slow path... What am I
> missing?

I thought these were per object counters? If you just want to count the
number of slabs then you do not need the lock at all. We already have a
counter for the number of slabs.

> No objections to alternative fixes, of course, but wrapping the
> counters under CONFIG_DEBUG seems like just hiding the actual issue...

CONFIG_DEBUG is on by default. It just compiles in the debug code and
disables it so we can enable it with a kernel boot option. This is because
we have had numerous issues in the past with "production" kernels that
could not be recompiled with debug options. So just running the prod
kernel with another option will allow you to find hard to debug issues in
a full scale producton deployment with potentially proprietary modules
etc.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-08-11 12:52         ` Christopher Lameter
@ 2020-08-20 13:58           ` Pekka Enberg
  2020-08-24  9:59             ` xunlei
  0 siblings, 1 reply; 17+ messages in thread
From: Pekka Enberg @ 2020-08-20 13:58 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Vlastimil Babka, Xunlei Pang, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML, Konstantin Khlebnikov,
	David Rientjes

Hi Christopher,

On Tue, Aug 11, 2020 at 3:52 PM Christopher Lameter <cl@linux.com> wrote:
>
> On Fri, 7 Aug 2020, Pekka Enberg wrote:
>
> > Why do you consider this to be a fast path? This is all partial list
> > accounting when we allocate/deallocate a slab, no? Just like
> > ___slab_alloc() says, I assumed this to be the slow path... What am I
> > missing?
>
> I thought these were per object counters? If you just want to count the
> number of slabs then you do not need the lock at all. We already have a
> counter for the number of slabs.

The patch attempts to speed up count_partial(), which holds on to the
"n->list_lock" (with IRQs off) for the whole duration it takes to walk
the partial slab list:

        spin_lock_irqsave(&n->list_lock, flags);
        list_for_each_entry(page, &n->partial, slab_list)
                x += get_count(page);
        spin_unlock_irqrestore(&n->list_lock, flags);

It's counting the number of *objects*, but the counters are only
updated in bulk when we add/remove a slab to/from the partial list.
The counter updates are therefore *not* in the fast-path AFAICT.

Xunlei, please correct me if I'm reading your patches wrong.

On Tue, Aug 11, 2020 at 3:52 PM Christopher Lameter <cl@linux.com> wrote:
> > No objections to alternative fixes, of course, but wrapping the
> > counters under CONFIG_DEBUG seems like just hiding the actual issue...
>
> CONFIG_DEBUG is on by default. It just compiles in the debug code and
> disables it so we can enable it with a kernel boot option. This is because
> we have had numerous issues in the past with "production" kernels that
> could not be recompiled with debug options. So just running the prod
> kernel with another option will allow you to find hard to debug issues in
> a full scale producton deployment with potentially proprietary modules
> etc.

Yeah, it's been too long since I last looked at the code and did not
realize even count_partial() is wrapped in CONFIG_DEBUG. So by all
means, let's also wrap the counters with that too.

- Pekka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] mm/slub: Introduce two counters for the partial objects
  2020-08-20 13:58           ` Pekka Enberg
@ 2020-08-24  9:59             ` xunlei
  0 siblings, 0 replies; 17+ messages in thread
From: xunlei @ 2020-08-24  9:59 UTC (permalink / raw)
  To: Pekka Enberg, Christopher Lameter
  Cc: Vlastimil Babka, Andrew Morton, Wen Yang, Yang Shi,
	Roman Gushchin, linux-mm, LKML, Konstantin Khlebnikov,
	David Rientjes

On 2020/8/20 PM9:58, Pekka Enberg wrote:
> Hi Christopher,
> 
> On Tue, Aug 11, 2020 at 3:52 PM Christopher Lameter <cl@linux.com> wrote:
>>
>> On Fri, 7 Aug 2020, Pekka Enberg wrote:
>>
>>> Why do you consider this to be a fast path? This is all partial list
>>> accounting when we allocate/deallocate a slab, no? Just like
>>> ___slab_alloc() says, I assumed this to be the slow path... What am I
>>> missing?
>>
>> I thought these were per object counters? If you just want to count the
>> number of slabs then you do not need the lock at all. We already have a
>> counter for the number of slabs.
> 
> The patch attempts to speed up count_partial(), which holds on to the
> "n->list_lock" (with IRQs off) for the whole duration it takes to walk
> the partial slab list:
> 
>         spin_lock_irqsave(&n->list_lock, flags);
>         list_for_each_entry(page, &n->partial, slab_list)
>                 x += get_count(page);
>         spin_unlock_irqrestore(&n->list_lock, flags);
> 
> It's counting the number of *objects*, but the counters are only
> updated in bulk when we add/remove a slab to/from the partial list.
> The counter updates are therefore *not* in the fast-path AFAICT.
> 
> Xunlei, please correct me if I'm reading your patches wrong.

Yes, it's all in slow-path.

> 
> On Tue, Aug 11, 2020 at 3:52 PM Christopher Lameter <cl@linux.com> wrote:
>>> No objections to alternative fixes, of course, but wrapping the
>>> counters under CONFIG_DEBUG seems like just hiding the actual issue...
>>
>> CONFIG_DEBUG is on by default. It just compiles in the debug code and
>> disables it so we can enable it with a kernel boot option. This is because
>> we have had numerous issues in the past with "production" kernels that
>> could not be recompiled with debug options. So just running the prod
>> kernel with another option will allow you to find hard to debug issues in
>> a full scale producton deployment with potentially proprietary modules
>> etc.
> 
> Yeah, it's been too long since I last looked at the code and did not
> realize even count_partial() is wrapped in CONFIG_DEBUG. So by all

Besides CONFIG_DEBUG, count_partial() is also wrapped in CONFIG_SYSFS.

> means, let's also wrap the counters with that too.
> 
> - Pekka
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-08-24 10:00 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-02  8:32 [PATCH 1/2] mm/slub: Introduce two counters for the partial objects Xunlei Pang
2020-07-02  8:32 ` [PATCH 2/2] mm/slub: Get rid of count_partial() Xunlei Pang
2020-07-02 11:59 ` [PATCH 1/2] mm/slub: Introduce two counters for the partial objects Pekka Enberg
2020-07-03  9:37   ` xunlei
2020-07-07 15:23     ` Pekka Enberg
2020-07-09 14:32       ` Christopher Lameter
2020-07-31  2:57       ` xunlei
2020-07-07  6:59 ` Christopher Lameter
2020-07-31  2:52   ` xunlei
2020-08-06 12:42 ` Vlastimil Babka
2020-08-07  7:25   ` Pekka Enberg
2020-08-07 13:02     ` Christopher Lameter
2020-08-07 17:28       ` Pekka Enberg
2020-08-10 11:56         ` xunlei
2020-08-11 12:52         ` Christopher Lameter
2020-08-20 13:58           ` Pekka Enberg
2020-08-24  9:59             ` xunlei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).