[v3,2/2] mm: memcg/slab: Create a new set of kmalloc-cg-<n> caches
diff mbox series

Message ID 20210505154613.17214-3-longman@redhat.com
State New, archived
Headers show
Series
  • mm: memcg/slab: Fix objcg pointer array handling problem
Related show

Commit Message

Waiman Long May 5, 2021, 3:46 p.m. UTC
There are currently two problems in the way the objcg pointer array
(memcg_data) in the page structure is being allocated and freed.

On its allocation, it is possible that the allocated objcg pointer
array comes from the same slab that requires memory accounting. If this
happens, the slab will never become empty again as there is at least
one object left (the obj_cgroup array) in the slab.

When it is freed, the objcg pointer array object may be the last one
in its slab and hence causes kfree() to be called again. With the
right workload, the slab cache may be set up in a way that allows the
recursive kfree() calling loop to nest deep enough to cause a kernel
stack overflow and panic the system.

One way to solve this problem is to split the kmalloc-<n> caches
(KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n>
(KMALLOC_NORMAL) caches for non-accounted objects only and a new set of
kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All
the other caches can still allow a mix of accounted and non-accounted
objects.

With this change, all the objcg pointer array objects will come from
KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
both the recursive kfree() problem and non-freeable slab problem are
gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
have mixed accounted and unaccounted objects, this will slightly reduce
the number of objcg pointer arrays that need to be allocated and save
a bit of memory.

The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and
KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches()
will include the newly added caches without change.

Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Waiman Long <longman@redhat.com>
---
 include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++--------
 mm/slab_common.c     | 23 +++++++++++++++--------
 2 files changed, 49 insertions(+), 16 deletions(-)

Comments

Vlastimil Babka May 5, 2021, 4:06 p.m. UTC | #1
On 5/5/21 5:46 PM, Waiman Long wrote:
> There are currently two problems in the way the objcg pointer array
> (memcg_data) in the page structure is being allocated and freed.
> 
> On its allocation, it is possible that the allocated objcg pointer
> array comes from the same slab that requires memory accounting. If this
> happens, the slab will never become empty again as there is at least
> one object left (the obj_cgroup array) in the slab.
> 
> When it is freed, the objcg pointer array object may be the last one
> in its slab and hence causes kfree() to be called again. With the
> right workload, the slab cache may be set up in a way that allows the
> recursive kfree() calling loop to nest deep enough to cause a kernel
> stack overflow and panic the system.
> 
> One way to solve this problem is to split the kmalloc-<n> caches
> (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n>
> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of
> kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All
> the other caches can still allow a mix of accounted and non-accounted
> objects.
> 
> With this change, all the objcg pointer array objects will come from
> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
> both the recursive kfree() problem and non-freeable slab problem are
> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
> have mixed accounted and unaccounted objects, this will slightly reduce
> the number of objcg pointer arrays that need to be allocated and save
> a bit of memory.
> 
> The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and
> KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches()
> will include the newly added caches without change.
> 
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++--------
>  mm/slab_common.c     | 23 +++++++++++++++--------
>  2 files changed, 49 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 0c97d788762c..f2d9ebc34f5c 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
>  /*
>   * Whenever changing this, take care of that kmalloc_type() and
>   * create_kmalloc_caches() still work as intended.
> + *
> + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP
> + * is for accounted objects only. All the other kmem caches can have both
> + * accounted and non-accounted objects.
>   */
>  enum kmalloc_cache_type {
>  	KMALLOC_NORMAL = 0,
> +#ifdef CONFIG_MEMCG_KMEM
> +	KMALLOC_CGROUP,
> +#endif
>  	KMALLOC_RECLAIM,
>  #ifdef CONFIG_ZONE_DMA
>  	KMALLOC_DMA,
> @@ -315,28 +322,47 @@ enum kmalloc_cache_type {
>  	NR_KMALLOC_TYPES
>  };
>  
> +#ifndef CONFIG_MEMCG_KMEM
> +#define KMALLOC_CGROUP	KMALLOC_NORMAL
> +#endif
> +#ifndef CONFIG_ZONE_DMA
> +#define KMALLOC_DMA	KMALLOC_NORMAL
> +#endif

You could move this to the enum definition itself? E.g.:

#ifdef CONFIG_MEMCG_KMEM
	KMALLOC_CGROUP,
#else
	KMALLOC_CGROUP = KMALLOC_NORMAL,
#endif

> +
>  #ifndef CONFIG_SLOB
>  extern struct kmem_cache *
>  kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
>  
> +/*
> + * Define gfp bits that should not be set for KMALLOC_NORMAL.
> + */
> +#define KMALLOC_NOT_NORMAL_BITS					\
> +	(__GFP_RECLAIMABLE |					\
> +	(IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |	\
> +	(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
> +
>  static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
>  {
> -#ifdef CONFIG_ZONE_DMA
>  	/*
>  	 * The most common case is KMALLOC_NORMAL, so test for it
>  	 * with a single branch for both flags.
Not "both flags" anymore. Something like "so test with a single branch that
there are none of the flags that would select a different type"

>  	 */
> -	if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
> +	if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
>  		return KMALLOC_NORMAL;
>  
>  	/*
> -	 * At least one of the flags has to be set. If both are, __GFP_DMA
> -	 * is more important.
> +	 * At least one of the flags has to be set. Their priorities in
> +	 * decreasing order are:
> +	 *  1) __GFP_DMA
> +	 *  2) __GFP_RECLAIMABLE
> +	 *  3) __GFP_ACCOUNT
>  	 */
> -	return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
> -#else
> -	return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
> -#endif
> +	if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
> +		return KMALLOC_DMA;
> +	if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE))
> +		return KMALLOC_RECLAIM;
> +	else
> +		return KMALLOC_CGROUP;
>  }

Works for me this way, thanks.

>  
>  /*
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index f8833d3e5d47..d750e3ba7af5 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -727,21 +727,25 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
>  }
>  
>  #ifdef CONFIG_ZONE_DMA
> -#define INIT_KMALLOC_INFO(__size, __short_size)			\
> -{								\
> -	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
> -	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,	\
> -	.name[KMALLOC_DMA]     = "dma-kmalloc-" #__short_size,	\
> -	.size = __size,						\
> -}
> +#define KMALLOC_DMA_NAME(sz)	.name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
> +#else
> +#define KMALLOC_DMA_NAME(sz)
> +#endif
> +
> +#ifdef CONFIG_MEMCG_KMEM
> +#define KMALLOC_CGROUP_NAME(sz)	.name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
>  #else
> +#define KMALLOC_CGROUP_NAME(sz)
> +#endif
> +
>  #define INIT_KMALLOC_INFO(__size, __short_size)			\
>  {								\
>  	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
>  	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,	\
> +	KMALLOC_CGROUP_NAME(__short_size)			\
> +	KMALLOC_DMA_NAME(__short_size)				\
>  	.size = __size,						\
>  }
> -#endif
>  
>  /*
>   * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
> @@ -847,6 +851,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
>  	int i;
>  	enum kmalloc_cache_type type;
>  
> +	/*
> +	 * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
> +	 */
>  	for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
>  		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
>  			if (!kmalloc_caches[type][i])
>
Shakeel Butt May 5, 2021, 4:17 p.m. UTC | #2
On Wed, May 5, 2021 at 8:47 AM Waiman Long <longman@redhat.com> wrote:
>
> There are currently two problems in the way the objcg pointer array
> (memcg_data) in the page structure is being allocated and freed.
>
> On its allocation, it is possible that the allocated objcg pointer
> array comes from the same slab that requires memory accounting. If this
> happens, the slab will never become empty again as there is at least
> one object left (the obj_cgroup array) in the slab.
>
> When it is freed, the objcg pointer array object may be the last one
> in its slab and hence causes kfree() to be called again. With the
> right workload, the slab cache may be set up in a way that allows the
> recursive kfree() calling loop to nest deep enough to cause a kernel
> stack overflow and panic the system.
>
> One way to solve this problem is to split the kmalloc-<n> caches
> (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n>
> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of
> kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All
> the other caches can still allow a mix of accounted and non-accounted
> objects.
>
> With this change, all the objcg pointer array objects will come from
> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
> both the recursive kfree() problem and non-freeable slab problem are
> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
> have mixed accounted and unaccounted objects, this will slightly reduce
> the number of objcg pointer arrays that need to be allocated and save
> a bit of memory.
>
> The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and
> KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches()
> will include the newly added caches without change.
>
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Waiman Long <longman@redhat.com>

One nit below and after incorporating Vlastimil's suggestions:

Reviewed-by: Shakeel Butt <shakeelb@google.com>

> ---
>  include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++--------
>  mm/slab_common.c     | 23 +++++++++++++++--------
>  2 files changed, 49 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 0c97d788762c..f2d9ebc34f5c 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
>  /*
>   * Whenever changing this, take care of that kmalloc_type() and
>   * create_kmalloc_caches() still work as intended.
> + *
> + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP
> + * is for accounted objects only.

I think you can say "KMALLOC_CGROUP is for accounted and unreclaimable
objects only".
Waiman Long May 5, 2021, 4:31 p.m. UTC | #3
On 5/5/21 12:06 PM, Vlastimil Babka wrote:
> On 5/5/21 5:46 PM, Waiman Long wrote:
>> There are currently two problems in the way the objcg pointer array
>> (memcg_data) in the page structure is being allocated and freed.
>>
>> On its allocation, it is possible that the allocated objcg pointer
>> array comes from the same slab that requires memory accounting. If this
>> happens, the slab will never become empty again as there is at least
>> one object left (the obj_cgroup array) in the slab.
>>
>> When it is freed, the objcg pointer array object may be the last one
>> in its slab and hence causes kfree() to be called again. With the
>> right workload, the slab cache may be set up in a way that allows the
>> recursive kfree() calling loop to nest deep enough to cause a kernel
>> stack overflow and panic the system.
>>
>> One way to solve this problem is to split the kmalloc-<n> caches
>> (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n>
>> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of
>> kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All
>> the other caches can still allow a mix of accounted and non-accounted
>> objects.
>>
>> With this change, all the objcg pointer array objects will come from
>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
>> both the recursive kfree() problem and non-freeable slab problem are
>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
>> have mixed accounted and unaccounted objects, this will slightly reduce
>> the number of objcg pointer arrays that need to be allocated and save
>> a bit of memory.
>>
>> The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and
>> KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches()
>> will include the newly added caches without change.
>>
>> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> ---
>>   include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++--------
>>   mm/slab_common.c     | 23 +++++++++++++++--------
>>   2 files changed, 49 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/slab.h b/include/linux/slab.h
>> index 0c97d788762c..f2d9ebc34f5c 100644
>> --- a/include/linux/slab.h
>> +++ b/include/linux/slab.h
>> @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
>>   /*
>>    * Whenever changing this, take care of that kmalloc_type() and
>>    * create_kmalloc_caches() still work as intended.
>> + *
>> + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP
>> + * is for accounted objects only. All the other kmem caches can have both
>> + * accounted and non-accounted objects.
>>    */
>>   enum kmalloc_cache_type {
>>   	KMALLOC_NORMAL = 0,
>> +#ifdef CONFIG_MEMCG_KMEM
>> +	KMALLOC_CGROUP,
>> +#endif
>>   	KMALLOC_RECLAIM,
>>   #ifdef CONFIG_ZONE_DMA
>>   	KMALLOC_DMA,
>> @@ -315,28 +322,47 @@ enum kmalloc_cache_type {
>>   	NR_KMALLOC_TYPES
>>   };
>>   
>> +#ifndef CONFIG_MEMCG_KMEM
>> +#define KMALLOC_CGROUP	KMALLOC_NORMAL
>> +#endif
>> +#ifndef CONFIG_ZONE_DMA
>> +#define KMALLOC_DMA	KMALLOC_NORMAL
>> +#endif
> You could move this to the enum definition itself? E.g.:
>
> #ifdef CONFIG_MEMCG_KMEM
> 	KMALLOC_CGROUP,
> #else
> 	KMALLOC_CGROUP = KMALLOC_NORMAL,
> #endif
>
>> +
>>   #ifndef CONFIG_SLOB
>>   extern struct kmem_cache *
>>   kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
>>   
>> +/*
>> + * Define gfp bits that should not be set for KMALLOC_NORMAL.
>> + */
>> +#define KMALLOC_NOT_NORMAL_BITS					\
>> +	(__GFP_RECLAIMABLE |					\
>> +	(IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |	\
>> +	(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
>> +
>>   static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
>>   {
>> -#ifdef CONFIG_ZONE_DMA
>>   	/*
>>   	 * The most common case is KMALLOC_NORMAL, so test for it
>>   	 * with a single branch for both flags.
> Not "both flags" anymore. Something like "so test with a single branch that
> there are none of the flags that would select a different type"
Right. I just left the comment there without taking a deeper look. My bad.

Cheers,
Longman
Waiman Long May 5, 2021, 4:31 p.m. UTC | #4
On 5/5/21 12:17 PM, Shakeel Butt wrote:
> On Wed, May 5, 2021 at 8:47 AM Waiman Long <longman@redhat.com> wrote:
>> There are currently two problems in the way the objcg pointer array
>> (memcg_data) in the page structure is being allocated and freed.
>>
>> On its allocation, it is possible that the allocated objcg pointer
>> array comes from the same slab that requires memory accounting. If this
>> happens, the slab will never become empty again as there is at least
>> one object left (the obj_cgroup array) in the slab.
>>
>> When it is freed, the objcg pointer array object may be the last one
>> in its slab and hence causes kfree() to be called again. With the
>> right workload, the slab cache may be set up in a way that allows the
>> recursive kfree() calling loop to nest deep enough to cause a kernel
>> stack overflow and panic the system.
>>
>> One way to solve this problem is to split the kmalloc-<n> caches
>> (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n>
>> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of
>> kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All
>> the other caches can still allow a mix of accounted and non-accounted
>> objects.
>>
>> With this change, all the objcg pointer array objects will come from
>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
>> both the recursive kfree() problem and non-freeable slab problem are
>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
>> have mixed accounted and unaccounted objects, this will slightly reduce
>> the number of objcg pointer arrays that need to be allocated and save
>> a bit of memory.
>>
>> The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and
>> KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches()
>> will include the newly added caches without change.
>>
>> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
>> Signed-off-by: Waiman Long <longman@redhat.com>
> One nit below and after incorporating Vlastimil's suggestions:
>
> Reviewed-by: Shakeel Butt <shakeelb@google.com>
>
>> ---
>>   include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++--------
>>   mm/slab_common.c     | 23 +++++++++++++++--------
>>   2 files changed, 49 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/slab.h b/include/linux/slab.h
>> index 0c97d788762c..f2d9ebc34f5c 100644
>> --- a/include/linux/slab.h
>> +++ b/include/linux/slab.h
>> @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
>>   /*
>>    * Whenever changing this, take care of that kmalloc_type() and
>>    * create_kmalloc_caches() still work as intended.
>> + *
>> + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP
>> + * is for accounted objects only.
> I think you can say "KMALLOC_CGROUP is for accounted and unreclaimable
> objects only".
>
Thanks for the suggestion. Will incorporate that.

Cheers,
Longman
Roman Gushchin May 5, 2021, 5:30 p.m. UTC | #5
On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
> There are currently two problems in the way the objcg pointer array
> (memcg_data) in the page structure is being allocated and freed.
> 
> On its allocation, it is possible that the allocated objcg pointer
> array comes from the same slab that requires memory accounting. If this
> happens, the slab will never become empty again as there is at least
> one object left (the obj_cgroup array) in the slab.
> 
> When it is freed, the objcg pointer array object may be the last one
> in its slab and hence causes kfree() to be called again. With the
> right workload, the slab cache may be set up in a way that allows the
> recursive kfree() calling loop to nest deep enough to cause a kernel
> stack overflow and panic the system.
> 
> One way to solve this problem is to split the kmalloc-<n> caches
> (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n>
> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of
> kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All
> the other caches can still allow a mix of accounted and non-accounted
> objects.

I agree that it's likely the best approach here. Thanks for discovering
and fixing the problem!

> 
> With this change, all the objcg pointer array objects will come from
> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
> both the recursive kfree() problem and non-freeable slab problem are
> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
> have mixed accounted and unaccounted objects, this will slightly reduce
> the number of objcg pointer arrays that need to be allocated and save
> a bit of memory.

Unfortunately the positive effect of this change will be likely
reversed by a lower utilization due to a larger number of caches.

Btw, I wonder if we also need a change in the slab caches merging procedure?
KMALLOC_NORMAL caches should not be merged with caches which can potentially
include accounted objects.

> 
> The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and
> KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches()
> will include the newly added caches without change.
> 
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++--------
>  mm/slab_common.c     | 23 +++++++++++++++--------
>  2 files changed, 49 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 0c97d788762c..f2d9ebc34f5c 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
>  /*
>   * Whenever changing this, take care of that kmalloc_type() and
>   * create_kmalloc_caches() still work as intended.
> + *
> + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP
> + * is for accounted objects only. All the other kmem caches can have both
> + * accounted and non-accounted objects.
>   */
>  enum kmalloc_cache_type {
>  	KMALLOC_NORMAL = 0,
> +#ifdef CONFIG_MEMCG_KMEM
> +	KMALLOC_CGROUP,
> +#endif
>  	KMALLOC_RECLAIM,
>  #ifdef CONFIG_ZONE_DMA
>  	KMALLOC_DMA,
> @@ -315,28 +322,47 @@ enum kmalloc_cache_type {
>  	NR_KMALLOC_TYPES
>  };
>  
> +#ifndef CONFIG_MEMCG_KMEM
> +#define KMALLOC_CGROUP	KMALLOC_NORMAL
> +#endif
> +#ifndef CONFIG_ZONE_DMA
> +#define KMALLOC_DMA	KMALLOC_NORMAL
> +#endif
> +
>  #ifndef CONFIG_SLOB
>  extern struct kmem_cache *
>  kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
>  
> +/*
> + * Define gfp bits that should not be set for KMALLOC_NORMAL.
> + */
> +#define KMALLOC_NOT_NORMAL_BITS					\
> +	(__GFP_RECLAIMABLE |					\
> +	(IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |	\
> +	(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
> +
>  static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
>  {
> -#ifdef CONFIG_ZONE_DMA
>  	/*
>  	 * The most common case is KMALLOC_NORMAL, so test for it
>  	 * with a single branch for both flags.
>  	 */
> -	if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
> +	if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
>  		return KMALLOC_NORMAL;

Likely KMALLOC_CGROUP is also very popular, so maybe we want to change the
optimization here a bit.

>  
>  	/*
> -	 * At least one of the flags has to be set. If both are, __GFP_DMA
> -	 * is more important.
> +	 * At least one of the flags has to be set. Their priorities in
> +	 * decreasing order are:
> +	 *  1) __GFP_DMA
> +	 *  2) __GFP_RECLAIMABLE
> +	 *  3) __GFP_ACCOUNT
>  	 */
> -	return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
> -#else
> -	return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
> -#endif
> +	if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
> +		return KMALLOC_DMA;
> +	if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE))
> +		return KMALLOC_RECLAIM;
> +	else
> +		return KMALLOC_CGROUP;
>  }
>  
>  /*
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index f8833d3e5d47..d750e3ba7af5 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -727,21 +727,25 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
>  }
>  
>  #ifdef CONFIG_ZONE_DMA
> -#define INIT_KMALLOC_INFO(__size, __short_size)			\
> -{								\
> -	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
> -	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,	\
> -	.name[KMALLOC_DMA]     = "dma-kmalloc-" #__short_size,	\
> -	.size = __size,						\
> -}
> +#define KMALLOC_DMA_NAME(sz)	.name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
> +#else
> +#define KMALLOC_DMA_NAME(sz)
> +#endif
> +
> +#ifdef CONFIG_MEMCG_KMEM
> +#define KMALLOC_CGROUP_NAME(sz)	.name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
>  #else
> +#define KMALLOC_CGROUP_NAME(sz)
> +#endif
> +
>  #define INIT_KMALLOC_INFO(__size, __short_size)			\
>  {								\
>  	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
>  	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,	\
> +	KMALLOC_CGROUP_NAME(__short_size)			\
> +	KMALLOC_DMA_NAME(__short_size)				\
>  	.size = __size,						\
>  }
> -#endif
>  
>  /*
>   * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
> @@ -847,6 +851,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
>  	int i;
>  	enum kmalloc_cache_type type;
>  
> +	/*
> +	 * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
> +	 */
>  	for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
>  		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
>  			if (!kmalloc_caches[type][i])
> -- 
> 2.18.1
>
Vlastimil Babka May 5, 2021, 6:02 p.m. UTC | #6
On 5/5/21 7:30 PM, Roman Gushchin wrote:
> On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
>> 
>> With this change, all the objcg pointer array objects will come from
>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
>> both the recursive kfree() problem and non-freeable slab problem are
>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
>> have mixed accounted and unaccounted objects, this will slightly reduce
>> the number of objcg pointer arrays that need to be allocated and save
>> a bit of memory.
> 
> Unfortunately the positive effect of this change will be likely
> reversed by a lower utilization due to a larger number of caches.
> 
> Btw, I wonder if we also need a change in the slab caches merging procedure?
> KMALLOC_NORMAL caches should not be merged with caches which can potentially
> include accounted objects.

Good point. But looks like kmalloc* caches are extempt from all merging in
create_boot_cache() via

	s->refcount = -1;       /* Exempt from merging for now */

It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag
to prevent accidental merging in case the above is ever removed. It would also
better reflect reality, and ensure that the array is allocated immediately with
the page, AFAICS.
Waiman Long May 5, 2021, 6:11 p.m. UTC | #7
On 5/5/21 1:30 PM, Roman Gushchin wrote:
> On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
>> There are currently two problems in the way the objcg pointer array
>> (memcg_data) in the page structure is being allocated and freed.
>>
>> On its allocation, it is possible that the allocated objcg pointer
>> array comes from the same slab that requires memory accounting. If this
>> happens, the slab will never become empty again as there is at least
>> one object left (the obj_cgroup array) in the slab.
>>
>> When it is freed, the objcg pointer array object may be the last one
>> in its slab and hence causes kfree() to be called again. With the
>> right workload, the slab cache may be set up in a way that allows the
>> recursive kfree() calling loop to nest deep enough to cause a kernel
>> stack overflow and panic the system.
>>
>> One way to solve this problem is to split the kmalloc-<n> caches
>> (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n>
>> (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of
>> kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All
>> the other caches can still allow a mix of accounted and non-accounted
>> objects.
> I agree that it's likely the best approach here. Thanks for discovering
> and fixing the problem!
>
>> With this change, all the objcg pointer array objects will come from
>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
>> both the recursive kfree() problem and non-freeable slab problem are
>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
>> have mixed accounted and unaccounted objects, this will slightly reduce
>> the number of objcg pointer arrays that need to be allocated and save
>> a bit of memory.
> Unfortunately the positive effect of this change will be likely
> reversed by a lower utilization due to a larger number of caches.

That is also true, will mention that.

>
> Btw, I wonder if we also need a change in the slab caches merging procedure?
> KMALLOC_NORMAL caches should not be merged with caches which can potentially
> include accounted objects.

Thank for catching this omission.

I will take a look and modify the merging procedure in a new patch. 
Accounting is usually specified at kmem_cache_create() time. Though, I 
did find one instance of setting ACCOUNT flag in kmem_cache_alloc(), I 
will ignore this case and merge accounted, but unreclaimable caches to 
KMALLOC_CGROUP.

>
>> The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and
>> KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches()
>> will include the newly added caches without change.
>>
>> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> ---
>>   include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++--------
>>   mm/slab_common.c     | 23 +++++++++++++++--------
>>   2 files changed, 49 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/slab.h b/include/linux/slab.h
>> index 0c97d788762c..f2d9ebc34f5c 100644
>> --- a/include/linux/slab.h
>> +++ b/include/linux/slab.h
>> @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
>>   /*
>>    * Whenever changing this, take care of that kmalloc_type() and
>>    * create_kmalloc_caches() still work as intended.
>> + *
>> + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP
>> + * is for accounted objects only. All the other kmem caches can have both
>> + * accounted and non-accounted objects.
>>    */
>>   enum kmalloc_cache_type {
>>   	KMALLOC_NORMAL = 0,
>> +#ifdef CONFIG_MEMCG_KMEM
>> +	KMALLOC_CGROUP,
>> +#endif
>>   	KMALLOC_RECLAIM,
>>   #ifdef CONFIG_ZONE_DMA
>>   	KMALLOC_DMA,
>> @@ -315,28 +322,47 @@ enum kmalloc_cache_type {
>>   	NR_KMALLOC_TYPES
>>   };
>>   
>> +#ifndef CONFIG_MEMCG_KMEM
>> +#define KMALLOC_CGROUP	KMALLOC_NORMAL
>> +#endif
>> +#ifndef CONFIG_ZONE_DMA
>> +#define KMALLOC_DMA	KMALLOC_NORMAL
>> +#endif
>> +
>>   #ifndef CONFIG_SLOB
>>   extern struct kmem_cache *
>>   kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
>>   
>> +/*
>> + * Define gfp bits that should not be set for KMALLOC_NORMAL.
>> + */
>> +#define KMALLOC_NOT_NORMAL_BITS					\
>> +	(__GFP_RECLAIMABLE |					\
>> +	(IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |	\
>> +	(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
>> +
>>   static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
>>   {
>> -#ifdef CONFIG_ZONE_DMA
>>   	/*
>>   	 * The most common case is KMALLOC_NORMAL, so test for it
>>   	 * with a single branch for both flags.
>>   	 */
>> -	if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
>> +	if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
>>   		return KMALLOC_NORMAL;
> Likely KMALLOC_CGROUP is also very popular, so maybe we want to change the
> optimization here a bit.

I doubt this optimization is really noticeable and whether 
KMALLOC_CGROUP is really popular will depend on the workloads. I am not 
planning to spend additional time to micro-optimize this part of the code.

Cheers,
Longman
Roman Gushchin May 5, 2021, 6:18 p.m. UTC | #8
On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote:
> On 5/5/21 7:30 PM, Roman Gushchin wrote:
> > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
> >> 
> >> With this change, all the objcg pointer array objects will come from
> >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
> >> both the recursive kfree() problem and non-freeable slab problem are
> >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
> >> have mixed accounted and unaccounted objects, this will slightly reduce
> >> the number of objcg pointer arrays that need to be allocated and save
> >> a bit of memory.
> > 
> > Unfortunately the positive effect of this change will be likely
> > reversed by a lower utilization due to a larger number of caches.
> > 
> > Btw, I wonder if we also need a change in the slab caches merging procedure?
> > KMALLOC_NORMAL caches should not be merged with caches which can potentially
> > include accounted objects.
> 
> Good point. But looks like kmalloc* caches are extempt from all merging in
> create_boot_cache() via
> 
> 	s->refcount = -1;       /* Exempt from merging for now */

Oh, interesting... I wonder if there is (still) a good reason for that? Maybe
we can remove this limitation and save some memory?

> 
> It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag
> to prevent accidental merging in case the above is ever removed. It would also
> better reflect reality, and ensure that the array is allocated immediately with
> the page, AFAICS.

That wouldn't be enough, because a !SLAB_ACCOUNT cache can still have accounted
allocations and be merged with kmalloc-* cache. What we might wanna do is to
keep the no-merging rule for kmalloc-*, but relax it for kmalloc-cg-* caches.

But we can do it later, as a separate change.

Thanks!
Roman Gushchin May 5, 2021, 6:22 p.m. UTC | #9
On Wed, May 05, 2021 at 02:11:52PM -0400, Waiman Long wrote:
> On 5/5/21 1:30 PM, Roman Gushchin wrote:
> > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
> > > There are currently two problems in the way the objcg pointer array
> > > (memcg_data) in the page structure is being allocated and freed.
> > > 
> > > On its allocation, it is possible that the allocated objcg pointer
> > > array comes from the same slab that requires memory accounting. If this
> > > happens, the slab will never become empty again as there is at least
> > > one object left (the obj_cgroup array) in the slab.
> > > 
> > > When it is freed, the objcg pointer array object may be the last one
> > > in its slab and hence causes kfree() to be called again. With the
> > > right workload, the slab cache may be set up in a way that allows the
> > > recursive kfree() calling loop to nest deep enough to cause a kernel
> > > stack overflow and panic the system.
> > > 
> > > One way to solve this problem is to split the kmalloc-<n> caches
> > > (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc-<n>
> > > (KMALLOC_NORMAL) caches for non-accounted objects only and a new set of
> > > kmalloc-cg-<n> (KMALLOC_CGROUP) caches for accounted objects only. All
> > > the other caches can still allow a mix of accounted and non-accounted
> > > objects.
> > I agree that it's likely the best approach here. Thanks for discovering
> > and fixing the problem!
> > 
> > > With this change, all the objcg pointer array objects will come from
> > > KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
> > > both the recursive kfree() problem and non-freeable slab problem are
> > > gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
> > > have mixed accounted and unaccounted objects, this will slightly reduce
> > > the number of objcg pointer arrays that need to be allocated and save
> > > a bit of memory.
> > Unfortunately the positive effect of this change will be likely
> > reversed by a lower utilization due to a larger number of caches.
> 
> That is also true, will mention that.

Thanks!

> 
> > 
> > Btw, I wonder if we also need a change in the slab caches merging procedure?
> > KMALLOC_NORMAL caches should not be merged with caches which can potentially
> > include accounted objects.
> 
> Thank for catching this omission.
> 
> I will take a look and modify the merging procedure in a new patch.
> Accounting is usually specified at kmem_cache_create() time. Though, I did
> find one instance of setting ACCOUNT flag in kmem_cache_alloc(), I will
> ignore this case and merge accounted, but unreclaimable caches to
> KMALLOC_CGROUP.

Vlastimil pointed out that it's not an actual problem, because kmalloc
caches are exempt from the merging. Please, add a comment about it into
the commit log/code. We might wanna relax this rule for kmalloc-cg-*, but
we can do it later.

> 
> > 
> > > The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and
> > > KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches()
> > > will include the newly added caches without change.
> > > 
> > > Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> > > Signed-off-by: Waiman Long <longman@redhat.com>
> > > ---
> > >   include/linux/slab.h | 42 ++++++++++++++++++++++++++++++++++--------
> > >   mm/slab_common.c     | 23 +++++++++++++++--------
> > >   2 files changed, 49 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > > index 0c97d788762c..f2d9ebc34f5c 100644
> > > --- a/include/linux/slab.h
> > > +++ b/include/linux/slab.h
> > > @@ -305,9 +305,16 @@ static inline void __check_heap_object(const void *ptr, unsigned long n,
> > >   /*
> > >    * Whenever changing this, take care of that kmalloc_type() and
> > >    * create_kmalloc_caches() still work as intended.
> > > + *
> > > + * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP
> > > + * is for accounted objects only. All the other kmem caches can have both
> > > + * accounted and non-accounted objects.
> > >    */
> > >   enum kmalloc_cache_type {
> > >   	KMALLOC_NORMAL = 0,
> > > +#ifdef CONFIG_MEMCG_KMEM
> > > +	KMALLOC_CGROUP,
> > > +#endif
> > >   	KMALLOC_RECLAIM,
> > >   #ifdef CONFIG_ZONE_DMA
> > >   	KMALLOC_DMA,
> > > @@ -315,28 +322,47 @@ enum kmalloc_cache_type {
> > >   	NR_KMALLOC_TYPES
> > >   };
> > > +#ifndef CONFIG_MEMCG_KMEM
> > > +#define KMALLOC_CGROUP	KMALLOC_NORMAL
> > > +#endif
> > > +#ifndef CONFIG_ZONE_DMA
> > > +#define KMALLOC_DMA	KMALLOC_NORMAL
> > > +#endif
> > > +
> > >   #ifndef CONFIG_SLOB
> > >   extern struct kmem_cache *
> > >   kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
> > > +/*
> > > + * Define gfp bits that should not be set for KMALLOC_NORMAL.
> > > + */
> > > +#define KMALLOC_NOT_NORMAL_BITS					\
> > > +	(__GFP_RECLAIMABLE |					\
> > > +	(IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |	\
> > > +	(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
> > > +
> > >   static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
> > >   {
> > > -#ifdef CONFIG_ZONE_DMA
> > >   	/*
> > >   	 * The most common case is KMALLOC_NORMAL, so test for it
> > >   	 * with a single branch for both flags.
> > >   	 */
> > > -	if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
> > > +	if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
> > >   		return KMALLOC_NORMAL;
> > Likely KMALLOC_CGROUP is also very popular, so maybe we want to change the
> > optimization here a bit.
> 
> I doubt this optimization is really noticeable and whether KMALLOC_CGROUP is
> really popular will depend on the workloads. I am not planning to spend
> additional time to micro-optimize this part of the code.

Ok.

Thanks!
Waiman Long May 5, 2021, 6:31 p.m. UTC | #10
On 5/5/21 2:02 PM, Vlastimil Babka wrote:
> On 5/5/21 7:30 PM, Roman Gushchin wrote:
>> On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
>>> With this change, all the objcg pointer array objects will come from
>>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
>>> both the recursive kfree() problem and non-freeable slab problem are
>>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
>>> have mixed accounted and unaccounted objects, this will slightly reduce
>>> the number of objcg pointer arrays that need to be allocated and save
>>> a bit of memory.
>> Unfortunately the positive effect of this change will be likely
>> reversed by a lower utilization due to a larger number of caches.
>>
>> Btw, I wonder if we also need a change in the slab caches merging procedure?
>> KMALLOC_NORMAL caches should not be merged with caches which can potentially
>> include accounted objects.
> Good point. But looks like kmalloc* caches are extempt from all merging in
> create_boot_cache() via
>
> 	s->refcount = -1;       /* Exempt from merging for now */
>
> It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag
> to prevent accidental merging in case the above is ever removed. It would also
> better reflect reality, and ensure that the array is allocated immediately with
> the page, AFAICS.
>
I am not sure if this is really true.

struct kmem_cache *__init create_kmalloc_cache(const char *name,
                 unsigned int size, slab_flags_t flags,
                 unsigned int useroffset, unsigned int usersize)
{
         struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);

         if (!s)
                 panic("Out of memory when creating slab %s\n", name);

         create_boot_cache(s, name, size, flags, useroffset, usersize);
         kasan_cache_create_kmalloc(s);
         list_add(&s->list, &slab_caches);
         s->refcount = 1;
         return s;
}

Even though refcount is set to -1 initially, it is set back to 1 
afterward. So merging can still happen AFAICS.

Cheers,
Longman
Roman Gushchin May 5, 2021, 6:32 p.m. UTC | #11
On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote:
> On 5/5/21 7:30 PM, Roman Gushchin wrote:
> > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
> >> 
> >> With this change, all the objcg pointer array objects will come from
> >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
> >> both the recursive kfree() problem and non-freeable slab problem are
> >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
> >> have mixed accounted and unaccounted objects, this will slightly reduce
> >> the number of objcg pointer arrays that need to be allocated and save
> >> a bit of memory.
> > 
> > Unfortunately the positive effect of this change will be likely
> > reversed by a lower utilization due to a larger number of caches.
> > 
> > Btw, I wonder if we also need a change in the slab caches merging procedure?
> > KMALLOC_NORMAL caches should not be merged with caches which can potentially
> > include accounted objects.
> 
> Good point. But looks like kmalloc* caches are extempt from all merging in
> create_boot_cache() via
> 
> 	s->refcount = -1;       /* Exempt from merging for now */

Wait, s->refcount is adjusted to 1 in create_kmalloc_cache() after calling
into create_boot_cache?

It means they are not exempt actually.
Roman Gushchin May 5, 2021, 6:38 p.m. UTC | #12
On Wed, May 05, 2021 at 02:31:28PM -0400, Waiman Long wrote:
> On 5/5/21 2:02 PM, Vlastimil Babka wrote:
> > On 5/5/21 7:30 PM, Roman Gushchin wrote:
> > > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
> > > > With this change, all the objcg pointer array objects will come from
> > > > KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
> > > > both the recursive kfree() problem and non-freeable slab problem are
> > > > gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
> > > > have mixed accounted and unaccounted objects, this will slightly reduce
> > > > the number of objcg pointer arrays that need to be allocated and save
> > > > a bit of memory.
> > > Unfortunately the positive effect of this change will be likely
> > > reversed by a lower utilization due to a larger number of caches.
> > > 
> > > Btw, I wonder if we also need a change in the slab caches merging procedure?
> > > KMALLOC_NORMAL caches should not be merged with caches which can potentially
> > > include accounted objects.
> > Good point. But looks like kmalloc* caches are extempt from all merging in
> > create_boot_cache() via
> > 
> > 	s->refcount = -1;       /* Exempt from merging for now */
> > 
> > It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag
> > to prevent accidental merging in case the above is ever removed. It would also
> > better reflect reality, and ensure that the array is allocated immediately with
> > the page, AFAICS.
> > 
> I am not sure if this is really true.
> 
> struct kmem_cache *__init create_kmalloc_cache(const char *name,
>                 unsigned int size, slab_flags_t flags,
>                 unsigned int useroffset, unsigned int usersize)
> {
>         struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
> 
>         if (!s)
>                 panic("Out of memory when creating slab %s\n", name);
> 
>         create_boot_cache(s, name, size, flags, useroffset, usersize);
>         kasan_cache_create_kmalloc(s);
>         list_add(&s->list, &slab_caches);
>         s->refcount = 1;
>         return s;
> }
> 
> Even though refcount is set to -1 initially, it is set back to 1 afterward.
> So merging can still happen AFAICS.

Right, thanks, I already noticed it. Then yeah, we should make sure we're not
merging KMALLOC_NORMAL caches with any others.
Waiman Long May 5, 2021, 6:54 p.m. UTC | #13
On 5/5/21 2:11 PM, Waiman Long wrote:
> On 5/5/21 1:30 PM, Roman Gushchin wrote:
>
>>
>> Btw, I wonder if we also need a change in the slab caches merging 
>> procedure?
>> KMALLOC_NORMAL caches should not be merged with caches which can 
>> potentially
>> include accounted objects.
>
> Thank for catching this omission.
>
> I will take a look and modify the merging procedure in a new patch. 
> Accounting is usually specified at kmem_cache_create() time. Though, I 
> did find one instance of setting ACCOUNT flag in kmem_cache_alloc(), I 
> will ignore this case and merge accounted, but unreclaimable caches to 
> KMALLOC_CGROUP. 

In mm/slab_common.c:

#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
                          SLAB_CACHE_DMA32 | SLAB_ACCOUNT)

struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
   :
                 if ((flags & SLAB_MERGE_SAME) != (s->flags & 
SLAB_MERGE_SAME))
                         continue;

By making sure kmalloc-cg-* has SLAB_ACCOUNT bit set, a kmemcache 
created with with SLAB_ACCOUNT may merge with kmalloc-cg-* whereas one 
without SLAB_ACCOUNT may merge with kmalloc-* for now. So the current 
code should work fine for most cases. Though, if the ACCOUNT flag is set 
at kmem_cache_alloc() and the cache happens to be merged into kmalloc-*, 
we will have the rare case that an objcg pointer array may have to be 
added to a kmalloc-* cache. However, this is not a common practice, and 
the three cases (not one, sorry) that I found so far is in

arch/x86/kvm/x86.c:     ctxt = kmem_cache_zalloc(x86_emulator_cache, 
GFP_KERNEL_ACCOUNT);
fs/hostfs/hostfs_kern.c:        hi = 
kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
virt/kvm/kvm_main.c:    vcpu = kmem_cache_zalloc(kvm_vcpu_cache, 
GFP_KERNEL_ACCOUNT);

We will have to advise against doing that.

Cheers,
Longman
Waiman Long May 5, 2021, 6:56 p.m. UTC | #14
On 5/5/21 2:38 PM, Roman Gushchin wrote:
> On Wed, May 05, 2021 at 02:31:28PM -0400, Waiman Long wrote:
>> On 5/5/21 2:02 PM, Vlastimil Babka wrote:
>>> On 5/5/21 7:30 PM, Roman Gushchin wrote:
>>>> On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
>>>>> With this change, all the objcg pointer array objects will come from
>>>>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
>>>>> both the recursive kfree() problem and non-freeable slab problem are
>>>>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
>>>>> have mixed accounted and unaccounted objects, this will slightly reduce
>>>>> the number of objcg pointer arrays that need to be allocated and save
>>>>> a bit of memory.
>>>> Unfortunately the positive effect of this change will be likely
>>>> reversed by a lower utilization due to a larger number of caches.
>>>>
>>>> Btw, I wonder if we also need a change in the slab caches merging procedure?
>>>> KMALLOC_NORMAL caches should not be merged with caches which can potentially
>>>> include accounted objects.
>>> Good point. But looks like kmalloc* caches are extempt from all merging in
>>> create_boot_cache() via
>>>
>>> 	s->refcount = -1;       /* Exempt from merging for now */
>>>
>>> It wouldn't hurt though to create the kmalloc-cg-* caches with SLAB_ACCOUNT flag
>>> to prevent accidental merging in case the above is ever removed. It would also
>>> better reflect reality, and ensure that the array is allocated immediately with
>>> the page, AFAICS.
>>>
>> I am not sure if this is really true.
>>
>> struct kmem_cache *__init create_kmalloc_cache(const char *name,
>>                  unsigned int size, slab_flags_t flags,
>>                  unsigned int useroffset, unsigned int usersize)
>> {
>>          struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
>>
>>          if (!s)
>>                  panic("Out of memory when creating slab %s\n", name);
>>
>>          create_boot_cache(s, name, size, flags, useroffset, usersize);
>>          kasan_cache_create_kmalloc(s);
>>          list_add(&s->list, &slab_caches);
>>          s->refcount = 1;
>>          return s;
>> }
>>
>> Even though refcount is set to -1 initially, it is set back to 1 afterward.
>> So merging can still happen AFAICS.
> Right, thanks, I already noticed it. Then yeah, we should make sure we're not
> merging KMALLOC_NORMAL caches with any others.
>
That should be easy. We just set the refcount to -1 for the 
KMALLOC_NORMAL caches right after its creation then.

Cheers,
Longman
Vlastimil Babka May 5, 2021, 9:29 p.m. UTC | #15
On 5/5/21 8:32 PM, Roman Gushchin wrote:
> On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote:
>> On 5/5/21 7:30 PM, Roman Gushchin wrote:
>> > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
>> >> 
>> >> With this change, all the objcg pointer array objects will come from
>> >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
>> >> both the recursive kfree() problem and non-freeable slab problem are
>> >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
>> >> have mixed accounted and unaccounted objects, this will slightly reduce
>> >> the number of objcg pointer arrays that need to be allocated and save
>> >> a bit of memory.
>> > 
>> > Unfortunately the positive effect of this change will be likely
>> > reversed by a lower utilization due to a larger number of caches.
>> > 
>> > Btw, I wonder if we also need a change in the slab caches merging procedure?
>> > KMALLOC_NORMAL caches should not be merged with caches which can potentially
>> > include accounted objects.
>> 
>> Good point. But looks like kmalloc* caches are extempt from all merging in
>> create_boot_cache() via
>> 
>> 	s->refcount = -1;       /* Exempt from merging for now */
> 
> Wait, s->refcount is adjusted to 1 in create_kmalloc_cache() after calling
> into create_boot_cache?

Hmm I missed that

Now I wonder why all kmalloc caches on my system have 0 aliases :)
cat /sys/kernel/slab/kmalloc-*/aliases


> It means they are not exempt actually.
>
Roman Gushchin May 5, 2021, 10:19 p.m. UTC | #16
On Wed, May 05, 2021 at 11:29:54PM +0200, Vlastimil Babka wrote:
> On 5/5/21 8:32 PM, Roman Gushchin wrote:
> > On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote:
> >> On 5/5/21 7:30 PM, Roman Gushchin wrote:
> >> > On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
> >> >> 
> >> >> With this change, all the objcg pointer array objects will come from
> >> >> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
> >> >> both the recursive kfree() problem and non-freeable slab problem are
> >> >> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
> >> >> have mixed accounted and unaccounted objects, this will slightly reduce
> >> >> the number of objcg pointer arrays that need to be allocated and save
> >> >> a bit of memory.
> >> > 
> >> > Unfortunately the positive effect of this change will be likely
> >> > reversed by a lower utilization due to a larger number of caches.
> >> > 
> >> > Btw, I wonder if we also need a change in the slab caches merging procedure?
> >> > KMALLOC_NORMAL caches should not be merged with caches which can potentially
> >> > include accounted objects.
> >> 
> >> Good point. But looks like kmalloc* caches are extempt from all merging in
> >> create_boot_cache() via
> >> 
> >> 	s->refcount = -1;       /* Exempt from merging for now */
> > 
> > Wait, s->refcount is adjusted to 1 in create_kmalloc_cache() after calling
> > into create_boot_cache?
> 
> Hmm I missed that
> 
> Now I wonder why all kmalloc caches on my system have 0 aliases :)
> cat /sys/kernel/slab/kmalloc-*/aliases

Yeah, I noticed it too, it's a good question. And I remember a case from
the past when it wasn't true (kmalloc-32 was shared with something else).
Waiman Long May 5, 2021, 11:06 p.m. UTC | #17
On 5/5/21 6:19 PM, Roman Gushchin wrote:
> On Wed, May 05, 2021 at 11:29:54PM +0200, Vlastimil Babka wrote:
>> On 5/5/21 8:32 PM, Roman Gushchin wrote:
>>> On Wed, May 05, 2021 at 08:02:06PM +0200, Vlastimil Babka wrote:
>>>> On 5/5/21 7:30 PM, Roman Gushchin wrote:
>>>>> On Wed, May 05, 2021 at 11:46:13AM -0400, Waiman Long wrote:
>>>>>> With this change, all the objcg pointer array objects will come from
>>>>>> KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So
>>>>>> both the recursive kfree() problem and non-freeable slab problem are
>>>>>> gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer
>>>>>> have mixed accounted and unaccounted objects, this will slightly reduce
>>>>>> the number of objcg pointer arrays that need to be allocated and save
>>>>>> a bit of memory.
>>>>> Unfortunately the positive effect of this change will be likely
>>>>> reversed by a lower utilization due to a larger number of caches.
>>>>>
>>>>> Btw, I wonder if we also need a change in the slab caches merging procedure?
>>>>> KMALLOC_NORMAL caches should not be merged with caches which can potentially
>>>>> include accounted objects.
>>>> Good point. But looks like kmalloc* caches are extempt from all merging in
>>>> create_boot_cache() via
>>>>
>>>> 	s->refcount = -1;       /* Exempt from merging for now */
>>> Wait, s->refcount is adjusted to 1 in create_kmalloc_cache() after calling
>>> into create_boot_cache?
>> Hmm I missed that
>>
>> Now I wonder why all kmalloc caches on my system have 0 aliases :)
>> cat /sys/kernel/slab/kmalloc-*/aliases
> Yeah, I noticed it too, it's a good question. And I remember a case from
> the past when it wasn't true (kmalloc-32 was shared with something else).
>
The criteria for cache merging require close to exact match in all 
attributes with a size difference of no more than sizeof(void *). So it 
is not easy to find a close match.

Cheers,
Longman

Patch
diff mbox series

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0c97d788762c..f2d9ebc34f5c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -305,9 +305,16 @@  static inline void __check_heap_object(const void *ptr, unsigned long n,
 /*
  * Whenever changing this, take care of that kmalloc_type() and
  * create_kmalloc_caches() still work as intended.
+ *
+ * KMALLOC_NORMAL is for non-accounted objects only whereas KMALLOC_CGROUP
+ * is for accounted objects only. All the other kmem caches can have both
+ * accounted and non-accounted objects.
  */
 enum kmalloc_cache_type {
 	KMALLOC_NORMAL = 0,
+#ifdef CONFIG_MEMCG_KMEM
+	KMALLOC_CGROUP,
+#endif
 	KMALLOC_RECLAIM,
 #ifdef CONFIG_ZONE_DMA
 	KMALLOC_DMA,
@@ -315,28 +322,47 @@  enum kmalloc_cache_type {
 	NR_KMALLOC_TYPES
 };
 
+#ifndef CONFIG_MEMCG_KMEM
+#define KMALLOC_CGROUP	KMALLOC_NORMAL
+#endif
+#ifndef CONFIG_ZONE_DMA
+#define KMALLOC_DMA	KMALLOC_NORMAL
+#endif
+
 #ifndef CONFIG_SLOB
 extern struct kmem_cache *
 kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
 
+/*
+ * Define gfp bits that should not be set for KMALLOC_NORMAL.
+ */
+#define KMALLOC_NOT_NORMAL_BITS					\
+	(__GFP_RECLAIMABLE |					\
+	(IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |	\
+	(IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))
+
 static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags)
 {
-#ifdef CONFIG_ZONE_DMA
 	/*
 	 * The most common case is KMALLOC_NORMAL, so test for it
 	 * with a single branch for both flags.
 	 */
-	if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0))
+	if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
 		return KMALLOC_NORMAL;
 
 	/*
-	 * At least one of the flags has to be set. If both are, __GFP_DMA
-	 * is more important.
+	 * At least one of the flags has to be set. Their priorities in
+	 * decreasing order are:
+	 *  1) __GFP_DMA
+	 *  2) __GFP_RECLAIMABLE
+	 *  3) __GFP_ACCOUNT
 	 */
-	return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
-#else
-	return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
-#endif
+	if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
+		return KMALLOC_DMA;
+	if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE))
+		return KMALLOC_RECLAIM;
+	else
+		return KMALLOC_CGROUP;
 }
 
 /*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f8833d3e5d47..d750e3ba7af5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -727,21 +727,25 @@  struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 }
 
 #ifdef CONFIG_ZONE_DMA
-#define INIT_KMALLOC_INFO(__size, __short_size)			\
-{								\
-	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
-	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,	\
-	.name[KMALLOC_DMA]     = "dma-kmalloc-" #__short_size,	\
-	.size = __size,						\
-}
+#define KMALLOC_DMA_NAME(sz)	.name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
+#else
+#define KMALLOC_DMA_NAME(sz)
+#endif
+
+#ifdef CONFIG_MEMCG_KMEM
+#define KMALLOC_CGROUP_NAME(sz)	.name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
 #else
+#define KMALLOC_CGROUP_NAME(sz)
+#endif
+
 #define INIT_KMALLOC_INFO(__size, __short_size)			\
 {								\
 	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
 	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,	\
+	KMALLOC_CGROUP_NAME(__short_size)			\
+	KMALLOC_DMA_NAME(__short_size)				\
 	.size = __size,						\
 }
-#endif
 
 /*
  * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
@@ -847,6 +851,9 @@  void __init create_kmalloc_caches(slab_flags_t flags)
 	int i;
 	enum kmalloc_cache_type type;
 
+	/*
+	 * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
+	 */
 	for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
 		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
 			if (!kmalloc_caches[type][i])