All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: use vmalloc fallback path for certain memcg allocations
@ 2016-12-01  1:16 Anatoly Stepanov
  2016-12-02  8:19 ` Alexey Lyashkov
  2016-12-02  8:44 ` Vlastimil Babka
  0 siblings, 2 replies; 14+ messages in thread
From: Anatoly Stepanov @ 2016-12-01  1:16 UTC (permalink / raw)
  To: linux-mm; +Cc: akpm, vdavydov.dev, astepanov, umka, panda, vmeshkov

As memcg array size can be up to:
sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);

where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.

When a memcg instance count is large enough it can lead
to high order allocations up to order 7.

The same story with memcg_lrus allocations.
So let's work this around by utilizing vmalloc fallback path.

Signed-off-by: Anatoly Stepanov <astepanov@cloudlinux.com>
---
 include/linux/memcontrol.h | 16 ++++++++++++++++
 mm/list_lru.c              | 14 +++++++-------
 mm/slab_common.c           | 21 ++++++++++++++-------
 3 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 61d20c1..a281622 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,9 @@
 #include <linux/mmzone.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
 
 struct mem_cgroup;
 struct page;
@@ -878,4 +881,17 @@ static inline void memcg_kmem_update_page_stat(struct page *page,
 }
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
+static inline void memcg_free(const void *ptr)
+{
+	is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
+}
+
+static inline void *memcg_alloc(size_t size)
+{
+	if (likely(size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)))
+		return kzalloc(size, GFP_KERNEL|__GFP_NORETRY);
+
+	return vzalloc(size);
+}
+
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 234676e..8f49339 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -327,12 +327,12 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
 {
 	int size = memcg_nr_cache_ids;
 
-	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
-	if (!nlru->memcg_lrus)
+	nlru->memcg_lrus = memcg_alloc(size * sizeof(void *));
+	if (nlru->memcg_lrus == NULL)
 		return -ENOMEM;
 
 	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
-		kfree(nlru->memcg_lrus);
+		memcg_free(nlru->memcg_lrus);
 		return -ENOMEM;
 	}
 
@@ -342,7 +342,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
 static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
 {
 	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
-	kfree(nlru->memcg_lrus);
+	memcg_free(nlru->memcg_lrus);
 }
 
 static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -353,12 +353,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 	BUG_ON(old_size > new_size);
 
 	old = nlru->memcg_lrus;
-	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+	new = memcg_alloc(new_size * sizeof(void *));
 	if (!new)
 		return -ENOMEM;
 
 	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
-		kfree(new);
+		memcg_free(new);
 		return -ENOMEM;
 	}
 
@@ -375,7 +375,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 	nlru->memcg_lrus = new;
 	spin_unlock_irq(&nlru->lock);
 
-	kfree(old);
+	memcg_free(old);
 	return 0;
 }
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 329b038..19f8cb5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -157,9 +157,8 @@ static int init_memcg_params(struct kmem_cache *s,
 	if (!memcg_nr_cache_ids)
 		return 0;
 
-	arr = kzalloc(sizeof(struct memcg_cache_array) +
-		      memcg_nr_cache_ids * sizeof(void *),
-		      GFP_KERNEL);
+	arr = memcg_alloc(sizeof(struct memcg_cache_array) +
+			memcg_nr_cache_ids * sizeof(void *));
 	if (!arr)
 		return -ENOMEM;
 
@@ -170,7 +169,15 @@ static int init_memcg_params(struct kmem_cache *s,
 static void destroy_memcg_params(struct kmem_cache *s)
 {
 	if (is_root_cache(s))
-		kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+		memcg_free(rcu_access_pointer(s->memcg_params.memcg_caches));
+}
+
+static void memcg_rcu_free(struct rcu_head *rcu)
+{
+	struct memcg_cache_array *arr;
+
+	arr = container_of(rcu, struct memcg_cache_array, rcu);
+	memcg_free(arr);
 }
 
 static int update_memcg_params(struct kmem_cache *s, int new_array_size)
@@ -180,8 +187,8 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 	if (!is_root_cache(s))
 		return 0;
 
-	new = kzalloc(sizeof(struct memcg_cache_array) +
-		      new_array_size * sizeof(void *), GFP_KERNEL);
+	new = memcg_alloc(sizeof(struct memcg_cache_array) +
+				new_array_size * sizeof(void *));
 	if (!new)
 		return -ENOMEM;
 
@@ -193,7 +200,7 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 
 	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
 	if (old)
-		kfree_rcu(old, rcu);
+		call_rcu(&old->rcu, memcg_rcu_free);
 	return 0;
 }
 
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-02  9:19   ` Michal Hocko
@ 2016-12-02  6:54     ` Anatoly Stepanov
  2016-12-05  5:23       ` Michal Hocko
  0 siblings, 1 reply; 14+ messages in thread
From: Anatoly Stepanov @ 2016-12-02  6:54 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Vlastimil Babka, linux-mm, akpm, vdavydov.dev, umka, panda, vmeshkov

Alex, Vlasimil, Michal, thanks for your responses!

On Fri, Dec 02, 2016 at 10:19:33AM +0100, Michal Hocko wrote:
> Thanks for CCing me Vlastimil
> 
> On Fri 02-12-16 09:44:23, Vlastimil Babka wrote:
> > On 12/01/2016 02:16 AM, Anatoly Stepanov wrote:
> > > As memcg array size can be up to:
> > > sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);
> > > 
> > > where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.
> > > 
> > > When a memcg instance count is large enough it can lead
> > > to high order allocations up to order 7.
> 
> This is definitely not nice and worth fixing! I am just wondering
> whether this is something you have encountered in the real life. Having
> thousands of memcgs sounds quite crazy^Wscary to me. I am not at all
> sure we are prepared for that and some controllers would have real
> issues with it AFAIR.

In our company we use custom-made lightweight container technology, the thing is
we can have up to several thousands of them on a server.
So those high-order allocations were observed on a real production workload.

> > > The same story with memcg_lrus allocations.
> > > So let's work this around by utilizing vmalloc fallback path.
> > > 
> > > Signed-off-by: Anatoly Stepanov <astepanov@cloudlinux.com>
> > > ---
> > >  include/linux/memcontrol.h | 16 ++++++++++++++++
> > >  mm/list_lru.c              | 14 +++++++-------
> > >  mm/slab_common.c           | 21 ++++++++++++++-------
> > >  3 files changed, 37 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > > index 61d20c1..a281622 100644
> > > --- a/include/linux/memcontrol.h
> > > +++ b/include/linux/memcontrol.h
> > > @@ -29,6 +29,9 @@
> > >  #include <linux/mmzone.h>
> > >  #include <linux/writeback.h>
> > >  #include <linux/page-flags.h>
> > > +#include <linux/vmalloc.h>
> > > +#include <linux/slab.h>
> > > +#include <linux/mm.h>
> > > 
> > >  struct mem_cgroup;
> > >  struct page;
> > > @@ -878,4 +881,17 @@ static inline void memcg_kmem_update_page_stat(struct page *page,
> > >  }
> > >  #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
> > > 
> > > +static inline void memcg_free(const void *ptr)
> > > +{
> > > +	is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
> > > +}
> > > +
> > > +static inline void *memcg_alloc(size_t size)
> > > +{
> > > +	if (likely(size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)))
> 
> you are mixing two different units here...
> 
> > > +		return kzalloc(size, GFP_KERNEL|__GFP_NORETRY);
> > > +
> > > +	return vzalloc(size);
> > 
> > That's not how I imagine a "fallback" to work. You should be trying
> > kzalloc() and if that fails, call vzalloc(), not distinguish it by costly
> > order check. Also IIRC __GFP_NORETRY can easily fail even for non-costly
> > orders.
> 
> Completely agreed! This should be done simply by
> 	gfp_t gfp_mask = GFP_KERNEL;
> 	void *ret;
> 	
> 	/*
> 	 * Do not invoke OOM killer for larger requests as we can fall
> 	 * back to the vmalloc
> 	 */
> 	if (size > PAGE_SIZE)
> 		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;

I think we should check against PAGE_ALLOC_COSTLY_ORDER anyway, 
as there's no big need to allocate large contiguous chunks here, at the same time someone
in the kernel might really need them.

> 
> 	ret = kzalloc(size, gfp_mask);
> 	if (ret)
> 		return ret;
> 	return vzalloc(size);
> 

> I also do not like memcg_alloc helper name. It suggests we are
> allocating a memcg while it is used for cache arrays and slab LRUS.
> Anyway this pattern is quite widespread in the kernel so I would simply
> suggest adding kvmalloc function instead.

Agreed, it would be nice to have a generic call.
I would suggest an impl. like this:

void *kvmalloc(size_t size)
{
	gfp_t gfp_mask = GFP_KERNEL;
	void *ret;

 	if (size > PAGE_SIZE)
 		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;


	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
		ret = kzalloc(size, gfp_mask);
		if (ret)
			return ret;
	}
	
	return vzalloc(size);
}

> > 
> > > +}
> > > +
> > >  #endif /* _LINUX_MEMCONTROL_H */
> > > diff --git a/mm/list_lru.c b/mm/list_lru.c
> > > index 234676e..8f49339 100644
> > > --- a/mm/list_lru.c
> > > +++ b/mm/list_lru.c
> > > @@ -327,12 +327,12 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
> > >  {
> > >  	int size = memcg_nr_cache_ids;
> > > 
> > > -	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
> > > -	if (!nlru->memcg_lrus)
> > > +	nlru->memcg_lrus = memcg_alloc(size * sizeof(void *));
> > > +	if (nlru->memcg_lrus == NULL)
> > >  		return -ENOMEM;
> > > 
> > >  	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
> > > -		kfree(nlru->memcg_lrus);
> > > +		memcg_free(nlru->memcg_lrus);
> > >  		return -ENOMEM;
> > >  	}
> > > 
> > > @@ -342,7 +342,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
> > >  static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
> > >  {
> > >  	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
> > > -	kfree(nlru->memcg_lrus);
> > > +	memcg_free(nlru->memcg_lrus);
> > >  }
> > > 
> > >  static int memcg_update_list_lru_node(struct list_lru_node *nlru,
> > > @@ -353,12 +353,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
> > >  	BUG_ON(old_size > new_size);
> > > 
> > >  	old = nlru->memcg_lrus;
> > > -	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
> > > +	new = memcg_alloc(new_size * sizeof(void *));
> > >  	if (!new)
> > >  		return -ENOMEM;
> > > 
> > >  	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
> > > -		kfree(new);
> > > +		memcg_free(new);
> > >  		return -ENOMEM;
> > >  	}
> > > 
> > > @@ -375,7 +375,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
> > >  	nlru->memcg_lrus = new;
> > >  	spin_unlock_irq(&nlru->lock);
> > > 
> > > -	kfree(old);
> > > +	memcg_free(old);
> > >  	return 0;
> > >  }
> > > 
> > > diff --git a/mm/slab_common.c b/mm/slab_common.c
> > > index 329b038..19f8cb5 100644
> > > --- a/mm/slab_common.c
> > > +++ b/mm/slab_common.c
> > > @@ -157,9 +157,8 @@ static int init_memcg_params(struct kmem_cache *s,
> > >  	if (!memcg_nr_cache_ids)
> > >  		return 0;
> > > 
> > > -	arr = kzalloc(sizeof(struct memcg_cache_array) +
> > > -		      memcg_nr_cache_ids * sizeof(void *),
> > > -		      GFP_KERNEL);
> > > +	arr = memcg_alloc(sizeof(struct memcg_cache_array) +
> > > +			memcg_nr_cache_ids * sizeof(void *));
> > >  	if (!arr)
> > >  		return -ENOMEM;
> > > 
> > > @@ -170,7 +169,15 @@ static int init_memcg_params(struct kmem_cache *s,
> > >  static void destroy_memcg_params(struct kmem_cache *s)
> > >  {
> > >  	if (is_root_cache(s))
> > > -		kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
> > > +		memcg_free(rcu_access_pointer(s->memcg_params.memcg_caches));
> > > +}
> > > +
> > > +static void memcg_rcu_free(struct rcu_head *rcu)
> > > +{
> > > +	struct memcg_cache_array *arr;
> > > +
> > > +	arr = container_of(rcu, struct memcg_cache_array, rcu);
> > > +	memcg_free(arr);
> > >  }
> > > 
> > >  static int update_memcg_params(struct kmem_cache *s, int new_array_size)
> > > @@ -180,8 +187,8 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
> > >  	if (!is_root_cache(s))
> > >  		return 0;
> > > 
> > > -	new = kzalloc(sizeof(struct memcg_cache_array) +
> > > -		      new_array_size * sizeof(void *), GFP_KERNEL);
> > > +	new = memcg_alloc(sizeof(struct memcg_cache_array) +
> > > +				new_array_size * sizeof(void *));
> > >  	if (!new)
> > >  		return -ENOMEM;
> > > 
> > > @@ -193,7 +200,7 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
> > > 
> > >  	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
> > >  	if (old)
> > > -		kfree_rcu(old, rcu);
> > > +		call_rcu(&old->rcu, memcg_rcu_free);
> > >  	return 0;
> > >  }
> > > 
> > > 
> 
> -- 
> Michal Hocko
> SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-01  1:16 [PATCH] mm: use vmalloc fallback path for certain memcg allocations Anatoly Stepanov
@ 2016-12-02  8:19 ` Alexey Lyashkov
  2016-12-02  8:44 ` Vlastimil Babka
  1 sibling, 0 replies; 14+ messages in thread
From: Alexey Lyashkov @ 2016-12-02  8:19 UTC (permalink / raw)
  To: Anatoly Stepanov; +Cc: linux-mm, akpm, vdavydov.dev, panda, vmeshkov


> 1 дек. 2016 г., в 4:16, Anatoly Stepanov <astepanov@cloudlinux.com> написал(а):
> 
> As memcg array size can be up to:
> sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);
> 
> where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.
> 
> When a memcg instance count is large enough it can lead
> to high order allocations up to order 7.
> 
> The same story with memcg_lrus allocations.
> So let's work this around by utilizing vmalloc fallback path.
> 
> Signed-off-by: Anatoly Stepanov <astepanov@cloudlinux.com>
> ---
> include/linux/memcontrol.h | 16 ++++++++++++++++
> mm/list_lru.c              | 14 +++++++-------
> mm/slab_common.c           | 21 ++++++++++++++-------
> 3 files changed, 37 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 61d20c1..a281622 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -29,6 +29,9 @@
> #include <linux/mmzone.h>
> #include <linux/writeback.h>
> #include <linux/page-flags.h>
> +#include <linux/vmalloc.h>
> +#include <linux/slab.h>
> +#include <linux/mm.h>
> 
> struct mem_cgroup;
> struct page;
> @@ -878,4 +881,17 @@ static inline void memcg_kmem_update_page_stat(struct page *page,
> }
> #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
> 
> +static inline void memcg_free(const void *ptr)
> +{
> +	is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
> +}
please to use a kvfree() instead.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-01  1:16 [PATCH] mm: use vmalloc fallback path for certain memcg allocations Anatoly Stepanov
  2016-12-02  8:19 ` Alexey Lyashkov
@ 2016-12-02  8:44 ` Vlastimil Babka
  2016-12-02  9:19   ` Michal Hocko
  1 sibling, 1 reply; 14+ messages in thread
From: Vlastimil Babka @ 2016-12-02  8:44 UTC (permalink / raw)
  To: Anatoly Stepanov, linux-mm
  Cc: akpm, vdavydov.dev, umka, panda, vmeshkov, Michal Hocko

On 12/01/2016 02:16 AM, Anatoly Stepanov wrote:
> As memcg array size can be up to:
> sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);
>
> where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.
>
> When a memcg instance count is large enough it can lead
> to high order allocations up to order 7.
>
> The same story with memcg_lrus allocations.
> So let's work this around by utilizing vmalloc fallback path.
>
> Signed-off-by: Anatoly Stepanov <astepanov@cloudlinux.com>
> ---
>  include/linux/memcontrol.h | 16 ++++++++++++++++
>  mm/list_lru.c              | 14 +++++++-------
>  mm/slab_common.c           | 21 ++++++++++++++-------
>  3 files changed, 37 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 61d20c1..a281622 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -29,6 +29,9 @@
>  #include <linux/mmzone.h>
>  #include <linux/writeback.h>
>  #include <linux/page-flags.h>
> +#include <linux/vmalloc.h>
> +#include <linux/slab.h>
> +#include <linux/mm.h>
>
>  struct mem_cgroup;
>  struct page;
> @@ -878,4 +881,17 @@ static inline void memcg_kmem_update_page_stat(struct page *page,
>  }
>  #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
>
> +static inline void memcg_free(const void *ptr)
> +{
> +	is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
> +}
> +
> +static inline void *memcg_alloc(size_t size)
> +{
> +	if (likely(size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)))
> +		return kzalloc(size, GFP_KERNEL|__GFP_NORETRY);
> +
> +	return vzalloc(size);

That's not how I imagine a "fallback" to work. You should be trying 
kzalloc() and if that fails, call vzalloc(), not distinguish it by 
costly order check. Also IIRC __GFP_NORETRY can easily fail even for 
non-costly orders.

> +}
> +
>  #endif /* _LINUX_MEMCONTROL_H */
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 234676e..8f49339 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -327,12 +327,12 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
>  {
>  	int size = memcg_nr_cache_ids;
>
> -	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
> -	if (!nlru->memcg_lrus)
> +	nlru->memcg_lrus = memcg_alloc(size * sizeof(void *));
> +	if (nlru->memcg_lrus == NULL)
>  		return -ENOMEM;
>
>  	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
> -		kfree(nlru->memcg_lrus);
> +		memcg_free(nlru->memcg_lrus);
>  		return -ENOMEM;
>  	}
>
> @@ -342,7 +342,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
>  static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
>  {
>  	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
> -	kfree(nlru->memcg_lrus);
> +	memcg_free(nlru->memcg_lrus);
>  }
>
>  static int memcg_update_list_lru_node(struct list_lru_node *nlru,
> @@ -353,12 +353,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
>  	BUG_ON(old_size > new_size);
>
>  	old = nlru->memcg_lrus;
> -	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
> +	new = memcg_alloc(new_size * sizeof(void *));
>  	if (!new)
>  		return -ENOMEM;
>
>  	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
> -		kfree(new);
> +		memcg_free(new);
>  		return -ENOMEM;
>  	}
>
> @@ -375,7 +375,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
>  	nlru->memcg_lrus = new;
>  	spin_unlock_irq(&nlru->lock);
>
> -	kfree(old);
> +	memcg_free(old);
>  	return 0;
>  }
>
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 329b038..19f8cb5 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -157,9 +157,8 @@ static int init_memcg_params(struct kmem_cache *s,
>  	if (!memcg_nr_cache_ids)
>  		return 0;
>
> -	arr = kzalloc(sizeof(struct memcg_cache_array) +
> -		      memcg_nr_cache_ids * sizeof(void *),
> -		      GFP_KERNEL);
> +	arr = memcg_alloc(sizeof(struct memcg_cache_array) +
> +			memcg_nr_cache_ids * sizeof(void *));
>  	if (!arr)
>  		return -ENOMEM;
>
> @@ -170,7 +169,15 @@ static int init_memcg_params(struct kmem_cache *s,
>  static void destroy_memcg_params(struct kmem_cache *s)
>  {
>  	if (is_root_cache(s))
> -		kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
> +		memcg_free(rcu_access_pointer(s->memcg_params.memcg_caches));
> +}
> +
> +static void memcg_rcu_free(struct rcu_head *rcu)
> +{
> +	struct memcg_cache_array *arr;
> +
> +	arr = container_of(rcu, struct memcg_cache_array, rcu);
> +	memcg_free(arr);
>  }
>
>  static int update_memcg_params(struct kmem_cache *s, int new_array_size)
> @@ -180,8 +187,8 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
>  	if (!is_root_cache(s))
>  		return 0;
>
> -	new = kzalloc(sizeof(struct memcg_cache_array) +
> -		      new_array_size * sizeof(void *), GFP_KERNEL);
> +	new = memcg_alloc(sizeof(struct memcg_cache_array) +
> +				new_array_size * sizeof(void *));
>  	if (!new)
>  		return -ENOMEM;
>
> @@ -193,7 +200,7 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
>
>  	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
>  	if (old)
> -		kfree_rcu(old, rcu);
> +		call_rcu(&old->rcu, memcg_rcu_free);
>  	return 0;
>  }
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-02  8:44 ` Vlastimil Babka
@ 2016-12-02  9:19   ` Michal Hocko
  2016-12-02  6:54     ` Anatoly Stepanov
  0 siblings, 1 reply; 14+ messages in thread
From: Michal Hocko @ 2016-12-02  9:19 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Anatoly Stepanov, linux-mm, akpm, vdavydov.dev, umka, panda, vmeshkov

Thanks for CCing me Vlastimil

On Fri 02-12-16 09:44:23, Vlastimil Babka wrote:
> On 12/01/2016 02:16 AM, Anatoly Stepanov wrote:
> > As memcg array size can be up to:
> > sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);
> > 
> > where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.
> > 
> > When a memcg instance count is large enough it can lead
> > to high order allocations up to order 7.

This is definitely not nice and worth fixing! I am just wondering
whether this is something you have encountered in the real life. Having
thousands of memcgs sounds quite crazy^Wscary to me. I am not at all
sure we are prepared for that and some controllers would have real
issues with it AFAIR.

> > The same story with memcg_lrus allocations.
> > So let's work this around by utilizing vmalloc fallback path.
> > 
> > Signed-off-by: Anatoly Stepanov <astepanov@cloudlinux.com>
> > ---
> >  include/linux/memcontrol.h | 16 ++++++++++++++++
> >  mm/list_lru.c              | 14 +++++++-------
> >  mm/slab_common.c           | 21 ++++++++++++++-------
> >  3 files changed, 37 insertions(+), 14 deletions(-)
> > 
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 61d20c1..a281622 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -29,6 +29,9 @@
> >  #include <linux/mmzone.h>
> >  #include <linux/writeback.h>
> >  #include <linux/page-flags.h>
> > +#include <linux/vmalloc.h>
> > +#include <linux/slab.h>
> > +#include <linux/mm.h>
> > 
> >  struct mem_cgroup;
> >  struct page;
> > @@ -878,4 +881,17 @@ static inline void memcg_kmem_update_page_stat(struct page *page,
> >  }
> >  #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
> > 
> > +static inline void memcg_free(const void *ptr)
> > +{
> > +	is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
> > +}
> > +
> > +static inline void *memcg_alloc(size_t size)
> > +{
> > +	if (likely(size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)))

you are mixing two different units here...

> > +		return kzalloc(size, GFP_KERNEL|__GFP_NORETRY);
> > +
> > +	return vzalloc(size);
> 
> That's not how I imagine a "fallback" to work. You should be trying
> kzalloc() and if that fails, call vzalloc(), not distinguish it by costly
> order check. Also IIRC __GFP_NORETRY can easily fail even for non-costly
> orders.

Completely agreed! This should be done simply by
	gfp_t gfp_mask = GFP_KERNEL;
	void *ret;
	
	/*
	 * Do not invoke OOM killer for larger requests as we can fall
	 * back to the vmalloc
	 */
	if (size > PAGE_SIZE)
		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;

	ret = kzalloc(size, gfp_mask);
	if (ret)
		return ret;
	return vzalloc(size);

I also do not like memcg_alloc helper name. It suggests we are
allocating a memcg while it is used for cache arrays and slab LRUS.
Anyway this pattern is quite widespread in the kernel so I would simply
suggest adding kvmalloc function instead.

> 
> > +}
> > +
> >  #endif /* _LINUX_MEMCONTROL_H */
> > diff --git a/mm/list_lru.c b/mm/list_lru.c
> > index 234676e..8f49339 100644
> > --- a/mm/list_lru.c
> > +++ b/mm/list_lru.c
> > @@ -327,12 +327,12 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
> >  {
> >  	int size = memcg_nr_cache_ids;
> > 
> > -	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
> > -	if (!nlru->memcg_lrus)
> > +	nlru->memcg_lrus = memcg_alloc(size * sizeof(void *));
> > +	if (nlru->memcg_lrus == NULL)
> >  		return -ENOMEM;
> > 
> >  	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
> > -		kfree(nlru->memcg_lrus);
> > +		memcg_free(nlru->memcg_lrus);
> >  		return -ENOMEM;
> >  	}
> > 
> > @@ -342,7 +342,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
> >  static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
> >  {
> >  	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
> > -	kfree(nlru->memcg_lrus);
> > +	memcg_free(nlru->memcg_lrus);
> >  }
> > 
> >  static int memcg_update_list_lru_node(struct list_lru_node *nlru,
> > @@ -353,12 +353,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
> >  	BUG_ON(old_size > new_size);
> > 
> >  	old = nlru->memcg_lrus;
> > -	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
> > +	new = memcg_alloc(new_size * sizeof(void *));
> >  	if (!new)
> >  		return -ENOMEM;
> > 
> >  	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
> > -		kfree(new);
> > +		memcg_free(new);
> >  		return -ENOMEM;
> >  	}
> > 
> > @@ -375,7 +375,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
> >  	nlru->memcg_lrus = new;
> >  	spin_unlock_irq(&nlru->lock);
> > 
> > -	kfree(old);
> > +	memcg_free(old);
> >  	return 0;
> >  }
> > 
> > diff --git a/mm/slab_common.c b/mm/slab_common.c
> > index 329b038..19f8cb5 100644
> > --- a/mm/slab_common.c
> > +++ b/mm/slab_common.c
> > @@ -157,9 +157,8 @@ static int init_memcg_params(struct kmem_cache *s,
> >  	if (!memcg_nr_cache_ids)
> >  		return 0;
> > 
> > -	arr = kzalloc(sizeof(struct memcg_cache_array) +
> > -		      memcg_nr_cache_ids * sizeof(void *),
> > -		      GFP_KERNEL);
> > +	arr = memcg_alloc(sizeof(struct memcg_cache_array) +
> > +			memcg_nr_cache_ids * sizeof(void *));
> >  	if (!arr)
> >  		return -ENOMEM;
> > 
> > @@ -170,7 +169,15 @@ static int init_memcg_params(struct kmem_cache *s,
> >  static void destroy_memcg_params(struct kmem_cache *s)
> >  {
> >  	if (is_root_cache(s))
> > -		kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
> > +		memcg_free(rcu_access_pointer(s->memcg_params.memcg_caches));
> > +}
> > +
> > +static void memcg_rcu_free(struct rcu_head *rcu)
> > +{
> > +	struct memcg_cache_array *arr;
> > +
> > +	arr = container_of(rcu, struct memcg_cache_array, rcu);
> > +	memcg_free(arr);
> >  }
> > 
> >  static int update_memcg_params(struct kmem_cache *s, int new_array_size)
> > @@ -180,8 +187,8 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
> >  	if (!is_root_cache(s))
> >  		return 0;
> > 
> > -	new = kzalloc(sizeof(struct memcg_cache_array) +
> > -		      new_array_size * sizeof(void *), GFP_KERNEL);
> > +	new = memcg_alloc(sizeof(struct memcg_cache_array) +
> > +				new_array_size * sizeof(void *));
> >  	if (!new)
> >  		return -ENOMEM;
> > 
> > @@ -193,7 +200,7 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
> > 
> >  	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
> >  	if (old)
> > -		kfree_rcu(old, rcu);
> > +		call_rcu(&old->rcu, memcg_rcu_free);
> >  	return 0;
> >  }
> > 
> > 

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-05  5:23       ` Michal Hocko
@ 2016-12-02 22:09         ` Anatoly Stepanov
  2016-12-06  8:47           ` Michal Hocko
  2016-12-05 14:09         ` Heiko Carstens
  1 sibling, 1 reply; 14+ messages in thread
From: Anatoly Stepanov @ 2016-12-02 22:09 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Vlastimil Babka, linux-mm, akpm, vdavydov.dev, umka, panda, vmeshkov

On Mon, Dec 05, 2016 at 06:23:26AM +0100, Michal Hocko wrote:
> On Fri 02-12-16 09:54:17, Anatoly Stepanov wrote:
> > Alex, Vlasimil, Michal, thanks for your responses!
> > 
> > On Fri, Dec 02, 2016 at 10:19:33AM +0100, Michal Hocko wrote:
> > > Thanks for CCing me Vlastimil
> > > 
> > > On Fri 02-12-16 09:44:23, Vlastimil Babka wrote:
> > > > On 12/01/2016 02:16 AM, Anatoly Stepanov wrote:
> > > > > As memcg array size can be up to:
> > > > > sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);
> > > > > 
> > > > > where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.
> > > > > 
> > > > > When a memcg instance count is large enough it can lead
> > > > > to high order allocations up to order 7.
> > > 
> > > This is definitely not nice and worth fixing! I am just wondering
> > > whether this is something you have encountered in the real life. Having
> > > thousands of memcgs sounds quite crazy^Wscary to me. I am not at all
> > > sure we are prepared for that and some controllers would have real
> > > issues with it AFAIR.
> > 
> > In our company we use custom-made lightweight container technology, the thing is
> > we can have up to several thousands of them on a server.
> > So those high-order allocations were observed on a real production workload.
> 
> OK, this is interesting. Definitely worth mentioning in the changelog!
> 
> [...]
> > > 	/*
> > > 	 * Do not invoke OOM killer for larger requests as we can fall
> > > 	 * back to the vmalloc
> > > 	 */
> > > 	if (size > PAGE_SIZE)
> > > 		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> > 
> > I think we should check against PAGE_ALLOC_COSTLY_ORDER anyway, as
> > there's no big need to allocate large contiguous chunks here, at the
> > same time someone in the kernel might really need them.
> 
> PAGE_ALLOC_COSTLY_ORDER is and should remain the page allocator internal
> implementation detail and shouldn't spread out much outside. GFP_NORETRY
> will already make sure we do not push hard here.

May be i didn't put my thoughts well, so let's discuss in more detail:

1. Yes, we don't try that hard to allocate high-order blocks with __GFP_NORETRY,
but we still can do compaction and direct reclaim, which can be heavy for large chunk.
In the worst case we can even fail to find the chunk, after all reclaim/compaction steps were made.

2. The second point is, even if we got the desired chunk quickly, we end up wasting large contiguous chunks,
which might be needed for CMA or some h/w driver (DMA for inst.), when they can't use non-contiguous chunks.

BTW, in the kernel there are few examples like alloc_fdmem() for inst., which
use that "costly order" idea of the fallback.

static void *alloc_fdmem(size_t size)
{
        /*
         * Very large allocations can stress page reclaim, so fall back to
         * vmalloc() if the allocation size will be considered "large" by the VM.
         */
        if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
                void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
                                     __GFP_NOWARN | __GFP_NORETRY);
                if (data != NULL)
                        return data;
        }
        return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
}

Also in netfilter,ceph and some other places.

Please, correct me if i'am somewhere mistaken with this thoughts.

And don't get me wrong, i'm just trying to get deeper into the problem,
especially as we're going to introduce something more or less generic.

> 
> > 
> > > 
> > > 	ret = kzalloc(size, gfp_mask);
> > > 	if (ret)
> > > 		return ret;
> > > 	return vzalloc(size);
> > > 
> > 
> > > I also do not like memcg_alloc helper name. It suggests we are
> > > allocating a memcg while it is used for cache arrays and slab LRUS.
> > > Anyway this pattern is quite widespread in the kernel so I would simply
> > > suggest adding kvmalloc function instead.
> > 
> > Agreed, it would be nice to have a generic call.
> > I would suggest an impl. like this:
> > 
> > void *kvmalloc(size_t size)
> 
> gfp_t gfp_mask should be a parameter as this should be a generic helper.
> 
> > {
> > 	gfp_t gfp_mask = GFP_KERNEL;
> 
> 
> > 	void *ret;
> > 
> >  	if (size > PAGE_SIZE)
> >  		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> > 
> > 
> > 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
> > 		ret = kzalloc(size, gfp_mask);
> > 		if (ret)
> > 			return ret;
> > 	}
> 
> No, please just do as suggested above. Tweak the gfp_mask for higher
> order requests and do kmalloc first with vmalloc as a  fallback.
> 
> -- 
> Michal Hocko
> SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-05 14:19           ` Michal Hocko
@ 2016-12-02 22:15             ` Anatoly Stepanov
  2016-12-06  8:34               ` Michal Hocko
  0 siblings, 1 reply; 14+ messages in thread
From: Anatoly Stepanov @ 2016-12-02 22:15 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Heiko Carstens, Vlastimil Babka, linux-mm, akpm, vdavydov.dev,
	umka, panda, vmeshkov

On Mon, Dec 05, 2016 at 03:19:29PM +0100, Michal Hocko wrote:
> On Mon 05-12-16 15:09:33, Heiko Carstens wrote:
> > On Mon, Dec 05, 2016 at 06:23:26AM +0100, Michal Hocko wrote:
> > > > > 
> > > > > 	ret = kzalloc(size, gfp_mask);
> > > > > 	if (ret)
> > > > > 		return ret;
> > > > > 	return vzalloc(size);
> > > > > 
> > > > 
> > > > > I also do not like memcg_alloc helper name. It suggests we are
> > > > > allocating a memcg while it is used for cache arrays and slab LRUS.
> > > > > Anyway this pattern is quite widespread in the kernel so I would simply
> > > > > suggest adding kvmalloc function instead.
> > > > 
> > > > Agreed, it would be nice to have a generic call.
> > > > I would suggest an impl. like this:
> > > > 
> > > > void *kvmalloc(size_t size)
> > > 
> > > gfp_t gfp_mask should be a parameter as this should be a generic helper.
> > > 
> > > > {
> > > > 	gfp_t gfp_mask = GFP_KERNEL;
> > > 
> > > 
> > > > 	void *ret;
> > > > 
> > > >  	if (size > PAGE_SIZE)
> > > >  		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> > > > 
> > > > 
> > > > 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
> > > > 		ret = kzalloc(size, gfp_mask);
> > > > 		if (ret)
> > > > 			return ret;
> > > > 	}
> > > 
> > > No, please just do as suggested above. Tweak the gfp_mask for higher
> > > order requests and do kmalloc first with vmalloc as a  fallback.
> > 
> > You may simply use the slightly different and open-coded variant within
> > fs/seq_file.c:seq_buf_alloc(). That one got a lot of testing in the
> > meantime...
> 
> Yeah. I would just add WARN_ON((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
> to catch users who might want to rely on GFP_NOFS, GFP_NOWAIT or other
> restricted requests because vmalloc cannot cope with those properly.

What about __vmalloc(size, gfp, prot)? I guess it's fine with theese

> 
> -- 
> Michal Hocko
> SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-06  8:47           ` Michal Hocko
@ 2016-12-03 15:55             ` Anatoly Stepanov
  2016-12-08  8:45               ` Michal Hocko
  0 siblings, 1 reply; 14+ messages in thread
From: Anatoly Stepanov @ 2016-12-03 15:55 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Vlastimil Babka, linux-mm, akpm, vdavydov.dev, umka, panda, vmeshkov

On Tue, Dec 06, 2016 at 09:47:35AM +0100, Michal Hocko wrote:
> On Sat 03-12-16 01:09:13, Anatoly Stepanov wrote:
> > On Mon, Dec 05, 2016 at 06:23:26AM +0100, Michal Hocko wrote:
> > > On Fri 02-12-16 09:54:17, Anatoly Stepanov wrote:
> > > > Alex, Vlasimil, Michal, thanks for your responses!
> > > > 
> > > > On Fri, Dec 02, 2016 at 10:19:33AM +0100, Michal Hocko wrote:
> > > > > Thanks for CCing me Vlastimil
> > > > > 
> > > > > On Fri 02-12-16 09:44:23, Vlastimil Babka wrote:
> > > > > > On 12/01/2016 02:16 AM, Anatoly Stepanov wrote:
> > > > > > > As memcg array size can be up to:
> > > > > > > sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);
> > > > > > > 
> > > > > > > where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.
> > > > > > > 
> > > > > > > When a memcg instance count is large enough it can lead
> > > > > > > to high order allocations up to order 7.
> > > > > 
> > > > > This is definitely not nice and worth fixing! I am just wondering
> > > > > whether this is something you have encountered in the real life. Having
> > > > > thousands of memcgs sounds quite crazy^Wscary to me. I am not at all
> > > > > sure we are prepared for that and some controllers would have real
> > > > > issues with it AFAIR.
> > > > 
> > > > In our company we use custom-made lightweight container technology, the thing is
> > > > we can have up to several thousands of them on a server.
> > > > So those high-order allocations were observed on a real production workload.
> > > 
> > > OK, this is interesting. Definitely worth mentioning in the changelog!
> > > 
> > > [...]
> > > > > 	/*
> > > > > 	 * Do not invoke OOM killer for larger requests as we can fall
> > > > > 	 * back to the vmalloc
> > > > > 	 */
> > > > > 	if (size > PAGE_SIZE)
> > > > > 		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> > > > 
> > > > I think we should check against PAGE_ALLOC_COSTLY_ORDER anyway, as
> > > > there's no big need to allocate large contiguous chunks here, at the
> > > > same time someone in the kernel might really need them.
> > > 
> > > PAGE_ALLOC_COSTLY_ORDER is and should remain the page allocator internal
> > > implementation detail and shouldn't spread out much outside. GFP_NORETRY
> > > will already make sure we do not push hard here.
> > 
> > May be i didn't put my thoughts well, so let's discuss in more detail:
> > 
> > 1. Yes, we don't try that hard to allocate high-order blocks with
> > __GFP_NORETRY, but we still can do compaction and direct reclaim,
> > which can be heavy for large chunk.  In the worst case we can even
> > fail to find the chunk, after all reclaim/compaction steps were made.
> 
> Yes this is correct. But I am not sure what you are trying to tell
> by that. Highorder requests are a bit of a problem. That's why
> __GFP_NORETRY is implicit here. It also guarantees that we won't hit
> the OOM killer because we do have a reasonable fallback. I do not see a
> point to play with COSTLY_ORDER though. The page allocator knows how to
> handle those and we are trying hard that those requests are not too
> disruptive. Or am I still missing your point?

My point is, while we're trying to get a pretty big contig. chunk (let's say of COSTLY_SIZE),
the reclaim can induce a lot of disk I/O which can be crucial
for overall system performance, at the same time we don't need that contig. chunk.

So, for COSTLY_SIZE chunks, vmalloc should perform better, as it's obviosly more likely
to find order-0 blocks w/o reclaim.

The bottom line is for COSTLY_SIZE chunks it's more likely to end up with the reclaim (=>disk I/O).
Which means for those chunks it's better to allocate them via vmalloc():

(Pseudo-code)
if (size < COSTLY) {
	ret = kmalloc(size);
	if (ret)
		return ret;
}

return vmalloc(size);

Right now i cannot see any significant overhead inside vmalloc() that can be an issue in this case.
Please tell if you know such a thing, it would be really interesting. 

> 
> > 2. The second point is, even if we got the desired chunk quickly, we
> > end up wasting large contiguous chunks, which might be needed for CMA
> > or some h/w driver (DMA for inst.), when they can't use non-contiguous
> > chunks.
> 
> On the other hand vmalloc is not free either.

Of course, but as i explained my thoughts above, i think kmalloc() seems to be worse
for large contig. chunks.

> 
> > BTW, in the kernel there are few examples like alloc_fdmem() for inst., which
> > use that "costly order" idea of the fallback.
> 
> I am not familiar with this code much so it is hard for me to comment.
> Anyway I am not entirely sure the code is still valid. We do not do
> excessive reclaim nor compaction for costly orders. THey are mostly an
> optimistic try without __GFP_REPEAT these days. So the assumption which
> it was based on back in 2011 might be no longer true.

Just to tell you one more case i oberved in real life scenario.
Once we encountered a problem on production server when a tons of forking from different apps
caused a lot of direct reclaim, and we fixed it by even more limiting usage of kmalloc().
PAGE_ALLOC_COSTLY_ORDER size was too big for us, after that production server started feeling better.
So, this limitation of kmalloc really makes much sense sometimes, and we need to take this into account.


> -- 
> Michal Hocko
> SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-02  6:54     ` Anatoly Stepanov
@ 2016-12-05  5:23       ` Michal Hocko
  2016-12-02 22:09         ` Anatoly Stepanov
  2016-12-05 14:09         ` Heiko Carstens
  0 siblings, 2 replies; 14+ messages in thread
From: Michal Hocko @ 2016-12-05  5:23 UTC (permalink / raw)
  To: Anatoly Stepanov
  Cc: Vlastimil Babka, linux-mm, akpm, vdavydov.dev, umka, panda, vmeshkov

On Fri 02-12-16 09:54:17, Anatoly Stepanov wrote:
> Alex, Vlasimil, Michal, thanks for your responses!
> 
> On Fri, Dec 02, 2016 at 10:19:33AM +0100, Michal Hocko wrote:
> > Thanks for CCing me Vlastimil
> > 
> > On Fri 02-12-16 09:44:23, Vlastimil Babka wrote:
> > > On 12/01/2016 02:16 AM, Anatoly Stepanov wrote:
> > > > As memcg array size can be up to:
> > > > sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);
> > > > 
> > > > where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.
> > > > 
> > > > When a memcg instance count is large enough it can lead
> > > > to high order allocations up to order 7.
> > 
> > This is definitely not nice and worth fixing! I am just wondering
> > whether this is something you have encountered in the real life. Having
> > thousands of memcgs sounds quite crazy^Wscary to me. I am not at all
> > sure we are prepared for that and some controllers would have real
> > issues with it AFAIR.
> 
> In our company we use custom-made lightweight container technology, the thing is
> we can have up to several thousands of them on a server.
> So those high-order allocations were observed on a real production workload.

OK, this is interesting. Definitely worth mentioning in the changelog!

[...]
> > 	/*
> > 	 * Do not invoke OOM killer for larger requests as we can fall
> > 	 * back to the vmalloc
> > 	 */
> > 	if (size > PAGE_SIZE)
> > 		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> 
> I think we should check against PAGE_ALLOC_COSTLY_ORDER anyway, as
> there's no big need to allocate large contiguous chunks here, at the
> same time someone in the kernel might really need them.

PAGE_ALLOC_COSTLY_ORDER is and should remain the page allocator internal
implementation detail and shouldn't spread out much outside. GFP_NORETRY
will already make sure we do not push hard here.

> 
> > 
> > 	ret = kzalloc(size, gfp_mask);
> > 	if (ret)
> > 		return ret;
> > 	return vzalloc(size);
> > 
> 
> > I also do not like memcg_alloc helper name. It suggests we are
> > allocating a memcg while it is used for cache arrays and slab LRUS.
> > Anyway this pattern is quite widespread in the kernel so I would simply
> > suggest adding kvmalloc function instead.
> 
> Agreed, it would be nice to have a generic call.
> I would suggest an impl. like this:
> 
> void *kvmalloc(size_t size)

gfp_t gfp_mask should be a parameter as this should be a generic helper.

> {
> 	gfp_t gfp_mask = GFP_KERNEL;


> 	void *ret;
> 
>  	if (size > PAGE_SIZE)
>  		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> 
> 
> 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
> 		ret = kzalloc(size, gfp_mask);
> 		if (ret)
> 			return ret;
> 	}

No, please just do as suggested above. Tweak the gfp_mask for higher
order requests and do kmalloc first with vmalloc as a  fallback.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-05  5:23       ` Michal Hocko
  2016-12-02 22:09         ` Anatoly Stepanov
@ 2016-12-05 14:09         ` Heiko Carstens
  2016-12-05 14:19           ` Michal Hocko
  1 sibling, 1 reply; 14+ messages in thread
From: Heiko Carstens @ 2016-12-05 14:09 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Anatoly Stepanov, Vlastimil Babka, linux-mm, akpm, vdavydov.dev,
	umka, panda, vmeshkov

On Mon, Dec 05, 2016 at 06:23:26AM +0100, Michal Hocko wrote:
> > > 
> > > 	ret = kzalloc(size, gfp_mask);
> > > 	if (ret)
> > > 		return ret;
> > > 	return vzalloc(size);
> > > 
> > 
> > > I also do not like memcg_alloc helper name. It suggests we are
> > > allocating a memcg while it is used for cache arrays and slab LRUS.
> > > Anyway this pattern is quite widespread in the kernel so I would simply
> > > suggest adding kvmalloc function instead.
> > 
> > Agreed, it would be nice to have a generic call.
> > I would suggest an impl. like this:
> > 
> > void *kvmalloc(size_t size)
> 
> gfp_t gfp_mask should be a parameter as this should be a generic helper.
> 
> > {
> > 	gfp_t gfp_mask = GFP_KERNEL;
> 
> 
> > 	void *ret;
> > 
> >  	if (size > PAGE_SIZE)
> >  		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> > 
> > 
> > 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
> > 		ret = kzalloc(size, gfp_mask);
> > 		if (ret)
> > 			return ret;
> > 	}
> 
> No, please just do as suggested above. Tweak the gfp_mask for higher
> order requests and do kmalloc first with vmalloc as a  fallback.

You may simply use the slightly different and open-coded variant within
fs/seq_file.c:seq_buf_alloc(). That one got a lot of testing in the
meantime...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-05 14:09         ` Heiko Carstens
@ 2016-12-05 14:19           ` Michal Hocko
  2016-12-02 22:15             ` Anatoly Stepanov
  0 siblings, 1 reply; 14+ messages in thread
From: Michal Hocko @ 2016-12-05 14:19 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: Anatoly Stepanov, Vlastimil Babka, linux-mm, akpm, vdavydov.dev,
	umka, panda, vmeshkov

On Mon 05-12-16 15:09:33, Heiko Carstens wrote:
> On Mon, Dec 05, 2016 at 06:23:26AM +0100, Michal Hocko wrote:
> > > > 
> > > > 	ret = kzalloc(size, gfp_mask);
> > > > 	if (ret)
> > > > 		return ret;
> > > > 	return vzalloc(size);
> > > > 
> > > 
> > > > I also do not like memcg_alloc helper name. It suggests we are
> > > > allocating a memcg while it is used for cache arrays and slab LRUS.
> > > > Anyway this pattern is quite widespread in the kernel so I would simply
> > > > suggest adding kvmalloc function instead.
> > > 
> > > Agreed, it would be nice to have a generic call.
> > > I would suggest an impl. like this:
> > > 
> > > void *kvmalloc(size_t size)
> > 
> > gfp_t gfp_mask should be a parameter as this should be a generic helper.
> > 
> > > {
> > > 	gfp_t gfp_mask = GFP_KERNEL;
> > 
> > 
> > > 	void *ret;
> > > 
> > >  	if (size > PAGE_SIZE)
> > >  		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> > > 
> > > 
> > > 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
> > > 		ret = kzalloc(size, gfp_mask);
> > > 		if (ret)
> > > 			return ret;
> > > 	}
> > 
> > No, please just do as suggested above. Tweak the gfp_mask for higher
> > order requests and do kmalloc first with vmalloc as a  fallback.
> 
> You may simply use the slightly different and open-coded variant within
> fs/seq_file.c:seq_buf_alloc(). That one got a lot of testing in the
> meantime...

Yeah. I would just add WARN_ON((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
to catch users who might want to rely on GFP_NOFS, GFP_NOWAIT or other
restricted requests because vmalloc cannot cope with those properly.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-02 22:15             ` Anatoly Stepanov
@ 2016-12-06  8:34               ` Michal Hocko
  0 siblings, 0 replies; 14+ messages in thread
From: Michal Hocko @ 2016-12-06  8:34 UTC (permalink / raw)
  To: Anatoly Stepanov
  Cc: Heiko Carstens, Vlastimil Babka, linux-mm, akpm, vdavydov.dev,
	umka, panda, vmeshkov

On Sat 03-12-16 01:15:10, Anatoly Stepanov wrote:
> On Mon, Dec 05, 2016 at 03:19:29PM +0100, Michal Hocko wrote:
> > On Mon 05-12-16 15:09:33, Heiko Carstens wrote:
> > > On Mon, Dec 05, 2016 at 06:23:26AM +0100, Michal Hocko wrote:
> > > > > > 
> > > > > > 	ret = kzalloc(size, gfp_mask);
> > > > > > 	if (ret)
> > > > > > 		return ret;
> > > > > > 	return vzalloc(size);
> > > > > > 
> > > > > 
> > > > > > I also do not like memcg_alloc helper name. It suggests we are
> > > > > > allocating a memcg while it is used for cache arrays and slab LRUS.
> > > > > > Anyway this pattern is quite widespread in the kernel so I would simply
> > > > > > suggest adding kvmalloc function instead.
> > > > > 
> > > > > Agreed, it would be nice to have a generic call.
> > > > > I would suggest an impl. like this:
> > > > > 
> > > > > void *kvmalloc(size_t size)
> > > > 
> > > > gfp_t gfp_mask should be a parameter as this should be a generic helper.
> > > > 
> > > > > {
> > > > > 	gfp_t gfp_mask = GFP_KERNEL;
> > > > 
> > > > 
> > > > > 	void *ret;
> > > > > 
> > > > >  	if (size > PAGE_SIZE)
> > > > >  		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> > > > > 
> > > > > 
> > > > > 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
> > > > > 		ret = kzalloc(size, gfp_mask);
> > > > > 		if (ret)
> > > > > 			return ret;
> > > > > 	}
> > > > 
> > > > No, please just do as suggested above. Tweak the gfp_mask for higher
> > > > order requests and do kmalloc first with vmalloc as a  fallback.
> > > 
> > > You may simply use the slightly different and open-coded variant within
> > > fs/seq_file.c:seq_buf_alloc(). That one got a lot of testing in the
> > > meantime...
> > 
> > Yeah. I would just add WARN_ON((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
> > to catch users who might want to rely on GFP_NOFS, GFP_NOWAIT or other
> > restricted requests because vmalloc cannot cope with those properly.
> 
> What about __vmalloc(size, gfp, prot)? I guess it's fine with theese

Not really. Parts of the vmalloc allocator still use hardcoded
GFP_KERNEL AFAIR.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-02 22:09         ` Anatoly Stepanov
@ 2016-12-06  8:47           ` Michal Hocko
  2016-12-03 15:55             ` Anatoly Stepanov
  0 siblings, 1 reply; 14+ messages in thread
From: Michal Hocko @ 2016-12-06  8:47 UTC (permalink / raw)
  To: Anatoly Stepanov
  Cc: Vlastimil Babka, linux-mm, akpm, vdavydov.dev, umka, panda, vmeshkov

On Sat 03-12-16 01:09:13, Anatoly Stepanov wrote:
> On Mon, Dec 05, 2016 at 06:23:26AM +0100, Michal Hocko wrote:
> > On Fri 02-12-16 09:54:17, Anatoly Stepanov wrote:
> > > Alex, Vlasimil, Michal, thanks for your responses!
> > > 
> > > On Fri, Dec 02, 2016 at 10:19:33AM +0100, Michal Hocko wrote:
> > > > Thanks for CCing me Vlastimil
> > > > 
> > > > On Fri 02-12-16 09:44:23, Vlastimil Babka wrote:
> > > > > On 12/01/2016 02:16 AM, Anatoly Stepanov wrote:
> > > > > > As memcg array size can be up to:
> > > > > > sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);
> > > > > > 
> > > > > > where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.
> > > > > > 
> > > > > > When a memcg instance count is large enough it can lead
> > > > > > to high order allocations up to order 7.
> > > > 
> > > > This is definitely not nice and worth fixing! I am just wondering
> > > > whether this is something you have encountered in the real life. Having
> > > > thousands of memcgs sounds quite crazy^Wscary to me. I am not at all
> > > > sure we are prepared for that and some controllers would have real
> > > > issues with it AFAIR.
> > > 
> > > In our company we use custom-made lightweight container technology, the thing is
> > > we can have up to several thousands of them on a server.
> > > So those high-order allocations were observed on a real production workload.
> > 
> > OK, this is interesting. Definitely worth mentioning in the changelog!
> > 
> > [...]
> > > > 	/*
> > > > 	 * Do not invoke OOM killer for larger requests as we can fall
> > > > 	 * back to the vmalloc
> > > > 	 */
> > > > 	if (size > PAGE_SIZE)
> > > > 		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> > > 
> > > I think we should check against PAGE_ALLOC_COSTLY_ORDER anyway, as
> > > there's no big need to allocate large contiguous chunks here, at the
> > > same time someone in the kernel might really need them.
> > 
> > PAGE_ALLOC_COSTLY_ORDER is and should remain the page allocator internal
> > implementation detail and shouldn't spread out much outside. GFP_NORETRY
> > will already make sure we do not push hard here.
> 
> May be i didn't put my thoughts well, so let's discuss in more detail:
> 
> 1. Yes, we don't try that hard to allocate high-order blocks with
> __GFP_NORETRY, but we still can do compaction and direct reclaim,
> which can be heavy for large chunk.  In the worst case we can even
> fail to find the chunk, after all reclaim/compaction steps were made.

Yes this is correct. But I am not sure what you are trying to tell
by that. Highorder requests are a bit of a problem. That's why
__GFP_NORETRY is implicit here. It also guarantees that we won't hit
the OOM killer because we do have a reasonable fallback. I do not see a
point to play with COSTLY_ORDER though. The page allocator knows how to
handle those and we are trying hard that those requests are not too
disruptive. Or am I still missing your point?

> 2. The second point is, even if we got the desired chunk quickly, we
> end up wasting large contiguous chunks, which might be needed for CMA
> or some h/w driver (DMA for inst.), when they can't use non-contiguous
> chunks.

On the other hand vmalloc is not free either.

> BTW, in the kernel there are few examples like alloc_fdmem() for inst., which
> use that "costly order" idea of the fallback.

I am not familiar with this code much so it is hard for me to comment.
Anyway I am not entirely sure the code is still valid. We do not do
excessive reclaim nor compaction for costly orders. THey are mostly an
optimistic try without __GFP_REPEAT these days. So the assumption which
it was based on back in 2011 might be no longer true.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: use vmalloc fallback path for certain memcg allocations
  2016-12-03 15:55             ` Anatoly Stepanov
@ 2016-12-08  8:45               ` Michal Hocko
  0 siblings, 0 replies; 14+ messages in thread
From: Michal Hocko @ 2016-12-08  8:45 UTC (permalink / raw)
  To: Anatoly Stepanov
  Cc: Vlastimil Babka, linux-mm, akpm, vdavydov.dev, umka, panda, vmeshkov

On Sat 03-12-16 18:55:22, Anatoly Stepanov wrote:
> On Tue, Dec 06, 2016 at 09:47:35AM +0100, Michal Hocko wrote:
> > On Sat 03-12-16 01:09:13, Anatoly Stepanov wrote:
> > > On Mon, Dec 05, 2016 at 06:23:26AM +0100, Michal Hocko wrote:
> > > > On Fri 02-12-16 09:54:17, Anatoly Stepanov wrote:
> > > > > Alex, Vlasimil, Michal, thanks for your responses!
> > > > > 
> > > > > On Fri, Dec 02, 2016 at 10:19:33AM +0100, Michal Hocko wrote:
> > > > > > Thanks for CCing me Vlastimil
> > > > > > 
> > > > > > On Fri 02-12-16 09:44:23, Vlastimil Babka wrote:
> > > > > > > On 12/01/2016 02:16 AM, Anatoly Stepanov wrote:
> > > > > > > > As memcg array size can be up to:
> > > > > > > > sizeof(struct memcg_cache_array) + kmemcg_id * sizeof(void *);
> > > > > > > > 
> > > > > > > > where kmemcg_id can be up to MEMCG_CACHES_MAX_SIZE.
> > > > > > > > 
> > > > > > > > When a memcg instance count is large enough it can lead
> > > > > > > > to high order allocations up to order 7.
> > > > > > 
> > > > > > This is definitely not nice and worth fixing! I am just wondering
> > > > > > whether this is something you have encountered in the real life. Having
> > > > > > thousands of memcgs sounds quite crazy^Wscary to me. I am not at all
> > > > > > sure we are prepared for that and some controllers would have real
> > > > > > issues with it AFAIR.
> > > > > 
> > > > > In our company we use custom-made lightweight container technology, the thing is
> > > > > we can have up to several thousands of them on a server.
> > > > > So those high-order allocations were observed on a real production workload.
> > > > 
> > > > OK, this is interesting. Definitely worth mentioning in the changelog!
> > > > 
> > > > [...]
> > > > > > 	/*
> > > > > > 	 * Do not invoke OOM killer for larger requests as we can fall
> > > > > > 	 * back to the vmalloc
> > > > > > 	 */
> > > > > > 	if (size > PAGE_SIZE)
> > > > > > 		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
> > > > > 
> > > > > I think we should check against PAGE_ALLOC_COSTLY_ORDER anyway, as
> > > > > there's no big need to allocate large contiguous chunks here, at the
> > > > > same time someone in the kernel might really need them.
> > > > 
> > > > PAGE_ALLOC_COSTLY_ORDER is and should remain the page allocator internal
> > > > implementation detail and shouldn't spread out much outside. GFP_NORETRY
> > > > will already make sure we do not push hard here.
> > > 
> > > May be i didn't put my thoughts well, so let's discuss in more detail:
> > > 
> > > 1. Yes, we don't try that hard to allocate high-order blocks with
> > > __GFP_NORETRY, but we still can do compaction and direct reclaim,
> > > which can be heavy for large chunk.  In the worst case we can even
> > > fail to find the chunk, after all reclaim/compaction steps were made.
> > 
> > Yes this is correct. But I am not sure what you are trying to tell
> > by that. Highorder requests are a bit of a problem. That's why
> > __GFP_NORETRY is implicit here. It also guarantees that we won't hit
> > the OOM killer because we do have a reasonable fallback. I do not see a
> > point to play with COSTLY_ORDER though. The page allocator knows how to
> > handle those and we are trying hard that those requests are not too
> > disruptive. Or am I still missing your point?
> 
> My point is, while we're trying to get a pretty big contig. chunk
> (let's say of COSTLY_SIZE), the reclaim can induce a lot of disk I/O

Not really, as I've tried to explain above. The page allocator really
doesn't try hard for costly orders and bail out early after the first
round of reclaim compaction.

> which can be crucial for overall system performance, at the same time
> we don't need that contig. chunk.
> 
> So, for COSTLY_SIZE chunks, vmalloc should perform better, as it's
> obviosly more likely to find order-0 blocks w/o reclaim.

Again, vmalloc is not free either and a problem especially on 32b
arches.

Anyway, I think we are going in circles here and repeating the same
arguments. Let me post what I think is the right implementation of
kvmalloc and you can build on top of that.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-12-08  8:45 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-01  1:16 [PATCH] mm: use vmalloc fallback path for certain memcg allocations Anatoly Stepanov
2016-12-02  8:19 ` Alexey Lyashkov
2016-12-02  8:44 ` Vlastimil Babka
2016-12-02  9:19   ` Michal Hocko
2016-12-02  6:54     ` Anatoly Stepanov
2016-12-05  5:23       ` Michal Hocko
2016-12-02 22:09         ` Anatoly Stepanov
2016-12-06  8:47           ` Michal Hocko
2016-12-03 15:55             ` Anatoly Stepanov
2016-12-08  8:45               ` Michal Hocko
2016-12-05 14:09         ` Heiko Carstens
2016-12-05 14:19           ` Michal Hocko
2016-12-02 22:15             ` Anatoly Stepanov
2016-12-06  8:34               ` Michal Hocko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.