linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch] mm: memcontrol: support transparent huge pages under pressure
@ 2014-09-19 13:20 Johannes Weiner
  2014-09-22 10:18 ` Vladimir Davydov
  2014-09-23  5:52 ` Greg Thelen
  0 siblings, 2 replies; 9+ messages in thread
From: Johannes Weiner @ 2014-09-19 13:20 UTC (permalink / raw)
  To: linux-mm; +Cc: Michal Hocko, Greg Thelen, Dave Hansen, cgroups, linux-kernel

In a memcg with even just moderate cache pressure, success rates for
transparent huge page allocations drop to zero, wasting a lot of
effort that the allocator puts into assembling these pages.

The reason for this is that the memcg reclaim code was never designed
for higher-order charges.  It reclaims in small batches until there is
room for at least one page.  Huge pages charges only succeed when
these batches add up over a series of huge faults, which is unlikely
under any significant load involving order-0 allocations in the group.

Remove that loop on the memcg side in favor of passing the actual
reclaim goal to direct reclaim, which is already set up and optimized
to meet higher-order goals efficiently.

This brings memcg's THP policy in line with the system policy: if the
allocator painstakingly assembles a hugepage, memcg will at least make
an honest effort to charge it.  As a result, transparent hugepage
allocation rates amid cache activity are drastically improved:

                                      vanilla                 patched
pgalloc                 4717530.80 (  +0.00%)   4451376.40 (  -5.64%)
pgfault                  491370.60 (  +0.00%)    225477.40 ( -54.11%)
pgmajfault                    2.00 (  +0.00%)         1.80 (  -6.67%)
thp_fault_alloc               0.00 (  +0.00%)       531.60 (+100.00%)
thp_fault_fallback          749.00 (  +0.00%)       217.40 ( -70.88%)

[ Note: this may in turn increase memory consumption from internal
  fragmentation, which is an inherent risk of transparent hugepages.
  Some setups may have to adjust the memcg limits accordingly to
  accomodate this - or, if the machine is already packed to capacity,
  disable the transparent huge page feature. ]

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/swap.h |  6 ++--
 mm/memcontrol.c      | 86 +++++++++++-----------------------------------------
 mm/vmscan.c          |  7 +++--
 3 files changed, 25 insertions(+), 74 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ea4f926e6b9b..37a585beef5c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -327,8 +327,10 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
-extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
-						  gfp_t gfp_mask, bool noswap);
+extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+						  unsigned long nr_pages,
+						  gfp_t gfp_mask,
+						  bool may_swap);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						struct zone *zone,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9431024e490c..e2def11f1ec1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -315,9 +315,6 @@ struct mem_cgroup {
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
-	/* set when res.limit == memsw.limit */
-	bool		memsw_is_minimum;
-
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 
@@ -481,14 +478,6 @@ enum res_type {
 #define OOM_CONTROL		(0)
 
 /*
- * Reclaim flags for mem_cgroup_hierarchical_reclaim
- */
-#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
-#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
-#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
-#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-
-/*
  * The memcg_create_mutex will be held whenever a new cgroup is created.
  * As a consequence, any change that needs to protect against new child cgroups
  * appearing has to hold it as well.
@@ -1794,42 +1783,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			 NULL, "Memory cgroup out of memory");
 }
 
-static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
-					gfp_t gfp_mask,
-					unsigned long flags)
-{
-	unsigned long total = 0;
-	bool noswap = false;
-	int loop;
-
-	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
-		noswap = true;
-	if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
-		noswap = true;
-
-	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
-		if (loop)
-			drain_all_stock_async(memcg);
-		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
-		/*
-		 * Allow limit shrinkers, which are triggered directly
-		 * by userspace, to catch signals and stop reclaim
-		 * after minimal progress, regardless of the margin.
-		 */
-		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
-			break;
-		if (mem_cgroup_margin(memcg))
-			break;
-		/*
-		 * If nothing was reclaimed after two attempts, there
-		 * may be no reclaimable pages in this hierarchy.
-		 */
-		if (loop && !total)
-			break;
-	}
-	return total;
-}
-
 /**
  * test_mem_cgroup_node_reclaimable
  * @memcg: the target memcg
@@ -2532,8 +2485,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	struct mem_cgroup *mem_over_limit;
 	struct res_counter *fail_res;
 	unsigned long nr_reclaimed;
-	unsigned long flags = 0;
 	unsigned long long size;
+	bool may_swap = true;
+	bool drained = false;
 	int ret = 0;
 
 	if (mem_cgroup_is_root(memcg))
@@ -2550,7 +2504,7 @@ retry:
 			goto done_restock;
 		res_counter_uncharge(&memcg->res, size);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
-		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+		may_swap = false;
 	} else
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
 
@@ -2576,11 +2530,18 @@ retry:
 	if (!(gfp_mask & __GFP_WAIT))
 		goto nomem;
 
-	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+						    gfp_mask, may_swap);
 
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
 		goto retry;
 
+	if (!drained) {
+		drain_all_stock_async(mem_over_limit);
+		drained = true;
+		goto retry;
+	}
+
 	if (gfp_mask & __GFP_NORETRY)
 		goto nomem;
 	/*
@@ -3655,19 +3616,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 			enlarge = 1;
 
 		ret = res_counter_set_limit(&memcg->res, val);
-		if (!ret) {
-			if (memswlimit == val)
-				memcg->memsw_is_minimum = true;
-			else
-				memcg->memsw_is_minimum = false;
-		}
 		mutex_unlock(&set_limit_mutex);
 
 		if (!ret)
 			break;
 
-		mem_cgroup_reclaim(memcg, GFP_KERNEL,
-				   MEM_CGROUP_RECLAIM_SHRINK);
+		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
+
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
@@ -3714,20 +3669,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 		if (memswlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->memsw, val);
-		if (!ret) {
-			if (memlimit == val)
-				memcg->memsw_is_minimum = true;
-			else
-				memcg->memsw_is_minimum = false;
-		}
 		mutex_unlock(&set_limit_mutex);
 
 		if (!ret)
 			break;
 
-		mem_cgroup_reclaim(memcg, GFP_KERNEL,
-				   MEM_CGROUP_RECLAIM_NOSWAP |
-				   MEM_CGROUP_RECLAIM_SHRINK);
+		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
+
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
@@ -3976,8 +3924,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 		if (signal_pending(current))
 			return -EINTR;
 
-		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
-						false);
+		progress = try_to_free_mem_cgroup_pages(memcg, 1,
+							GFP_KERNEL, true);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b672e2c6becc..97d31ec17d06 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2759,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 }
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   bool noswap)
+					   bool may_swap)
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
 	int nid;
 	struct scan_control sc = {
-		.nr_to_reclaim = SWAP_CLUSTER_MAX,
+		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
 		.target_mem_cgroup = memcg,
 		.priority = DEF_PRIORITY,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
-		.may_swap = !noswap,
+		.may_swap = may_swap,
 	};
 
 	/*
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
  2014-09-19 13:20 [patch] mm: memcontrol: support transparent huge pages under pressure Johannes Weiner
@ 2014-09-22 10:18 ` Vladimir Davydov
  2014-09-23  5:52 ` Greg Thelen
  1 sibling, 0 replies; 9+ messages in thread
From: Vladimir Davydov @ 2014-09-22 10:18 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: linux-mm, Michal Hocko, Greg Thelen, Dave Hansen, cgroups, linux-kernel

On Fri, Sep 19, 2014 at 09:20:40AM -0400, Johannes Weiner wrote:
> In a memcg with even just moderate cache pressure, success rates for
> transparent huge page allocations drop to zero, wasting a lot of
> effort that the allocator puts into assembling these pages.
> 
> The reason for this is that the memcg reclaim code was never designed
> for higher-order charges.  It reclaims in small batches until there is
> room for at least one page.  Huge pages charges only succeed when
> these batches add up over a series of huge faults, which is unlikely
> under any significant load involving order-0 allocations in the group.
> 
> Remove that loop on the memcg side in favor of passing the actual
> reclaim goal to direct reclaim, which is already set up and optimized
> to meet higher-order goals efficiently.
> 
> This brings memcg's THP policy in line with the system policy: if the
> allocator painstakingly assembles a hugepage, memcg will at least make
> an honest effort to charge it.  As a result, transparent hugepage
> allocation rates amid cache activity are drastically improved:
> 
>                                       vanilla                 patched
> pgalloc                 4717530.80 (  +0.00%)   4451376.40 (  -5.64%)
> pgfault                  491370.60 (  +0.00%)    225477.40 ( -54.11%)
> pgmajfault                    2.00 (  +0.00%)         1.80 (  -6.67%)
> thp_fault_alloc               0.00 (  +0.00%)       531.60 (+100.00%)
> thp_fault_fallback          749.00 (  +0.00%)       217.40 ( -70.88%)
> 
> [ Note: this may in turn increase memory consumption from internal
>   fragmentation, which is an inherent risk of transparent hugepages.
>   Some setups may have to adjust the memcg limits accordingly to
>   accomodate this - or, if the machine is already packed to capacity,
>   disable the transparent huge page feature. ]
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Looks like a really nice change to me. FWIW,

Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
  2014-09-19 13:20 [patch] mm: memcontrol: support transparent huge pages under pressure Johannes Weiner
  2014-09-22 10:18 ` Vladimir Davydov
@ 2014-09-23  5:52 ` Greg Thelen
  2014-09-23  8:29   ` Vladimir Davydov
  2014-09-23 11:16   ` Johannes Weiner
  1 sibling, 2 replies; 9+ messages in thread
From: Greg Thelen @ 2014-09-23  5:52 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel


On Fri, Sep 19 2014, Johannes Weiner wrote:

> In a memcg with even just moderate cache pressure, success rates for
> transparent huge page allocations drop to zero, wasting a lot of
> effort that the allocator puts into assembling these pages.
>
> The reason for this is that the memcg reclaim code was never designed
> for higher-order charges.  It reclaims in small batches until there is
> room for at least one page.  Huge pages charges only succeed when
> these batches add up over a series of huge faults, which is unlikely
> under any significant load involving order-0 allocations in the group.
>
> Remove that loop on the memcg side in favor of passing the actual
> reclaim goal to direct reclaim, which is already set up and optimized
> to meet higher-order goals efficiently.
>
> This brings memcg's THP policy in line with the system policy: if the
> allocator painstakingly assembles a hugepage, memcg will at least make
> an honest effort to charge it.  As a result, transparent hugepage
> allocation rates amid cache activity are drastically improved:
>
>                                       vanilla                 patched
> pgalloc                 4717530.80 (  +0.00%)   4451376.40 (  -5.64%)
> pgfault                  491370.60 (  +0.00%)    225477.40 ( -54.11%)
> pgmajfault                    2.00 (  +0.00%)         1.80 (  -6.67%)
> thp_fault_alloc               0.00 (  +0.00%)       531.60 (+100.00%)
> thp_fault_fallback          749.00 (  +0.00%)       217.40 ( -70.88%)
>
> [ Note: this may in turn increase memory consumption from internal
>   fragmentation, which is an inherent risk of transparent hugepages.
>   Some setups may have to adjust the memcg limits accordingly to
>   accomodate this - or, if the machine is already packed to capacity,
>   disable the transparent huge page feature. ]

We're using an earlier version of this patch, so I approve of the
general direction.  But I have some feedback.

The memsw aspect of this change seems somewhat separate.  Can it be
split into a different patch?

The memsw aspect of this patch seems to change behavior.  Is this
intended?  If so, a mention of it in the commit log would assuage the
reader.  I'll explain...  Assume a machine with swap enabled and
res.limit==memsw.limit, thus memsw_is_minimum is true.  My understanding
is that memsw.usage represents sum(ram_usage, swap_usage).  So when
memsw_is_minimum=true, then both swap_usage=0 and
memsw.usage==res.usage.  In this condition, if res usage is at limit
then there's no point in swapping because memsw.usage is already
maximal.  Prior to this patch I think the kernel did the right thing,
but not afterwards.

Before this patch:
  if res.usage == res.limit, try_charge() indirectly calls
  try_to_free_mem_cgroup_pages(noswap=true)

After this patch:
  if res.usage == res.limit, try_charge() calls
  try_to_free_mem_cgroup_pages(may_swap=true)

Notice the inverted swap-is-allowed value.

I haven't had time to look at your other outstanding memcg patches.
These comments were made with this patch in isolation.

> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>  include/linux/swap.h |  6 ++--
>  mm/memcontrol.c      | 86 +++++++++++-----------------------------------------
>  mm/vmscan.c          |  7 +++--
>  3 files changed, 25 insertions(+), 74 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index ea4f926e6b9b..37a585beef5c 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -327,8 +327,10 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
>  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  					gfp_t gfp_mask, nodemask_t *mask);
>  extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
> -extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
> -						  gfp_t gfp_mask, bool noswap);
> +extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> +						  unsigned long nr_pages,
> +						  gfp_t gfp_mask,
> +						  bool may_swap);
>  extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
>  						gfp_t gfp_mask, bool noswap,
>  						struct zone *zone,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 9431024e490c..e2def11f1ec1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -315,9 +315,6 @@ struct mem_cgroup {
>  	/* OOM-Killer disable */
>  	int		oom_kill_disable;
>  
> -	/* set when res.limit == memsw.limit */
> -	bool		memsw_is_minimum;
> -
>  	/* protect arrays of thresholds */
>  	struct mutex thresholds_lock;
>  
> @@ -481,14 +478,6 @@ enum res_type {
>  #define OOM_CONTROL		(0)
>  
>  /*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -
> -/*
>   * The memcg_create_mutex will be held whenever a new cgroup is created.
>   * As a consequence, any change that needs to protect against new child cgroups
>   * appearing has to hold it as well.
> @@ -1794,42 +1783,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  			 NULL, "Memory cgroup out of memory");
>  }
>  
> -static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
> -					gfp_t gfp_mask,
> -					unsigned long flags)
> -{
> -	unsigned long total = 0;
> -	bool noswap = false;
> -	int loop;
> -
> -	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
> -		noswap = true;
> -	if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
> -		noswap = true;
> -
> -	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
> -		if (loop)
> -			drain_all_stock_async(memcg);
> -		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
> -		/*
> -		 * Allow limit shrinkers, which are triggered directly
> -		 * by userspace, to catch signals and stop reclaim
> -		 * after minimal progress, regardless of the margin.
> -		 */
> -		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
> -			break;
> -		if (mem_cgroup_margin(memcg))
> -			break;
> -		/*
> -		 * If nothing was reclaimed after two attempts, there
> -		 * may be no reclaimable pages in this hierarchy.
> -		 */
> -		if (loop && !total)
> -			break;
> -	}
> -	return total;
> -}
> -
>  /**
>   * test_mem_cgroup_node_reclaimable
>   * @memcg: the target memcg
> @@ -2532,8 +2485,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	struct mem_cgroup *mem_over_limit;
>  	struct res_counter *fail_res;
>  	unsigned long nr_reclaimed;
> -	unsigned long flags = 0;
>  	unsigned long long size;
> +	bool may_swap = true;
> +	bool drained = false;
>  	int ret = 0;
>  
>  	if (mem_cgroup_is_root(memcg))
> @@ -2550,7 +2504,7 @@ retry:
>  			goto done_restock;
>  		res_counter_uncharge(&memcg->res, size);
>  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
> -		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> +		may_swap = false;
>  	} else
>  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
>  
> @@ -2576,11 +2530,18 @@ retry:
>  	if (!(gfp_mask & __GFP_WAIT))
>  		goto nomem;
>  
> -	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
> +	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> +						    gfp_mask, may_swap);
>  
>  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
>  		goto retry;
>  
> +	if (!drained) {
> +		drain_all_stock_async(mem_over_limit);
> +		drained = true;
> +		goto retry;
> +	}
> +
>  	if (gfp_mask & __GFP_NORETRY)
>  		goto nomem;
>  	/*
> @@ -3655,19 +3616,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>  			enlarge = 1;
>  
>  		ret = res_counter_set_limit(&memcg->res, val);
> -		if (!ret) {
> -			if (memswlimit == val)
> -				memcg->memsw_is_minimum = true;
> -			else
> -				memcg->memsw_is_minimum = false;
> -		}
>  		mutex_unlock(&set_limit_mutex);
>  
>  		if (!ret)
>  			break;
>  
> -		mem_cgroup_reclaim(memcg, GFP_KERNEL,
> -				   MEM_CGROUP_RECLAIM_SHRINK);
> +		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
> +
>  		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
>  		/* Usage is reduced ? */
>  		if (curusage >= oldusage)
> @@ -3714,20 +3669,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
>  		if (memswlimit < val)
>  			enlarge = 1;
>  		ret = res_counter_set_limit(&memcg->memsw, val);
> -		if (!ret) {
> -			if (memlimit == val)
> -				memcg->memsw_is_minimum = true;
> -			else
> -				memcg->memsw_is_minimum = false;
> -		}
>  		mutex_unlock(&set_limit_mutex);
>  
>  		if (!ret)
>  			break;
>  
> -		mem_cgroup_reclaim(memcg, GFP_KERNEL,
> -				   MEM_CGROUP_RECLAIM_NOSWAP |
> -				   MEM_CGROUP_RECLAIM_SHRINK);
> +		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
> +
>  		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
>  		/* Usage is reduced ? */
>  		if (curusage >= oldusage)
> @@ -3976,8 +3924,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>  		if (signal_pending(current))
>  			return -EINTR;
>  
> -		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
> -						false);
> +		progress = try_to_free_mem_cgroup_pages(memcg, 1,
> +							GFP_KERNEL, true);
>  		if (!progress) {
>  			nr_retries--;
>  			/* maybe some writeback is necessary */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b672e2c6becc..97d31ec17d06 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2759,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
>  }
>  
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> +					   unsigned long nr_pages,
>  					   gfp_t gfp_mask,
> -					   bool noswap)
> +					   bool may_swap)
>  {
>  	struct zonelist *zonelist;
>  	unsigned long nr_reclaimed;
>  	int nid;
>  	struct scan_control sc = {
> -		.nr_to_reclaim = SWAP_CLUSTER_MAX,
> +		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
>  		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
>  				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
>  		.target_mem_cgroup = memcg,
>  		.priority = DEF_PRIORITY,
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
> -		.may_swap = !noswap,
> +		.may_swap = may_swap,
>  	};
>  
>  	/*

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
  2014-09-23  5:52 ` Greg Thelen
@ 2014-09-23  8:29   ` Vladimir Davydov
  2014-09-23 11:44     ` Vladimir Davydov
  2014-09-23 11:48     ` Johannes Weiner
  2014-09-23 11:16   ` Johannes Weiner
  1 sibling, 2 replies; 9+ messages in thread
From: Vladimir Davydov @ 2014-09-23  8:29 UTC (permalink / raw)
  To: Greg Thelen
  Cc: Johannes Weiner, linux-mm, Michal Hocko, Dave Hansen, cgroups,
	linux-kernel

[sorry for butting in, but I think I can answer your question]

On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
> 
> On Fri, Sep 19 2014, Johannes Weiner wrote:
> 
> > In a memcg with even just moderate cache pressure, success rates for
> > transparent huge page allocations drop to zero, wasting a lot of
> > effort that the allocator puts into assembling these pages.
> >
> > The reason for this is that the memcg reclaim code was never designed
> > for higher-order charges.  It reclaims in small batches until there is
> > room for at least one page.  Huge pages charges only succeed when
> > these batches add up over a series of huge faults, which is unlikely
> > under any significant load involving order-0 allocations in the group.
> >
> > Remove that loop on the memcg side in favor of passing the actual
> > reclaim goal to direct reclaim, which is already set up and optimized
> > to meet higher-order goals efficiently.
> >
> > This brings memcg's THP policy in line with the system policy: if the
> > allocator painstakingly assembles a hugepage, memcg will at least make
> > an honest effort to charge it.  As a result, transparent hugepage
> > allocation rates amid cache activity are drastically improved:
> >
> >                                       vanilla                 patched
> > pgalloc                 4717530.80 (  +0.00%)   4451376.40 (  -5.64%)
> > pgfault                  491370.60 (  +0.00%)    225477.40 ( -54.11%)
> > pgmajfault                    2.00 (  +0.00%)         1.80 (  -6.67%)
> > thp_fault_alloc               0.00 (  +0.00%)       531.60 (+100.00%)
> > thp_fault_fallback          749.00 (  +0.00%)       217.40 ( -70.88%)
> >
> > [ Note: this may in turn increase memory consumption from internal
> >   fragmentation, which is an inherent risk of transparent hugepages.
> >   Some setups may have to adjust the memcg limits accordingly to
> >   accomodate this - or, if the machine is already packed to capacity,
> >   disable the transparent huge page feature. ]
> 
> We're using an earlier version of this patch, so I approve of the
> general direction.  But I have some feedback.
> 
> The memsw aspect of this change seems somewhat separate.  Can it be
> split into a different patch?
> 
> The memsw aspect of this patch seems to change behavior.  Is this
> intended?  If so, a mention of it in the commit log would assuage the
> reader.  I'll explain...  Assume a machine with swap enabled and
> res.limit==memsw.limit, thus memsw_is_minimum is true.  My understanding
> is that memsw.usage represents sum(ram_usage, swap_usage).  So when
> memsw_is_minimum=true, then both swap_usage=0 and
> memsw.usage==res.usage. 

Not necessarily, we can have memsw.usage > res.usage due to global
pressure. The point is (memsw.limit-res.limit) is not the swap usage
limit. This is really confusing. As Johannes pointed out, we should drop
memsw.limit in favor of separate mem.limit and swap.limit.

> In this condition, if res usage is at limit then there's no point in
> swapping because memsw.usage is already maximal.  Prior to this patch
> I think the kernel did the right thing, but not afterwards.
> 
> Before this patch:
>   if res.usage == res.limit, try_charge() indirectly calls
>   try_to_free_mem_cgroup_pages(noswap=true)

But this is wrong. If we fail to charge res, we should try to do swap
out along with page cache reclaim. Swap out won't affect memsw.usage,
but will diminish res.usage so that the allocation may succeed.

AFAIU we should only disable swap-out if we succeeded to charge res, but
failed with memsw. And this is the rule that this patch follows.

Anyway, I agree with you that this is far not obvious and deserves a
separate patch.

Thanks,
Vladimir

> 
> After this patch:
>   if res.usage == res.limit, try_charge() calls
>   try_to_free_mem_cgroup_pages(may_swap=true)
> 
> Notice the inverted swap-is-allowed value.
> 
> I haven't had time to look at your other outstanding memcg patches.
> These comments were made with this patch in isolation.
[...]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
  2014-09-23  5:52 ` Greg Thelen
  2014-09-23  8:29   ` Vladimir Davydov
@ 2014-09-23 11:16   ` Johannes Weiner
  2014-09-24  0:07     ` Greg Thelen
  1 sibling, 1 reply; 9+ messages in thread
From: Johannes Weiner @ 2014-09-23 11:16 UTC (permalink / raw)
  To: Greg Thelen; +Cc: linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel

On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
> 
> On Fri, Sep 19 2014, Johannes Weiner wrote:
> 
> > In a memcg with even just moderate cache pressure, success rates for
> > transparent huge page allocations drop to zero, wasting a lot of
> > effort that the allocator puts into assembling these pages.
> >
> > The reason for this is that the memcg reclaim code was never designed
> > for higher-order charges.  It reclaims in small batches until there is
> > room for at least one page.  Huge pages charges only succeed when
> > these batches add up over a series of huge faults, which is unlikely
> > under any significant load involving order-0 allocations in the group.
> >
> > Remove that loop on the memcg side in favor of passing the actual
> > reclaim goal to direct reclaim, which is already set up and optimized
> > to meet higher-order goals efficiently.
> >
> > This brings memcg's THP policy in line with the system policy: if the
> > allocator painstakingly assembles a hugepage, memcg will at least make
> > an honest effort to charge it.  As a result, transparent hugepage
> > allocation rates amid cache activity are drastically improved:
> >
> >                                       vanilla                 patched
> > pgalloc                 4717530.80 (  +0.00%)   4451376.40 (  -5.64%)
> > pgfault                  491370.60 (  +0.00%)    225477.40 ( -54.11%)
> > pgmajfault                    2.00 (  +0.00%)         1.80 (  -6.67%)
> > thp_fault_alloc               0.00 (  +0.00%)       531.60 (+100.00%)
> > thp_fault_fallback          749.00 (  +0.00%)       217.40 ( -70.88%)
> >
> > [ Note: this may in turn increase memory consumption from internal
> >   fragmentation, which is an inherent risk of transparent hugepages.
> >   Some setups may have to adjust the memcg limits accordingly to
> >   accomodate this - or, if the machine is already packed to capacity,
> >   disable the transparent huge page feature. ]
> 
> We're using an earlier version of this patch, so I approve of the
> general direction.  But I have some feedback.
> 
> The memsw aspect of this change seems somewhat separate.  Can it be
> split into a different patch?
> 
> The memsw aspect of this patch seems to change behavior.  Is this
> intended?  If so, a mention of it in the commit log would assuage the
> reader.  I'll explain...  Assume a machine with swap enabled and
> res.limit==memsw.limit, thus memsw_is_minimum is true.  My understanding
> is that memsw.usage represents sum(ram_usage, swap_usage).  So when
> memsw_is_minimum=true, then both swap_usage=0 and
> memsw.usage==res.usage.  In this condition, if res usage is at limit
> then there's no point in swapping because memsw.usage is already
> maximal.  Prior to this patch I think the kernel did the right thing,
> but not afterwards.
> 
> Before this patch:
>   if res.usage == res.limit, try_charge() indirectly calls
>   try_to_free_mem_cgroup_pages(noswap=true)
> 
> After this patch:
>   if res.usage == res.limit, try_charge() calls
>   try_to_free_mem_cgroup_pages(may_swap=true)
> 
> Notice the inverted swap-is-allowed value.

For some reason I had myself convinced that this is dead code due to a
change in callsites a long time ago, but you are right that currently
try_charge() relies on it, thanks for pointing it out.

However, memsw is always equal to or bigger than the memory limit - so
instead of keeping a separate state variable to track when memory
failure implies memsw failure, couldn't we just charge memsw first?

How about the following?  But yeah, I'd split this into a separate
patch now.

---
 mm/memcontrol.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2def11f1ec1..7c9a8971d0f4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2497,16 +2497,17 @@ retry:
 		goto done;
 
 	size = batch * PAGE_SIZE;
-	if (!res_counter_charge(&memcg->res, size, &fail_res)) {
-		if (!do_swap_account)
+	if (!do_swap_account ||
+	    !res_counter_charge(&memcg->memsw, size, &fail_res)) {
+		if (!res_counter_charge(&memcg->res, size, &fail_res))
 			goto done_restock;
-		if (!res_counter_charge(&memcg->memsw, size, &fail_res))
-			goto done_restock;
-		res_counter_uncharge(&memcg->res, size);
+		if (do_swap_account)
+			res_counter_uncharge(&memcg->memsw, size);
+		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+	} else {
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
 		may_swap = false;
-	} else
-		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+	}
 
 	if (batch > nr_pages) {
 		batch = nr_pages;
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
  2014-09-23  8:29   ` Vladimir Davydov
@ 2014-09-23 11:44     ` Vladimir Davydov
  2014-09-23 11:48     ` Johannes Weiner
  1 sibling, 0 replies; 9+ messages in thread
From: Vladimir Davydov @ 2014-09-23 11:44 UTC (permalink / raw)
  To: Greg Thelen
  Cc: Johannes Weiner, linux-mm, Michal Hocko, Dave Hansen, cgroups,
	linux-kernel

On Tue, Sep 23, 2014 at 12:29:27PM +0400, Vladimir Davydov wrote:
> On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
> > In this condition, if res usage is at limit then there's no point in
> > swapping because memsw.usage is already maximal.  Prior to this patch
> > I think the kernel did the right thing, but not afterwards.
> > 
> > Before this patch:
> >   if res.usage == res.limit, try_charge() indirectly calls
> >   try_to_free_mem_cgroup_pages(noswap=true)
> 
> But this is wrong. If we fail to charge res, we should try to do swap
> out along with page cache reclaim. Swap out won't affect memsw.usage,
> but will diminish res.usage so that the allocation may succeed.

Oops, I missed your point, sorry. If we hit the res.limit and
memsw.limit=res.limit, we automatically hit memsw.limit too, so there's
no point swapping out.

Thanks,
Vladimir

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
  2014-09-23  8:29   ` Vladimir Davydov
  2014-09-23 11:44     ` Vladimir Davydov
@ 2014-09-23 11:48     ` Johannes Weiner
  2014-09-23 11:56       ` Vladimir Davydov
  1 sibling, 1 reply; 9+ messages in thread
From: Johannes Weiner @ 2014-09-23 11:48 UTC (permalink / raw)
  To: Vladimir Davydov
  Cc: Greg Thelen, linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel

On Tue, Sep 23, 2014 at 12:29:27PM +0400, Vladimir Davydov wrote:
> [sorry for butting in, but I think I can answer your question]
> 
> On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
> > 
> > On Fri, Sep 19 2014, Johannes Weiner wrote:
> > 
> > > In a memcg with even just moderate cache pressure, success rates for
> > > transparent huge page allocations drop to zero, wasting a lot of
> > > effort that the allocator puts into assembling these pages.
> > >
> > > The reason for this is that the memcg reclaim code was never designed
> > > for higher-order charges.  It reclaims in small batches until there is
> > > room for at least one page.  Huge pages charges only succeed when
> > > these batches add up over a series of huge faults, which is unlikely
> > > under any significant load involving order-0 allocations in the group.
> > >
> > > Remove that loop on the memcg side in favor of passing the actual
> > > reclaim goal to direct reclaim, which is already set up and optimized
> > > to meet higher-order goals efficiently.
> > >
> > > This brings memcg's THP policy in line with the system policy: if the
> > > allocator painstakingly assembles a hugepage, memcg will at least make
> > > an honest effort to charge it.  As a result, transparent hugepage
> > > allocation rates amid cache activity are drastically improved:
> > >
> > >                                       vanilla                 patched
> > > pgalloc                 4717530.80 (  +0.00%)   4451376.40 (  -5.64%)
> > > pgfault                  491370.60 (  +0.00%)    225477.40 ( -54.11%)
> > > pgmajfault                    2.00 (  +0.00%)         1.80 (  -6.67%)
> > > thp_fault_alloc               0.00 (  +0.00%)       531.60 (+100.00%)
> > > thp_fault_fallback          749.00 (  +0.00%)       217.40 ( -70.88%)
> > >
> > > [ Note: this may in turn increase memory consumption from internal
> > >   fragmentation, which is an inherent risk of transparent hugepages.
> > >   Some setups may have to adjust the memcg limits accordingly to
> > >   accomodate this - or, if the machine is already packed to capacity,
> > >   disable the transparent huge page feature. ]
> > 
> > We're using an earlier version of this patch, so I approve of the
> > general direction.  But I have some feedback.
> > 
> > The memsw aspect of this change seems somewhat separate.  Can it be
> > split into a different patch?
> > 
> > The memsw aspect of this patch seems to change behavior.  Is this
> > intended?  If so, a mention of it in the commit log would assuage the
> > reader.  I'll explain...  Assume a machine with swap enabled and
> > res.limit==memsw.limit, thus memsw_is_minimum is true.  My understanding
> > is that memsw.usage represents sum(ram_usage, swap_usage).  So when
> > memsw_is_minimum=true, then both swap_usage=0 and
> > memsw.usage==res.usage. 
> 
> Not necessarily, we can have memsw.usage > res.usage due to global
> pressure. The point is (memsw.limit-res.limit) is not the swap usage
> limit. This is really confusing. As Johannes pointed out, we should drop
> memsw.limit in favor of separate mem.limit and swap.limit.

Memsw can exceed memory, but not the other way round, so if the limits
are equal and we hit the memory limit, surely the memsw limit must be
hit as well?

> > In this condition, if res usage is at limit then there's no point in
> > swapping because memsw.usage is already maximal.  Prior to this patch
> > I think the kernel did the right thing, but not afterwards.
> > 
> > Before this patch:
> >   if res.usage == res.limit, try_charge() indirectly calls
> >   try_to_free_mem_cgroup_pages(noswap=true)
> 
> But this is wrong. If we fail to charge res, we should try to do swap
> out along with page cache reclaim. Swap out won't affect memsw.usage,
> but will diminish res.usage so that the allocation may succeed.

But we know that the memsw limit must be hit as well in that case, and
swapping only makes progress in the sense that we are then succeeding
the memory charge.  But we still fail to charge memsw.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
  2014-09-23 11:48     ` Johannes Weiner
@ 2014-09-23 11:56       ` Vladimir Davydov
  0 siblings, 0 replies; 9+ messages in thread
From: Vladimir Davydov @ 2014-09-23 11:56 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Greg Thelen, linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel

On Tue, Sep 23, 2014 at 07:48:27AM -0400, Johannes Weiner wrote:
> On Tue, Sep 23, 2014 at 12:29:27PM +0400, Vladimir Davydov wrote:
> > On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
> > > In this condition, if res usage is at limit then there's no point in
> > > swapping because memsw.usage is already maximal.  Prior to this patch
> > > I think the kernel did the right thing, but not afterwards.
> > > 
> > > Before this patch:
> > >   if res.usage == res.limit, try_charge() indirectly calls
> > >   try_to_free_mem_cgroup_pages(noswap=true)
> > 
> > But this is wrong. If we fail to charge res, we should try to do swap
> > out along with page cache reclaim. Swap out won't affect memsw.usage,
> > but will diminish res.usage so that the allocation may succeed.
> 
> But we know that the memsw limit must be hit as well in that case, and
> swapping only makes progress in the sense that we are then succeeding
> the memory charge.  But we still fail to charge memsw.

Yeah, I admit I said nonsense. The problem Greg pointed out does exist.
I think your second patch (charging memsw before res) should fix it.

Thanks,
Vladimir

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
  2014-09-23 11:16   ` Johannes Weiner
@ 2014-09-24  0:07     ` Greg Thelen
  0 siblings, 0 replies; 9+ messages in thread
From: Greg Thelen @ 2014-09-24  0:07 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel


On Tue, Sep 23 2014, Johannes Weiner wrote:

> On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
>> 
>> On Fri, Sep 19 2014, Johannes Weiner wrote:
>> 
>> > In a memcg with even just moderate cache pressure, success rates for
>> > transparent huge page allocations drop to zero, wasting a lot of
>> > effort that the allocator puts into assembling these pages.
>> >
>> > The reason for this is that the memcg reclaim code was never designed
>> > for higher-order charges.  It reclaims in small batches until there is
>> > room for at least one page.  Huge pages charges only succeed when
>> > these batches add up over a series of huge faults, which is unlikely
>> > under any significant load involving order-0 allocations in the group.
>> >
>> > Remove that loop on the memcg side in favor of passing the actual
>> > reclaim goal to direct reclaim, which is already set up and optimized
>> > to meet higher-order goals efficiently.
>> >
>> > This brings memcg's THP policy in line with the system policy: if the
>> > allocator painstakingly assembles a hugepage, memcg will at least make
>> > an honest effort to charge it.  As a result, transparent hugepage
>> > allocation rates amid cache activity are drastically improved:
>> >
>> >                                       vanilla                 patched
>> > pgalloc                 4717530.80 (  +0.00%)   4451376.40 (  -5.64%)
>> > pgfault                  491370.60 (  +0.00%)    225477.40 ( -54.11%)
>> > pgmajfault                    2.00 (  +0.00%)         1.80 (  -6.67%)
>> > thp_fault_alloc               0.00 (  +0.00%)       531.60 (+100.00%)
>> > thp_fault_fallback          749.00 (  +0.00%)       217.40 ( -70.88%)
>> >
>> > [ Note: this may in turn increase memory consumption from internal
>> >   fragmentation, which is an inherent risk of transparent hugepages.
>> >   Some setups may have to adjust the memcg limits accordingly to
>> >   accomodate this - or, if the machine is already packed to capacity,
>> >   disable the transparent huge page feature. ]
>> 
>> We're using an earlier version of this patch, so I approve of the
>> general direction.  But I have some feedback.
>> 
>> The memsw aspect of this change seems somewhat separate.  Can it be
>> split into a different patch?
>> 
>> The memsw aspect of this patch seems to change behavior.  Is this
>> intended?  If so, a mention of it in the commit log would assuage the
>> reader.  I'll explain...  Assume a machine with swap enabled and
>> res.limit==memsw.limit, thus memsw_is_minimum is true.  My understanding
>> is that memsw.usage represents sum(ram_usage, swap_usage).  So when
>> memsw_is_minimum=true, then both swap_usage=0 and
>> memsw.usage==res.usage.  In this condition, if res usage is at limit
>> then there's no point in swapping because memsw.usage is already
>> maximal.  Prior to this patch I think the kernel did the right thing,
>> but not afterwards.
>> 
>> Before this patch:
>>   if res.usage == res.limit, try_charge() indirectly calls
>>   try_to_free_mem_cgroup_pages(noswap=true)
>> 
>> After this patch:
>>   if res.usage == res.limit, try_charge() calls
>>   try_to_free_mem_cgroup_pages(may_swap=true)
>> 
>> Notice the inverted swap-is-allowed value.
>
> For some reason I had myself convinced that this is dead code due to a
> change in callsites a long time ago, but you are right that currently
> try_charge() relies on it, thanks for pointing it out.
>
> However, memsw is always equal to or bigger than the memory limit - so
> instead of keeping a separate state variable to track when memory
> failure implies memsw failure, couldn't we just charge memsw first?
>
> How about the following?  But yeah, I'd split this into a separate
> patch now.

Looks good to me.  Thanks.

Acked-by: Greg Thelen <gthelen@google.com>

> ---
>  mm/memcontrol.c | 15 ++++++++-------
>  1 file changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e2def11f1ec1..7c9a8971d0f4 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2497,16 +2497,17 @@ retry:
>  		goto done;
>  
>  	size = batch * PAGE_SIZE;
> -	if (!res_counter_charge(&memcg->res, size, &fail_res)) {
> -		if (!do_swap_account)
> +	if (!do_swap_account ||
> +	    !res_counter_charge(&memcg->memsw, size, &fail_res)) {
> +		if (!res_counter_charge(&memcg->res, size, &fail_res))
>  			goto done_restock;
> -		if (!res_counter_charge(&memcg->memsw, size, &fail_res))
> -			goto done_restock;
> -		res_counter_uncharge(&memcg->res, size);
> +		if (do_swap_account)
> +			res_counter_uncharge(&memcg->memsw, size);
> +		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
> +	} else {
>  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
>  		may_swap = false;
> -	} else
> -		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
> +	}
>  
>  	if (batch > nr_pages) {
>  		batch = nr_pages;


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-09-24  0:08 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-19 13:20 [patch] mm: memcontrol: support transparent huge pages under pressure Johannes Weiner
2014-09-22 10:18 ` Vladimir Davydov
2014-09-23  5:52 ` Greg Thelen
2014-09-23  8:29   ` Vladimir Davydov
2014-09-23 11:44     ` Vladimir Davydov
2014-09-23 11:48     ` Johannes Weiner
2014-09-23 11:56       ` Vladimir Davydov
2014-09-23 11:16   ` Johannes Weiner
2014-09-24  0:07     ` Greg Thelen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).