* [patch] mm: memcontrol: support transparent huge pages under pressure
@ 2014-09-19 13:20 Johannes Weiner
2014-09-22 10:18 ` Vladimir Davydov
2014-09-23 5:52 ` Greg Thelen
0 siblings, 2 replies; 9+ messages in thread
From: Johannes Weiner @ 2014-09-19 13:20 UTC (permalink / raw)
To: linux-mm; +Cc: Michal Hocko, Greg Thelen, Dave Hansen, cgroups, linux-kernel
In a memcg with even just moderate cache pressure, success rates for
transparent huge page allocations drop to zero, wasting a lot of
effort that the allocator puts into assembling these pages.
The reason for this is that the memcg reclaim code was never designed
for higher-order charges. It reclaims in small batches until there is
room for at least one page. Huge pages charges only succeed when
these batches add up over a series of huge faults, which is unlikely
under any significant load involving order-0 allocations in the group.
Remove that loop on the memcg side in favor of passing the actual
reclaim goal to direct reclaim, which is already set up and optimized
to meet higher-order goals efficiently.
This brings memcg's THP policy in line with the system policy: if the
allocator painstakingly assembles a hugepage, memcg will at least make
an honest effort to charge it. As a result, transparent hugepage
allocation rates amid cache activity are drastically improved:
vanilla patched
pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%)
pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%)
pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%)
thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%)
thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%)
[ Note: this may in turn increase memory consumption from internal
fragmentation, which is an inherent risk of transparent hugepages.
Some setups may have to adjust the memcg limits accordingly to
accomodate this - or, if the machine is already packed to capacity,
disable the transparent huge page feature. ]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
include/linux/swap.h | 6 ++--
mm/memcontrol.c | 86 +++++++++++-----------------------------------------
mm/vmscan.c | 7 +++--
3 files changed, 25 insertions(+), 74 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ea4f926e6b9b..37a585beef5c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -327,8 +327,10 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
-extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
- gfp_t gfp_mask, bool noswap);
+extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ unsigned long nr_pages,
+ gfp_t gfp_mask,
+ bool may_swap);
extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
struct zone *zone,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9431024e490c..e2def11f1ec1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -315,9 +315,6 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
- /* set when res.limit == memsw.limit */
- bool memsw_is_minimum;
-
/* protect arrays of thresholds */
struct mutex thresholds_lock;
@@ -481,14 +478,6 @@ enum res_type {
#define OOM_CONTROL (0)
/*
- * Reclaim flags for mem_cgroup_hierarchical_reclaim
- */
-#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
-#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
-#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
-#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-
-/*
* The memcg_create_mutex will be held whenever a new cgroup is created.
* As a consequence, any change that needs to protect against new child cgroups
* appearing has to hold it as well.
@@ -1794,42 +1783,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
NULL, "Memory cgroup out of memory");
}
-static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
- gfp_t gfp_mask,
- unsigned long flags)
-{
- unsigned long total = 0;
- bool noswap = false;
- int loop;
-
- if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
- noswap = true;
- if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
- noswap = true;
-
- for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
- if (loop)
- drain_all_stock_async(memcg);
- total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
- /*
- * Allow limit shrinkers, which are triggered directly
- * by userspace, to catch signals and stop reclaim
- * after minimal progress, regardless of the margin.
- */
- if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
- break;
- if (mem_cgroup_margin(memcg))
- break;
- /*
- * If nothing was reclaimed after two attempts, there
- * may be no reclaimable pages in this hierarchy.
- */
- if (loop && !total)
- break;
- }
- return total;
-}
-
/**
* test_mem_cgroup_node_reclaimable
* @memcg: the target memcg
@@ -2532,8 +2485,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long nr_reclaimed;
- unsigned long flags = 0;
unsigned long long size;
+ bool may_swap = true;
+ bool drained = false;
int ret = 0;
if (mem_cgroup_is_root(memcg))
@@ -2550,7 +2504,7 @@ retry:
goto done_restock;
res_counter_uncharge(&memcg->res, size);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
- flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+ may_swap = false;
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
@@ -2576,11 +2530,18 @@ retry:
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
- nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+ nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+ gfp_mask, may_swap);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
+ if (!drained) {
+ drain_all_stock_async(mem_over_limit);
+ drained = true;
+ goto retry;
+ }
+
if (gfp_mask & __GFP_NORETRY)
goto nomem;
/*
@@ -3655,19 +3616,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
enlarge = 1;
ret = res_counter_set_limit(&memcg->res, val);
- if (!ret) {
- if (memswlimit == val)
- memcg->memsw_is_minimum = true;
- else
- memcg->memsw_is_minimum = false;
- }
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
- mem_cgroup_reclaim(memcg, GFP_KERNEL,
- MEM_CGROUP_RECLAIM_SHRINK);
+ try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
+
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -3714,20 +3669,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
if (memswlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
- if (!ret) {
- if (memlimit == val)
- memcg->memsw_is_minimum = true;
- else
- memcg->memsw_is_minimum = false;
- }
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
- mem_cgroup_reclaim(memcg, GFP_KERNEL,
- MEM_CGROUP_RECLAIM_NOSWAP |
- MEM_CGROUP_RECLAIM_SHRINK);
+ try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
+
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -3976,8 +3924,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
if (signal_pending(current))
return -EINTR;
- progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
- false);
+ progress = try_to_free_mem_cgroup_pages(memcg, 1,
+ GFP_KERNEL, true);
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b672e2c6becc..97d31ec17d06 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2759,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
}
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ unsigned long nr_pages,
gfp_t gfp_mask,
- bool noswap)
+ bool may_swap)
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
int nid;
struct scan_control sc = {
- .nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
.target_mem_cgroup = memcg,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
- .may_swap = !noswap,
+ .may_swap = may_swap,
};
/*
--
2.1.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
2014-09-19 13:20 [patch] mm: memcontrol: support transparent huge pages under pressure Johannes Weiner
@ 2014-09-22 10:18 ` Vladimir Davydov
2014-09-23 5:52 ` Greg Thelen
1 sibling, 0 replies; 9+ messages in thread
From: Vladimir Davydov @ 2014-09-22 10:18 UTC (permalink / raw)
To: Johannes Weiner
Cc: linux-mm, Michal Hocko, Greg Thelen, Dave Hansen, cgroups, linux-kernel
On Fri, Sep 19, 2014 at 09:20:40AM -0400, Johannes Weiner wrote:
> In a memcg with even just moderate cache pressure, success rates for
> transparent huge page allocations drop to zero, wasting a lot of
> effort that the allocator puts into assembling these pages.
>
> The reason for this is that the memcg reclaim code was never designed
> for higher-order charges. It reclaims in small batches until there is
> room for at least one page. Huge pages charges only succeed when
> these batches add up over a series of huge faults, which is unlikely
> under any significant load involving order-0 allocations in the group.
>
> Remove that loop on the memcg side in favor of passing the actual
> reclaim goal to direct reclaim, which is already set up and optimized
> to meet higher-order goals efficiently.
>
> This brings memcg's THP policy in line with the system policy: if the
> allocator painstakingly assembles a hugepage, memcg will at least make
> an honest effort to charge it. As a result, transparent hugepage
> allocation rates amid cache activity are drastically improved:
>
> vanilla patched
> pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%)
> pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%)
> pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%)
> thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%)
> thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%)
>
> [ Note: this may in turn increase memory consumption from internal
> fragmentation, which is an inherent risk of transparent hugepages.
> Some setups may have to adjust the memcg limits accordingly to
> accomodate this - or, if the machine is already packed to capacity,
> disable the transparent huge page feature. ]
>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Looks like a really nice change to me. FWIW,
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
2014-09-19 13:20 [patch] mm: memcontrol: support transparent huge pages under pressure Johannes Weiner
2014-09-22 10:18 ` Vladimir Davydov
@ 2014-09-23 5:52 ` Greg Thelen
2014-09-23 8:29 ` Vladimir Davydov
2014-09-23 11:16 ` Johannes Weiner
1 sibling, 2 replies; 9+ messages in thread
From: Greg Thelen @ 2014-09-23 5:52 UTC (permalink / raw)
To: Johannes Weiner
Cc: linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel
On Fri, Sep 19 2014, Johannes Weiner wrote:
> In a memcg with even just moderate cache pressure, success rates for
> transparent huge page allocations drop to zero, wasting a lot of
> effort that the allocator puts into assembling these pages.
>
> The reason for this is that the memcg reclaim code was never designed
> for higher-order charges. It reclaims in small batches until there is
> room for at least one page. Huge pages charges only succeed when
> these batches add up over a series of huge faults, which is unlikely
> under any significant load involving order-0 allocations in the group.
>
> Remove that loop on the memcg side in favor of passing the actual
> reclaim goal to direct reclaim, which is already set up and optimized
> to meet higher-order goals efficiently.
>
> This brings memcg's THP policy in line with the system policy: if the
> allocator painstakingly assembles a hugepage, memcg will at least make
> an honest effort to charge it. As a result, transparent hugepage
> allocation rates amid cache activity are drastically improved:
>
> vanilla patched
> pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%)
> pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%)
> pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%)
> thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%)
> thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%)
>
> [ Note: this may in turn increase memory consumption from internal
> fragmentation, which is an inherent risk of transparent hugepages.
> Some setups may have to adjust the memcg limits accordingly to
> accomodate this - or, if the machine is already packed to capacity,
> disable the transparent huge page feature. ]
We're using an earlier version of this patch, so I approve of the
general direction. But I have some feedback.
The memsw aspect of this change seems somewhat separate. Can it be
split into a different patch?
The memsw aspect of this patch seems to change behavior. Is this
intended? If so, a mention of it in the commit log would assuage the
reader. I'll explain... Assume a machine with swap enabled and
res.limit==memsw.limit, thus memsw_is_minimum is true. My understanding
is that memsw.usage represents sum(ram_usage, swap_usage). So when
memsw_is_minimum=true, then both swap_usage=0 and
memsw.usage==res.usage. In this condition, if res usage is at limit
then there's no point in swapping because memsw.usage is already
maximal. Prior to this patch I think the kernel did the right thing,
but not afterwards.
Before this patch:
if res.usage == res.limit, try_charge() indirectly calls
try_to_free_mem_cgroup_pages(noswap=true)
After this patch:
if res.usage == res.limit, try_charge() calls
try_to_free_mem_cgroup_pages(may_swap=true)
Notice the inverted swap-is-allowed value.
I haven't had time to look at your other outstanding memcg patches.
These comments were made with this patch in isolation.
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
> include/linux/swap.h | 6 ++--
> mm/memcontrol.c | 86 +++++++++++-----------------------------------------
> mm/vmscan.c | 7 +++--
> 3 files changed, 25 insertions(+), 74 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index ea4f926e6b9b..37a585beef5c 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -327,8 +327,10 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
> extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> gfp_t gfp_mask, nodemask_t *mask);
> extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
> -extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
> - gfp_t gfp_mask, bool noswap);
> +extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> + unsigned long nr_pages,
> + gfp_t gfp_mask,
> + bool may_swap);
> extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
> gfp_t gfp_mask, bool noswap,
> struct zone *zone,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 9431024e490c..e2def11f1ec1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -315,9 +315,6 @@ struct mem_cgroup {
> /* OOM-Killer disable */
> int oom_kill_disable;
>
> - /* set when res.limit == memsw.limit */
> - bool memsw_is_minimum;
> -
> /* protect arrays of thresholds */
> struct mutex thresholds_lock;
>
> @@ -481,14 +478,6 @@ enum res_type {
> #define OOM_CONTROL (0)
>
> /*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -
> -/*
> * The memcg_create_mutex will be held whenever a new cgroup is created.
> * As a consequence, any change that needs to protect against new child cgroups
> * appearing has to hold it as well.
> @@ -1794,42 +1783,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> NULL, "Memory cgroup out of memory");
> }
>
> -static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
> - gfp_t gfp_mask,
> - unsigned long flags)
> -{
> - unsigned long total = 0;
> - bool noswap = false;
> - int loop;
> -
> - if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
> - noswap = true;
> - if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
> - noswap = true;
> -
> - for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
> - if (loop)
> - drain_all_stock_async(memcg);
> - total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
> - /*
> - * Allow limit shrinkers, which are triggered directly
> - * by userspace, to catch signals and stop reclaim
> - * after minimal progress, regardless of the margin.
> - */
> - if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
> - break;
> - if (mem_cgroup_margin(memcg))
> - break;
> - /*
> - * If nothing was reclaimed after two attempts, there
> - * may be no reclaimable pages in this hierarchy.
> - */
> - if (loop && !total)
> - break;
> - }
> - return total;
> -}
> -
> /**
> * test_mem_cgroup_node_reclaimable
> * @memcg: the target memcg
> @@ -2532,8 +2485,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> struct mem_cgroup *mem_over_limit;
> struct res_counter *fail_res;
> unsigned long nr_reclaimed;
> - unsigned long flags = 0;
> unsigned long long size;
> + bool may_swap = true;
> + bool drained = false;
> int ret = 0;
>
> if (mem_cgroup_is_root(memcg))
> @@ -2550,7 +2504,7 @@ retry:
> goto done_restock;
> res_counter_uncharge(&memcg->res, size);
> mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
> - flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> + may_swap = false;
> } else
> mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
>
> @@ -2576,11 +2530,18 @@ retry:
> if (!(gfp_mask & __GFP_WAIT))
> goto nomem;
>
> - nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
> + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> + gfp_mask, may_swap);
>
> if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> goto retry;
>
> + if (!drained) {
> + drain_all_stock_async(mem_over_limit);
> + drained = true;
> + goto retry;
> + }
> +
> if (gfp_mask & __GFP_NORETRY)
> goto nomem;
> /*
> @@ -3655,19 +3616,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> enlarge = 1;
>
> ret = res_counter_set_limit(&memcg->res, val);
> - if (!ret) {
> - if (memswlimit == val)
> - memcg->memsw_is_minimum = true;
> - else
> - memcg->memsw_is_minimum = false;
> - }
> mutex_unlock(&set_limit_mutex);
>
> if (!ret)
> break;
>
> - mem_cgroup_reclaim(memcg, GFP_KERNEL,
> - MEM_CGROUP_RECLAIM_SHRINK);
> + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
> +
> curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
> /* Usage is reduced ? */
> if (curusage >= oldusage)
> @@ -3714,20 +3669,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
> if (memswlimit < val)
> enlarge = 1;
> ret = res_counter_set_limit(&memcg->memsw, val);
> - if (!ret) {
> - if (memlimit == val)
> - memcg->memsw_is_minimum = true;
> - else
> - memcg->memsw_is_minimum = false;
> - }
> mutex_unlock(&set_limit_mutex);
>
> if (!ret)
> break;
>
> - mem_cgroup_reclaim(memcg, GFP_KERNEL,
> - MEM_CGROUP_RECLAIM_NOSWAP |
> - MEM_CGROUP_RECLAIM_SHRINK);
> + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
> +
> curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
> /* Usage is reduced ? */
> if (curusage >= oldusage)
> @@ -3976,8 +3924,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> if (signal_pending(current))
> return -EINTR;
>
> - progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
> - false);
> + progress = try_to_free_mem_cgroup_pages(memcg, 1,
> + GFP_KERNEL, true);
> if (!progress) {
> nr_retries--;
> /* maybe some writeback is necessary */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b672e2c6becc..97d31ec17d06 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2759,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
> }
>
> unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> + unsigned long nr_pages,
> gfp_t gfp_mask,
> - bool noswap)
> + bool may_swap)
> {
> struct zonelist *zonelist;
> unsigned long nr_reclaimed;
> int nid;
> struct scan_control sc = {
> - .nr_to_reclaim = SWAP_CLUSTER_MAX,
> + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
> (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
> .target_mem_cgroup = memcg,
> .priority = DEF_PRIORITY,
> .may_writepage = !laptop_mode,
> .may_unmap = 1,
> - .may_swap = !noswap,
> + .may_swap = may_swap,
> };
>
> /*
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
2014-09-23 5:52 ` Greg Thelen
@ 2014-09-23 8:29 ` Vladimir Davydov
2014-09-23 11:44 ` Vladimir Davydov
2014-09-23 11:48 ` Johannes Weiner
2014-09-23 11:16 ` Johannes Weiner
1 sibling, 2 replies; 9+ messages in thread
From: Vladimir Davydov @ 2014-09-23 8:29 UTC (permalink / raw)
To: Greg Thelen
Cc: Johannes Weiner, linux-mm, Michal Hocko, Dave Hansen, cgroups,
linux-kernel
[sorry for butting in, but I think I can answer your question]
On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
>
> On Fri, Sep 19 2014, Johannes Weiner wrote:
>
> > In a memcg with even just moderate cache pressure, success rates for
> > transparent huge page allocations drop to zero, wasting a lot of
> > effort that the allocator puts into assembling these pages.
> >
> > The reason for this is that the memcg reclaim code was never designed
> > for higher-order charges. It reclaims in small batches until there is
> > room for at least one page. Huge pages charges only succeed when
> > these batches add up over a series of huge faults, which is unlikely
> > under any significant load involving order-0 allocations in the group.
> >
> > Remove that loop on the memcg side in favor of passing the actual
> > reclaim goal to direct reclaim, which is already set up and optimized
> > to meet higher-order goals efficiently.
> >
> > This brings memcg's THP policy in line with the system policy: if the
> > allocator painstakingly assembles a hugepage, memcg will at least make
> > an honest effort to charge it. As a result, transparent hugepage
> > allocation rates amid cache activity are drastically improved:
> >
> > vanilla patched
> > pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%)
> > pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%)
> > pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%)
> > thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%)
> > thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%)
> >
> > [ Note: this may in turn increase memory consumption from internal
> > fragmentation, which is an inherent risk of transparent hugepages.
> > Some setups may have to adjust the memcg limits accordingly to
> > accomodate this - or, if the machine is already packed to capacity,
> > disable the transparent huge page feature. ]
>
> We're using an earlier version of this patch, so I approve of the
> general direction. But I have some feedback.
>
> The memsw aspect of this change seems somewhat separate. Can it be
> split into a different patch?
>
> The memsw aspect of this patch seems to change behavior. Is this
> intended? If so, a mention of it in the commit log would assuage the
> reader. I'll explain... Assume a machine with swap enabled and
> res.limit==memsw.limit, thus memsw_is_minimum is true. My understanding
> is that memsw.usage represents sum(ram_usage, swap_usage). So when
> memsw_is_minimum=true, then both swap_usage=0 and
> memsw.usage==res.usage.
Not necessarily, we can have memsw.usage > res.usage due to global
pressure. The point is (memsw.limit-res.limit) is not the swap usage
limit. This is really confusing. As Johannes pointed out, we should drop
memsw.limit in favor of separate mem.limit and swap.limit.
> In this condition, if res usage is at limit then there's no point in
> swapping because memsw.usage is already maximal. Prior to this patch
> I think the kernel did the right thing, but not afterwards.
>
> Before this patch:
> if res.usage == res.limit, try_charge() indirectly calls
> try_to_free_mem_cgroup_pages(noswap=true)
But this is wrong. If we fail to charge res, we should try to do swap
out along with page cache reclaim. Swap out won't affect memsw.usage,
but will diminish res.usage so that the allocation may succeed.
AFAIU we should only disable swap-out if we succeeded to charge res, but
failed with memsw. And this is the rule that this patch follows.
Anyway, I agree with you that this is far not obvious and deserves a
separate patch.
Thanks,
Vladimir
>
> After this patch:
> if res.usage == res.limit, try_charge() calls
> try_to_free_mem_cgroup_pages(may_swap=true)
>
> Notice the inverted swap-is-allowed value.
>
> I haven't had time to look at your other outstanding memcg patches.
> These comments were made with this patch in isolation.
[...]
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
2014-09-23 5:52 ` Greg Thelen
2014-09-23 8:29 ` Vladimir Davydov
@ 2014-09-23 11:16 ` Johannes Weiner
2014-09-24 0:07 ` Greg Thelen
1 sibling, 1 reply; 9+ messages in thread
From: Johannes Weiner @ 2014-09-23 11:16 UTC (permalink / raw)
To: Greg Thelen; +Cc: linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel
On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
>
> On Fri, Sep 19 2014, Johannes Weiner wrote:
>
> > In a memcg with even just moderate cache pressure, success rates for
> > transparent huge page allocations drop to zero, wasting a lot of
> > effort that the allocator puts into assembling these pages.
> >
> > The reason for this is that the memcg reclaim code was never designed
> > for higher-order charges. It reclaims in small batches until there is
> > room for at least one page. Huge pages charges only succeed when
> > these batches add up over a series of huge faults, which is unlikely
> > under any significant load involving order-0 allocations in the group.
> >
> > Remove that loop on the memcg side in favor of passing the actual
> > reclaim goal to direct reclaim, which is already set up and optimized
> > to meet higher-order goals efficiently.
> >
> > This brings memcg's THP policy in line with the system policy: if the
> > allocator painstakingly assembles a hugepage, memcg will at least make
> > an honest effort to charge it. As a result, transparent hugepage
> > allocation rates amid cache activity are drastically improved:
> >
> > vanilla patched
> > pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%)
> > pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%)
> > pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%)
> > thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%)
> > thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%)
> >
> > [ Note: this may in turn increase memory consumption from internal
> > fragmentation, which is an inherent risk of transparent hugepages.
> > Some setups may have to adjust the memcg limits accordingly to
> > accomodate this - or, if the machine is already packed to capacity,
> > disable the transparent huge page feature. ]
>
> We're using an earlier version of this patch, so I approve of the
> general direction. But I have some feedback.
>
> The memsw aspect of this change seems somewhat separate. Can it be
> split into a different patch?
>
> The memsw aspect of this patch seems to change behavior. Is this
> intended? If so, a mention of it in the commit log would assuage the
> reader. I'll explain... Assume a machine with swap enabled and
> res.limit==memsw.limit, thus memsw_is_minimum is true. My understanding
> is that memsw.usage represents sum(ram_usage, swap_usage). So when
> memsw_is_minimum=true, then both swap_usage=0 and
> memsw.usage==res.usage. In this condition, if res usage is at limit
> then there's no point in swapping because memsw.usage is already
> maximal. Prior to this patch I think the kernel did the right thing,
> but not afterwards.
>
> Before this patch:
> if res.usage == res.limit, try_charge() indirectly calls
> try_to_free_mem_cgroup_pages(noswap=true)
>
> After this patch:
> if res.usage == res.limit, try_charge() calls
> try_to_free_mem_cgroup_pages(may_swap=true)
>
> Notice the inverted swap-is-allowed value.
For some reason I had myself convinced that this is dead code due to a
change in callsites a long time ago, but you are right that currently
try_charge() relies on it, thanks for pointing it out.
However, memsw is always equal to or bigger than the memory limit - so
instead of keeping a separate state variable to track when memory
failure implies memsw failure, couldn't we just charge memsw first?
How about the following? But yeah, I'd split this into a separate
patch now.
---
mm/memcontrol.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2def11f1ec1..7c9a8971d0f4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2497,16 +2497,17 @@ retry:
goto done;
size = batch * PAGE_SIZE;
- if (!res_counter_charge(&memcg->res, size, &fail_res)) {
- if (!do_swap_account)
+ if (!do_swap_account ||
+ !res_counter_charge(&memcg->memsw, size, &fail_res)) {
+ if (!res_counter_charge(&memcg->res, size, &fail_res))
goto done_restock;
- if (!res_counter_charge(&memcg->memsw, size, &fail_res))
- goto done_restock;
- res_counter_uncharge(&memcg->res, size);
+ if (do_swap_account)
+ res_counter_uncharge(&memcg->memsw, size);
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+ } else {
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
may_swap = false;
- } else
- mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+ }
if (batch > nr_pages) {
batch = nr_pages;
--
2.1.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
2014-09-23 8:29 ` Vladimir Davydov
@ 2014-09-23 11:44 ` Vladimir Davydov
2014-09-23 11:48 ` Johannes Weiner
1 sibling, 0 replies; 9+ messages in thread
From: Vladimir Davydov @ 2014-09-23 11:44 UTC (permalink / raw)
To: Greg Thelen
Cc: Johannes Weiner, linux-mm, Michal Hocko, Dave Hansen, cgroups,
linux-kernel
On Tue, Sep 23, 2014 at 12:29:27PM +0400, Vladimir Davydov wrote:
> On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
> > In this condition, if res usage is at limit then there's no point in
> > swapping because memsw.usage is already maximal. Prior to this patch
> > I think the kernel did the right thing, but not afterwards.
> >
> > Before this patch:
> > if res.usage == res.limit, try_charge() indirectly calls
> > try_to_free_mem_cgroup_pages(noswap=true)
>
> But this is wrong. If we fail to charge res, we should try to do swap
> out along with page cache reclaim. Swap out won't affect memsw.usage,
> but will diminish res.usage so that the allocation may succeed.
Oops, I missed your point, sorry. If we hit the res.limit and
memsw.limit=res.limit, we automatically hit memsw.limit too, so there's
no point swapping out.
Thanks,
Vladimir
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
2014-09-23 8:29 ` Vladimir Davydov
2014-09-23 11:44 ` Vladimir Davydov
@ 2014-09-23 11:48 ` Johannes Weiner
2014-09-23 11:56 ` Vladimir Davydov
1 sibling, 1 reply; 9+ messages in thread
From: Johannes Weiner @ 2014-09-23 11:48 UTC (permalink / raw)
To: Vladimir Davydov
Cc: Greg Thelen, linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel
On Tue, Sep 23, 2014 at 12:29:27PM +0400, Vladimir Davydov wrote:
> [sorry for butting in, but I think I can answer your question]
>
> On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
> >
> > On Fri, Sep 19 2014, Johannes Weiner wrote:
> >
> > > In a memcg with even just moderate cache pressure, success rates for
> > > transparent huge page allocations drop to zero, wasting a lot of
> > > effort that the allocator puts into assembling these pages.
> > >
> > > The reason for this is that the memcg reclaim code was never designed
> > > for higher-order charges. It reclaims in small batches until there is
> > > room for at least one page. Huge pages charges only succeed when
> > > these batches add up over a series of huge faults, which is unlikely
> > > under any significant load involving order-0 allocations in the group.
> > >
> > > Remove that loop on the memcg side in favor of passing the actual
> > > reclaim goal to direct reclaim, which is already set up and optimized
> > > to meet higher-order goals efficiently.
> > >
> > > This brings memcg's THP policy in line with the system policy: if the
> > > allocator painstakingly assembles a hugepage, memcg will at least make
> > > an honest effort to charge it. As a result, transparent hugepage
> > > allocation rates amid cache activity are drastically improved:
> > >
> > > vanilla patched
> > > pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%)
> > > pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%)
> > > pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%)
> > > thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%)
> > > thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%)
> > >
> > > [ Note: this may in turn increase memory consumption from internal
> > > fragmentation, which is an inherent risk of transparent hugepages.
> > > Some setups may have to adjust the memcg limits accordingly to
> > > accomodate this - or, if the machine is already packed to capacity,
> > > disable the transparent huge page feature. ]
> >
> > We're using an earlier version of this patch, so I approve of the
> > general direction. But I have some feedback.
> >
> > The memsw aspect of this change seems somewhat separate. Can it be
> > split into a different patch?
> >
> > The memsw aspect of this patch seems to change behavior. Is this
> > intended? If so, a mention of it in the commit log would assuage the
> > reader. I'll explain... Assume a machine with swap enabled and
> > res.limit==memsw.limit, thus memsw_is_minimum is true. My understanding
> > is that memsw.usage represents sum(ram_usage, swap_usage). So when
> > memsw_is_minimum=true, then both swap_usage=0 and
> > memsw.usage==res.usage.
>
> Not necessarily, we can have memsw.usage > res.usage due to global
> pressure. The point is (memsw.limit-res.limit) is not the swap usage
> limit. This is really confusing. As Johannes pointed out, we should drop
> memsw.limit in favor of separate mem.limit and swap.limit.
Memsw can exceed memory, but not the other way round, so if the limits
are equal and we hit the memory limit, surely the memsw limit must be
hit as well?
> > In this condition, if res usage is at limit then there's no point in
> > swapping because memsw.usage is already maximal. Prior to this patch
> > I think the kernel did the right thing, but not afterwards.
> >
> > Before this patch:
> > if res.usage == res.limit, try_charge() indirectly calls
> > try_to_free_mem_cgroup_pages(noswap=true)
>
> But this is wrong. If we fail to charge res, we should try to do swap
> out along with page cache reclaim. Swap out won't affect memsw.usage,
> but will diminish res.usage so that the allocation may succeed.
But we know that the memsw limit must be hit as well in that case, and
swapping only makes progress in the sense that we are then succeeding
the memory charge. But we still fail to charge memsw.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
2014-09-23 11:48 ` Johannes Weiner
@ 2014-09-23 11:56 ` Vladimir Davydov
0 siblings, 0 replies; 9+ messages in thread
From: Vladimir Davydov @ 2014-09-23 11:56 UTC (permalink / raw)
To: Johannes Weiner
Cc: Greg Thelen, linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel
On Tue, Sep 23, 2014 at 07:48:27AM -0400, Johannes Weiner wrote:
> On Tue, Sep 23, 2014 at 12:29:27PM +0400, Vladimir Davydov wrote:
> > On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
> > > In this condition, if res usage is at limit then there's no point in
> > > swapping because memsw.usage is already maximal. Prior to this patch
> > > I think the kernel did the right thing, but not afterwards.
> > >
> > > Before this patch:
> > > if res.usage == res.limit, try_charge() indirectly calls
> > > try_to_free_mem_cgroup_pages(noswap=true)
> >
> > But this is wrong. If we fail to charge res, we should try to do swap
> > out along with page cache reclaim. Swap out won't affect memsw.usage,
> > but will diminish res.usage so that the allocation may succeed.
>
> But we know that the memsw limit must be hit as well in that case, and
> swapping only makes progress in the sense that we are then succeeding
> the memory charge. But we still fail to charge memsw.
Yeah, I admit I said nonsense. The problem Greg pointed out does exist.
I think your second patch (charging memsw before res) should fix it.
Thanks,
Vladimir
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [patch] mm: memcontrol: support transparent huge pages under pressure
2014-09-23 11:16 ` Johannes Weiner
@ 2014-09-24 0:07 ` Greg Thelen
0 siblings, 0 replies; 9+ messages in thread
From: Greg Thelen @ 2014-09-24 0:07 UTC (permalink / raw)
To: Johannes Weiner
Cc: linux-mm, Michal Hocko, Dave Hansen, cgroups, linux-kernel
On Tue, Sep 23 2014, Johannes Weiner wrote:
> On Mon, Sep 22, 2014 at 10:52:50PM -0700, Greg Thelen wrote:
>>
>> On Fri, Sep 19 2014, Johannes Weiner wrote:
>>
>> > In a memcg with even just moderate cache pressure, success rates for
>> > transparent huge page allocations drop to zero, wasting a lot of
>> > effort that the allocator puts into assembling these pages.
>> >
>> > The reason for this is that the memcg reclaim code was never designed
>> > for higher-order charges. It reclaims in small batches until there is
>> > room for at least one page. Huge pages charges only succeed when
>> > these batches add up over a series of huge faults, which is unlikely
>> > under any significant load involving order-0 allocations in the group.
>> >
>> > Remove that loop on the memcg side in favor of passing the actual
>> > reclaim goal to direct reclaim, which is already set up and optimized
>> > to meet higher-order goals efficiently.
>> >
>> > This brings memcg's THP policy in line with the system policy: if the
>> > allocator painstakingly assembles a hugepage, memcg will at least make
>> > an honest effort to charge it. As a result, transparent hugepage
>> > allocation rates amid cache activity are drastically improved:
>> >
>> > vanilla patched
>> > pgalloc 4717530.80 ( +0.00%) 4451376.40 ( -5.64%)
>> > pgfault 491370.60 ( +0.00%) 225477.40 ( -54.11%)
>> > pgmajfault 2.00 ( +0.00%) 1.80 ( -6.67%)
>> > thp_fault_alloc 0.00 ( +0.00%) 531.60 (+100.00%)
>> > thp_fault_fallback 749.00 ( +0.00%) 217.40 ( -70.88%)
>> >
>> > [ Note: this may in turn increase memory consumption from internal
>> > fragmentation, which is an inherent risk of transparent hugepages.
>> > Some setups may have to adjust the memcg limits accordingly to
>> > accomodate this - or, if the machine is already packed to capacity,
>> > disable the transparent huge page feature. ]
>>
>> We're using an earlier version of this patch, so I approve of the
>> general direction. But I have some feedback.
>>
>> The memsw aspect of this change seems somewhat separate. Can it be
>> split into a different patch?
>>
>> The memsw aspect of this patch seems to change behavior. Is this
>> intended? If so, a mention of it in the commit log would assuage the
>> reader. I'll explain... Assume a machine with swap enabled and
>> res.limit==memsw.limit, thus memsw_is_minimum is true. My understanding
>> is that memsw.usage represents sum(ram_usage, swap_usage). So when
>> memsw_is_minimum=true, then both swap_usage=0 and
>> memsw.usage==res.usage. In this condition, if res usage is at limit
>> then there's no point in swapping because memsw.usage is already
>> maximal. Prior to this patch I think the kernel did the right thing,
>> but not afterwards.
>>
>> Before this patch:
>> if res.usage == res.limit, try_charge() indirectly calls
>> try_to_free_mem_cgroup_pages(noswap=true)
>>
>> After this patch:
>> if res.usage == res.limit, try_charge() calls
>> try_to_free_mem_cgroup_pages(may_swap=true)
>>
>> Notice the inverted swap-is-allowed value.
>
> For some reason I had myself convinced that this is dead code due to a
> change in callsites a long time ago, but you are right that currently
> try_charge() relies on it, thanks for pointing it out.
>
> However, memsw is always equal to or bigger than the memory limit - so
> instead of keeping a separate state variable to track when memory
> failure implies memsw failure, couldn't we just charge memsw first?
>
> How about the following? But yeah, I'd split this into a separate
> patch now.
Looks good to me. Thanks.
Acked-by: Greg Thelen <gthelen@google.com>
> ---
> mm/memcontrol.c | 15 ++++++++-------
> 1 file changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e2def11f1ec1..7c9a8971d0f4 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2497,16 +2497,17 @@ retry:
> goto done;
>
> size = batch * PAGE_SIZE;
> - if (!res_counter_charge(&memcg->res, size, &fail_res)) {
> - if (!do_swap_account)
> + if (!do_swap_account ||
> + !res_counter_charge(&memcg->memsw, size, &fail_res)) {
> + if (!res_counter_charge(&memcg->res, size, &fail_res))
> goto done_restock;
> - if (!res_counter_charge(&memcg->memsw, size, &fail_res))
> - goto done_restock;
> - res_counter_uncharge(&memcg->res, size);
> + if (do_swap_account)
> + res_counter_uncharge(&memcg->memsw, size);
> + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
> + } else {
> mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
> may_swap = false;
> - } else
> - mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
> + }
>
> if (batch > nr_pages) {
> batch = nr_pages;
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2014-09-24 0:08 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-19 13:20 [patch] mm: memcontrol: support transparent huge pages under pressure Johannes Weiner
2014-09-22 10:18 ` Vladimir Davydov
2014-09-23 5:52 ` Greg Thelen
2014-09-23 8:29 ` Vladimir Davydov
2014-09-23 11:44 ` Vladimir Davydov
2014-09-23 11:48 ` Johannes Weiner
2014-09-23 11:56 ` Vladimir Davydov
2014-09-23 11:16 ` Johannes Weiner
2014-09-24 0:07 ` Greg Thelen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).