linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] follow up nodereclaim for 32b fix
@ 2017-01-17 10:36 Michal Hocko
  2017-01-17 10:37 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Michal Hocko @ 2017-01-17 10:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm, LKML

Hi,
I have previously posted this as an RFC [1] but there didn't seem to be
any objections other than some requests to reorganize the changes in
a slightly different way so I am reposting the series and asking for
inclusion.

This is a follow up on top of [2]. The patch 1 cleans up the code a bit.
I haven't seen any real issues or bug reports but conceptualy ignoring
the maximum eligible zone in get_scan_count is wrong by definition. This
is what patch 2 does.  Patch 3 removes inactive_reclaimable_pages
which was a kind of hack around for the problem which should have been
addressed at get_scan_count.

There is one more place which needs a special handling which is not
a part of this series. too_many_isolated can get confused as well. I
already have some preliminary work but it still needs some testing so I
will post it separatelly.

Michal Hocko (3):
      mm, vmscan: cleanup lru size claculations
      mm, vmscan: consider eligible zones in get_scan_count
      Revert "mm: bail out in shrink_inactive_list()"

 include/linux/mmzone.h |   2 +-
 mm/vmscan.c            | 116 +++++++++++++++++++------------------------------
 mm/workingset.c        |   2 +-
 3 files changed, 46 insertions(+), 74 deletions(-)

[1] http://lkml.kernel.org/r/20170110125552.4170-1-mhocko@kernel.org
[2] http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/3] mm, vmscan: cleanup lru size claculations
  2017-01-17 10:36 [PATCH 0/3] follow up nodereclaim for 32b fix Michal Hocko
@ 2017-01-17 10:37 ` Michal Hocko
  2017-01-17 10:37 ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 11+ messages in thread
From: Michal Hocko @ 2017-01-17 10:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Michal Hocko

From: Michal Hocko <mhocko@suse.com>

lruvec_lru_size returns the full size of the LRU list while we sometimes
need a value reduced only to eligible zones (e.g. for lowmem requests).
inactive_list_is_low is one such user. Later patches will add more of
them. Add a new parameter to lruvec_lru_size and allow it filter out
zones which are not eligible for the given context.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 include/linux/mmzone.h |  2 +-
 mm/vmscan.c            | 89 +++++++++++++++++++++++++-------------------------
 mm/workingset.c        |  2 +-
 3 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d1d440cff60e..91f69aa0d581 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -780,7 +780,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 #endif
 }
 
-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
 
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cf940af609fd..aed39dc272c0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -234,22 +234,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
 		pgdat_reclaimable_pages(pgdat) * 6;
 }
 
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/**
+ * lruvec_lru_size -  Returns the number of pages on the given LRU list.
+ * @lruvec: lru vector
+ * @lru: lru to use
+ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
 {
+	unsigned long lru_size;
+	int zid;
+
 	if (!mem_cgroup_disabled())
-		return mem_cgroup_get_lru_size(lruvec, lru);
+		lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+	else
+		lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 
-	return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
-}
+	for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
+		unsigned long size;
 
-unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru,
-				   int zone_idx)
-{
-	if (!mem_cgroup_disabled())
-		return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx);
+		if (!managed_zone(zone))
+			continue;
+
+		if (!mem_cgroup_disabled())
+			size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+		else
+			size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
+				       NR_ZONE_LRU_BASE + lru);
+		lru_size -= min(size, lru_size);
+	}
+
+	return lru_size;
 
-	return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx],
-			       NR_ZONE_LRU_BASE + lru);
 }
 
 /*
@@ -2051,11 +2068,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 						struct scan_control *sc, bool trace)
 {
 	unsigned long inactive_ratio;
-	unsigned long total_inactive, inactive;
-	unsigned long total_active, active;
+	unsigned long inactive, active;
+	enum lru_list inactive_lru = file * LRU_FILE;
+	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
 	unsigned long gb;
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-	int zid;
 
 	/*
 	 * If we don't have swap space, anonymous page deactivation
@@ -2064,27 +2080,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	if (!file && !total_swap_pages)
 		return false;
 
-	total_inactive = inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
-	total_active = active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
-	/*
-	 * For zone-constrained allocations, it is necessary to check if
-	 * deactivations are required for lowmem to be reclaimed. This
-	 * calculates the inactive/active pages available in eligible zones.
-	 */
-	for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
-		struct zone *zone = &pgdat->node_zones[zid];
-		unsigned long inactive_zone, active_zone;
-
-		if (!managed_zone(zone))
-			continue;
-
-		inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid);
-		active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid);
-
-		inactive -= min(inactive, inactive_zone);
-		active -= min(active, active_zone);
-	}
+	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
+	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
@@ -2093,10 +2090,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 		inactive_ratio = 1;
 
 	if (trace)
-		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id,
+		trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
 				sc->reclaim_idx,
-				total_inactive, inactive,
-				total_active, active, inactive_ratio, file);
+				lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+				lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+				inactive_ratio, file);
+
 	return inactive * inactive_ratio < active;
 }
 
@@ -2236,7 +2235,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * system is under heavy pressure.
 	 */
 	if (!inactive_list_is_low(lruvec, true, sc, false) &&
-	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2262,10 +2261,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * anon in [0], file in [1]
 	 */
 
-	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
-	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
 
 	spin_lock_irq(&pgdat->lru_lock);
 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2303,7 +2302,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 			unsigned long size;
 			unsigned long scan;
 
-			size = lruvec_lru_size(lruvec, lru);
+			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
 			scan = size >> sc->priority;
 
 			if (!scan && pass && force_scan)
diff --git a/mm/workingset.c b/mm/workingset.c
index abb58ffa3c64..a67f5796b995 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -267,7 +267,7 @@ bool workingset_refault(void *shadow)
 	}
 	lruvec = mem_cgroup_lruvec(pgdat, memcg);
 	refault = atomic_long_read(&lruvec->inactive_age);
-	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
 	rcu_read_unlock();
 
 	/*
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-17 10:36 [PATCH 0/3] follow up nodereclaim for 32b fix Michal Hocko
  2017-01-17 10:37 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
@ 2017-01-17 10:37 ` Michal Hocko
  2017-01-18 16:46   ` Johannes Weiner
  2017-02-06  8:10   ` Michal Hocko
  2017-01-17 10:37 ` [PATCH 3/3] Revert "mm: bail out in shrink_inactive_list()" Michal Hocko
  2017-01-17 11:13 ` [PATCH 0/3] follow up nodereclaim for 32b fix Mel Gorman
  3 siblings, 2 replies; 11+ messages in thread
From: Michal Hocko @ 2017-01-17 10:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Michal Hocko

From: Michal Hocko <mhocko@suse.com>

get_scan_count considers the whole node LRU size when
- doing SCAN_FILE due to many page cache inactive pages
- calculating the number of pages to scan

in both cases this might lead to unexpected behavior especially on 32b
systems where we can expect lowmem memory pressure very often.

A large highmem zone can easily distort SCAN_FILE heuristic because
there might be only few file pages from the eligible zones on the node
lru and we would still enforce file lru scanning which can lead to
trashing while we could still scan anonymous pages.

The later use of lruvec_lru_size can be problematic as well. Especially
when there are not many pages from the eligible zones. We would have to
skip over many pages to find anything to reclaim but shrink_node_memcg
would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
at maximum. Therefore we can end up going over a large LRU many times
without actually having chance to reclaim much if anything at all. The
closer we are out of memory on lowmem zone the worse the problem will
be.

Fix this by filtering out all the ineligible zones when calculating the
lru size for both paths and consider only sc->reclaim_idx zones.

Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index aed39dc272c0..ffac8fa7bdd8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2235,7 +2235,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * system is under heavy pressure.
 	 */
 	if (!inactive_list_is_low(lruvec, true, sc, false) &&
-	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
+	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2302,7 +2302,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 			unsigned long size;
 			unsigned long scan;
 
-			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
+			size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
 			scan = size >> sc->priority;
 
 			if (!scan && pass && force_scan)
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/3] Revert "mm: bail out in shrink_inactive_list()"
  2017-01-17 10:36 [PATCH 0/3] follow up nodereclaim for 32b fix Michal Hocko
  2017-01-17 10:37 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
  2017-01-17 10:37 ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
@ 2017-01-17 10:37 ` Michal Hocko
  2017-01-18 16:48   ` Johannes Weiner
  2017-01-17 11:13 ` [PATCH 0/3] follow up nodereclaim for 32b fix Mel Gorman
  3 siblings, 1 reply; 11+ messages in thread
From: Michal Hocko @ 2017-01-17 10:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Michal Hocko

From: Michal Hocko <mhocko@suse.com>

This reverts commit 91dcade47a3d0e7c31464ef05f56c08e92a0e9c2.

inactive_reclaimable_pages shouldn't be needed anymore since
that get_scan_count is aware of the eligble zones ("mm, vmscan:
consider eligible zones in get_scan_count").

Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/vmscan.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ffac8fa7bdd8..f3255702f3df 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1701,30 +1701,6 @@ static int current_may_throttle(void)
 		bdi_write_congested(current->backing_dev_info);
 }
 
-static bool inactive_reclaimable_pages(struct lruvec *lruvec,
-				struct scan_control *sc, enum lru_list lru)
-{
-	int zid;
-	struct zone *zone;
-	int file = is_file_lru(lru);
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-
-	if (!global_reclaim(sc))
-		return true;
-
-	for (zid = sc->reclaim_idx; zid >= 0; zid--) {
-		zone = &pgdat->node_zones[zid];
-		if (!managed_zone(zone))
-			continue;
-
-		if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
-				LRU_FILE * file) >= SWAP_CLUSTER_MAX)
-			return true;
-	}
-
-	return false;
-}
-
 /*
  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
@@ -1743,9 +1719,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 
-	if (!inactive_reclaimable_pages(lruvec, sc, lru))
-		return 0;
-
 	while (unlikely(too_many_isolated(pgdat, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] follow up nodereclaim for 32b fix
  2017-01-17 10:36 [PATCH 0/3] follow up nodereclaim for 32b fix Michal Hocko
                   ` (2 preceding siblings ...)
  2017-01-17 10:37 ` [PATCH 3/3] Revert "mm: bail out in shrink_inactive_list()" Michal Hocko
@ 2017-01-17 11:13 ` Mel Gorman
  3 siblings, 0 replies; 11+ messages in thread
From: Mel Gorman @ 2017-01-17 11:13 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Johannes Weiner, Minchan Kim, Hillf Danton,
	linux-mm, LKML

On Tue, Jan 17, 2017 at 11:36:59AM +0100, Michal Hocko wrote:
> Hi,
> I have previously posted this as an RFC [1] but there didn't seem to be
> any objections other than some requests to reorganize the changes in
> a slightly different way so I am reposting the series and asking for
> inclusion.
> 
> This is a follow up on top of [2]. The patch 1 cleans up the code a bit.
> I haven't seen any real issues or bug reports but conceptualy ignoring
> the maximum eligible zone in get_scan_count is wrong by definition. This
> is what patch 2 does.  Patch 3 removes inactive_reclaimable_pages
> which was a kind of hack around for the problem which should have been
> addressed at get_scan_count.
> 
> There is one more place which needs a special handling which is not
> a part of this series. too_many_isolated can get confused as well. I
> already have some preliminary work but it still needs some testing so I
> will post it separatelly.
> 
> Michal Hocko (3):
>       mm, vmscan: cleanup lru size claculations
>       mm, vmscan: consider eligible zones in get_scan_count
>       Revert "mm: bail out in shrink_inactive_list()"
> 

Acked-by: Mel Gorman <mgorman@suse.de>

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-17 10:37 ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
@ 2017-01-18 16:46   ` Johannes Weiner
  2017-02-06  8:10   ` Michal Hocko
  1 sibling, 0 replies; 11+ messages in thread
From: Johannes Weiner @ 2017-01-18 16:46 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Michal Hocko

On Tue, Jan 17, 2017 at 11:37:01AM +0100, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> get_scan_count considers the whole node LRU size when
> - doing SCAN_FILE due to many page cache inactive pages
> - calculating the number of pages to scan
> 
> in both cases this might lead to unexpected behavior especially on 32b
> systems where we can expect lowmem memory pressure very often.
> 
> A large highmem zone can easily distort SCAN_FILE heuristic because
> there might be only few file pages from the eligible zones on the node
> lru and we would still enforce file lru scanning which can lead to
> trashing while we could still scan anonymous pages.
> 
> The later use of lruvec_lru_size can be problematic as well. Especially
> when there are not many pages from the eligible zones. We would have to
> skip over many pages to find anything to reclaim but shrink_node_memcg
> would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
> at maximum. Therefore we can end up going over a large LRU many times
> without actually having chance to reclaim much if anything at all. The
> closer we are out of memory on lowmem zone the worse the problem will
> be.
> 
> Fix this by filtering out all the ineligible zones when calculating the
> lru size for both paths and consider only sc->reclaim_idx zones.
> 
> Acked-by: Minchan Kim <minchan@kernel.org>
> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
> Signed-off-by: Michal Hocko <mhocko@suse.com>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] Revert "mm: bail out in shrink_inactive_list()"
  2017-01-17 10:37 ` [PATCH 3/3] Revert "mm: bail out in shrink_inactive_list()" Michal Hocko
@ 2017-01-18 16:48   ` Johannes Weiner
  0 siblings, 0 replies; 11+ messages in thread
From: Johannes Weiner @ 2017-01-18 16:48 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Michal Hocko

On Tue, Jan 17, 2017 at 11:37:02AM +0100, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> This reverts commit 91dcade47a3d0e7c31464ef05f56c08e92a0e9c2.
> 
> inactive_reclaimable_pages shouldn't be needed anymore since
> that get_scan_count is aware of the eligble zones ("mm, vmscan:
> consider eligible zones in get_scan_count").
> 
> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
> Acked-by: Minchan Kim <minchan@kernel.org>
> Signed-off-by: Michal Hocko <mhocko@suse.com>

Acked-by: Johannes Weiner <hannes@cmpchxg.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-17 10:37 ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
  2017-01-18 16:46   ` Johannes Weiner
@ 2017-02-06  8:10   ` Michal Hocko
  2017-02-06 23:40     ` Andrew Morton
  1 sibling, 1 reply; 11+ messages in thread
From: Michal Hocko @ 2017-02-06  8:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Trevor Cordes

Hi Andrew,
it turned out that this is not a theoretical issue after all. Trevor
(added to the CC) was seeing pre-mature OOM killer triggering [1]
bisected to b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a
per-node basis").
After some going back and forth it turned out that b4536f0c829c ("mm,
memcg: fix the active list aging for lowmem requests when memcg is
enabled") helped a lot but it wasn't sufficient on its own. We also
need this patch to make the oom behavior stable again. So I suggest
backporting this to stable as well. Could you update the changelog as
follows?

The patch would need to be tweaked a bit to apply to 4.10 and older
but I will do that as soon as it hits the Linus tree in the next merge
window.

[1] http://lkml.kernel.org/r/20170111103243.GA27795@pog.tecnopolis.ca

On Tue 17-01-17 11:37:01, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> get_scan_count considers the whole node LRU size when
> - doing SCAN_FILE due to many page cache inactive pages
> - calculating the number of pages to scan
> 
> in both cases this might lead to unexpected behavior especially on 32b
> systems where we can expect lowmem memory pressure very often.
> 
> A large highmem zone can easily distort SCAN_FILE heuristic because
> there might be only few file pages from the eligible zones on the node
> lru and we would still enforce file lru scanning which can lead to
> trashing while we could still scan anonymous pages.
> 
> The later use of lruvec_lru_size can be problematic as well. Especially
> when there are not many pages from the eligible zones. We would have to
> skip over many pages to find anything to reclaim but shrink_node_memcg
> would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
> at maximum. Therefore we can end up going over a large LRU many times
> without actually having chance to reclaim much if anything at all. The
> closer we are out of memory on lowmem zone the worse the problem will
> be.
> 
> Fix this by filtering out all the ineligible zones when calculating the
> lru size for both paths and consider only sc->reclaim_idx zones.
> 

Fixes: b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a per-node basis")
Cc: stable # 4.8+
Tested-by: Trevor Cordes <trevor@tecnopolis.ca>

> Acked-by: Minchan Kim <minchan@kernel.org>
> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
> Signed-off-by: Michal Hocko <mhocko@suse.com>
> ---
>  mm/vmscan.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index aed39dc272c0..ffac8fa7bdd8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2235,7 +2235,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
>  	 * system is under heavy pressure.
>  	 */
>  	if (!inactive_list_is_low(lruvec, true, sc, false) &&
> -	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
> +	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
>  		scan_balance = SCAN_FILE;
>  		goto out;
>  	}
> @@ -2302,7 +2302,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
>  			unsigned long size;
>  			unsigned long scan;
>  
> -			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
> +			size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
>  			scan = size >> sc->priority;
>  
>  			if (!scan && pass && force_scan)
> -- 
> 2.11.0
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-02-06  8:10   ` Michal Hocko
@ 2017-02-06 23:40     ` Andrew Morton
  0 siblings, 0 replies; 11+ messages in thread
From: Andrew Morton @ 2017-02-06 23:40 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Johannes Weiner, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Trevor Cordes

On Mon, 6 Feb 2017 09:10:07 +0100 Michal Hocko <mhocko@kernel.org> wrote:

> Hi Andrew,
> it turned out that this is not a theoretical issue after all. Trevor
> (added to the CC) was seeing pre-mature OOM killer triggering [1]
> bisected to b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a
> per-node basis").
> After some going back and forth it turned out that b4536f0c829c ("mm,
> memcg: fix the active list aging for lowmem requests when memcg is
> enabled") helped a lot but it wasn't sufficient on its own. We also
> need this patch to make the oom behavior stable again. So I suggest
> backporting this to stable as well. Could you update the changelog as
> follows?
> 
> The patch would need to be tweaked a bit to apply to 4.10 and older
> but I will do that as soon as it hits the Linus tree in the next merge
> window.
> 
> ...
>
> Fixes: b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a per-node basis")
> Cc: stable # 4.8+
> Tested-by: Trevor Cordes <trevor@tecnopolis.ca>

No probs.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-16 19:33   ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
@ 2017-01-17  3:42     ` Hillf Danton
  0 siblings, 0 replies; 11+ messages in thread
From: Hillf Danton @ 2017-01-17  3:42 UTC (permalink / raw)
  To: 'Michal Hocko', 'Johannes Weiner'
  Cc: 'Minchan Kim', 'Mel Gorman',
	linux-mm, 'LKML', 'Michal Hocko'


On Tuesday, January 17, 2017 3:33 AM Michal Hocko wrote: 
> 
> From: Michal Hocko <mhocko@suse.com>
> 
> get_scan_count considers the whole node LRU size when
> - doing SCAN_FILE due to many page cache inactive pages
> - calculating the number of pages to scan
> 
> in both cases this might lead to unexpected behavior especially on 32b
> systems where we can expect lowmem memory pressure very often.
> 
> A large highmem zone can easily distort SCAN_FILE heuristic because
> there might be only few file pages from the eligible zones on the node
> lru and we would still enforce file lru scanning which can lead to
> trashing while we could still scan anonymous pages.
> 
> The later use of lruvec_lru_size can be problematic as well. Especially
> when there are not many pages from the eligible zones. We would have to
> skip over many pages to find anything to reclaim but shrink_node_memcg
> would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
> at maximum. Therefore we can end up going over a large LRU many times
> without actually having chance to reclaim much if anything at all. The
> closer we are out of memory on lowmem zone the worse the problem will
> be.
> 
> Fix this by filtering out all the ineligible zones when calculating the
> lru size for both paths and consider only sc->reclaim_idx zones.
> 
> Acked-by: Minchan Kim <minchan@kernel.org>
> Signed-off-by: Michal Hocko <mhocko@suse.com>
> ---
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-16 19:33 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
@ 2017-01-16 19:33   ` Michal Hocko
  2017-01-17  3:42     ` Hillf Danton
  0 siblings, 1 reply; 11+ messages in thread
From: Michal Hocko @ 2017-01-16 19:33 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Minchan Kim, Mel Gorman, Hillf Danton, linux-mm, LKML, Michal Hocko

From: Michal Hocko <mhocko@suse.com>

get_scan_count considers the whole node LRU size when
- doing SCAN_FILE due to many page cache inactive pages
- calculating the number of pages to scan

in both cases this might lead to unexpected behavior especially on 32b
systems where we can expect lowmem memory pressure very often.

A large highmem zone can easily distort SCAN_FILE heuristic because
there might be only few file pages from the eligible zones on the node
lru and we would still enforce file lru scanning which can lead to
trashing while we could still scan anonymous pages.

The later use of lruvec_lru_size can be problematic as well. Especially
when there are not many pages from the eligible zones. We would have to
skip over many pages to find anything to reclaim but shrink_node_memcg
would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
at maximum. Therefore we can end up going over a large LRU many times
without actually having chance to reclaim much if anything at all. The
closer we are out of memory on lowmem zone the worse the problem will
be.

Fix this by filtering out all the ineligible zones when calculating the
lru size for both paths and consider only sc->reclaim_idx zones.

Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1cb0ebdef305..a88e222784ea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2234,7 +2234,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * system is under heavy pressure.
 	 */
 	if (!inactive_list_is_low(lruvec, true, sc, false) &&
-	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
+	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2301,7 +2301,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 			unsigned long size;
 			unsigned long scan;
 
-			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
+			size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
 			scan = size >> sc->priority;
 
 			if (!scan && pass && force_scan)
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2017-02-06 23:40 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-17 10:36 [PATCH 0/3] follow up nodereclaim for 32b fix Michal Hocko
2017-01-17 10:37 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
2017-01-17 10:37 ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
2017-01-18 16:46   ` Johannes Weiner
2017-02-06  8:10   ` Michal Hocko
2017-02-06 23:40     ` Andrew Morton
2017-01-17 10:37 ` [PATCH 3/3] Revert "mm: bail out in shrink_inactive_list()" Michal Hocko
2017-01-18 16:48   ` Johannes Weiner
2017-01-17 11:13 ` [PATCH 0/3] follow up nodereclaim for 32b fix Mel Gorman
     [not found] <20170116160123.GB30300@cmpxchg.org>
2017-01-16 19:33 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
2017-01-16 19:33   ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
2017-01-17  3:42     ` Hillf Danton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).