linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] mm, vmscan: cleanup lru size claculations
       [not found] <20170116160123.GB30300@cmpxchg.org>
@ 2017-01-16 19:33 ` Michal Hocko
  2017-01-16 19:33   ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
                     ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Michal Hocko @ 2017-01-16 19:33 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Minchan Kim, Mel Gorman, Hillf Danton, linux-mm, LKML, Michal Hocko

From: Michal Hocko <mhocko@suse.com>

lruvec_lru_size returns the full size of the LRU list while we sometimes
need a value reduced only to eligible zones (e.g. for lowmem requests).
inactive_list_is_low is one such user. Later patches will add more of
them. Add a new parameter to lruvec_lru_size and allow it filter out
zones which are not eligible for the given context.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 include/linux/mmzone.h |  2 +-
 mm/vmscan.c            | 88 ++++++++++++++++++++++++--------------------------
 mm/workingset.c        |  2 +-
 3 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d1d440cff60e..91f69aa0d581 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -780,7 +780,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 #endif
 }
 
-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
 
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cf940af609fd..1cb0ebdef305 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -234,22 +234,38 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
 		pgdat_reclaimable_pages(pgdat) * 6;
 }
 
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/** lruvec_lru_size -  Returns the number of pages on the given LRU list.
+ * @lruvec: lru vector
+ * @lru: lru to use
+ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
 {
+	unsigned long lru_size;
+	int zid;
+
 	if (!mem_cgroup_disabled())
-		return mem_cgroup_get_lru_size(lruvec, lru);
+		lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+	else
+		lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 
-	return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
-}
+	for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
+		unsigned long size;
 
-unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru,
-				   int zone_idx)
-{
-	if (!mem_cgroup_disabled())
-		return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx);
+		if (!managed_zone(zone))
+			continue;
+
+		if (!mem_cgroup_disabled())
+			size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+		else
+			size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
+				       NR_ZONE_LRU_BASE + lru);
+		lru_size -= min(size, lru_size);
+	}
+
+	return lru_size;
 
-	return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx],
-			       NR_ZONE_LRU_BASE + lru);
 }
 
 /*
@@ -2051,11 +2067,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 						struct scan_control *sc, bool trace)
 {
 	unsigned long inactive_ratio;
-	unsigned long total_inactive, inactive;
-	unsigned long total_active, active;
+	unsigned long inactive, active;
+	enum lru_list inactive_lru = file * LRU_FILE;
+	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
 	unsigned long gb;
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-	int zid;
 
 	/*
 	 * If we don't have swap space, anonymous page deactivation
@@ -2064,27 +2079,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	if (!file && !total_swap_pages)
 		return false;
 
-	total_inactive = inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
-	total_active = active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
-	/*
-	 * For zone-constrained allocations, it is necessary to check if
-	 * deactivations are required for lowmem to be reclaimed. This
-	 * calculates the inactive/active pages available in eligible zones.
-	 */
-	for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
-		struct zone *zone = &pgdat->node_zones[zid];
-		unsigned long inactive_zone, active_zone;
-
-		if (!managed_zone(zone))
-			continue;
-
-		inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid);
-		active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid);
-
-		inactive -= min(inactive, inactive_zone);
-		active -= min(active, active_zone);
-	}
+	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
+	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
@@ -2093,10 +2089,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 		inactive_ratio = 1;
 
 	if (trace)
-		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id,
+		trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
 				sc->reclaim_idx,
-				total_inactive, inactive,
-				total_active, active, inactive_ratio, file);
+				lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+				lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+				inactive_ratio, file);
+
 	return inactive * inactive_ratio < active;
 }
 
@@ -2236,7 +2234,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * system is under heavy pressure.
 	 */
 	if (!inactive_list_is_low(lruvec, true, sc, false) &&
-	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2262,10 +2260,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * anon in [0], file in [1]
 	 */
 
-	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
-	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
 
 	spin_lock_irq(&pgdat->lru_lock);
 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2303,7 +2301,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 			unsigned long size;
 			unsigned long scan;
 
-			size = lruvec_lru_size(lruvec, lru);
+			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
 			scan = size >> sc->priority;
 
 			if (!scan && pass && force_scan)
diff --git a/mm/workingset.c b/mm/workingset.c
index abb58ffa3c64..a67f5796b995 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -267,7 +267,7 @@ bool workingset_refault(void *shadow)
 	}
 	lruvec = mem_cgroup_lruvec(pgdat, memcg);
 	refault = atomic_long_read(&lruvec->inactive_age);
-	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
 	rcu_read_unlock();
 
 	/*
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-16 19:33 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
@ 2017-01-16 19:33   ` Michal Hocko
  2017-01-17  3:42     ` Hillf Danton
  2017-01-16 19:33   ` [PATCH 3/3] Reverted "mm: bail out in shrink_inactive_list()" Michal Hocko
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 12+ messages in thread
From: Michal Hocko @ 2017-01-16 19:33 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Minchan Kim, Mel Gorman, Hillf Danton, linux-mm, LKML, Michal Hocko

From: Michal Hocko <mhocko@suse.com>

get_scan_count considers the whole node LRU size when
- doing SCAN_FILE due to many page cache inactive pages
- calculating the number of pages to scan

in both cases this might lead to unexpected behavior especially on 32b
systems where we can expect lowmem memory pressure very often.

A large highmem zone can easily distort SCAN_FILE heuristic because
there might be only few file pages from the eligible zones on the node
lru and we would still enforce file lru scanning which can lead to
trashing while we could still scan anonymous pages.

The later use of lruvec_lru_size can be problematic as well. Especially
when there are not many pages from the eligible zones. We would have to
skip over many pages to find anything to reclaim but shrink_node_memcg
would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
at maximum. Therefore we can end up going over a large LRU many times
without actually having chance to reclaim much if anything at all. The
closer we are out of memory on lowmem zone the worse the problem will
be.

Fix this by filtering out all the ineligible zones when calculating the
lru size for both paths and consider only sc->reclaim_idx zones.

Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1cb0ebdef305..a88e222784ea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2234,7 +2234,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * system is under heavy pressure.
 	 */
 	if (!inactive_list_is_low(lruvec, true, sc, false) &&
-	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
+	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2301,7 +2301,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 			unsigned long size;
 			unsigned long scan;
 
-			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
+			size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
 			scan = size >> sc->priority;
 
 			if (!scan && pass && force_scan)
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] Reverted "mm: bail out in shrink_inactive_list()"
  2017-01-16 19:33 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
  2017-01-16 19:33   ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
@ 2017-01-16 19:33   ` Michal Hocko
  2017-01-17  3:58     ` Hillf Danton
  2017-01-17  6:58     ` Minchan Kim
  2017-01-17  3:40   ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Hillf Danton
  2017-01-17  6:58   ` Minchan Kim
  3 siblings, 2 replies; 12+ messages in thread
From: Michal Hocko @ 2017-01-16 19:33 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Minchan Kim, Mel Gorman, Hillf Danton, linux-mm, LKML, Michal Hocko

From: Michal Hocko <mhocko@suse.com>

This reverts 91dcade47a3d0e7c31464ef05f56c08e92a0e9c2.
inactive_reclaimable_pages shouldn't be needed anymore since that
get_scan_count is aware of the eligble zones ("mm, vmscan: consider
eligible zones in get_scan_count").

Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/vmscan.c | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index a88e222784ea..486ba6d7dc4c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1700,30 +1700,6 @@ static int current_may_throttle(void)
 		bdi_write_congested(current->backing_dev_info);
 }
 
-static bool inactive_reclaimable_pages(struct lruvec *lruvec,
-				struct scan_control *sc, enum lru_list lru)
-{
-	int zid;
-	struct zone *zone;
-	int file = is_file_lru(lru);
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-
-	if (!global_reclaim(sc))
-		return true;
-
-	for (zid = sc->reclaim_idx; zid >= 0; zid--) {
-		zone = &pgdat->node_zones[zid];
-		if (!managed_zone(zone))
-			continue;
-
-		if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
-				LRU_FILE * file) >= SWAP_CLUSTER_MAX)
-			return true;
-	}
-
-	return false;
-}
-
 /*
  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
@@ -1742,9 +1718,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 
-	if (!inactive_reclaimable_pages(lruvec, sc, lru))
-		return 0;
-
 	while (unlikely(too_many_isolated(pgdat, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] mm, vmscan: cleanup lru size claculations
  2017-01-16 19:33 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
  2017-01-16 19:33   ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
  2017-01-16 19:33   ` [PATCH 3/3] Reverted "mm: bail out in shrink_inactive_list()" Michal Hocko
@ 2017-01-17  3:40   ` Hillf Danton
  2017-01-17  6:58   ` Minchan Kim
  3 siblings, 0 replies; 12+ messages in thread
From: Hillf Danton @ 2017-01-17  3:40 UTC (permalink / raw)
  To: 'Michal Hocko', 'Johannes Weiner'
  Cc: 'Minchan Kim', 'Mel Gorman',
	linux-mm, 'LKML', 'Michal Hocko'

On Tuesday, January 17, 2017 3:33 AM Michal Hocko wrote: 
> 
> From: Michal Hocko <mhocko@suse.com>
> 
> lruvec_lru_size returns the full size of the LRU list while we sometimes
> need a value reduced only to eligible zones (e.g. for lowmem requests).
> inactive_list_is_low is one such user. Later patches will add more of
> them. Add a new parameter to lruvec_lru_size and allow it filter out
> zones which are not eligible for the given context.
> 
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Michal Hocko <mhocko@suse.com>
> ---
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-16 19:33   ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
@ 2017-01-17  3:42     ` Hillf Danton
  0 siblings, 0 replies; 12+ messages in thread
From: Hillf Danton @ 2017-01-17  3:42 UTC (permalink / raw)
  To: 'Michal Hocko', 'Johannes Weiner'
  Cc: 'Minchan Kim', 'Mel Gorman',
	linux-mm, 'LKML', 'Michal Hocko'


On Tuesday, January 17, 2017 3:33 AM Michal Hocko wrote: 
> 
> From: Michal Hocko <mhocko@suse.com>
> 
> get_scan_count considers the whole node LRU size when
> - doing SCAN_FILE due to many page cache inactive pages
> - calculating the number of pages to scan
> 
> in both cases this might lead to unexpected behavior especially on 32b
> systems where we can expect lowmem memory pressure very often.
> 
> A large highmem zone can easily distort SCAN_FILE heuristic because
> there might be only few file pages from the eligible zones on the node
> lru and we would still enforce file lru scanning which can lead to
> trashing while we could still scan anonymous pages.
> 
> The later use of lruvec_lru_size can be problematic as well. Especially
> when there are not many pages from the eligible zones. We would have to
> skip over many pages to find anything to reclaim but shrink_node_memcg
> would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
> at maximum. Therefore we can end up going over a large LRU many times
> without actually having chance to reclaim much if anything at all. The
> closer we are out of memory on lowmem zone the worse the problem will
> be.
> 
> Fix this by filtering out all the ineligible zones when calculating the
> lru size for both paths and consider only sc->reclaim_idx zones.
> 
> Acked-by: Minchan Kim <minchan@kernel.org>
> Signed-off-by: Michal Hocko <mhocko@suse.com>
> ---
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] Reverted "mm: bail out in shrink_inactive_list()"
  2017-01-16 19:33   ` [PATCH 3/3] Reverted "mm: bail out in shrink_inactive_list()" Michal Hocko
@ 2017-01-17  3:58     ` Hillf Danton
  2017-01-17  6:58     ` Minchan Kim
  1 sibling, 0 replies; 12+ messages in thread
From: Hillf Danton @ 2017-01-17  3:58 UTC (permalink / raw)
  To: 'Michal Hocko', 'Johannes Weiner'
  Cc: 'Minchan Kim', 'Mel Gorman',
	linux-mm, 'LKML', 'Michal Hocko'


On Tuesday, January 17, 2017 3:33 AM Michal Hocko wrote: 
> 
> From: Michal Hocko <mhocko@suse.com>
> 
> This reverts 91dcade47a3d0e7c31464ef05f56c08e92a0e9c2.
> inactive_reclaimable_pages shouldn't be needed anymore since that
> get_scan_count is aware of the eligble zones ("mm, vmscan: consider
> eligible zones in get_scan_count").
> 
Looks radical ;)

> Signed-off-by: Michal Hocko <mhocko@suse.com>
> ---
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> 

>  mm/vmscan.c | 27 ---------------------------
>  1 file changed, 27 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a88e222784ea..486ba6d7dc4c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1700,30 +1700,6 @@ static int current_may_throttle(void)
>  		bdi_write_congested(current->backing_dev_info);
>  }
> 
> -static bool inactive_reclaimable_pages(struct lruvec *lruvec,
> -				struct scan_control *sc, enum lru_list lru)
> -{
> -	int zid;
> -	struct zone *zone;
> -	int file = is_file_lru(lru);
> -	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> -
> -	if (!global_reclaim(sc))
> -		return true;
> -
> -	for (zid = sc->reclaim_idx; zid >= 0; zid--) {
> -		zone = &pgdat->node_zones[zid];
> -		if (!managed_zone(zone))
> -			continue;
> -
> -		if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
> -				LRU_FILE * file) >= SWAP_CLUSTER_MAX)
> -			return true;
> -	}
> -
> -	return false;
> -}
> -
>  /*
>   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
>   * of reclaimed pages
> @@ -1742,9 +1718,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>  	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
> 
> -	if (!inactive_reclaimable_pages(lruvec, sc, lru))
> -		return 0;
> -
>  	while (unlikely(too_many_isolated(pgdat, file, sc))) {
>  		congestion_wait(BLK_RW_ASYNC, HZ/10);
> 
> --
> 2.11.0

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] mm, vmscan: cleanup lru size claculations
  2017-01-16 19:33 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
                     ` (2 preceding siblings ...)
  2017-01-17  3:40   ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Hillf Danton
@ 2017-01-17  6:58   ` Minchan Kim
  3 siblings, 0 replies; 12+ messages in thread
From: Minchan Kim @ 2017-01-17  6:58 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Johannes Weiner, Mel Gorman, Hillf Danton, linux-mm, LKML, Michal Hocko

On Mon, Jan 16, 2017 at 08:33:15PM +0100, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> lruvec_lru_size returns the full size of the LRU list while we sometimes
> need a value reduced only to eligible zones (e.g. for lowmem requests).
> inactive_list_is_low is one such user. Later patches will add more of
> them. Add a new parameter to lruvec_lru_size and allow it filter out
> zones which are not eligible for the given context.
> 
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Michal Hocko <mhocko@suse.com>
> ---
>  include/linux/mmzone.h |  2 +-
>  mm/vmscan.c            | 88 ++++++++++++++++++++++++--------------------------
>  mm/workingset.c        |  2 +-
>  3 files changed, 45 insertions(+), 47 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index d1d440cff60e..91f69aa0d581 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -780,7 +780,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
>  #endif
>  }
>  
> -extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
> +extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
>  
>  #ifdef CONFIG_HAVE_MEMORY_PRESENT
>  void memory_present(int nid, unsigned long start, unsigned long end);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index cf940af609fd..1cb0ebdef305 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -234,22 +234,38 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
>  		pgdat_reclaimable_pages(pgdat) * 6;
>  }
>  
> -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
> +/** lruvec_lru_size -  Returns the number of pages on the given LRU list.

minor:

/*
 * lruvec_lru_size

I don't have any preferance but just found.

Otherwise,
Acked-by: Minchan Kim <minchan@kernel.org>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] Reverted "mm: bail out in shrink_inactive_list()"
  2017-01-16 19:33   ` [PATCH 3/3] Reverted "mm: bail out in shrink_inactive_list()" Michal Hocko
  2017-01-17  3:58     ` Hillf Danton
@ 2017-01-17  6:58     ` Minchan Kim
  1 sibling, 0 replies; 12+ messages in thread
From: Minchan Kim @ 2017-01-17  6:58 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Johannes Weiner, Mel Gorman, Hillf Danton, linux-mm, LKML, Michal Hocko

On Mon, Jan 16, 2017 at 08:33:17PM +0100, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> This reverts 91dcade47a3d0e7c31464ef05f56c08e92a0e9c2.
> inactive_reclaimable_pages shouldn't be needed anymore since that
> get_scan_count is aware of the eligble zones ("mm, vmscan: consider
> eligible zones in get_scan_count").
> 
> Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-02-06  8:10   ` Michal Hocko
@ 2017-02-06 23:40     ` Andrew Morton
  0 siblings, 0 replies; 12+ messages in thread
From: Andrew Morton @ 2017-02-06 23:40 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Johannes Weiner, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Trevor Cordes

On Mon, 6 Feb 2017 09:10:07 +0100 Michal Hocko <mhocko@kernel.org> wrote:

> Hi Andrew,
> it turned out that this is not a theoretical issue after all. Trevor
> (added to the CC) was seeing pre-mature OOM killer triggering [1]
> bisected to b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a
> per-node basis").
> After some going back and forth it turned out that b4536f0c829c ("mm,
> memcg: fix the active list aging for lowmem requests when memcg is
> enabled") helped a lot but it wasn't sufficient on its own. We also
> need this patch to make the oom behavior stable again. So I suggest
> backporting this to stable as well. Could you update the changelog as
> follows?
> 
> The patch would need to be tweaked a bit to apply to 4.10 and older
> but I will do that as soon as it hits the Linus tree in the next merge
> window.
> 
> ...
>
> Fixes: b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a per-node basis")
> Cc: stable # 4.8+
> Tested-by: Trevor Cordes <trevor@tecnopolis.ca>

No probs.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-17 10:37 ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
  2017-01-18 16:46   ` Johannes Weiner
@ 2017-02-06  8:10   ` Michal Hocko
  2017-02-06 23:40     ` Andrew Morton
  1 sibling, 1 reply; 12+ messages in thread
From: Michal Hocko @ 2017-02-06  8:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Trevor Cordes

Hi Andrew,
it turned out that this is not a theoretical issue after all. Trevor
(added to the CC) was seeing pre-mature OOM killer triggering [1]
bisected to b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a
per-node basis").
After some going back and forth it turned out that b4536f0c829c ("mm,
memcg: fix the active list aging for lowmem requests when memcg is
enabled") helped a lot but it wasn't sufficient on its own. We also
need this patch to make the oom behavior stable again. So I suggest
backporting this to stable as well. Could you update the changelog as
follows?

The patch would need to be tweaked a bit to apply to 4.10 and older
but I will do that as soon as it hits the Linus tree in the next merge
window.

[1] http://lkml.kernel.org/r/20170111103243.GA27795@pog.tecnopolis.ca

On Tue 17-01-17 11:37:01, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> get_scan_count considers the whole node LRU size when
> - doing SCAN_FILE due to many page cache inactive pages
> - calculating the number of pages to scan
> 
> in both cases this might lead to unexpected behavior especially on 32b
> systems where we can expect lowmem memory pressure very often.
> 
> A large highmem zone can easily distort SCAN_FILE heuristic because
> there might be only few file pages from the eligible zones on the node
> lru and we would still enforce file lru scanning which can lead to
> trashing while we could still scan anonymous pages.
> 
> The later use of lruvec_lru_size can be problematic as well. Especially
> when there are not many pages from the eligible zones. We would have to
> skip over many pages to find anything to reclaim but shrink_node_memcg
> would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
> at maximum. Therefore we can end up going over a large LRU many times
> without actually having chance to reclaim much if anything at all. The
> closer we are out of memory on lowmem zone the worse the problem will
> be.
> 
> Fix this by filtering out all the ineligible zones when calculating the
> lru size for both paths and consider only sc->reclaim_idx zones.
> 

Fixes: b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a per-node basis")
Cc: stable # 4.8+
Tested-by: Trevor Cordes <trevor@tecnopolis.ca>

> Acked-by: Minchan Kim <minchan@kernel.org>
> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
> Signed-off-by: Michal Hocko <mhocko@suse.com>
> ---
>  mm/vmscan.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index aed39dc272c0..ffac8fa7bdd8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2235,7 +2235,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
>  	 * system is under heavy pressure.
>  	 */
>  	if (!inactive_list_is_low(lruvec, true, sc, false) &&
> -	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
> +	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
>  		scan_balance = SCAN_FILE;
>  		goto out;
>  	}
> @@ -2302,7 +2302,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
>  			unsigned long size;
>  			unsigned long scan;
>  
> -			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
> +			size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
>  			scan = size >> sc->priority;
>  
>  			if (!scan && pass && force_scan)
> -- 
> 2.11.0
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-17 10:37 ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
@ 2017-01-18 16:46   ` Johannes Weiner
  2017-02-06  8:10   ` Michal Hocko
  1 sibling, 0 replies; 12+ messages in thread
From: Johannes Weiner @ 2017-01-18 16:46 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Michal Hocko

On Tue, Jan 17, 2017 at 11:37:01AM +0100, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> get_scan_count considers the whole node LRU size when
> - doing SCAN_FILE due to many page cache inactive pages
> - calculating the number of pages to scan
> 
> in both cases this might lead to unexpected behavior especially on 32b
> systems where we can expect lowmem memory pressure very often.
> 
> A large highmem zone can easily distort SCAN_FILE heuristic because
> there might be only few file pages from the eligible zones on the node
> lru and we would still enforce file lru scanning which can lead to
> trashing while we could still scan anonymous pages.
> 
> The later use of lruvec_lru_size can be problematic as well. Especially
> when there are not many pages from the eligible zones. We would have to
> skip over many pages to find anything to reclaim but shrink_node_memcg
> would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
> at maximum. Therefore we can end up going over a large LRU many times
> without actually having chance to reclaim much if anything at all. The
> closer we are out of memory on lowmem zone the worse the problem will
> be.
> 
> Fix this by filtering out all the ineligible zones when calculating the
> lru size for both paths and consider only sc->reclaim_idx zones.
> 
> Acked-by: Minchan Kim <minchan@kernel.org>
> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
> Signed-off-by: Michal Hocko <mhocko@suse.com>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count
  2017-01-17 10:36 [PATCH 0/3] follow up nodereclaim for 32b fix Michal Hocko
@ 2017-01-17 10:37 ` Michal Hocko
  2017-01-18 16:46   ` Johannes Weiner
  2017-02-06  8:10   ` Michal Hocko
  0 siblings, 2 replies; 12+ messages in thread
From: Michal Hocko @ 2017-01-17 10:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Mel Gorman, Minchan Kim, Hillf Danton, linux-mm,
	LKML, Michal Hocko

From: Michal Hocko <mhocko@suse.com>

get_scan_count considers the whole node LRU size when
- doing SCAN_FILE due to many page cache inactive pages
- calculating the number of pages to scan

in both cases this might lead to unexpected behavior especially on 32b
systems where we can expect lowmem memory pressure very often.

A large highmem zone can easily distort SCAN_FILE heuristic because
there might be only few file pages from the eligible zones on the node
lru and we would still enforce file lru scanning which can lead to
trashing while we could still scan anonymous pages.

The later use of lruvec_lru_size can be problematic as well. Especially
when there are not many pages from the eligible zones. We would have to
skip over many pages to find anything to reclaim but shrink_node_memcg
would only reduce the remaining number to scan by SWAP_CLUSTER_MAX
at maximum. Therefore we can end up going over a large LRU many times
without actually having chance to reclaim much if anything at all. The
closer we are out of memory on lowmem zone the worse the problem will
be.

Fix this by filtering out all the ineligible zones when calculating the
lru size for both paths and consider only sc->reclaim_idx zones.

Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index aed39dc272c0..ffac8fa7bdd8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2235,7 +2235,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * system is under heavy pressure.
 	 */
 	if (!inactive_list_is_low(lruvec, true, sc, false) &&
-	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
+	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2302,7 +2302,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 			unsigned long size;
 			unsigned long scan;
 
-			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
+			size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
 			scan = size >> sc->priority;
 
 			if (!scan && pass && force_scan)
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-02-06 23:40 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20170116160123.GB30300@cmpxchg.org>
2017-01-16 19:33 ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Michal Hocko
2017-01-16 19:33   ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
2017-01-17  3:42     ` Hillf Danton
2017-01-16 19:33   ` [PATCH 3/3] Reverted "mm: bail out in shrink_inactive_list()" Michal Hocko
2017-01-17  3:58     ` Hillf Danton
2017-01-17  6:58     ` Minchan Kim
2017-01-17  3:40   ` [PATCH 1/3] mm, vmscan: cleanup lru size claculations Hillf Danton
2017-01-17  6:58   ` Minchan Kim
2017-01-17 10:36 [PATCH 0/3] follow up nodereclaim for 32b fix Michal Hocko
2017-01-17 10:37 ` [PATCH 2/3] mm, vmscan: consider eligible zones in get_scan_count Michal Hocko
2017-01-18 16:46   ` Johannes Weiner
2017-02-06  8:10   ` Michal Hocko
2017-02-06 23:40     ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).