linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov@parallels.com>
To: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Dave Chinner <david@fromorbit.com>, <linux-mm@kvack.org>,
	<linux-kernel@vger.kernel.org>
Subject: Re: [patch] mm: vmscan: invoke slab shrinkers from shrink_zone()
Date: Fri, 28 Nov 2014 19:06:37 +0300	[thread overview]
Message-ID: <20141128160637.GH6948@esperanza> (raw)
In-Reply-To: <1416939830-20289-1-git-send-email-hannes@cmpxchg.org>

Hi Johannes,

The patch generally looks good to me, because it simplifies the code
flow significantly and makes it easier for me to introduce per memcg
slab reclaim (thanks!). However, it has one serious flaw. Please see the
comment inline.

On Tue, Nov 25, 2014 at 01:23:50PM -0500, Johannes Weiner wrote:
[...]
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a384339bf718..8c2b45bfe610 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
[...]
> @@ -2376,12 +2407,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>  	struct zone *zone;
>  	unsigned long nr_soft_reclaimed;
>  	unsigned long nr_soft_scanned;
> -	unsigned long lru_pages = 0;
> -	struct reclaim_state *reclaim_state = current->reclaim_state;
>  	gfp_t orig_mask;
> -	struct shrink_control shrink = {
> -		.gfp_mask = sc->gfp_mask,
> -	};
>  	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
>  	bool reclaimable = false;
>  
> @@ -2394,10 +2420,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>  	if (buffer_heads_over_limit)
>  		sc->gfp_mask |= __GFP_HIGHMEM;
>  
> -	nodes_clear(shrink.nodes_to_scan);
> -
>  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
> -					gfp_zone(sc->gfp_mask), sc->nodemask) {
> +					requested_highidx, sc->nodemask) {
>  		if (!populated_zone(zone))
>  			continue;
>  		/*
> @@ -2409,9 +2433,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>  						 GFP_KERNEL | __GFP_HARDWALL))
>  				continue;
>  
> -			lru_pages += zone_reclaimable_pages(zone);
> -			node_set(zone_to_nid(zone), shrink.nodes_to_scan);
> -
>  			if (sc->priority != DEF_PRIORITY &&
>  			    !zone_reclaimable(zone))
>  				continue;	/* Let kswapd poll it */
> @@ -2450,7 +2471,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
>  			/* need some check for avoid more shrink_zone() */
>  		}
>  
> -		if (shrink_zone(zone, sc))
> +		if (shrink_zone(zone, sc, zone_idx(zone) == requested_highidx))
>  			reclaimable = true;
>  
>  		if (global_reclaim(sc) &&

If the highest zone (zone_idx=requested_highidx) is not populated, we
won't scan slab caches on direct reclaim, which may result in OOM kill
even if there are plenty of freeable dentries available.

It's especially relevant for VMs, which often have less than 4G of RAM,
in which case we will only have ZONE_DMA and ZONE_DMA32 populated and
empty ZONE_NORMAL on x86_64.

What about distributing the pressure proportionally to the number of
present pages on the zone? Something like this:

>From 5face0e29300950575bf9ccbd995828e2f183da1 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Fri, 28 Nov 2014 17:58:43 +0300
Subject: [PATCH] vmscan: fix slab vs page cache reclaim balance

Though page cache reclaim is done per-zone, the finest granularity for
slab cache reclaim is per node. To achieve proportional pressure being
put on them, we therefore shrink slab caches only when scanning the
class zone, which is the highest zone suitable for allocations. However,
the class zone may be empty, e.g. ZONE_NORMAL/ZONE_HIGH, which are class
zones for most allocations, are empty on x86_64 with < 4G of RAM. This
will result in slab cache being scanned only by kswapd, which in turn
may lead to a premature OOM kill.

This patch attempts to fix this by calling shrink_node_slabs per each
zone eligible while distributing the pressure between zones
proportionally to the number of present pages.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9130cf67bac1..dd80625a1be5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2298,8 +2298,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static bool shrink_zone(struct zone *zone, struct scan_control *sc,
-			bool is_classzone)
+static bool shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_reclaimed, nr_scanned;
 	bool reclaimable = false;
@@ -2310,7 +2309,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 			.zone = zone,
 			.priority = sc->priority,
 		};
-		unsigned long zone_lru_pages = 0;
+		unsigned long nr_eligible = 0;
 		struct mem_cgroup *memcg;
 
 		nr_reclaimed = sc->nr_reclaimed;
@@ -2319,6 +2318,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
 			unsigned long lru_pages;
+			unsigned long long tmp;
 			struct lruvec *lruvec;
 			int swappiness;
 
@@ -2326,7 +2326,17 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 			swappiness = mem_cgroup_swappiness(memcg);
 
 			shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
-			zone_lru_pages += lru_pages;
+
+			/*
+			 * Scale lru_pages inversely proportionally to the zone
+			 * size in order to not over-reclaim slab caches, which
+			 * are zone unaware.
+			 */
+			tmp = lru_pages;
+			tmp *= zone->zone_pgdat->node_present_pages;
+			do_div(tmp, zone->present_pages);
+
+			nr_eligible += tmp;
 
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
@@ -2350,12 +2360,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 		 * Shrink the slab caches in the same proportion that
 		 * the eligible LRU pages were scanned.
 		 */
-		if (global_reclaim(sc) && is_classzone) {
+		if (global_reclaim(sc)) {
 			struct reclaim_state *reclaim_state;
 
 			shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
 					  sc->nr_scanned - nr_scanned,
-					  zone_lru_pages);
+					  nr_eligible);
 
 			reclaim_state = current->reclaim_state;
 			if (reclaim_state) {
@@ -2503,7 +2513,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 			/* need some check for avoid more shrink_zone() */
 		}
 
-		if (shrink_zone(zone, sc, zone_idx(zone) == requested_highidx))
+		if (shrink_zone(zone, sc))
 			reclaimable = true;
 
 		if (global_reclaim(sc) &&
@@ -3010,7 +3020,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 						balance_gap, classzone_idx))
 		return true;
 
-	shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
+	shrink_zone(zone, sc);
 
 	/* Account for the number of pages attempted to reclaim */
 	*nr_attempted += sc->nr_to_reclaim;
@@ -3656,7 +3666,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 * priorities until we have enough memory freed.
 		 */
 		do {
-			shrink_zone(zone, &sc, true);
+			shrink_zone(zone, &sc);
 		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
 	}
 

  parent reply	other threads:[~2014-11-28 16:06 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-11-25 18:23 [patch] mm: vmscan: invoke slab shrinkers from shrink_zone() Johannes Weiner
2014-11-26 21:41 ` Andrew Morton
2014-11-28 16:06 ` Vladimir Davydov [this message]
2015-04-16  3:57   ` Joonsoo Kim
2015-04-16 14:34     ` Johannes Weiner
2015-04-16 23:17       ` Dave Chinner
2015-04-17  5:09         ` Joonsoo Kim
2015-04-17  5:06       ` Joonsoo Kim
2015-04-24 11:47         ` Vlastimil Babka

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20141128160637.GH6948@esperanza \
    --to=vdavydov@parallels.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@fromorbit.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).