From: Mel Gorman <mgorman@techsingularity.net> To: Andrew Morton <akpm@linux-foundation.org>, Linux-MM <linux-mm@kvack.org> Cc: Rik van Riel <riel@surriel.com>, Vlastimil Babka <vbabka@suse.cz>, Johannes Weiner <hannes@cmpxchg.org>, LKML <linux-kernel@vger.kernel.org>, Mel Gorman <mgorman@techsingularity.net> Subject: [PATCH 04/27] mm, vmscan: Begin reclaiming pages on a per-node basis Date: Thu, 9 Jun 2016 19:04:20 +0100 [thread overview] Message-ID: <1465495483-11855-5-git-send-email-mgorman@techsingularity.net> (raw) In-Reply-To: <1465495483-11855-1-git-send-email-mgorman@techsingularity.net> This patch makes reclaim decisions on a per-node basis. A reclaimer knows what zone is required by the allocation request and skips pages from higher zones. In many cases this will be ok because it's a GFP_HIGHMEM request of some description. On 64-bit, ZONE_DMA32 requests will cause some problems but 32-bit devices on 64-bit platforms are increasingly rare. Historically it would have been a major problem on 32-bit with big Highmem:Lowmem ratios but such configurations are also now rare and even where they exist, they are not encouraged. If it really becomes a problem, it'll manifest as very low reclaim efficiencies. Signed-off-by: Mel Gorman <mgorman@techsingularity.net> --- mm/vmscan.c | 72 ++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index f87a5a0f8793..ab1b28e7e20a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -84,6 +84,9 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ int priority; + /* The highest zone to isolate pages for reclaim from */ + enum zone_type reclaim_idx; + unsigned int may_writepage:1; /* Can mapped pages be reclaimed? */ @@ -1369,6 +1372,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; unsigned long scan; + LIST_HEAD(pages_skipped); for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); scan++) { @@ -1379,6 +1383,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); + if (page_zonenum(page) > sc->reclaim_idx) { + list_move(&page->lru, &pages_skipped); + continue; + } + switch (__isolate_lru_page(page, mode)) { case 0: nr_taken += hpage_nr_pages(page); @@ -1395,6 +1404,15 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } } + /* + * Splice any skipped pages to the start of the LRU list. Note that + * this disrupts the LRU order when reclaiming for lower zones but + * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX + * scanning would soon rescan the same pages to skip and put the + * system at risk of premature OOM. + */ + if (!list_empty(&pages_skipped)) + list_splice(&pages_skipped, src); *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); @@ -1557,7 +1575,7 @@ static int current_may_throttle(void) } /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ static noinline_for_stack unsigned long @@ -2371,12 +2389,13 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static bool shrink_zone(struct zone *zone, struct scan_control *sc, - bool is_classzone) +static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc, + enum zone_type classzone_idx) { struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; bool reclaimable = false; + struct zone *zone = &pgdat->node_zones[classzone_idx]; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2408,7 +2427,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, shrink_zone_memcg(zone, memcg, sc, &lru_pages); zone_lru_pages += lru_pages; - if (memcg && is_classzone) + if (!global_reclaim(sc) && sc->reclaim_idx == classzone_idx) shrink_slab(sc->gfp_mask, zone_to_nid(zone), memcg, sc->nr_scanned - scanned, lru_pages); @@ -2439,7 +2458,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Shrink the slab caches in the same proportion that * the eligible LRU pages were scanned. */ - if (global_reclaim(sc) && is_classzone) + if (global_reclaim(sc) && sc->reclaim_idx == classzone_idx) shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, sc->nr_scanned - nr_scanned, zone_lru_pages); @@ -2516,14 +2535,14 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc, + enum zone_type classzone_idx) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; gfp_t orig_mask; - enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2536,15 +2555,15 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { - enum zone_type classzone_idx; - if (!populated_zone(zone)) continue; - classzone_idx = requested_highidx; while (!populated_zone(zone->zone_pgdat->node_zones + - classzone_idx)) + classzone_idx)) { + sc->reclaim_idx--; classzone_idx--; + continue; + } /* * Take care memory controller reclaiming has small influence @@ -2570,8 +2589,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && - zonelist_zone_idx(z) <= requested_highidx && - compaction_ready(zone, sc->order, requested_highidx)) { + zonelist_zone_idx(z) <= classzone_idx && + compaction_ready(zone, sc->order, classzone_idx)) { sc->compaction_ready = true; continue; } @@ -2591,7 +2610,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); + shrink_node(zone->zone_pgdat, sc, classzone_idx); } /* @@ -2623,6 +2642,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, int initial_priority = sc->priority; unsigned long total_scanned = 0; unsigned long writeback_threshold; + enum zone_type classzone_idx = gfp_zone(sc->gfp_mask); retry: delayacct_freepages_start(); @@ -2633,7 +2653,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - shrink_zones(zonelist, sc); + shrink_zones(zonelist, sc, classzone_idx); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) @@ -3088,7 +3108,7 @@ static bool kswapd_shrink_zone(struct zone *zone, balance_gap, classzone_idx)) return true; - shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); + shrink_node(zone->zone_pgdat, sc, classzone_idx); /* TODO: ANOMALY */ clear_bit(PGDAT_WRITEBACK, &pgdat->flags); @@ -3137,6 +3157,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .reclaim_idx = MAX_NR_ZONES - 1, .order = order, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, @@ -3207,15 +3228,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) sc.may_writepage = 1; /* - * Now scan the zone in the dma->highmem direction, stopping - * at the last zone which needs scanning. - * - * We do this because the page allocator works in the opposite - * direction. This prevents the page allocator from allocating - * pages behind kswapd's direction of progress, which would - * cause too much scanning of the lower zones. + * Continue scanning in the highmem->dma direction stopping at + * the last zone which needs scanning. This may reclaim lowmem + * pages that are not necessary for zone balancing but it + * preserves LRU ordering. It is assumed that the bulk of + * allocation requests can use arbitrary zones with the + * possible exception of big highmem:lowmem configurations. */ - for (i = 0; i <= end_zone; i++) { + for (i = end_zone; i >= end_zone; i--) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) @@ -3226,6 +3246,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) continue; sc.nr_scanned = 0; + sc.reclaim_idx = i; nr_soft_scanned = 0; /* @@ -3674,6 +3695,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, + .reclaim_idx = zone_idx(zone), }; cond_resched(); @@ -3693,7 +3715,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * priorities until we have enough memory freed. */ do { - shrink_zone(zone, &sc, true); + shrink_node(zone->zone_pgdat, &sc, zone_idx(zone)); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } -- 2.6.4
next prev parent reply other threads:[~2016-06-09 18:07 UTC|newest] Thread overview: 79+ messages / expand[flat|nested] mbox.gz Atom feed top 2016-06-09 18:04 [PATCH 00/27] Move LRU page reclaim from zones to nodes v6 Mel Gorman 2016-06-09 18:04 ` [PATCH 01/27] mm, vmstat: Add infrastructure for per-node vmstats Mel Gorman 2016-06-10 13:31 ` Vlastimil Babka 2016-06-10 13:47 ` Mel Gorman 2016-06-13 17:26 ` Christoph Lameter 2016-06-14 14:25 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 02/27] mm, vmscan: Move lru_lock to the node Mel Gorman 2016-06-10 16:39 ` Vlastimil Babka 2016-06-10 17:48 ` Peter Zijlstra 2016-06-14 14:41 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 03/27] mm, vmscan: Move LRU lists to node Mel Gorman 2016-06-10 18:00 ` Vlastimil Babka 2016-06-09 18:04 ` Mel Gorman [this message] 2016-06-15 12:52 ` [PATCH 04/27] mm, vmscan: Begin reclaiming pages on a per-node basis Vlastimil Babka 2016-06-09 18:04 ` [PATCH 05/27] mm, vmscan: Have kswapd only scan based on the highest requested zone Mel Gorman 2016-06-15 13:13 ` Vlastimil Babka 2016-06-16 7:44 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 06/27] mm, vmscan: Make kswapd reclaim in terms of nodes Mel Gorman 2016-06-15 14:23 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 07/27] mm, vmscan: Remove balance gap Mel Gorman 2016-06-15 14:28 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 08/27] mm, vmscan: Simplify the logic deciding whether kswapd sleeps Mel Gorman 2016-06-15 15:18 ` Vlastimil Babka 2016-06-16 8:30 ` Mel Gorman 2016-06-16 9:16 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 09/27] mm, vmscan: By default have direct reclaim only shrink once per node Mel Gorman 2016-06-16 8:59 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 10/27] mm, vmscan: Clear congestion, dirty and need for compaction on a per-node basis Mel Gorman 2016-06-16 9:29 ` Vlastimil Babka 2016-06-16 10:29 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 11/27] mm: vmscan: Do not reclaim from kswapd if there is any eligible zone Mel Gorman 2016-06-16 10:08 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 12/27] mm, vmscan: Make shrink_node decisions more node-centric Mel Gorman 2016-06-16 13:35 ` Vlastimil Babka 2016-06-16 14:47 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 13/27] mm, memcg: Move memcg limit enforcement from zones to nodes Mel Gorman 2016-06-16 15:06 ` Vlastimil Babka 2016-06-16 15:53 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 14/27] mm, workingset: Make working set detection node-aware Mel Gorman 2016-06-16 15:13 ` Vlastimil Babka 2016-06-16 15:56 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 15/27] mm, page_alloc: Consider dirtyable memory in terms of nodes Mel Gorman 2016-06-16 15:45 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 16/27] mm: Move page mapped accounting to the node Mel Gorman 2016-06-16 15:52 ` Vlastimil Babka 2016-06-16 16:04 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 17/27] mm: Rename NR_ANON_PAGES to NR_ANON_MAPPED Mel Gorman 2016-06-17 8:28 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 18/27] mm: Move most file-based accounting to the node Mel Gorman 2016-06-17 8:35 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 19/27] mm: Move vmscan writes and file write " Mel Gorman 2016-06-17 8:51 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 20/27] mm, vmscan: Update classzone_idx if buffer_heads_over_limit Mel Gorman 2016-06-17 8:53 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 21/27] mm, vmscan: Only wakeup kswapd once per node for the requested classzone Mel Gorman 2016-06-17 10:46 ` Vlastimil Babka 2016-06-17 12:03 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman 2016-06-17 10:55 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 23/27] mm, vmscan: Add classzone information to tracepoints Mel Gorman 2016-06-17 10:57 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 24/27] mm, page_alloc: Remove fair zone allocation policy Mel Gorman 2016-06-17 11:27 ` Vlastimil Babka 2016-06-17 12:07 ` Mel Gorman 2016-06-09 18:04 ` [PATCH 25/27] mm: page_alloc: Cache the last node whose dirty limit is reached Mel Gorman 2016-06-17 11:31 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 26/27] mm: vmstat: Replace __count_zone_vm_events with a zone id equivalent Mel Gorman 2016-06-17 11:36 ` Vlastimil Babka 2016-06-09 18:04 ` [PATCH 27/27] mm: vmstat: Account per-zone stalls and pages skipped during reclaim Mel Gorman 2016-06-17 12:39 ` Vlastimil Babka -- strict thread matches above, loose matches on Subject: below -- 2016-06-21 14:15 [PATCH 00/27] Move LRU page reclaim from zones to nodes v7 Mel Gorman 2016-06-21 14:15 ` [PATCH 04/27] mm, vmscan: Begin reclaiming pages on a per-node basis Mel Gorman 2016-06-22 14:04 ` Vlastimil Babka 2016-06-22 16:00 ` Vlastimil Babka 2016-06-23 11:07 ` Mel Gorman 2016-06-23 11:13 ` Michal Hocko 2016-06-23 10:58 ` Mel Gorman [not found] <02ed01d1c47a$49fbfbc0$ddf3f340$@alibaba-inc.com> 2016-06-12 7:33 ` Hillf Danton 2016-06-14 14:47 ` Mel Gorman 2016-04-15 9:13 [PATCH 00/27] Move LRU page reclaim from zones to nodes v5 Mel Gorman 2016-04-15 9:13 ` [PATCH 04/27] mm, vmscan: Begin reclaiming pages on a per-node basis Mel Gorman
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=1465495483-11855-5-git-send-email-mgorman@techsingularity.net \ --to=mgorman@techsingularity.net \ --cc=akpm@linux-foundation.org \ --cc=hannes@cmpxchg.org \ --cc=linux-kernel@vger.kernel.org \ --cc=linux-mm@kvack.org \ --cc=riel@surriel.com \ --cc=vbabka@suse.cz \ --subject='Re: [PATCH 04/27] mm, vmscan: Begin reclaiming pages on a per-node basis' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).