linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/2] mm, zone: track number of pages in free area by migratetype
@ 2016-11-17  1:32 David Rientjes
  2016-11-17  1:32 ` [patch 2/2] mm, compaction: avoid async compaction if most free memory is ineligible David Rientjes
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: David Rientjes @ 2016-11-17  1:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Vlastimil Babka, Michal Hocko, Joonsoo Kim,
	linux-kernel, linux-mm

Each zone's free_area tracks the number of free pages for all free lists.
This does not allow the number of free pages for a specific migratetype
to be determined without iterating its free list.

An upcoming change will use this information to preclude doing async
memory compaction when the number of MIGRATE_UNMOVABLE pageblocks is
below a certain threshold.

The total number of free pages is still tracked, however, to not make
zone_watermark_ok() more expensive.  Reading /proc/pagetypeinfo, however,
is faster.

This patch introduces no functional change and increases the amount of
per-zone metadata at worst by 48 bytes per memory zone (when CONFIG_CMA
and CONFIG_MEMORY_ISOLATION are enabled).

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/mmzone.h |  3 ++-
 mm/compaction.c        |  4 ++--
 mm/page_alloc.c        | 47 ++++++++++++++++++++++++++++-------------------
 mm/vmstat.c            | 18 +++++-------------
 4 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -89,7 +89,8 @@ extern int page_group_by_mobility_disabled;
 
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
-	unsigned long		nr_free;
+	unsigned long		nr_free[MIGRATE_TYPES];
+	unsigned long		total_free;
 };
 
 struct pglist_data;
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1320,13 +1320,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 		bool can_steal;
 
 		/* Job done if page is free of the right migratetype */
-		if (!list_empty(&area->free_list[migratetype]))
+		if (area->nr_free[migratetype])
 			return COMPACT_SUCCESS;
 
 #ifdef CONFIG_CMA
 		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
 		if (migratetype == MIGRATE_MOVABLE &&
-			!list_empty(&area->free_list[MIGRATE_CMA]))
+						area->nr_free[MIGRATE_CMA])
 			return COMPACT_SUCCESS;
 #endif
 		/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -821,7 +821,8 @@ static inline void __free_one_page(struct page *page,
 			clear_page_guard(zone, buddy, order, migratetype);
 		} else {
 			list_del(&buddy->lru);
-			zone->free_area[order].nr_free--;
+			zone->free_area[order].nr_free[migratetype]--;
+			zone->free_area[order].total_free--;
 			rmv_page_order(buddy);
 		}
 		combined_idx = buddy_idx & page_idx;
@@ -880,7 +881,8 @@ static inline void __free_one_page(struct page *page,
 
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
-	zone->free_area[order].nr_free++;
+	zone->free_area[order].nr_free[migratetype]++;
+	zone->free_area[order].total_free++;
 }
 
 /*
@@ -1648,7 +1650,8 @@ static inline void expand(struct zone *zone, struct page *page,
 			continue;
 
 		list_add(&page[size].lru, &area->free_list[migratetype]);
-		area->nr_free++;
+		area->nr_free[migratetype]++;
+		area->total_free++;
 		set_page_order(&page[size], high);
 	}
 }
@@ -1802,7 +1805,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 			continue;
 		list_del(&page->lru);
 		rmv_page_order(page);
-		area->nr_free--;
+		area->nr_free[migratetype]--;
+		area->total_free--;
 		expand(zone, page, order, current_order, area, migratetype);
 		set_pcppage_migratetype(page, migratetype);
 		return page;
@@ -1991,7 +1995,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
 	int i;
 	int fallback_mt;
 
-	if (area->nr_free == 0)
+	if (!area->total_free)
 		return -1;
 
 	*can_steal = false;
@@ -2000,7 +2004,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
 		if (fallback_mt == MIGRATE_TYPES)
 			break;
 
-		if (list_empty(&area->free_list[fallback_mt]))
+		if (!area->nr_free[fallback_mt])
 			continue;
 
 		if (can_steal_fallback(order, migratetype))
@@ -2163,7 +2167,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 			steal_suitable_fallback(zone, page, start_migratetype);
 
 		/* Remove the page from the freelists */
-		area->nr_free--;
+		area->nr_free[fallback_mt]--;
+		area->total_free--;
 		list_del(&page->lru);
 		rmv_page_order(page);
 
@@ -2549,7 +2554,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
 
 	/* Remove page from free list */
 	list_del(&page->lru);
-	zone->free_area[order].nr_free--;
+	zone->free_area[order].nr_free[mt]--;
+	zone->free_area[order].total_free--;
 	rmv_page_order(page);
 
 	/*
@@ -2808,22 +2814,19 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 		struct free_area *area = &z->free_area[o];
 		int mt;
 
-		if (!area->nr_free)
+		if (!area->total_free)
 			continue;
 
 		if (alloc_harder)
 			return true;
 
-		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
-			if (!list_empty(&area->free_list[mt]))
+		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++)
+			if (area->nr_free[mt])
 				return true;
-		}
 
 #ifdef CONFIG_CMA
-		if ((alloc_flags & ALLOC_CMA) &&
-		    !list_empty(&area->free_list[MIGRATE_CMA])) {
+		if ((alloc_flags & ALLOC_CMA) && area->nr_free[MIGRATE_CMA])
 			return true;
-		}
 #endif
 	}
 	return false;
@@ -4431,12 +4434,12 @@ void show_free_areas(unsigned int filter)
 			struct free_area *area = &zone->free_area[order];
 			int type;
 
-			nr[order] = area->nr_free;
+			nr[order] = area->total_free;
 			total += nr[order] << order;
 
 			types[order] = 0;
 			for (type = 0; type < MIGRATE_TYPES; type++) {
-				if (!list_empty(&area->free_list[type]))
+				if (area->nr_free[type])
 					types[order] |= 1 << type;
 			}
 		}
@@ -5100,8 +5103,10 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 	unsigned int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
-		zone->free_area[order].nr_free = 0;
+		zone->free_area[order].nr_free[t] = 0;
 	}
+	for (order = 0; order < MAX_ORDER; order++)
+		zone->free_area[order].total_free = 0;
 }
 
 #ifndef __HAVE_ARCH_MEMMAP_INIT
@@ -7416,6 +7421,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
+		int migratetype;
+
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
@@ -7438,9 +7445,11 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 		pr_info("remove from free list %lx %d %lx\n",
 			pfn, 1 << order, end_pfn);
 #endif
+		migratetype = get_pageblock_migratetype(page);
 		list_del(&page->lru);
 		rmv_page_order(page);
-		zone->free_area[order].nr_free--;
+		zone->free_area[order].nr_free[migratetype]--;
+		zone->free_area[order].total_free--;
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
diff --git a/mm/vmstat.c b/mm/vmstat.c
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -846,7 +846,7 @@ static void fill_contig_page_info(struct zone *zone,
 		unsigned long blocks;
 
 		/* Count number of free blocks */
-		blocks = zone->free_area[order].nr_free;
+		blocks = zone->free_area[order].total_free;
 		info->free_blocks_total += blocks;
 
 		/* Count free base pages */
@@ -1146,7 +1146,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
 
 	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
 	for (order = 0; order < MAX_ORDER; ++order)
-		seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+		seq_printf(m, "%6lu ", zone->free_area[order].total_free);
 	seq_putc(m, '\n');
 }
 
@@ -1170,17 +1170,9 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
 					pgdat->node_id,
 					zone->name,
 					migratetype_names[mtype]);
-		for (order = 0; order < MAX_ORDER; ++order) {
-			unsigned long freecount = 0;
-			struct free_area *area;
-			struct list_head *curr;
-
-			area = &(zone->free_area[order]);
-
-			list_for_each(curr, &area->free_list[mtype])
-				freecount++;
-			seq_printf(m, "%6lu ", freecount);
-		}
+		for (order = 0; order < MAX_ORDER; ++order)
+			seq_printf(m, "%6lu ",
+				   zone->free_area[order].nr_free[mtype]);
 		seq_putc(m, '\n');
 	}
 }

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [patch 2/2] mm, compaction: avoid async compaction if most free memory is ineligible
  2016-11-17  1:32 [patch 1/2] mm, zone: track number of pages in free area by migratetype David Rientjes
@ 2016-11-17  1:32 ` David Rientjes
  2016-11-17 17:04 ` [patch 1/2] mm, zone: track number of pages in free area by migratetype Vlastimil Babka
  2016-11-30  0:16 ` [patch v2 1/2] mm, zone: track number of movable free pages David Rientjes
  2 siblings, 0 replies; 8+ messages in thread
From: David Rientjes @ 2016-11-17  1:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Vlastimil Babka, Michal Hocko, Joonsoo Kim,
	linux-kernel, linux-mm

Memory compaction will only migrate memory to MIGRATE_MOVABLE pageblocks
for asynchronous compaction.

If most free memory on the system is not eligible for migration in this
context, isolate_freepages() can take an extreme amount of time trying to
find a free page.  For example, we have encountered the following
scenario many times, specifically due to slab fragmentation:

Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10 
Node    0, zone   Normal, type    Unmovable  40000   3778      2      0      0      0      0      0      0      0      0 
Node    0, zone   Normal, type  Reclaimable     11      6      0      0      0      0      0      0      0      0      0 
Node    0, zone   Normal, type      Movable      1      1      0      0      0      0      0      0      0      0      0 
Node    0, zone   Normal, type      Reserve      0      0      0      0      0      0      0      0      0      0      0

The compaction freeing scanner will end up scanning this entire zone,
perhaps finding no memory free and terminating compaction after pages
have already been isolated for migration.  It is unnecessary to even
start async compaction in a scenario where free memory cannot be
isolated as a migration target.

This patch does not deem async compaction to be suitable when 1/64th or
less of free memory is from MIGRATE_MOVABLE pageblocks.  This heuristic
is somewhat arbitrarily defined, but in the example above would easily
trigger earlier when async compaction will become very expensive.

It would also be possible to check zone watermarks in
__compaction_suitable() using the amount of MIGRATE_MOVABLE memory as
an alternative.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/buffer.c                |  2 +-
 include/linux/compaction.h |  8 ++++----
 include/linux/swap.h       |  3 ++-
 mm/compaction.c            | 49 +++++++++++++++++++++++++++++++++++++---------
 mm/internal.h              |  1 +
 mm/page_alloc.c            | 15 +++++++-------
 mm/vmscan.c                | 20 ++++++++++++++++---
 7 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -268,7 +268,7 @@ static void free_more_memory(void)
 						gfp_zone(GFP_NOFS), NULL);
 		if (z->zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS, NULL);
+					  GFP_NOFS, MIN_COMPACT_PRIORITY, NULL);
 	}
 }
 
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -97,7 +97,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 		const struct alloc_context *ac, enum compact_priority prio);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern enum compact_result compaction_suitable(struct zone *zone, int order,
-		unsigned int alloc_flags, int classzone_idx);
+		unsigned int alloc_flags, int classzone_idx, bool sync);
 
 extern void defer_compaction(struct zone *zone, int order);
 extern bool compaction_deferred(struct zone *zone, int order);
@@ -171,7 +171,7 @@ static inline bool compaction_withdrawn(enum compact_result result)
 
 
 bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
-					int alloc_flags);
+				  int alloc_flags, enum compact_priority prio);
 
 extern int kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
@@ -182,8 +182,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
 {
 }
 
-static inline enum compact_result compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx)
+static inline enum compact_result compaction_suitable(struct zone *zone,
+		int order, int alloc_flags, int classzone_idx, bool sync)
 {
 	return COMPACT_SKIPPED;
 }
diff --git a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
+#include <linux/compaction.h>
 #include <asm/page.h>
 
 struct notifier_block;
@@ -315,7 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-					gfp_t gfp_mask, nodemask_t *mask);
+		gfp_t gfp_mask, enum compact_priority prio, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1365,7 +1365,7 @@ static enum compact_result compact_finished(struct zone *zone,
 static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx,
-					unsigned long wmark_target)
+					unsigned long wmark_target, bool sync)
 {
 	unsigned long watermark;
 
@@ -1402,18 +1402,46 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
 						ALLOC_CMA, wmark_target))
 		return COMPACT_SKIPPED;
 
+	if (!sync) {
+		unsigned long nr_movable_free_pages = 0;
+		unsigned long nr_free_pages;
+		int i;
+
+		for (i = 0; i < MAX_ORDER; i++) {
+			nr_movable_free_pages +=
+				zone->free_area[i].nr_free[MIGRATE_MOVABLE];
+#ifdef CONFIG_CMA
+			nr_movable_free_pages +=
+				zone->free_area[i].nr_free[MIGRATE_CMA];
+#endif
+		}
+		nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
+#ifdef CONFIG_CMA
+		nr_free_pages += zone_page_state(zone, NR_FREE_CMA_PAGES);
+#endif
+		/*
+		 * Page migration can only migrate pages to MIGRATE_MOVABLE or
+		 * MIGRATE_CMA pageblocks for async compaction.  If the amount
+		 * of ineligible pageblocks substantially outweighs the amount
+		 * of eligible pageblocks, do not attempt compaction since it
+		 * will be unnecessarily expensive.
+		 */
+		if (nr_movable_free_pages <= (nr_free_pages / 64))
+			return COMPACT_SKIPPED;
+	}
+
 	return COMPACT_CONTINUE;
 }
 
 enum compact_result compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
-					int classzone_idx)
+					int classzone_idx, bool sync)
 {
 	enum compact_result ret;
 	int fragindex;
 
 	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
-				    zone_page_state(zone, NR_FREE_PAGES));
+				    zone_page_state(zone, NR_FREE_PAGES), sync);
 	/*
 	 * fragmentation index determines if allocation failures are due to
 	 * low memory or external fragmentation
@@ -1444,7 +1472,7 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 }
 
 bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
-		int alloc_flags)
+				  int alloc_flags, enum compact_priority prio)
 {
 	struct zone *zone;
 	struct zoneref *z;
@@ -1467,7 +1495,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
 		available = zone_reclaimable_pages(zone) / order;
 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
 		compact_result = __compaction_suitable(zone, order, alloc_flags,
-				ac_classzone_idx(ac), available);
+				ac_classzone_idx(ac), available, prio);
 		if (compact_result != COMPACT_SKIPPED)
 			return true;
 	}
@@ -1484,7 +1512,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
-							cc->classzone_idx);
+				  cc->classzone_idx, sync);
 	/* Compaction is likely to fail */
 	if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
 		return ret;
@@ -1860,13 +1888,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
 	enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
 
 	for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
+		enum compact_result result;
+
 		zone = &pgdat->node_zones[zoneid];
 
 		if (!populated_zone(zone))
 			continue;
 
-		if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
-					classzone_idx) == COMPACT_CONTINUE)
+		result = compaction_suitable(zone, pgdat->kcompactd_max_order,
+					     0, classzone_idx, true);
+		if (result == COMPACT_CONTINUE)
 			return true;
 	}
 
@@ -1903,7 +1934,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 		if (compaction_deferred(zone, cc.order))
 			continue;
 
-		if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+		if (compaction_suitable(zone, cc.order, 0, zoneid, true) !=
 							COMPACT_CONTINUE)
 			continue;
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -451,6 +451,7 @@ extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
 
 extern void set_pageblock_order(void);
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+					    enum migrate_mode mode,
 					    struct list_head *page_list);
 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
 #define ALLOC_WMARK_MIN		WMARK_MIN
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3217,7 +3217,8 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 	 * compaction.
 	 */
 	if (compaction_withdrawn(compact_result))
-		return compaction_zonelist_suitable(ac, order, alloc_flags);
+		return compaction_zonelist_suitable(ac, order, alloc_flags,
+						    *compact_priority);
 
 	/*
 	 * !costly requests are much more important than __GFP_REPEAT
@@ -3287,7 +3288,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
-					const struct alloc_context *ac)
+		  const struct alloc_context *ac, enum compact_priority prio)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
@@ -3301,7 +3302,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 
-	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
+	progress = try_to_free_pages(ac->zonelist, order, gfp_mask, prio,
 								ac->nodemask);
 
 	current->reclaim_state = NULL;
@@ -3317,12 +3318,12 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
-		unsigned long *did_some_progress)
+		enum compact_priority prio, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 
-	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
+	*did_some_progress = __perform_reclaim(gfp_mask, order, ac, prio);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 
@@ -3666,7 +3667,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
-							&did_some_progress);
+					compact_priority, &did_some_progress);
 	if (page)
 		goto got_pg;
 
@@ -7191,7 +7192,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 			break;
 		}
 
-		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
+		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, cc->mode,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -84,6 +84,8 @@ struct scan_control {
 	/* Scan (total_size >> priority) pages at once */
 	int priority;
 
+	enum compact_priority compact_priority;
+
 	/* The highest zone to isolate pages for reclaim from */
 	enum zone_type reclaim_idx;
 
@@ -1275,11 +1277,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 }
 
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+					    enum migrate_mode mode,
 					    struct list_head *page_list)
 {
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
+		.compact_priority = mode == MIGRATE_ASYNC ?
+				    COMPACT_PRIO_ASYNC :
+				    COMPACT_PRIO_SYNC_LIGHT,
 		.may_unmap = 1,
 	};
 	unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
@@ -2500,7 +2506,8 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 		if (!managed_zone(zone))
 			continue;
 
-		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
+		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx,
+					    sc->compact_priority)) {
 		case COMPACT_SUCCESS:
 		case COMPACT_CONTINUE:
 			return false;
@@ -2613,7 +2620,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 	unsigned long watermark;
 	enum compact_result suitable;
 
-	suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
+	suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx,
+				       sc->compact_priority);
 	if (suitable == COMPACT_SUCCESS)
 		/* Allocation should succeed already. Don't reclaim. */
 		return true;
@@ -2942,7 +2950,8 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 }
 
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-				gfp_t gfp_mask, nodemask_t *nodemask)
+				gfp_t gfp_mask, enum compact_priority prio,
+				nodemask_t *nodemask)
 {
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
@@ -2952,6 +2961,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.order = order,
 		.nodemask = nodemask,
 		.priority = DEF_PRIORITY,
+		.compact_priority = prio,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = 1,
@@ -3032,6 +3042,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.target_mem_cgroup = memcg,
 		.priority = DEF_PRIORITY,
+		.compact_priority = DEF_COMPACT_PRIORITY,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = may_swap,
@@ -3203,6 +3214,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		.gfp_mask = GFP_KERNEL,
 		.order = order,
 		.priority = DEF_PRIORITY,
+		.compact_priority = DEF_COMPACT_PRIORITY,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = 1,
@@ -3536,6 +3548,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 		.gfp_mask = GFP_HIGHUSER_MOVABLE,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.priority = DEF_PRIORITY,
+		.compact_priority = DEF_COMPACT_PRIORITY,
 		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
@@ -3724,6 +3737,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
 		.order = order,
 		.priority = NODE_RECLAIM_PRIORITY,
+		.compact_priority = DEF_COMPACT_PRIORITY,
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/2] mm, zone: track number of pages in free area by migratetype
  2016-11-17  1:32 [patch 1/2] mm, zone: track number of pages in free area by migratetype David Rientjes
  2016-11-17  1:32 ` [patch 2/2] mm, compaction: avoid async compaction if most free memory is ineligible David Rientjes
@ 2016-11-17 17:04 ` Vlastimil Babka
  2016-11-17 22:11   ` David Rientjes
  2016-11-30  0:16 ` [patch v2 1/2] mm, zone: track number of movable free pages David Rientjes
  2 siblings, 1 reply; 8+ messages in thread
From: Vlastimil Babka @ 2016-11-17 17:04 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Michal Hocko, Joonsoo Kim, linux-kernel, linux-mm

On 11/17/2016 02:32 AM, David Rientjes wrote:
> Each zone's free_area tracks the number of free pages for all free lists.
> This does not allow the number of free pages for a specific migratetype
> to be determined without iterating its free list.
> 
> An upcoming change will use this information to preclude doing async
> memory compaction when the number of MIGRATE_UNMOVABLE pageblocks is
> below a certain threshold.
> 
> The total number of free pages is still tracked, however, to not make
> zone_watermark_ok() more expensive.  Reading /proc/pagetypeinfo, however,
> is faster.

Yeah I've already seen a case with /proc/pagetypeinfo causing soft
lockups due to high number of iterations...

> This patch introduces no functional change and increases the amount of
> per-zone metadata at worst by 48 bytes per memory zone (when CONFIG_CMA
> and CONFIG_MEMORY_ISOLATION are enabled).

Isn't it 48 bytes per zone and order?

> Signed-off-by: David Rientjes <rientjes@google.com>

I'd be for this if there are no performance regressions. It affects hot
paths and increases cache footprint. I think at least some allocator
intensive microbenchmark should be used.

Vlastimil

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/2] mm, zone: track number of pages in free area by migratetype
  2016-11-17 17:04 ` [patch 1/2] mm, zone: track number of pages in free area by migratetype Vlastimil Babka
@ 2016-11-17 22:11   ` David Rientjes
  2016-11-18 20:58     ` Vlastimil Babka
  0 siblings, 1 reply; 8+ messages in thread
From: David Rientjes @ 2016-11-17 22:11 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Mel Gorman, Michal Hocko, Joonsoo Kim,
	linux-kernel, linux-mm

On Thu, 17 Nov 2016, Vlastimil Babka wrote:

> > The total number of free pages is still tracked, however, to not make
> > zone_watermark_ok() more expensive.  Reading /proc/pagetypeinfo, however,
> > is faster.
> 
> Yeah I've already seen a case with /proc/pagetypeinfo causing soft
> lockups due to high number of iterations...
> 

Thanks for taking a look at the patchset!

Wow, I haven't seen /proc/pagetypeinfo soft lockups yet, I thought this 
was a relatively minor point :)  But it looks like we need some 
improvement in this behavior independent of memory compaction anyway.

> > This patch introduces no functional change and increases the amount of
> > per-zone metadata at worst by 48 bytes per memory zone (when CONFIG_CMA
> > and CONFIG_MEMORY_ISOLATION are enabled).
> 
> Isn't it 48 bytes per zone and order?
> 

Yes, sorry, I'll fix that in v2.  I think less than half a kilobyte for 
each memory zone is satisfactory for extra tracking, compaction 
improvements, and optimized /proc/pagetypeinfo, though.

> > Signed-off-by: David Rientjes <rientjes@google.com>
> 
> I'd be for this if there are no performance regressions. It affects hot
> paths and increases cache footprint. I think at least some allocator
> intensive microbenchmark should be used.
> 

I can easily implement a test to stress movable page allocations from 
fallback MIGRATE_UNMOVABLE pageblocks and freeing back to the same 
pageblocks.  I assume we're not interested in memory offline benchmarks.

What do you think about the logic presented in patch 2/2?  Are you 
comfortable with a hard-coded ratio such as 1/64th of free memory or would 
you prefer to look at the zone's watermark with the number of free pages 
from MIGRATE_MOVABLE pageblocks rather than NR_FREE_PAGES?  I was split 
between the two options.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/2] mm, zone: track number of pages in free area by migratetype
  2016-11-17 22:11   ` David Rientjes
@ 2016-11-18 20:58     ` Vlastimil Babka
  0 siblings, 0 replies; 8+ messages in thread
From: Vlastimil Babka @ 2016-11-18 20:58 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Michal Hocko, Joonsoo Kim,
	linux-kernel, linux-mm

On 11/17/2016 11:11 PM, David Rientjes wrote:
> On Thu, 17 Nov 2016, Vlastimil Babka wrote:
> 
>>> The total number of free pages is still tracked, however, to not make
>>> zone_watermark_ok() more expensive.  Reading /proc/pagetypeinfo, however,
>>> is faster.
>>
>> Yeah I've already seen a case with /proc/pagetypeinfo causing soft
>> lockups due to high number of iterations...
>>
> 
> Thanks for taking a look at the patchset!
> 
> Wow, I haven't seen /proc/pagetypeinfo soft lockups yet, I thought this 
> was a relatively minor point :)

Well to be honest, it was a system misconfigured with numa=off which
made the lists both longer and more numa-distant. But nevertheless, we
might get there. It's not nice when userspace can so easily trigger long
iterations under the zone/node lock...

> But it looks like we need some 
> improvement in this behavior independent of memory compaction anyway.

Yeah.

>>> This patch introduces no functional change and increases the amount of
>>> per-zone metadata at worst by 48 bytes per memory zone (when CONFIG_CMA
>>> and CONFIG_MEMORY_ISOLATION are enabled).
>>
>> Isn't it 48 bytes per zone and order?
>>
> 
> Yes, sorry, I'll fix that in v2.  I think less than half a kilobyte for 
> each memory zone is satisfactory for extra tracking, compaction 
> improvements, and optimized /proc/pagetypeinfo, though.

I'm not worried about memory usage, but perhaps cache usage.

>>> Signed-off-by: David Rientjes <rientjes@google.com>
>>
>> I'd be for this if there are no performance regressions. It affects hot
>> paths and increases cache footprint. I think at least some allocator
>> intensive microbenchmark should be used.
>>
> 
> I can easily implement a test to stress movable page allocations from 
> fallback MIGRATE_UNMOVABLE pageblocks and freeing back to the same 
> pageblocks.  I assume we're not interested in memory offline benchmarks.

I meant just allocation benchmarks to see how much the extra operations
and cache footprint matters.

> What do you think about the logic presented in patch 2/2?  Are you 
> comfortable with a hard-coded ratio such as 1/64th of free memory or would 
> you prefer to look at the zone's watermark with the number of free pages 
> from MIGRATE_MOVABLE pageblocks rather than NR_FREE_PAGES?  I was split 
> between the two options.

The second options makes more sense to me intuitively as it resembles
what we've been doing until now. Maybe just don't require such a large
gap as compaction_suitable does?

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [patch v2 1/2] mm, zone: track number of movable free pages
  2016-11-17  1:32 [patch 1/2] mm, zone: track number of pages in free area by migratetype David Rientjes
  2016-11-17  1:32 ` [patch 2/2] mm, compaction: avoid async compaction if most free memory is ineligible David Rientjes
  2016-11-17 17:04 ` [patch 1/2] mm, zone: track number of pages in free area by migratetype Vlastimil Babka
@ 2016-11-30  0:16 ` David Rientjes
  2016-11-30  0:16   ` [patch v2 2/2] mm, compaction: avoid async compaction if most free memory is ineligible David Rientjes
  2016-11-30  7:34   ` [patch v2 1/2] mm, zone: track number of movable free pages Vlastimil Babka
  2 siblings, 2 replies; 8+ messages in thread
From: David Rientjes @ 2016-11-30  0:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Vlastimil Babka, Michal Hocko, Joonsoo Kim,
	linux-kernel, linux-mm

An upcoming compaction change will need the number of movable free pages
per zone to determine if async compaction will become unnecessarily
expensive.

This patch introduces no functional change or increased memory footprint.
It simply tracks the number of free movable pages as a subset of the
total number of free pages.  This is exported to userspace as part of a
new /proc/vmstat field.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 v2: do not track free pages per migratetype since page allocator stress
     testing reveals this tracking can impact workloads and there is no
     substantial benefit when thp is disabled.  This occurs because
     entire pageblocks can be converted to new migratetypes and requires
     iteration of free_areas in the hotpaths for proper tracking.

 include/linux/mmzone.h | 1 +
 include/linux/vmstat.h | 2 ++
 mm/page_alloc.c        | 8 +++++++-
 mm/vmstat.c            | 1 +
 4 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -138,6 +138,7 @@ enum zone_stat_item {
 	NUMA_OTHER,		/* allocation from other node */
 #endif
 	NR_FREE_CMA_PAGES,
+	NR_FREE_MOVABLE_PAGES,
 	NR_VM_ZONE_STAT_ITEMS };
 
 enum node_stat_item {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -347,6 +347,8 @@ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
 	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
 	if (is_migrate_cma(migratetype))
 		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
+	if (migratetype == MIGRATE_MOVABLE)
+		__mod_zone_page_state(zone, NR_FREE_MOVABLE_PAGES, nr_pages);
 }
 
 extern const char * const vmstat_text[];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2197,6 +2197,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
+		int mt;
+
 		if (unlikely(page == NULL))
 			break;
 
@@ -2217,9 +2219,13 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		else
 			list_add_tail(&page->lru, list);
 		list = &page->lru;
-		if (is_migrate_cma(get_pcppage_migratetype(page)))
+		mt = get_pcppage_migratetype(page);
+		if (is_migrate_cma(mt))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
+		if (mt == MIGRATE_MOVABLE)
+			__mod_zone_page_state(zone, NR_FREE_MOVABLE_PAGES,
+					      -(1 << order));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
diff --git a/mm/vmstat.c b/mm/vmstat.c
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -945,6 +945,7 @@ const char * const vmstat_text[] = {
 	"numa_other",
 #endif
 	"nr_free_cma",
+	"nr_free_movable",
 
 	/* Node-based counters */
 	"nr_inactive_anon",

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [patch v2 2/2] mm, compaction: avoid async compaction if most free memory is ineligible
  2016-11-30  0:16 ` [patch v2 1/2] mm, zone: track number of movable free pages David Rientjes
@ 2016-11-30  0:16   ` David Rientjes
  2016-11-30  7:34   ` [patch v2 1/2] mm, zone: track number of movable free pages Vlastimil Babka
  1 sibling, 0 replies; 8+ messages in thread
From: David Rientjes @ 2016-11-30  0:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Vlastimil Babka, Michal Hocko, Joonsoo Kim,
	linux-kernel, linux-mm

Memory compaction will only migrate memory to MIGRATE_MOVABLE pageblocks
for asynchronous compaction.

If most free memory on the system is not eligible for migration in this
context, isolate_freepages() can take an extreme amount of time trying to
find a free page.  For example, we have encountered the following
scenario many times, specifically due to slab fragmentation:

Free pages count per migrate type at order       0      1      2      3      4      5      6      7      8      9     10 
Node    0, zone   Normal, type    Unmovable  40000   3778      2      0      0      0      0      0      0      0      0 
Node    0, zone   Normal, type  Reclaimable     11      6      0      0      0      0      0      0      0      0      0 
Node    0, zone   Normal, type      Movable      1      1      0      0      0      0      0      0      0      0      0 
Node    0, zone   Normal, type      Reserve      0      0      0      0      0      0      0      0      0      0      0

The compaction freeing scanner will end up scanning this entire zone,
perhaps finding no memory free and terminating compaction after pages
have already been isolated for migration.  It is unnecessary to even
start async compaction in a scenario where free memory cannot be
isolated as a migration target.

This patch does not deem async compaction to be suitable when the
watermark checks using only the amount of free movable memory fails.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 v2: convert to per-zone watermark check

 fs/buffer.c                |  2 +-
 include/linux/compaction.h |  8 ++++----
 include/linux/swap.h       |  3 ++-
 mm/compaction.c            | 37 ++++++++++++++++++++++++++++---------
 mm/internal.h              |  1 +
 mm/page_alloc.c            | 15 ++++++++-------
 mm/vmscan.c                | 20 +++++++++++++++++---
 7 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -268,7 +268,7 @@ static void free_more_memory(void)
 						gfp_zone(GFP_NOFS), NULL);
 		if (z->zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-						GFP_NOFS, NULL);
+					  GFP_NOFS, MIN_COMPACT_PRIORITY, NULL);
 	}
 }
 
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -97,7 +97,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 		const struct alloc_context *ac, enum compact_priority prio);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern enum compact_result compaction_suitable(struct zone *zone, int order,
-		unsigned int alloc_flags, int classzone_idx);
+		unsigned int alloc_flags, int classzone_idx, bool sync);
 
 extern void defer_compaction(struct zone *zone, int order);
 extern bool compaction_deferred(struct zone *zone, int order);
@@ -171,7 +171,7 @@ static inline bool compaction_withdrawn(enum compact_result result)
 
 
 bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
-					int alloc_flags);
+				  int alloc_flags, enum compact_priority prio);
 
 extern int kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
@@ -182,8 +182,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
 {
 }
 
-static inline enum compact_result compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx)
+static inline enum compact_result compaction_suitable(struct zone *zone,
+		int order, int alloc_flags, int classzone_idx, bool sync)
 {
 	return COMPACT_SKIPPED;
 }
diff --git a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/atomic.h>
 #include <linux/page-flags.h>
+#include <linux/compaction.h>
 #include <asm/page.h>
 
 struct notifier_block;
@@ -315,7 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-					gfp_t gfp_mask, nodemask_t *mask);
+		gfp_t gfp_mask, enum compact_priority prio, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
diff --git a/mm/compaction.c b/mm/compaction.c
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1377,7 +1377,7 @@ static enum compact_result compact_finished(struct zone *zone,
 static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx,
-					unsigned long wmark_target)
+					unsigned long wmark_target, bool sync)
 {
 	unsigned long watermark;
 
@@ -1414,18 +1414,34 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
 						ALLOC_CMA, wmark_target))
 		return COMPACT_SKIPPED;
 
+	if (!sync) {
+		unsigned long free;
+
+		free = zone_page_state(zone, NR_FREE_CMA_PAGES) +
+		       zone_page_state(zone, NR_FREE_MOVABLE_PAGES);
+		/*
+		 * Page migration can only migrate pages to MIGRATE_MOVABLE or
+		 * MIGRATE_CMA pageblocks for async compaction.  If there is
+		 * insufficient free target memory, do not attempt compaction
+		 * since free scanning will become unnecessarily expensive.
+		 */
+		if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+					 ALLOC_CMA, free))
+			return COMPACT_SKIPPED;
+	}
+
 	return COMPACT_CONTINUE;
 }
 
 enum compact_result compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
-					int classzone_idx)
+					int classzone_idx, bool sync)
 {
 	enum compact_result ret;
 	int fragindex;
 
 	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
-				    zone_page_state(zone, NR_FREE_PAGES));
+				    zone_page_state(zone, NR_FREE_PAGES), sync);
 	/*
 	 * fragmentation index determines if allocation failures are due to
 	 * low memory or external fragmentation
@@ -1456,7 +1472,7 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 }
 
 bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
-		int alloc_flags)
+				  int alloc_flags, enum compact_priority prio)
 {
 	struct zone *zone;
 	struct zoneref *z;
@@ -1479,7 +1495,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
 		available = zone_reclaimable_pages(zone) / order;
 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
 		compact_result = __compaction_suitable(zone, order, alloc_flags,
-				ac_classzone_idx(ac), available);
+				ac_classzone_idx(ac), available, prio);
 		if (compact_result != COMPACT_SKIPPED)
 			return true;
 	}
@@ -1496,7 +1512,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 
 	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
-							cc->classzone_idx);
+				  cc->classzone_idx, sync);
 	/* Compaction is likely to fail */
 	if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
 		return ret;
@@ -1869,13 +1885,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
 	enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
 
 	for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
+		enum compact_result result;
+
 		zone = &pgdat->node_zones[zoneid];
 
 		if (!populated_zone(zone))
 			continue;
 
-		if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
-					classzone_idx) == COMPACT_CONTINUE)
+		result = compaction_suitable(zone, pgdat->kcompactd_max_order,
+					     0, classzone_idx, true);
+		if (result == COMPACT_CONTINUE)
 			return true;
 	}
 
@@ -1911,7 +1930,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 		if (compaction_deferred(zone, cc.order))
 			continue;
 
-		if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+		if (compaction_suitable(zone, cc.order, 0, zoneid, true) !=
 							COMPACT_CONTINUE)
 			continue;
 
diff --git a/mm/internal.h b/mm/internal.h
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -451,6 +451,7 @@ extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
 
 extern void set_pageblock_order(void);
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+					    enum migrate_mode mode,
 					    struct list_head *page_list);
 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
 #define ALLOC_WMARK_MIN		WMARK_MIN
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3194,7 +3194,8 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 	 * compaction.
 	 */
 	if (compaction_withdrawn(compact_result))
-		return compaction_zonelist_suitable(ac, order, alloc_flags);
+		return compaction_zonelist_suitable(ac, order, alloc_flags,
+						    *compact_priority);
 
 	/*
 	 * !costly requests are much more important than __GFP_REPEAT
@@ -3264,7 +3265,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
-					const struct alloc_context *ac)
+		  const struct alloc_context *ac, enum compact_priority prio)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
@@ -3278,7 +3279,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 
-	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
+	progress = try_to_free_pages(ac->zonelist, order, gfp_mask, prio,
 								ac->nodemask);
 
 	current->reclaim_state = NULL;
@@ -3294,12 +3295,12 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
-		unsigned long *did_some_progress)
+		enum compact_priority prio, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 
-	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
+	*did_some_progress = __perform_reclaim(gfp_mask, order, ac, prio);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 
@@ -3641,7 +3642,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
-							&did_some_progress);
+					compact_priority, &did_some_progress);
 	if (page)
 		goto got_pg;
 
@@ -7163,7 +7164,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 			break;
 		}
 
-		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
+		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, cc->mode,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -84,6 +84,8 @@ struct scan_control {
 	/* Scan (total_size >> priority) pages at once */
 	int priority;
 
+	enum compact_priority compact_priority;
+
 	/* The highest zone to isolate pages for reclaim from */
 	enum zone_type reclaim_idx;
 
@@ -1267,11 +1269,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 }
 
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+					    enum migrate_mode mode,
 					    struct list_head *page_list)
 {
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
+		.compact_priority = mode == MIGRATE_ASYNC ?
+				    COMPACT_PRIO_ASYNC :
+				    COMPACT_PRIO_SYNC_LIGHT,
 		.may_unmap = 1,
 	};
 	unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
@@ -2492,7 +2498,8 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
 		if (!managed_zone(zone))
 			continue;
 
-		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
+		switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx,
+					    sc->compact_priority)) {
 		case COMPACT_SUCCESS:
 		case COMPACT_CONTINUE:
 			return false;
@@ -2605,7 +2612,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 	unsigned long watermark;
 	enum compact_result suitable;
 
-	suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
+	suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx,
+				       sc->compact_priority);
 	if (suitable == COMPACT_SUCCESS)
 		/* Allocation should succeed already. Don't reclaim. */
 		return true;
@@ -2934,7 +2942,8 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 }
 
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
-				gfp_t gfp_mask, nodemask_t *nodemask)
+				gfp_t gfp_mask, enum compact_priority prio,
+				nodemask_t *nodemask)
 {
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
@@ -2944,6 +2953,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.order = order,
 		.nodemask = nodemask,
 		.priority = DEF_PRIORITY,
+		.compact_priority = prio,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = 1,
@@ -3024,6 +3034,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.target_mem_cgroup = memcg,
 		.priority = DEF_PRIORITY,
+		.compact_priority = DEF_COMPACT_PRIORITY,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = may_swap,
@@ -3195,6 +3206,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		.gfp_mask = GFP_KERNEL,
 		.order = order,
 		.priority = DEF_PRIORITY,
+		.compact_priority = DEF_COMPACT_PRIORITY,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = 1,
@@ -3528,6 +3540,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 		.gfp_mask = GFP_HIGHUSER_MOVABLE,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.priority = DEF_PRIORITY,
+		.compact_priority = DEF_COMPACT_PRIORITY,
 		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
@@ -3716,6 +3729,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
 		.order = order,
 		.priority = NODE_RECLAIM_PRIORITY,
+		.compact_priority = DEF_COMPACT_PRIORITY,
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch v2 1/2] mm, zone: track number of movable free pages
  2016-11-30  0:16 ` [patch v2 1/2] mm, zone: track number of movable free pages David Rientjes
  2016-11-30  0:16   ` [patch v2 2/2] mm, compaction: avoid async compaction if most free memory is ineligible David Rientjes
@ 2016-11-30  7:34   ` Vlastimil Babka
  1 sibling, 0 replies; 8+ messages in thread
From: Vlastimil Babka @ 2016-11-30  7:34 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Mel Gorman, Michal Hocko, Joonsoo Kim, linux-kernel, linux-mm

On 11/30/2016 01:16 AM, David Rientjes wrote:
> An upcoming compaction change will need the number of movable free pages
> per zone to determine if async compaction will become unnecessarily
> expensive.
>
> This patch introduces no functional change or increased memory footprint.
> It simply tracks the number of free movable pages as a subset of the
> total number of free pages.  This is exported to userspace as part of a
> new /proc/vmstat field.
>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  v2: do not track free pages per migratetype since page allocator stress
>      testing reveals this tracking can impact workloads and there is no
>      substantial benefit when thp is disabled.  This occurs because
>      entire pageblocks can be converted to new migratetypes and requires
>      iteration of free_areas in the hotpaths for proper tracking.

Ah, right, forgot about the accuracy issue when focusing on the overhead 
issue. Unfortunately I'm afraid the NR_FREE_MOVABLE_PAGES in this patch 
will also drift uncontrollably over time. Stealing is one thing, and 
also buddy merging can silently move free pages between migratetypes. It 
already took some effort to make this accurate for MIGRATE_CMA and 
MIGRATE_ISOLATE, which has some overhead and works only thanks to 
additional constraints - CMA pageblocks don't ever get converted, and 
for ISOLATE we don't put them on pcplists, perform pcplists draining 
during isolation, and have extra code guarded by has_isolate_pageblock() 
in buddy merging. None of this would be directly viable for 
MIGRATE_MOVABLE I'm afraid.

>  include/linux/mmzone.h | 1 +
>  include/linux/vmstat.h | 2 ++
>  mm/page_alloc.c        | 8 +++++++-
>  mm/vmstat.c            | 1 +
>  4 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -138,6 +138,7 @@ enum zone_stat_item {
>  	NUMA_OTHER,		/* allocation from other node */
>  #endif
>  	NR_FREE_CMA_PAGES,
> +	NR_FREE_MOVABLE_PAGES,
>  	NR_VM_ZONE_STAT_ITEMS };
>
>  enum node_stat_item {
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -347,6 +347,8 @@ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
>  	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
>  	if (is_migrate_cma(migratetype))
>  		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
> +	if (migratetype == MIGRATE_MOVABLE)
> +		__mod_zone_page_state(zone, NR_FREE_MOVABLE_PAGES, nr_pages);
>  }
>
>  extern const char * const vmstat_text[];
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2197,6 +2197,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  	spin_lock(&zone->lock);
>  	for (i = 0; i < count; ++i) {
>  		struct page *page = __rmqueue(zone, order, migratetype);
> +		int mt;
> +
>  		if (unlikely(page == NULL))
>  			break;
>
> @@ -2217,9 +2219,13 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  		else
>  			list_add_tail(&page->lru, list);
>  		list = &page->lru;
> -		if (is_migrate_cma(get_pcppage_migratetype(page)))
> +		mt = get_pcppage_migratetype(page);
> +		if (is_migrate_cma(mt))
>  			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
>  					      -(1 << order));
> +		if (mt == MIGRATE_MOVABLE)
> +			__mod_zone_page_state(zone, NR_FREE_MOVABLE_PAGES,
> +					      -(1 << order));
>  	}
>  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
>  	spin_unlock(&zone->lock);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -945,6 +945,7 @@ const char * const vmstat_text[] = {
>  	"numa_other",
>  #endif
>  	"nr_free_cma",
> +	"nr_free_movable",
>
>  	/* Node-based counters */
>  	"nr_inactive_anon",
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-11-30  7:34 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-17  1:32 [patch 1/2] mm, zone: track number of pages in free area by migratetype David Rientjes
2016-11-17  1:32 ` [patch 2/2] mm, compaction: avoid async compaction if most free memory is ineligible David Rientjes
2016-11-17 17:04 ` [patch 1/2] mm, zone: track number of pages in free area by migratetype Vlastimil Babka
2016-11-17 22:11   ` David Rientjes
2016-11-18 20:58     ` Vlastimil Babka
2016-11-30  0:16 ` [patch v2 1/2] mm, zone: track number of movable free pages David Rientjes
2016-11-30  0:16   ` [patch v2 2/2] mm, compaction: avoid async compaction if most free memory is ineligible David Rientjes
2016-11-30  7:34   ` [patch v2 1/2] mm, zone: track number of movable free pages Vlastimil Babka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).