linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
@ 2021-03-13  8:31 zhou
  2021-03-13 10:50 ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: zhou @ 2021-03-13  8:31 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, mhocko, mgorman, willy, rostedt, mingo,
	vbabka, rientjes, pankaj.gupta.linux, bhe, ying.huang,
	iamjoonsoo.kim, minchan, ruxian.feng, kai.cheng, zhao.xu,
	zhouxianrong, zhou xianrong

From: zhou xianrong <xianrong.zhou@transsion.com>

For purpose of better migration cma pages are allocated after
failure movalbe allocations and are used normally for file pages
or anonymous pages.

In reclaim path many cma pages if configurated are reclaimed
from lru lists in kswapd mainly or direct reclaim triggered by
unmovable or reclaimable allocations. But these reclaimed cma
pages can not be used by original unmovable or reclaimable
allocations. So the reclaim are unnecessary.

So the unmovable or reclaimable allocations should not trigger
reclaiming cma pages. The patch adds third factor of migratetype
which is just like factors of zone index or order kswapd need
consider. The modification follows codes of zone index
consideration. And it is straightforward that skips reclaiming
cma pages in reclaim procedure which is triggered only by
unmovable or reclaimable allocations.

This optimization can avoid ~3% unnecessary isolations from cma
(cma isolated / total isolated) with configuration of total 100Mb
cma pages.

Signed-off-by: zhou xianrong <xianrong.zhou@transsion.com>
Signed-off-by: feng ruxian <ruxian.feng@transsion.com>
---
 include/linux/mmzone.h        |  6 ++--
 include/trace/events/vmscan.h | 20 +++++++----
 mm/page_alloc.c               |  5 +--
 mm/vmscan.c                   | 63 +++++++++++++++++++++++++++++------
 4 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316bff3d..7dd38d7372b9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -301,6 +301,8 @@ struct lruvec {
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
 /* Isolate unevictable pages */
 #define ISOLATE_UNEVICTABLE	((__force isolate_mode_t)0x8)
+/* Isolate none cma pages */
+#define ISOLATE_NONCMA		((__force isolate_mode_t)0x10)
 
 /* LRU Isolation modes. */
 typedef unsigned __bitwise isolate_mode_t;
@@ -756,7 +758,7 @@ typedef struct pglist_data {
 	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;	/* Protected by
 					   mem_hotplug_begin/end() */
-	int kswapd_order;
+	int kswapd_order, kswapd_migratetype;
 	enum zone_type kswapd_highest_zoneidx;
 
 	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
@@ -840,7 +842,7 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
 
 void build_all_zonelists(pg_data_t *pgdat);
 void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
-		   enum zone_type highest_zoneidx);
+		   int migratetype, enum zone_type highest_zoneidx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			 int highest_zoneidx, unsigned int alloc_flags,
 			 long free_pages);
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 2070df64958e..41bbafdfde84 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -51,37 +51,41 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
 
 TRACE_EVENT(mm_vmscan_kswapd_wake,
 
-	TP_PROTO(int nid, int zid, int order),
+	TP_PROTO(int nid, int zid, int order, int mt),
 
-	TP_ARGS(nid, zid, order),
+	TP_ARGS(nid, zid, order, mt),
 
 	TP_STRUCT__entry(
 		__field(	int,	nid	)
 		__field(	int,	zid	)
 		__field(	int,	order	)
+		__field(	int,	mt	)
 	),
 
 	TP_fast_assign(
 		__entry->nid	= nid;
 		__entry->zid    = zid;
 		__entry->order	= order;
+		__entry->mt	= mt;
 	),
 
-	TP_printk("nid=%d order=%d",
+	TP_printk("nid=%d order=%d migratetype=%d",
 		__entry->nid,
-		__entry->order)
+		__entry->order,
+		__entry->mt)
 );
 
 TRACE_EVENT(mm_vmscan_wakeup_kswapd,
 
-	TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
+	TP_PROTO(int nid, int zid, int order, int mt, gfp_t gfp_flags),
 
-	TP_ARGS(nid, zid, order, gfp_flags),
+	TP_ARGS(nid, zid, order, mt, gfp_flags),
 
 	TP_STRUCT__entry(
 		__field(	int,	nid		)
 		__field(	int,	zid		)
 		__field(	int,	order		)
+		__field(	int,	mt		)
 		__field(	gfp_t,	gfp_flags	)
 	),
 
@@ -89,12 +93,14 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
 		__entry->nid		= nid;
 		__entry->zid		= zid;
 		__entry->order		= order;
+		__entry->mt		= mt;
 		__entry->gfp_flags	= gfp_flags;
 	),
 
-	TP_printk("nid=%d order=%d gfp_flags=%s",
+	TP_printk("nid=%d order=%d migratetype=%d gfp_flags=%s",
 		__entry->nid,
 		__entry->order,
+		__entry->mt,
 		show_gfp_flags(__entry->gfp_flags))
 );
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 519a60d5b6f7..45ceb15721b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3517,7 +3517,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 	/* Separate test+clear to avoid unnecessary atomics */
 	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
 		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
-		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+		wakeup_kswapd(zone, 0, 0, migratetype, zone_idx(zone));
 	}
 
 	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
@@ -4426,11 +4426,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
 	struct zone *zone;
 	pg_data_t *last_pgdat = NULL;
 	enum zone_type highest_zoneidx = ac->highest_zoneidx;
+	int migratetype = ac->migratetype;
 
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
 					ac->nodemask) {
 		if (last_pgdat != zone->zone_pgdat)
-			wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
+			wakeup_kswapd(zone, gfp_mask, order, migratetype, highest_zoneidx);
 		last_pgdat = zone->zone_pgdat;
 	}
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1b574ad199d..184f0c4c7151 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -99,6 +99,9 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
+	/* Can cma pages be reclaimed? */
+	unsigned int may_cma:1;
+
 	/*
 	 * Cgroups are not reclaimed below their configured memory.low,
 	 * unless we threaten to OOM. If any cgroups are skipped due to
@@ -286,6 +289,11 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 }
 #endif
 
+static bool movable_reclaim(gfp_t gfp_mask)
+{
+	return is_migrate_movable(gfp_migratetype(gfp_mask));
+}
+
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -1499,6 +1507,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
 		.may_unmap = 1,
+		.may_cma = 1,
 	};
 	struct reclaim_stat stat;
 	unsigned int nr_reclaimed;
@@ -1593,6 +1602,9 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
 		return ret;
 
+	if ((mode & ISOLATE_NONCMA) && is_migrate_cma(get_pageblock_migratetype(page)))
+		return ret;
+
 	return 0;
 }
 
@@ -1647,7 +1659,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	unsigned long skipped = 0;
 	unsigned long scan, total_scan, nr_pages;
 	LIST_HEAD(pages_skipped);
-	isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+	isolate_mode_t mode;
+
+	mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+	mode |= (sc->may_cma ? 0 : ISOLATE_NONCMA);
 
 	total_scan = 0;
 	scan = 0;
@@ -2125,6 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
 		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_cma = 1,
 	};
 
 	while (!list_empty(page_list)) {
@@ -3253,6 +3269,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_cma = movable_reclaim(gfp_mask),
 	};
 
 	/*
@@ -3298,6 +3315,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 		.may_unmap = 1,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.may_swap = !noswap,
+		.may_cma = 1,
 	};
 
 	WARN_ON_ONCE(!current->reclaim_state);
@@ -3341,6 +3359,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = may_swap,
+		.may_cma = 1,
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -3548,7 +3567,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
  * or lower is eligible for reclaim until at least one usable zone is
  * balanced.
  */
-static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int migratetype, int highest_zoneidx)
 {
 	int i;
 	unsigned long nr_soft_reclaimed;
@@ -3650,6 +3669,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		 */
 		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
 		sc.may_swap = !nr_boost_reclaim;
+		sc.may_cma = is_migrate_movable(migratetype);
 
 		/*
 		 * Do some background aging of the anon list, to give
@@ -3771,8 +3791,15 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
 	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
 }
 
+static int kswapd_migratetype(pg_data_t *pgdat, int prev_migratetype)
+{
+	int curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
+
+	return curr_migratetype == MIGRATE_TYPES ? prev_migratetype : curr_migratetype;
+}
+
 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
-				unsigned int highest_zoneidx)
+				int migratetype, unsigned int highest_zoneidx)
 {
 	long remaining = 0;
 	DEFINE_WAIT(wait);
@@ -3807,8 +3834,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 		remaining = schedule_timeout(HZ/10);
 
 		/*
-		 * If woken prematurely then reset kswapd_highest_zoneidx and
-		 * order. The values will either be from a wakeup request or
+		 * If woken prematurely then reset kswapd_highest_zoneidx, order
+		 * and migratetype. The values will either be from a wakeup request or
 		 * the previous request that slept prematurely.
 		 */
 		if (remaining) {
@@ -3818,6 +3845,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 
 			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
 				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
+
+			if (!is_migrate_movable(READ_ONCE(pgdat->kswapd_migratetype)))
+				WRITE_ONCE(pgdat->kswapd_migratetype,
+						kswapd_migratetype(pgdat, migratetype));
 		}
 
 		finish_wait(&pgdat->kswapd_wait, &wait);
@@ -3870,6 +3901,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
  */
 static int kswapd(void *p)
 {
+	int migratetype = 0;
 	unsigned int alloc_order, reclaim_order;
 	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
 	pg_data_t *pgdat = (pg_data_t*)p;
@@ -3895,23 +3927,27 @@ static int kswapd(void *p)
 	set_freezable();
 
 	WRITE_ONCE(pgdat->kswapd_order, 0);
+	WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
 	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 	for ( ; ; ) {
 		bool ret;
 
 		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
+		migratetype = kswapd_migratetype(pgdat, migratetype);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx);
 
 kswapd_try_sleep:
 		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
-					highest_zoneidx);
+					migratetype, highest_zoneidx);
 
 		/* Read the new order and highest_zoneidx */
 		alloc_order = READ_ONCE(pgdat->kswapd_order);
+		migratetype = kswapd_migratetype(pgdat, migratetype);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx);
 		WRITE_ONCE(pgdat->kswapd_order, 0);
+		WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 
 		ret = try_to_freeze();
@@ -3934,8 +3970,8 @@ static int kswapd(void *p)
 		 * request (alloc_order).
 		 */
 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
-						alloc_order);
-		reclaim_order = balance_pgdat(pgdat, alloc_order,
+						alloc_order, migratetype);
+		reclaim_order = balance_pgdat(pgdat, alloc_order, migratetype,
 						highest_zoneidx);
 		if (reclaim_order < alloc_order)
 			goto kswapd_try_sleep;
@@ -3953,11 +3989,12 @@ static int kswapd(void *p)
  * has failed or is not needed, still wake up kcompactd if only compaction is
  * needed.
  */
-void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, int migratetype
 		   enum zone_type highest_zoneidx)
 {
 	pg_data_t *pgdat;
 	enum zone_type curr_idx;
+	int curr_migratetype;
 
 	if (!managed_zone(zone))
 		return;
@@ -3967,6 +4004,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 
 	pgdat = zone->zone_pgdat;
 	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
+	curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
 
 	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
@@ -3974,6 +4012,9 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 	if (READ_ONCE(pgdat->kswapd_order) < order)
 		WRITE_ONCE(pgdat->kswapd_order, order);
 
+	if (curr_migratetype == MIGRATE_TYPES || is_migrate_movable(migratetype))
+		WRITE_ONCE(pgdat->kswapd_migratetype, migratetype);
+
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 
@@ -3994,7 +4035,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 	}
 
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
-				      gfp_flags);
+				      migratetype, gfp_flags);
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
@@ -4017,6 +4058,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_cma = 1,
 		.hibernation_mode = 1,
 	};
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
@@ -4176,6 +4218,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,
+		.may_cma = movable_reclaim(gfp_mask),
 		.reclaim_idx = gfp_zone(gfp_mask),
 	};
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
  2021-03-13  8:31 [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation zhou
@ 2021-03-13 10:50 ` kernel test robot
  2021-03-13 13:37 ` zhou xianrong
  2021-03-15 15:46 ` David Hildenbrand
  2 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2021-03-13 10:50 UTC (permalink / raw)
  To: zhou, linux-mm
  Cc: kbuild-all, linux-kernel, akpm, mhocko, mgorman, willy, rostedt,
	mingo, vbabka, rientjes

[-- Attachment #1: Type: text/plain, Size: 6763 bytes --]

Hi zhou,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/perf/core]
[cannot apply to linux/master linus/master hnaz-linux-mm/master v5.12-rc2 next-20210312]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/zhou/kswapd-no-need-reclaim-cma-pages-triggered-by-unmovable-allocation/20210313-163541
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 8bcfdd7cad3dffdd340f9a79098cbf331eb2cd53
config: m68k-randconfig-c023-20210313 (attached as .config)
compiler: m68k-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/f40216b4d0325cf640d1c3ebe448772d6430bc6a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review zhou/kswapd-no-need-reclaim-cma-pages-triggered-by-unmovable-allocation/20210313-163541
        git checkout f40216b4d0325cf640d1c3ebe448772d6430bc6a
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=m68k 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> mm/vmscan.c:3991:6: error: expected ';', ',' or ')' before 'enum'
    3991 |      enum zone_type highest_zoneidx)
         |      ^~~~
   mm/vmscan.c:3405:13: warning: 'pgdat_watermark_boosted' defined but not used [-Wunused-function]
    3405 | static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
         |             ^~~~~~~~~~~~~~~~~~~~~~~


vim +3991 mm/vmscan.c

^1da177e4c3f41 Linus Torvalds 2005-04-16  3982  
^1da177e4c3f41 Linus Torvalds 2005-04-16  3983  /*
5ecd9d403ad081 David Rientjes 2018-04-05  3984   * A zone is low on free memory or too fragmented for high-order memory.  If
5ecd9d403ad081 David Rientjes 2018-04-05  3985   * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
5ecd9d403ad081 David Rientjes 2018-04-05  3986   * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
5ecd9d403ad081 David Rientjes 2018-04-05  3987   * has failed or is not needed, still wake up kcompactd if only compaction is
5ecd9d403ad081 David Rientjes 2018-04-05  3988   * needed.
^1da177e4c3f41 Linus Torvalds 2005-04-16  3989   */
f40216b4d0325c zhou xianrong  2021-03-13  3990  void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, int migratetype
97a225e69a1f88 Joonsoo Kim    2020-06-03 @3991  		   enum zone_type highest_zoneidx)
^1da177e4c3f41 Linus Torvalds 2005-04-16  3992  {
^1da177e4c3f41 Linus Torvalds 2005-04-16  3993  	pg_data_t *pgdat;
5644e1fbbfe15a Qian Cai       2020-04-01  3994  	enum zone_type curr_idx;
f40216b4d0325c zhou xianrong  2021-03-13  3995  	int curr_migratetype;
^1da177e4c3f41 Linus Torvalds 2005-04-16  3996  
6aa303defb7454 Mel Gorman     2016-09-01  3997  	if (!managed_zone(zone))
^1da177e4c3f41 Linus Torvalds 2005-04-16  3998  		return;
^1da177e4c3f41 Linus Torvalds 2005-04-16  3999  
5ecd9d403ad081 David Rientjes 2018-04-05  4000  	if (!cpuset_zone_allowed(zone, gfp_flags))
^1da177e4c3f41 Linus Torvalds 2005-04-16  4001  		return;
5644e1fbbfe15a Qian Cai       2020-04-01  4002  
88f5acf88ae6a9 Mel Gorman     2011-01-13  4003  	pgdat = zone->zone_pgdat;
97a225e69a1f88 Joonsoo Kim    2020-06-03  4004  	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
f40216b4d0325c zhou xianrong  2021-03-13  4005  	curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
5644e1fbbfe15a Qian Cai       2020-04-01  4006  
97a225e69a1f88 Joonsoo Kim    2020-06-03  4007  	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
97a225e69a1f88 Joonsoo Kim    2020-06-03  4008  		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
5644e1fbbfe15a Qian Cai       2020-04-01  4009  
5644e1fbbfe15a Qian Cai       2020-04-01  4010  	if (READ_ONCE(pgdat->kswapd_order) < order)
5644e1fbbfe15a Qian Cai       2020-04-01  4011  		WRITE_ONCE(pgdat->kswapd_order, order);
dffcac2cb88e4e Shakeel Butt   2019-07-04  4012  
f40216b4d0325c zhou xianrong  2021-03-13  4013  	if (curr_migratetype == MIGRATE_TYPES || is_migrate_movable(migratetype))
f40216b4d0325c zhou xianrong  2021-03-13  4014  		WRITE_ONCE(pgdat->kswapd_migratetype, migratetype);
f40216b4d0325c zhou xianrong  2021-03-13  4015  
8d0986e289a4b0 Con Kolivas    2005-09-13  4016  	if (!waitqueue_active(&pgdat->kswapd_wait))
^1da177e4c3f41 Linus Torvalds 2005-04-16  4017  		return;
e1a556374abc0d Mel Gorman     2016-07-28  4018  
5ecd9d403ad081 David Rientjes 2018-04-05  4019  	/* Hopeless node, leave it to direct reclaim if possible */
5ecd9d403ad081 David Rientjes 2018-04-05  4020  	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
97a225e69a1f88 Joonsoo Kim    2020-06-03  4021  	    (pgdat_balanced(pgdat, order, highest_zoneidx) &&
97a225e69a1f88 Joonsoo Kim    2020-06-03  4022  	     !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
5ecd9d403ad081 David Rientjes 2018-04-05  4023  		/*
5ecd9d403ad081 David Rientjes 2018-04-05  4024  		 * There may be plenty of free memory available, but it's too
5ecd9d403ad081 David Rientjes 2018-04-05  4025  		 * fragmented for high-order allocations.  Wake up kcompactd
5ecd9d403ad081 David Rientjes 2018-04-05  4026  		 * and rely on compaction_suitable() to determine if it's
5ecd9d403ad081 David Rientjes 2018-04-05  4027  		 * needed.  If it fails, it will defer subsequent attempts to
5ecd9d403ad081 David Rientjes 2018-04-05  4028  		 * ratelimit its work.
5ecd9d403ad081 David Rientjes 2018-04-05  4029  		 */
5ecd9d403ad081 David Rientjes 2018-04-05  4030  		if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
97a225e69a1f88 Joonsoo Kim    2020-06-03  4031  			wakeup_kcompactd(pgdat, order, highest_zoneidx);
88f5acf88ae6a9 Mel Gorman     2011-01-13  4032  		return;
5ecd9d403ad081 David Rientjes 2018-04-05  4033  	}
88f5acf88ae6a9 Mel Gorman     2011-01-13  4034  
97a225e69a1f88 Joonsoo Kim    2020-06-03  4035  	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
f40216b4d0325c zhou xianrong  2021-03-13  4036  				      migratetype, gfp_flags);
8d0986e289a4b0 Con Kolivas    2005-09-13  4037  	wake_up_interruptible(&pgdat->kswapd_wait);
^1da177e4c3f41 Linus Torvalds 2005-04-16  4038  }
^1da177e4c3f41 Linus Torvalds 2005-04-16  4039  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 23827 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
  2021-03-13  8:31 [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation zhou
  2021-03-13 10:50 ` kernel test robot
@ 2021-03-13 13:37 ` zhou xianrong
  2021-03-15 15:46 ` David Hildenbrand
  2 siblings, 0 replies; 11+ messages in thread
From: zhou xianrong @ 2021-03-13 13:37 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, mhocko, mgorman, willy, rostedt, mingo,
	vbabka, rientjes, pankaj.gupta.linux, bhe, ying.huang,
	iamjoonsoo.kim, minchan, ruxian.feng, kai.cheng, zhao.xu,
	zhouxianrong, zhou xianrong

From: zhou xianrong <xianrong.zhou@transsion.com>

For purpose of better migration cma pages are allocated after
failure movalbe allocations and are used normally for file pages
or anonymous pages.

In reclaim path many cma pages if configurated are reclaimed
from lru lists in kswapd mainly or direct reclaim triggered by
unmovable or reclaimable allocations. But these reclaimed cma
pages can not be used by original unmovable or reclaimable
allocations. So the reclaim are unnecessary.

So the unmovable or reclaimable allocations should not trigger
reclaiming cma pages. The patch adds third factor of migratetype
which is just like factors of zone index or order kswapd need
consider. The modification follows codes of zone index
consideration. And it is straightforward that skips reclaiming
cma pages in reclaim procedure which is triggered only by
unmovable or reclaimable allocations.

This optimization can avoid ~3% unnecessary isolations from cma
(cma isolated / total isolated) with configuration of total 100Mb
cma pages.

Signed-off-by: zhou xianrong <xianrong.zhou@transsion.com>
Signed-off-by: feng ruxian <ruxian.feng@transsion.com>
---
 include/linux/mmzone.h        |  6 ++--
 include/trace/events/vmscan.h | 20 +++++++----
 mm/page_alloc.c               |  5 +--
 mm/vmscan.c                   | 63 +++++++++++++++++++++++++++++------
 4 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316bff3d..7dd38d7372b9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -301,6 +301,8 @@ struct lruvec {
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
 /* Isolate unevictable pages */
 #define ISOLATE_UNEVICTABLE	((__force isolate_mode_t)0x8)
+/* Isolate none cma pages */
+#define ISOLATE_NONCMA		((__force isolate_mode_t)0x10)
 
 /* LRU Isolation modes. */
 typedef unsigned __bitwise isolate_mode_t;
@@ -756,7 +758,7 @@ typedef struct pglist_data {
 	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;	/* Protected by
 					   mem_hotplug_begin/end() */
-	int kswapd_order;
+	int kswapd_order, kswapd_migratetype;
 	enum zone_type kswapd_highest_zoneidx;
 
 	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
@@ -840,7 +842,7 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
 
 void build_all_zonelists(pg_data_t *pgdat);
 void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
-		   enum zone_type highest_zoneidx);
+		   int migratetype, enum zone_type highest_zoneidx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			 int highest_zoneidx, unsigned int alloc_flags,
 			 long free_pages);
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 2070df64958e..41bbafdfde84 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -51,37 +51,41 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
 
 TRACE_EVENT(mm_vmscan_kswapd_wake,
 
-	TP_PROTO(int nid, int zid, int order),
+	TP_PROTO(int nid, int zid, int order, int mt),
 
-	TP_ARGS(nid, zid, order),
+	TP_ARGS(nid, zid, order, mt),
 
 	TP_STRUCT__entry(
 		__field(	int,	nid	)
 		__field(	int,	zid	)
 		__field(	int,	order	)
+		__field(	int,	mt	)
 	),
 
 	TP_fast_assign(
 		__entry->nid	= nid;
 		__entry->zid    = zid;
 		__entry->order	= order;
+		__entry->mt	= mt;
 	),
 
-	TP_printk("nid=%d order=%d",
+	TP_printk("nid=%d order=%d migratetype=%d",
 		__entry->nid,
-		__entry->order)
+		__entry->order,
+		__entry->mt)
 );
 
 TRACE_EVENT(mm_vmscan_wakeup_kswapd,
 
-	TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
+	TP_PROTO(int nid, int zid, int order, int mt, gfp_t gfp_flags),
 
-	TP_ARGS(nid, zid, order, gfp_flags),
+	TP_ARGS(nid, zid, order, mt, gfp_flags),
 
 	TP_STRUCT__entry(
 		__field(	int,	nid		)
 		__field(	int,	zid		)
 		__field(	int,	order		)
+		__field(	int,	mt		)
 		__field(	gfp_t,	gfp_flags	)
 	),
 
@@ -89,12 +93,14 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
 		__entry->nid		= nid;
 		__entry->zid		= zid;
 		__entry->order		= order;
+		__entry->mt		= mt;
 		__entry->gfp_flags	= gfp_flags;
 	),
 
-	TP_printk("nid=%d order=%d gfp_flags=%s",
+	TP_printk("nid=%d order=%d migratetype=%d gfp_flags=%s",
 		__entry->nid,
 		__entry->order,
+		__entry->mt,
 		show_gfp_flags(__entry->gfp_flags))
 );
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 519a60d5b6f7..45ceb15721b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3517,7 +3517,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 	/* Separate test+clear to avoid unnecessary atomics */
 	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
 		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
-		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+		wakeup_kswapd(zone, 0, 0, migratetype, zone_idx(zone));
 	}
 
 	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
@@ -4426,11 +4426,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
 	struct zone *zone;
 	pg_data_t *last_pgdat = NULL;
 	enum zone_type highest_zoneidx = ac->highest_zoneidx;
+	int migratetype = ac->migratetype;
 
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
 					ac->nodemask) {
 		if (last_pgdat != zone->zone_pgdat)
-			wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
+			wakeup_kswapd(zone, gfp_mask, order, migratetype, highest_zoneidx);
 		last_pgdat = zone->zone_pgdat;
 	}
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1b574ad199d..e0a482cbba15 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -99,6 +99,9 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
+	/* Can cma pages be reclaimed? */
+	unsigned int may_cma:1;
+
 	/*
 	 * Cgroups are not reclaimed below their configured memory.low,
 	 * unless we threaten to OOM. If any cgroups are skipped due to
@@ -286,6 +289,11 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 }
 #endif
 
+static bool movable_reclaim(gfp_t gfp_mask)
+{
+	return is_migrate_movable(gfp_migratetype(gfp_mask));
+}
+
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -1499,6 +1507,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
 		.may_unmap = 1,
+		.may_cma = 1,
 	};
 	struct reclaim_stat stat;
 	unsigned int nr_reclaimed;
@@ -1593,6 +1602,9 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
 		return ret;
 
+	if ((mode & ISOLATE_NONCMA) && is_migrate_cma(get_pageblock_migratetype(page)))
+		return ret;
+
 	return 0;
 }
 
@@ -1647,7 +1659,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	unsigned long skipped = 0;
 	unsigned long scan, total_scan, nr_pages;
 	LIST_HEAD(pages_skipped);
-	isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+	isolate_mode_t mode;
+
+	mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+	mode |= (sc->may_cma ? 0 : ISOLATE_NONCMA);
 
 	total_scan = 0;
 	scan = 0;
@@ -2125,6 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
 		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_cma = 1,
 	};
 
 	while (!list_empty(page_list)) {
@@ -3253,6 +3269,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_cma = movable_reclaim(gfp_mask),
 	};
 
 	/*
@@ -3298,6 +3315,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 		.may_unmap = 1,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.may_swap = !noswap,
+		.may_cma = 1,
 	};
 
 	WARN_ON_ONCE(!current->reclaim_state);
@@ -3341,6 +3359,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = may_swap,
+		.may_cma = 1,
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -3548,7 +3567,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
  * or lower is eligible for reclaim until at least one usable zone is
  * balanced.
  */
-static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int migratetype, int highest_zoneidx)
 {
 	int i;
 	unsigned long nr_soft_reclaimed;
@@ -3650,6 +3669,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		 */
 		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
 		sc.may_swap = !nr_boost_reclaim;
+		sc.may_cma = is_migrate_movable(migratetype);
 
 		/*
 		 * Do some background aging of the anon list, to give
@@ -3771,8 +3791,15 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
 	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
 }
 
+static int kswapd_migratetype(pg_data_t *pgdat, int prev_migratetype)
+{
+	int curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
+
+	return curr_migratetype == MIGRATE_TYPES ? prev_migratetype : curr_migratetype;
+}
+
 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
-				unsigned int highest_zoneidx)
+				int migratetype, unsigned int highest_zoneidx)
 {
 	long remaining = 0;
 	DEFINE_WAIT(wait);
@@ -3807,8 +3834,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 		remaining = schedule_timeout(HZ/10);
 
 		/*
-		 * If woken prematurely then reset kswapd_highest_zoneidx and
-		 * order. The values will either be from a wakeup request or
+		 * If woken prematurely then reset kswapd_highest_zoneidx, order
+		 * and migratetype. The values will either be from a wakeup request or
 		 * the previous request that slept prematurely.
 		 */
 		if (remaining) {
@@ -3818,6 +3845,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 
 			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
 				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
+
+			if (!is_migrate_movable(READ_ONCE(pgdat->kswapd_migratetype)))
+				WRITE_ONCE(pgdat->kswapd_migratetype,
+						kswapd_migratetype(pgdat, migratetype));
 		}
 
 		finish_wait(&pgdat->kswapd_wait, &wait);
@@ -3870,6 +3901,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
  */
 static int kswapd(void *p)
 {
+	int migratetype = 0;
 	unsigned int alloc_order, reclaim_order;
 	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
 	pg_data_t *pgdat = (pg_data_t*)p;
@@ -3895,23 +3927,27 @@ static int kswapd(void *p)
 	set_freezable();
 
 	WRITE_ONCE(pgdat->kswapd_order, 0);
+	WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
 	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 	for ( ; ; ) {
 		bool ret;
 
 		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
+		migratetype = kswapd_migratetype(pgdat, migratetype);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx);
 
 kswapd_try_sleep:
 		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
-					highest_zoneidx);
+					migratetype, highest_zoneidx);
 
 		/* Read the new order and highest_zoneidx */
 		alloc_order = READ_ONCE(pgdat->kswapd_order);
+		migratetype = kswapd_migratetype(pgdat, migratetype);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx);
 		WRITE_ONCE(pgdat->kswapd_order, 0);
+		WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 
 		ret = try_to_freeze();
@@ -3934,8 +3970,8 @@ static int kswapd(void *p)
 		 * request (alloc_order).
 		 */
 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
-						alloc_order);
-		reclaim_order = balance_pgdat(pgdat, alloc_order,
+						alloc_order, migratetype);
+		reclaim_order = balance_pgdat(pgdat, alloc_order, migratetype,
 						highest_zoneidx);
 		if (reclaim_order < alloc_order)
 			goto kswapd_try_sleep;
@@ -3953,11 +3989,12 @@ static int kswapd(void *p)
  * has failed or is not needed, still wake up kcompactd if only compaction is
  * needed.
  */
-void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, int migratetype,
 		   enum zone_type highest_zoneidx)
 {
 	pg_data_t *pgdat;
 	enum zone_type curr_idx;
+	int curr_migratetype;
 
 	if (!managed_zone(zone))
 		return;
@@ -3967,6 +4004,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 
 	pgdat = zone->zone_pgdat;
 	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
+	curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
 
 	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
@@ -3974,6 +4012,9 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 	if (READ_ONCE(pgdat->kswapd_order) < order)
 		WRITE_ONCE(pgdat->kswapd_order, order);
 
+	if (curr_migratetype == MIGRATE_TYPES || is_migrate_movable(migratetype))
+		WRITE_ONCE(pgdat->kswapd_migratetype, migratetype);
+
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 
@@ -3994,7 +4035,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 	}
 
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
-				      gfp_flags);
+				      migratetype, gfp_flags);
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
@@ -4017,6 +4058,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_cma = 1,
 		.hibernation_mode = 1,
 	};
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
@@ -4176,6 +4218,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,
+		.may_cma = movable_reclaim(gfp_mask),
 		.reclaim_idx = gfp_zone(gfp_mask),
 	};
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
  2021-03-13  8:31 [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation zhou
  2021-03-13 10:50 ` kernel test robot
  2021-03-13 13:37 ` zhou xianrong
@ 2021-03-15 15:46 ` David Hildenbrand
  2021-03-15 16:09   ` Michal Hocko
  2 siblings, 1 reply; 11+ messages in thread
From: David Hildenbrand @ 2021-03-15 15:46 UTC (permalink / raw)
  To: zhou, linux-mm
  Cc: linux-kernel, akpm, mhocko, mgorman, willy, rostedt, mingo,
	vbabka, rientjes, pankaj.gupta.linux, bhe, ying.huang,
	iamjoonsoo.kim, minchan, ruxian.feng, kai.cheng, zhao.xu,
	zhouxianrong, zhou xianrong

On 13.03.21 09:31, zhou wrote:
> From: zhou xianrong <xianrong.zhou@transsion.com>
> 
> For purpose of better migration cma pages are allocated after
> failure movalbe allocations and are used normally for file pages
> or anonymous pages.

I failed to parse that senctence.

"For better migration, CMA pages are allocated after failing allocation 
of movable allocations and are used for backing files or anonymous memory."

Still doesn't make any sense to me. Can you clarify?


> 
> In reclaim path many cma pages if configurated are reclaimed

s/configurated/configured/

> from lru lists in kswapd mainly or direct reclaim triggered by
> unmovable or reclaimable allocations. But these reclaimed cma
> pages can not be used by original unmovable or reclaimable
> allocations. So the reclaim are unnecessary.

Might be a dump question, but why can't reclaimable allocations end up 
on CMA? (for unmovable allocations, this is clear) Or did I 
misunderstand what that paragraph was trying to tell me?

> 
> So the unmovable or reclaimable allocations should not trigger
> reclaiming cma pages. The patch adds third factor of migratetype
> which is just like factors of zone index or order kswapd need
> consider. The modification follows codes of zone index
> consideration. And it is straightforward that skips reclaiming
> cma pages in reclaim procedure which is triggered only by
> unmovable or reclaimable allocations.
> 
> This optimization can avoid ~3% unnecessary isolations from cma
> (cma isolated / total isolated) with configuration of total 100Mb
> cma pages.

Can you say a few words about interaction with ZONE_MOVABLE, which 
behaves similar to CMA? I.e., does the same apply to ZONE_MOVABLE? Is it 
already handled?

> 
> Signed-off-by: zhou xianrong <xianrong.zhou@transsion.com>
> Signed-off-by: feng ruxian <ruxian.feng@transsion.com>
> ---
>   include/linux/mmzone.h        |  6 ++--
>   include/trace/events/vmscan.h | 20 +++++++----
>   mm/page_alloc.c               |  5 +--
>   mm/vmscan.c                   | 63 +++++++++++++++++++++++++++++------
>   4 files changed, 73 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index b593316bff3d..7dd38d7372b9 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -301,6 +301,8 @@ struct lruvec {
>   #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
>   /* Isolate unevictable pages */
>   #define ISOLATE_UNEVICTABLE	((__force isolate_mode_t)0x8)
> +/* Isolate none cma pages */
> +#define ISOLATE_NONCMA		((__force isolate_mode_t)0x10)
>   
>   /* LRU Isolation modes. */
>   typedef unsigned __bitwise isolate_mode_t;
> @@ -756,7 +758,7 @@ typedef struct pglist_data {
>   	wait_queue_head_t pfmemalloc_wait;
>   	struct task_struct *kswapd;	/* Protected by
>   					   mem_hotplug_begin/end() */
> -	int kswapd_order;
> +	int kswapd_order, kswapd_migratetype;
>   	enum zone_type kswapd_highest_zoneidx;
>   
>   	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
> @@ -840,7 +842,7 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
>   
>   void build_all_zonelists(pg_data_t *pgdat);
>   void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
> -		   enum zone_type highest_zoneidx);
> +		   int migratetype, enum zone_type highest_zoneidx);
>   bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
>   			 int highest_zoneidx, unsigned int alloc_flags,
>   			 long free_pages);
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index 2070df64958e..41bbafdfde84 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -51,37 +51,41 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
>   
>   TRACE_EVENT(mm_vmscan_kswapd_wake,
>   
> -	TP_PROTO(int nid, int zid, int order),
> +	TP_PROTO(int nid, int zid, int order, int mt),
>   
> -	TP_ARGS(nid, zid, order),
> +	TP_ARGS(nid, zid, order, mt),
>   
>   	TP_STRUCT__entry(
>   		__field(	int,	nid	)
>   		__field(	int,	zid	)
>   		__field(	int,	order	)
> +		__field(	int,	mt	)
>   	),
>   
>   	TP_fast_assign(
>   		__entry->nid	= nid;
>   		__entry->zid    = zid;
>   		__entry->order	= order;
> +		__entry->mt	= mt;
>   	),
>   
> -	TP_printk("nid=%d order=%d",
> +	TP_printk("nid=%d order=%d migratetype=%d",
>   		__entry->nid,
> -		__entry->order)
> +		__entry->order,
> +		__entry->mt)
>   );
>   
>   TRACE_EVENT(mm_vmscan_wakeup_kswapd,
>   
> -	TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
> +	TP_PROTO(int nid, int zid, int order, int mt, gfp_t gfp_flags),
>   
> -	TP_ARGS(nid, zid, order, gfp_flags),
> +	TP_ARGS(nid, zid, order, mt, gfp_flags),
>   
>   	TP_STRUCT__entry(
>   		__field(	int,	nid		)
>   		__field(	int,	zid		)
>   		__field(	int,	order		)
> +		__field(	int,	mt		)
>   		__field(	gfp_t,	gfp_flags	)
>   	),
>   
> @@ -89,12 +93,14 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
>   		__entry->nid		= nid;
>   		__entry->zid		= zid;
>   		__entry->order		= order;
> +		__entry->mt		= mt;
>   		__entry->gfp_flags	= gfp_flags;
>   	),
>   
> -	TP_printk("nid=%d order=%d gfp_flags=%s",
> +	TP_printk("nid=%d order=%d migratetype=%d gfp_flags=%s",
>   		__entry->nid,
>   		__entry->order,
> +		__entry->mt,
>   		show_gfp_flags(__entry->gfp_flags))
>   );
>   
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 519a60d5b6f7..45ceb15721b8 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3517,7 +3517,7 @@ struct page *rmqueue(struct zone *preferred_zone,
>   	/* Separate test+clear to avoid unnecessary atomics */
>   	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
>   		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
> -		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
> +		wakeup_kswapd(zone, 0, 0, migratetype, zone_idx(zone));
>   	}
>   
>   	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
> @@ -4426,11 +4426,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
>   	struct zone *zone;
>   	pg_data_t *last_pgdat = NULL;
>   	enum zone_type highest_zoneidx = ac->highest_zoneidx;
> +	int migratetype = ac->migratetype;
>   
>   	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
>   					ac->nodemask) {
>   		if (last_pgdat != zone->zone_pgdat)
> -			wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
> +			wakeup_kswapd(zone, gfp_mask, order, migratetype, highest_zoneidx);
>   		last_pgdat = zone->zone_pgdat;
>   	}
>   }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b1b574ad199d..184f0c4c7151 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -99,6 +99,9 @@ struct scan_control {
>   	/* Can pages be swapped as part of reclaim? */
>   	unsigned int may_swap:1;
>   
> +	/* Can cma pages be reclaimed? */
> +	unsigned int may_cma:1;
> +
>   	/*
>   	 * Cgroups are not reclaimed below their configured memory.low,
>   	 * unless we threaten to OOM. If any cgroups are skipped due to
> @@ -286,6 +289,11 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>   }
>   #endif
>   
> +static bool movable_reclaim(gfp_t gfp_mask)
> +{
> +	return is_migrate_movable(gfp_migratetype(gfp_mask));
> +}
> +
>   /*
>    * This misses isolated pages which are not accounted for to save counters.
>    * As the data only determines if reclaim or compaction continues, it is
> @@ -1499,6 +1507,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
>   		.gfp_mask = GFP_KERNEL,
>   		.priority = DEF_PRIORITY,
>   		.may_unmap = 1,
> +		.may_cma = 1,
>   	};
>   	struct reclaim_stat stat;
>   	unsigned int nr_reclaimed;
> @@ -1593,6 +1602,9 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
>   	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
>   		return ret;
>   
> +	if ((mode & ISOLATE_NONCMA) && is_migrate_cma(get_pageblock_migratetype(page)))
> +		return ret;
> +
>   	return 0;
>   }
>   
> @@ -1647,7 +1659,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>   	unsigned long skipped = 0;
>   	unsigned long scan, total_scan, nr_pages;
>   	LIST_HEAD(pages_skipped);
> -	isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
> +	isolate_mode_t mode;
> +
> +	mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
> +	mode |= (sc->may_cma ? 0 : ISOLATE_NONCMA);
>   
>   	total_scan = 0;
>   	scan = 0;
> @@ -2125,6 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
>   		.may_writepage = 1,
>   		.may_unmap = 1,
>   		.may_swap = 1,
> +		.may_cma = 1,
>   	};
>   
>   	while (!list_empty(page_list)) {
> @@ -3253,6 +3269,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>   		.may_writepage = !laptop_mode,
>   		.may_unmap = 1,
>   		.may_swap = 1,
> +		.may_cma = movable_reclaim(gfp_mask),
>   	};
>   
>   	/*
> @@ -3298,6 +3315,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>   		.may_unmap = 1,
>   		.reclaim_idx = MAX_NR_ZONES - 1,
>   		.may_swap = !noswap,
> +		.may_cma = 1,
>   	};
>   
>   	WARN_ON_ONCE(!current->reclaim_state);
> @@ -3341,6 +3359,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>   		.may_writepage = !laptop_mode,
>   		.may_unmap = 1,
>   		.may_swap = may_swap,
> +		.may_cma = 1,
>   	};
>   	/*
>   	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> @@ -3548,7 +3567,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
>    * or lower is eligible for reclaim until at least one usable zone is
>    * balanced.
>    */
> -static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
> +static int balance_pgdat(pg_data_t *pgdat, int order, int migratetype, int highest_zoneidx)
>   {
>   	int i;
>   	unsigned long nr_soft_reclaimed;
> @@ -3650,6 +3669,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
>   		 */
>   		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
>   		sc.may_swap = !nr_boost_reclaim;
> +		sc.may_cma = is_migrate_movable(migratetype);
>   
>   		/*
>   		 * Do some background aging of the anon list, to give
> @@ -3771,8 +3791,15 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
>   	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
>   }
>   
> +static int kswapd_migratetype(pg_data_t *pgdat, int prev_migratetype)
> +{
> +	int curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
> +
> +	return curr_migratetype == MIGRATE_TYPES ? prev_migratetype : curr_migratetype;
> +}
> +
>   static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
> -				unsigned int highest_zoneidx)
> +				int migratetype, unsigned int highest_zoneidx)
>   {
>   	long remaining = 0;
>   	DEFINE_WAIT(wait);
> @@ -3807,8 +3834,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>   		remaining = schedule_timeout(HZ/10);
>   
>   		/*
> -		 * If woken prematurely then reset kswapd_highest_zoneidx and
> -		 * order. The values will either be from a wakeup request or
> +		 * If woken prematurely then reset kswapd_highest_zoneidx, order
> +		 * and migratetype. The values will either be from a wakeup request or
>   		 * the previous request that slept prematurely.
>   		 */
>   		if (remaining) {
> @@ -3818,6 +3845,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>   
>   			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
>   				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
> +
> +			if (!is_migrate_movable(READ_ONCE(pgdat->kswapd_migratetype)))
> +				WRITE_ONCE(pgdat->kswapd_migratetype,
> +						kswapd_migratetype(pgdat, migratetype));
>   		}
>   
>   		finish_wait(&pgdat->kswapd_wait, &wait);
> @@ -3870,6 +3901,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>    */
>   static int kswapd(void *p)
>   {
> +	int migratetype = 0;
>   	unsigned int alloc_order, reclaim_order;
>   	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
>   	pg_data_t *pgdat = (pg_data_t*)p;
> @@ -3895,23 +3927,27 @@ static int kswapd(void *p)
>   	set_freezable();
>   
>   	WRITE_ONCE(pgdat->kswapd_order, 0);
> +	WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
>   	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
>   	for ( ; ; ) {
>   		bool ret;
>   
>   		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
> +		migratetype = kswapd_migratetype(pgdat, migratetype);
>   		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
>   							highest_zoneidx);
>   
>   kswapd_try_sleep:
>   		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
> -					highest_zoneidx);
> +					migratetype, highest_zoneidx);
>   
>   		/* Read the new order and highest_zoneidx */
>   		alloc_order = READ_ONCE(pgdat->kswapd_order);
> +		migratetype = kswapd_migratetype(pgdat, migratetype);
>   		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
>   							highest_zoneidx);
>   		WRITE_ONCE(pgdat->kswapd_order, 0);
> +		WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
>   		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
>   
>   		ret = try_to_freeze();
> @@ -3934,8 +3970,8 @@ static int kswapd(void *p)
>   		 * request (alloc_order).
>   		 */
>   		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
> -						alloc_order);
> -		reclaim_order = balance_pgdat(pgdat, alloc_order,
> +						alloc_order, migratetype);
> +		reclaim_order = balance_pgdat(pgdat, alloc_order, migratetype,
>   						highest_zoneidx);
>   		if (reclaim_order < alloc_order)
>   			goto kswapd_try_sleep;
> @@ -3953,11 +3989,12 @@ static int kswapd(void *p)
>    * has failed or is not needed, still wake up kcompactd if only compaction is
>    * needed.
>    */
> -void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
> +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, int migratetype
>   		   enum zone_type highest_zoneidx)
>   {
>   	pg_data_t *pgdat;
>   	enum zone_type curr_idx;
> +	int curr_migratetype;
>   
>   	if (!managed_zone(zone))
>   		return;
> @@ -3967,6 +4004,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>   
>   	pgdat = zone->zone_pgdat;
>   	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
> +	curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
>   
>   	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
>   		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
> @@ -3974,6 +4012,9 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>   	if (READ_ONCE(pgdat->kswapd_order) < order)
>   		WRITE_ONCE(pgdat->kswapd_order, order);
>   
> +	if (curr_migratetype == MIGRATE_TYPES || is_migrate_movable(migratetype))
> +		WRITE_ONCE(pgdat->kswapd_migratetype, migratetype);
> +
>   	if (!waitqueue_active(&pgdat->kswapd_wait))
>   		return;
>   
> @@ -3994,7 +4035,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>   	}
>   
>   	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
> -				      gfp_flags);
> +				      migratetype, gfp_flags);
>   	wake_up_interruptible(&pgdat->kswapd_wait);
>   }
>   
> @@ -4017,6 +4058,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>   		.may_writepage = 1,
>   		.may_unmap = 1,
>   		.may_swap = 1,
> +		.may_cma = 1,
>   		.hibernation_mode = 1,
>   	};
>   	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
> @@ -4176,6 +4218,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>   		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
>   		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
>   		.may_swap = 1,
> +		.may_cma = movable_reclaim(gfp_mask),
>   		.reclaim_idx = gfp_zone(gfp_mask),
>   	};
>   
> 


-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
  2021-03-15 15:46 ` David Hildenbrand
@ 2021-03-15 16:09   ` Michal Hocko
  2021-03-15 16:12     ` David Hildenbrand
  0 siblings, 1 reply; 11+ messages in thread
From: Michal Hocko @ 2021-03-15 16:09 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: zhou, linux-mm, linux-kernel, akpm, mgorman, willy, rostedt,
	mingo, vbabka, rientjes, pankaj.gupta.linux, bhe, ying.huang,
	iamjoonsoo.kim, minchan, ruxian.feng, kai.cheng, zhao.xu,
	zhouxianrong, zhou xianrong

On Mon 15-03-21 16:46:33, David Hildenbrand wrote:
> On 13.03.21 09:31, zhou wrote:
[...]
> > This optimization can avoid ~3% unnecessary isolations from cma
> > (cma isolated / total isolated) with configuration of total 100Mb
> > cma pages.
> 
> Can you say a few words about interaction with ZONE_MOVABLE, which behaves
> similar to CMA? I.e., does the same apply to ZONE_MOVABLE? Is it already
> handled?

No, the movable zone shouldn't be affected as the reclaim is zone aware.
The problem is that CMA doesn't belong to any particular zone. This is
something Joonsoo worked in the past and I believe following up on that
work has been recommended last time a similar/same approach like this
patch was proposed.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
  2021-03-15 16:09   ` Michal Hocko
@ 2021-03-15 16:12     ` David Hildenbrand
  0 siblings, 0 replies; 11+ messages in thread
From: David Hildenbrand @ 2021-03-15 16:12 UTC (permalink / raw)
  To: Michal Hocko
  Cc: zhou, linux-mm, linux-kernel, akpm, mgorman, willy, rostedt,
	mingo, vbabka, rientjes, pankaj.gupta.linux, bhe, ying.huang,
	iamjoonsoo.kim, minchan, ruxian.feng, kai.cheng, zhao.xu,
	zhouxianrong, zhou xianrong

On 15.03.21 17:09, Michal Hocko wrote:
> On Mon 15-03-21 16:46:33, David Hildenbrand wrote:
>> On 13.03.21 09:31, zhou wrote:
> [...]
>>> This optimization can avoid ~3% unnecessary isolations from cma
>>> (cma isolated / total isolated) with configuration of total 100Mb
>>> cma pages.
>>
>> Can you say a few words about interaction with ZONE_MOVABLE, which behaves
>> similar to CMA? I.e., does the same apply to ZONE_MOVABLE? Is it already
>> handled?
> 
> No, the movable zone shouldn't be affected as the reclaim is zone aware.
> The problem is that CMA doesn't belong to any particular zone. This is
> something Joonsoo worked in the past and I believe following up on that
> work has been recommended last time a similar/same approach like this
> patch was proposed.

Okay, thanks - that's what I expected.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
  2021-02-10 13:14     ` Michal Hocko
@ 2021-02-11 11:01       ` zhou xianrong
  0 siblings, 0 replies; 11+ messages in thread
From: zhou xianrong @ 2021-02-11 11:01 UTC (permalink / raw)
  To: Michal Hocko
  Cc: iamjoonsoo.kim, linux-mm, linux-kernel, akpm, rostedt, mingo,
	vbabka, rientjes, willy, pankaj.gupta.linux, bhe, ying.huang,
	minchan, ruxian.feng, kai.cheng, zhao.xu, yunfeng.lan,
	zhouxianrong, zhou xianrong


On 2021/2/10 下午9:14, Michal Hocko wrote:
> On Wed 10-02-21 12:07:57, zhou xianrong wrote:
>> On 2021/2/9 下午5:23, Michal Hocko wrote:
>>> On Tue 09-02-21 16:23:13, zhou wrote:
>>>> From: zhou xianrong <xianrong.zhou@transsion.com>
>>>>
>>>> For purpose of better migration cma pages are allocated after
>>>> failure movalbe allocations and are used normally for file pages
>>>> or anonymous pages.
>>>>
>>>> In reclaim path so many cma pages if configurated are reclaimed
>>>> from lru lists in kswapd mainly or direct reclaim triggered by
>>>> unmovable or reclaimable allocations. But these cma pages can not
>>>> be used by original unmovable or reclaimable allocations. So the
>>>> reclaim are unnecessary.
>>>>
>>>> In a same system if the cma pages were configurated to large then
>>>> more failture unmovable (vmalloc etc.) or reclaimable (slab etc.)
>>>> allocations are arised and then more kswapd rounds are triggered
>>>> and then more cma pages are reclaimed.
>>> Could you be more specific? Do you have any numbers and an example
>>> configuration when this is visible?
>> It should be implicit.
> Right but the scale of the problem is an important part of _any_ patch
> justification.
Sorry. The relative description is  not suitable and should be removed.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
  2021-02-10  4:07   ` zhou xianrong
@ 2021-02-10 13:14     ` Michal Hocko
  2021-02-11 11:01       ` zhou xianrong
  0 siblings, 1 reply; 11+ messages in thread
From: Michal Hocko @ 2021-02-10 13:14 UTC (permalink / raw)
  To: zhou xianrong
  Cc: iamjoonsoo.kim, linux-mm, linux-kernel, akpm, rostedt, mingo,
	vbabka, rientjes, willy, pankaj.gupta.linux, bhe, ying.huang,
	minchan, ruxian.feng, kai.cheng, zhao.xu, yunfeng.lan,
	zhouxianrong, zhou xianrong

On Wed 10-02-21 12:07:57, zhou xianrong wrote:
> 
> On 2021/2/9 下午5:23, Michal Hocko wrote:
> > On Tue 09-02-21 16:23:13, zhou wrote:
> > > From: zhou xianrong <xianrong.zhou@transsion.com>
> > > 
> > > For purpose of better migration cma pages are allocated after
> > > failure movalbe allocations and are used normally for file pages
> > > or anonymous pages.
> > > 
> > > In reclaim path so many cma pages if configurated are reclaimed
> > > from lru lists in kswapd mainly or direct reclaim triggered by
> > > unmovable or reclaimable allocations. But these cma pages can not
> > > be used by original unmovable or reclaimable allocations. So the
> > > reclaim are unnecessary.
> > > 
> > > In a same system if the cma pages were configurated to large then
> > > more failture unmovable (vmalloc etc.) or reclaimable (slab etc.)
> > > allocations are arised and then more kswapd rounds are triggered
> > > and then more cma pages are reclaimed.
> > Could you be more specific? Do you have any numbers and an example
> > configuration when this is visible?
> It should be implicit.

Right but the scale of the problem is an important part of _any_ patch
justification.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
  2021-02-09  9:23 ` Michal Hocko
@ 2021-02-10  4:07   ` zhou xianrong
  2021-02-10 13:14     ` Michal Hocko
  0 siblings, 1 reply; 11+ messages in thread
From: zhou xianrong @ 2021-02-10  4:07 UTC (permalink / raw)
  To: Michal Hocko, iamjoonsoo.kim
  Cc: linux-mm, linux-kernel, akpm, rostedt, mingo, vbabka, rientjes,
	willy, pankaj.gupta.linux, bhe, ying.huang, minchan, ruxian.feng,
	kai.cheng, zhao.xu, yunfeng.lan, zhouxianrong, zhou xianrong


On 2021/2/9 下午5:23, Michal Hocko wrote:
> On Tue 09-02-21 16:23:13, zhou wrote:
>> From: zhou xianrong <xianrong.zhou@transsion.com>
>>
>> For purpose of better migration cma pages are allocated after
>> failure movalbe allocations and are used normally for file pages
>> or anonymous pages.
>>
>> In reclaim path so many cma pages if configurated are reclaimed
>> from lru lists in kswapd mainly or direct reclaim triggered by
>> unmovable or reclaimable allocations. But these cma pages can not
>> be used by original unmovable or reclaimable allocations. So the
>> reclaim are unnecessary.
>>
>> In a same system if the cma pages were configurated to large then
>> more failture unmovable (vmalloc etc.) or reclaimable (slab etc.)
>> allocations are arised and then more kswapd rounds are triggered
>> and then more cma pages are reclaimed.
> Could you be more specific? Do you have any numbers and an example
> configuration when this is visible?
It should be implicit.
>> So this maybe cause vicious cycle. It causes that when we are under
>> low memory and still there are many cma pages that can not be
>> allocated due to unnecessary cma reclaim and cma fallback allocations
>> . So cma pages are not used sufficiently.
>>
>> The modification is straightforward that skips reclaiming cma pages
>> in reclaim procedure which is triggered only by unmovable or
>> reclaimable allocations. This optimization can avoid ~3% unnecessary
>> cma isolations (cma isolated / total isolated).
> Joonsoo used to have a patch series to drop many of the hacks we have
> for CMA and made it part of a movable zone. That would solve many
> problems, including this one. I am not sure where the work stands now
> but it would be probably better to revive that rather than adding more
> special casing on top of what we have right now.
Yes. This modification is simple and retain existing cma logic.
>> Signed-off-by: zhou xianrong <xianrong.zhou@transsion.com>
>> ---
>>   include/linux/mmzone.h        |  6 ++--
>>   include/trace/events/vmscan.h | 20 +++++++----
>>   mm/page_alloc.c               |  5 +--
>>   mm/vmscan.c                   | 63 +++++++++++++++++++++++++++++------
>>   4 files changed, 73 insertions(+), 21 deletions(-)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index b593316bff3d..7dd38d7372b9 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -301,6 +301,8 @@ struct lruvec {
>>   #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
>>   /* Isolate unevictable pages */
>>   #define ISOLATE_UNEVICTABLE	((__force isolate_mode_t)0x8)
>> +/* Isolate none cma pages */
>> +#define ISOLATE_NONCMA		((__force isolate_mode_t)0x10)
>>   
>>   /* LRU Isolation modes. */
>>   typedef unsigned __bitwise isolate_mode_t;
>> @@ -756,7 +758,7 @@ typedef struct pglist_data {
>>   	wait_queue_head_t pfmemalloc_wait;
>>   	struct task_struct *kswapd;	/* Protected by
>>   					   mem_hotplug_begin/end() */
>> -	int kswapd_order;
>> +	int kswapd_order, kswapd_migratetype;
>>   	enum zone_type kswapd_highest_zoneidx;
>>   
>>   	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
>> @@ -840,7 +842,7 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
>>   
>>   void build_all_zonelists(pg_data_t *pgdat);
>>   void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
>> -		   enum zone_type highest_zoneidx);
>> +		   int migratetype, enum zone_type highest_zoneidx);
>>   bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
>>   			 int highest_zoneidx, unsigned int alloc_flags,
>>   			 long free_pages);
>> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
>> index 2070df64958e..41bbafdfde84 100644
>> --- a/include/trace/events/vmscan.h
>> +++ b/include/trace/events/vmscan.h
>> @@ -51,37 +51,41 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
>>   
>>   TRACE_EVENT(mm_vmscan_kswapd_wake,
>>   
>> -	TP_PROTO(int nid, int zid, int order),
>> +	TP_PROTO(int nid, int zid, int order, int mt),
>>   
>> -	TP_ARGS(nid, zid, order),
>> +	TP_ARGS(nid, zid, order, mt),
>>   
>>   	TP_STRUCT__entry(
>>   		__field(	int,	nid	)
>>   		__field(	int,	zid	)
>>   		__field(	int,	order	)
>> +		__field(	int,	mt	)
>>   	),
>>   
>>   	TP_fast_assign(
>>   		__entry->nid	= nid;
>>   		__entry->zid    = zid;
>>   		__entry->order	= order;
>> +		__entry->mt	= mt;
>>   	),
>>   
>> -	TP_printk("nid=%d order=%d",
>> +	TP_printk("nid=%d order=%d migratetype=%d",
>>   		__entry->nid,
>> -		__entry->order)
>> +		__entry->order,
>> +		__entry->mt)
>>   );
>>   
>>   TRACE_EVENT(mm_vmscan_wakeup_kswapd,
>>   
>> -	TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
>> +	TP_PROTO(int nid, int zid, int order, int mt, gfp_t gfp_flags),
>>   
>> -	TP_ARGS(nid, zid, order, gfp_flags),
>> +	TP_ARGS(nid, zid, order, mt, gfp_flags),
>>   
>>   	TP_STRUCT__entry(
>>   		__field(	int,	nid		)
>>   		__field(	int,	zid		)
>>   		__field(	int,	order		)
>> +		__field(	int,	mt		)
>>   		__field(	gfp_t,	gfp_flags	)
>>   	),
>>   
>> @@ -89,12 +93,14 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
>>   		__entry->nid		= nid;
>>   		__entry->zid		= zid;
>>   		__entry->order		= order;
>> +		__entry->mt		= mt;
>>   		__entry->gfp_flags	= gfp_flags;
>>   	),
>>   
>> -	TP_printk("nid=%d order=%d gfp_flags=%s",
>> +	TP_printk("nid=%d order=%d migratetype=%d gfp_flags=%s",
>>   		__entry->nid,
>>   		__entry->order,
>> +		__entry->mt,
>>   		show_gfp_flags(__entry->gfp_flags))
>>   );
>>   
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 519a60d5b6f7..45ceb15721b8 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -3517,7 +3517,7 @@ struct page *rmqueue(struct zone *preferred_zone,
>>   	/* Separate test+clear to avoid unnecessary atomics */
>>   	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
>>   		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
>> -		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
>> +		wakeup_kswapd(zone, 0, 0, migratetype, zone_idx(zone));
>>   	}
>>   
>>   	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
>> @@ -4426,11 +4426,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
>>   	struct zone *zone;
>>   	pg_data_t *last_pgdat = NULL;
>>   	enum zone_type highest_zoneidx = ac->highest_zoneidx;
>> +	int migratetype = ac->migratetype;
>>   
>>   	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
>>   					ac->nodemask) {
>>   		if (last_pgdat != zone->zone_pgdat)
>> -			wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
>> +			wakeup_kswapd(zone, gfp_mask, order, migratetype, highest_zoneidx);
>>   		last_pgdat = zone->zone_pgdat;
>>   	}
>>   }
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index b1b574ad199d..e61ec8747a40 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -99,6 +99,9 @@ struct scan_control {
>>   	/* Can pages be swapped as part of reclaim? */
>>   	unsigned int may_swap:1;
>>   
>> +	/* Can cma pages be reclaimed? */
>> +	unsigned int may_cma:1;
>> +
>>   	/*
>>   	 * Cgroups are not reclaimed below their configured memory.low,
>>   	 * unless we threaten to OOM. If any cgroups are skipped due to
>> @@ -286,6 +289,11 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>>   }
>>   #endif
>>   
>> +static bool movable_reclaim(gfp_t gfp_mask)
>> +{
>> +	return is_migrate_movable(gfp_migratetype(gfp_mask));
>> +}
>> +
>>   /*
>>    * This misses isolated pages which are not accounted for to save counters.
>>    * As the data only determines if reclaim or compaction continues, it is
>> @@ -1499,6 +1507,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
>>   		.gfp_mask = GFP_KERNEL,
>>   		.priority = DEF_PRIORITY,
>>   		.may_unmap = 1,
>> +		.may_cma = 1,
>>   	};
>>   	struct reclaim_stat stat;
>>   	unsigned int nr_reclaimed;
>> @@ -1593,6 +1602,9 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
>>   	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
>>   		return ret;
>>   
>> +	if ((mode & ISOLATE_NONCMA) && is_migrate_cma(get_pageblock_migratetype(page)))
>> +		return ret;
>> +
>>   	return 0;
>>   }
>>   
>> @@ -1647,7 +1659,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>>   	unsigned long skipped = 0;
>>   	unsigned long scan, total_scan, nr_pages;
>>   	LIST_HEAD(pages_skipped);
>> -	isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
>> +	isolate_mode_t mode;
>> +
>> +	mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
>> +	mode |= (sc->may_cma ? 0 : ISOLATE_NONCMA);
>>   
>>   	total_scan = 0;
>>   	scan = 0;
>> @@ -2125,6 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
>>   		.may_writepage = 1,
>>   		.may_unmap = 1,
>>   		.may_swap = 1,
>> +		.may_cma = 1,
>>   	};
>>   
>>   	while (!list_empty(page_list)) {
>> @@ -3253,6 +3269,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>>   		.may_writepage = !laptop_mode,
>>   		.may_unmap = 1,
>>   		.may_swap = 1,
>> +		.may_cma = movable_reclaim(gfp_mask),
>>   	};
>>   
>>   	/*
>> @@ -3298,6 +3315,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>>   		.may_unmap = 1,
>>   		.reclaim_idx = MAX_NR_ZONES - 1,
>>   		.may_swap = !noswap,
>> +		.may_cma = 1,
>>   	};
>>   
>>   	WARN_ON_ONCE(!current->reclaim_state);
>> @@ -3341,6 +3359,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>>   		.may_writepage = !laptop_mode,
>>   		.may_unmap = 1,
>>   		.may_swap = may_swap,
>> +		.may_cma = 1,
>>   	};
>>   	/*
>>   	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
>> @@ -3548,7 +3567,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
>>    * or lower is eligible for reclaim until at least one usable zone is
>>    * balanced.
>>    */
>> -static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
>> +static int balance_pgdat(pg_data_t *pgdat, int order, int migratetype, int highest_zoneidx)
>>   {
>>   	int i;
>>   	unsigned long nr_soft_reclaimed;
>> @@ -3650,6 +3669,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
>>   		 */
>>   		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
>>   		sc.may_swap = !nr_boost_reclaim;
>> +		sc.may_cma = is_migrate_movable(migratetype);
>>   
>>   		/*
>>   		 * Do some background aging of the anon list, to give
>> @@ -3771,8 +3791,15 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
>>   	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
>>   }
>>   
>> +static int kswapd_migratetype(pg_data_t *pgdat, int prev_migratetype)
>> +{
>> +	int curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
>> +
>> +	return curr_migratetype == MIGRATE_TYPES ? prev_migratetype : curr_migratetype;
>> +}
>> +
>>   static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
>> -				unsigned int highest_zoneidx)
>> +				int migratetype, unsigned int highest_zoneidx)
>>   {
>>   	long remaining = 0;
>>   	DEFINE_WAIT(wait);
>> @@ -3807,8 +3834,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>>   		remaining = schedule_timeout(HZ/10);
>>   
>>   		/*
>> -		 * If woken prematurely then reset kswapd_highest_zoneidx and
>> -		 * order. The values will either be from a wakeup request or
>> +		 * If woken prematurely then reset kswapd_highest_zoneidx, order
>> +		 * and migratetype. The values will either be from a wakeup request or
>>   		 * the previous request that slept prematurely.
>>   		 */
>>   		if (remaining) {
>> @@ -3818,6 +3845,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>>   
>>   			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
>>   				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
>> +
>> +			if (!is_migrate_movable(READ_ONCE(pgdat->kswapd_migratetype)))
>> +				WRITE_ONCE(pgdat->kswapd_migratetype,
>> +						kswapd_migratetype(pgdat, migratetype));
>>   		}
>>   
>>   		finish_wait(&pgdat->kswapd_wait, &wait);
>> @@ -3870,6 +3901,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>>    */
>>   static int kswapd(void *p)
>>   {
>> +	int migratetype = 0;
>>   	unsigned int alloc_order, reclaim_order;
>>   	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
>>   	pg_data_t *pgdat = (pg_data_t*)p;
>> @@ -3895,23 +3927,27 @@ static int kswapd(void *p)
>>   	set_freezable();
>>   
>>   	WRITE_ONCE(pgdat->kswapd_order, 0);
>> +	WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
>>   	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
>>   	for ( ; ; ) {
>>   		bool ret;
>>   
>>   		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
>> +		migratetype = kswapd_migratetype(pgdat, migratetype);
>>   		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
>>   							highest_zoneidx);
>>   
>>   kswapd_try_sleep:
>>   		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
>> -					highest_zoneidx);
>> +					migratetype, highest_zoneidx);
>>   
>>   		/* Read the new order and highest_zoneidx */
>>   		alloc_order = READ_ONCE(pgdat->kswapd_order);
>> +		migratetype = kswapd_migratetype(pgdat, migratetype);
>>   		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
>>   							highest_zoneidx);
>>   		WRITE_ONCE(pgdat->kswapd_order, 0);
>> +		WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
>>   		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
>>   
>>   		ret = try_to_freeze();
>> @@ -3934,8 +3970,8 @@ static int kswapd(void *p)
>>   		 * request (alloc_order).
>>   		 */
>>   		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
>> -						alloc_order);
>> -		reclaim_order = balance_pgdat(pgdat, alloc_order,
>> +						alloc_order, migratetype);
>> +		reclaim_order = balance_pgdat(pgdat, alloc_order, migratetype,
>>   						highest_zoneidx);
>>   		if (reclaim_order < alloc_order)
>>   			goto kswapd_try_sleep;
>> @@ -3954,10 +3990,11 @@ static int kswapd(void *p)
>>    * needed.
>>    */
>>   void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>> -		   enum zone_type highest_zoneidx)
>> +		   int migratetype, enum zone_type highest_zoneidx)
>>   {
>>   	pg_data_t *pgdat;
>>   	enum zone_type curr_idx;
>> +	int curr_migratetype;
>>   
>>   	if (!managed_zone(zone))
>>   		return;
>> @@ -3967,6 +4004,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>>   
>>   	pgdat = zone->zone_pgdat;
>>   	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
>> +	curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
>>   
>>   	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
>>   		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
>> @@ -3974,6 +4012,9 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>>   	if (READ_ONCE(pgdat->kswapd_order) < order)
>>   		WRITE_ONCE(pgdat->kswapd_order, order);
>>   
>> +	if (curr_migratetype == MIGRATE_TYPES || is_migrate_movable(migratetype))
>> +		WRITE_ONCE(pgdat->kswapd_migratetype, migratetype);
>> +
>>   	if (!waitqueue_active(&pgdat->kswapd_wait))
>>   		return;
>>   
>> @@ -3994,7 +4035,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>>   	}
>>   
>>   	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
>> -				      gfp_flags);
>> +				      migratetype, gfp_flags);
>>   	wake_up_interruptible(&pgdat->kswapd_wait);
>>   }
>>   
>> @@ -4017,6 +4058,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>>   		.may_writepage = 1,
>>   		.may_unmap = 1,
>>   		.may_swap = 1,
>> +		.may_cma = 1,
>>   		.hibernation_mode = 1,
>>   	};
>>   	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
>> @@ -4176,6 +4218,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>>   		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
>>   		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
>>   		.may_swap = 1,
>> +		.may_cma = movable_reclaim(gfp_mask),
>>   		.reclaim_idx = gfp_zone(gfp_mask),
>>   	};
>>   
>> -- 
>> 2.25.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
  2021-02-09  8:23 zhou
@ 2021-02-09  9:23 ` Michal Hocko
  2021-02-10  4:07   ` zhou xianrong
  0 siblings, 1 reply; 11+ messages in thread
From: Michal Hocko @ 2021-02-09  9:23 UTC (permalink / raw)
  To: zhou, iamjoonsoo.kim
  Cc: linux-mm, linux-kernel, akpm, rostedt, mingo, vbabka, rientjes,
	willy, pankaj.gupta.linux, bhe, ying.huang, minchan, ruxian.feng,
	kai.cheng, zhao.xu, yunfeng.lan, zhouxianrong, zhou xianrong

On Tue 09-02-21 16:23:13, zhou wrote:
> From: zhou xianrong <xianrong.zhou@transsion.com>
> 
> For purpose of better migration cma pages are allocated after
> failure movalbe allocations and are used normally for file pages
> or anonymous pages.
> 
> In reclaim path so many cma pages if configurated are reclaimed
> from lru lists in kswapd mainly or direct reclaim triggered by
> unmovable or reclaimable allocations. But these cma pages can not
> be used by original unmovable or reclaimable allocations. So the
> reclaim are unnecessary.
> 
> In a same system if the cma pages were configurated to large then
> more failture unmovable (vmalloc etc.) or reclaimable (slab etc.)
> allocations are arised and then more kswapd rounds are triggered
> and then more cma pages are reclaimed.

Could you be more specific? Do you have any numbers and an example
configuration when this is visible?

> So this maybe cause vicious cycle. It causes that when we are under
> low memory and still there are many cma pages that can not be
> allocated due to unnecessary cma reclaim and cma fallback allocations
> . So cma pages are not used sufficiently.
> 
> The modification is straightforward that skips reclaiming cma pages
> in reclaim procedure which is triggered only by unmovable or
> reclaimable allocations. This optimization can avoid ~3% unnecessary
> cma isolations (cma isolated / total isolated).

Joonsoo used to have a patch series to drop many of the hacks we have
for CMA and made it part of a movable zone. That would solve many
problems, including this one. I am not sure where the work stands now
but it would be probably better to revive that rather than adding more
special casing on top of what we have right now.

> Signed-off-by: zhou xianrong <xianrong.zhou@transsion.com>
> ---
>  include/linux/mmzone.h        |  6 ++--
>  include/trace/events/vmscan.h | 20 +++++++----
>  mm/page_alloc.c               |  5 +--
>  mm/vmscan.c                   | 63 +++++++++++++++++++++++++++++------
>  4 files changed, 73 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index b593316bff3d..7dd38d7372b9 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -301,6 +301,8 @@ struct lruvec {
>  #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
>  /* Isolate unevictable pages */
>  #define ISOLATE_UNEVICTABLE	((__force isolate_mode_t)0x8)
> +/* Isolate none cma pages */
> +#define ISOLATE_NONCMA		((__force isolate_mode_t)0x10)
>  
>  /* LRU Isolation modes. */
>  typedef unsigned __bitwise isolate_mode_t;
> @@ -756,7 +758,7 @@ typedef struct pglist_data {
>  	wait_queue_head_t pfmemalloc_wait;
>  	struct task_struct *kswapd;	/* Protected by
>  					   mem_hotplug_begin/end() */
> -	int kswapd_order;
> +	int kswapd_order, kswapd_migratetype;
>  	enum zone_type kswapd_highest_zoneidx;
>  
>  	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
> @@ -840,7 +842,7 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
>  
>  void build_all_zonelists(pg_data_t *pgdat);
>  void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
> -		   enum zone_type highest_zoneidx);
> +		   int migratetype, enum zone_type highest_zoneidx);
>  bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
>  			 int highest_zoneidx, unsigned int alloc_flags,
>  			 long free_pages);
> diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
> index 2070df64958e..41bbafdfde84 100644
> --- a/include/trace/events/vmscan.h
> +++ b/include/trace/events/vmscan.h
> @@ -51,37 +51,41 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
>  
>  TRACE_EVENT(mm_vmscan_kswapd_wake,
>  
> -	TP_PROTO(int nid, int zid, int order),
> +	TP_PROTO(int nid, int zid, int order, int mt),
>  
> -	TP_ARGS(nid, zid, order),
> +	TP_ARGS(nid, zid, order, mt),
>  
>  	TP_STRUCT__entry(
>  		__field(	int,	nid	)
>  		__field(	int,	zid	)
>  		__field(	int,	order	)
> +		__field(	int,	mt	)
>  	),
>  
>  	TP_fast_assign(
>  		__entry->nid	= nid;
>  		__entry->zid    = zid;
>  		__entry->order	= order;
> +		__entry->mt	= mt;
>  	),
>  
> -	TP_printk("nid=%d order=%d",
> +	TP_printk("nid=%d order=%d migratetype=%d",
>  		__entry->nid,
> -		__entry->order)
> +		__entry->order,
> +		__entry->mt)
>  );
>  
>  TRACE_EVENT(mm_vmscan_wakeup_kswapd,
>  
> -	TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
> +	TP_PROTO(int nid, int zid, int order, int mt, gfp_t gfp_flags),
>  
> -	TP_ARGS(nid, zid, order, gfp_flags),
> +	TP_ARGS(nid, zid, order, mt, gfp_flags),
>  
>  	TP_STRUCT__entry(
>  		__field(	int,	nid		)
>  		__field(	int,	zid		)
>  		__field(	int,	order		)
> +		__field(	int,	mt		)
>  		__field(	gfp_t,	gfp_flags	)
>  	),
>  
> @@ -89,12 +93,14 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
>  		__entry->nid		= nid;
>  		__entry->zid		= zid;
>  		__entry->order		= order;
> +		__entry->mt		= mt;
>  		__entry->gfp_flags	= gfp_flags;
>  	),
>  
> -	TP_printk("nid=%d order=%d gfp_flags=%s",
> +	TP_printk("nid=%d order=%d migratetype=%d gfp_flags=%s",
>  		__entry->nid,
>  		__entry->order,
> +		__entry->mt,
>  		show_gfp_flags(__entry->gfp_flags))
>  );
>  
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 519a60d5b6f7..45ceb15721b8 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3517,7 +3517,7 @@ struct page *rmqueue(struct zone *preferred_zone,
>  	/* Separate test+clear to avoid unnecessary atomics */
>  	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
>  		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
> -		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
> +		wakeup_kswapd(zone, 0, 0, migratetype, zone_idx(zone));
>  	}
>  
>  	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
> @@ -4426,11 +4426,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
>  	struct zone *zone;
>  	pg_data_t *last_pgdat = NULL;
>  	enum zone_type highest_zoneidx = ac->highest_zoneidx;
> +	int migratetype = ac->migratetype;
>  
>  	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
>  					ac->nodemask) {
>  		if (last_pgdat != zone->zone_pgdat)
> -			wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
> +			wakeup_kswapd(zone, gfp_mask, order, migratetype, highest_zoneidx);
>  		last_pgdat = zone->zone_pgdat;
>  	}
>  }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b1b574ad199d..e61ec8747a40 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -99,6 +99,9 @@ struct scan_control {
>  	/* Can pages be swapped as part of reclaim? */
>  	unsigned int may_swap:1;
>  
> +	/* Can cma pages be reclaimed? */
> +	unsigned int may_cma:1;
> +
>  	/*
>  	 * Cgroups are not reclaimed below their configured memory.low,
>  	 * unless we threaten to OOM. If any cgroups are skipped due to
> @@ -286,6 +289,11 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>  }
>  #endif
>  
> +static bool movable_reclaim(gfp_t gfp_mask)
> +{
> +	return is_migrate_movable(gfp_migratetype(gfp_mask));
> +}
> +
>  /*
>   * This misses isolated pages which are not accounted for to save counters.
>   * As the data only determines if reclaim or compaction continues, it is
> @@ -1499,6 +1507,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
>  		.gfp_mask = GFP_KERNEL,
>  		.priority = DEF_PRIORITY,
>  		.may_unmap = 1,
> +		.may_cma = 1,
>  	};
>  	struct reclaim_stat stat;
>  	unsigned int nr_reclaimed;
> @@ -1593,6 +1602,9 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
>  	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
>  		return ret;
>  
> +	if ((mode & ISOLATE_NONCMA) && is_migrate_cma(get_pageblock_migratetype(page)))
> +		return ret;
> +
>  	return 0;
>  }
>  
> @@ -1647,7 +1659,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>  	unsigned long skipped = 0;
>  	unsigned long scan, total_scan, nr_pages;
>  	LIST_HEAD(pages_skipped);
> -	isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
> +	isolate_mode_t mode;
> +
> +	mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
> +	mode |= (sc->may_cma ? 0 : ISOLATE_NONCMA);
>  
>  	total_scan = 0;
>  	scan = 0;
> @@ -2125,6 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
>  		.may_writepage = 1,
>  		.may_unmap = 1,
>  		.may_swap = 1,
> +		.may_cma = 1,
>  	};
>  
>  	while (!list_empty(page_list)) {
> @@ -3253,6 +3269,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
>  		.may_swap = 1,
> +		.may_cma = movable_reclaim(gfp_mask),
>  	};
>  
>  	/*
> @@ -3298,6 +3315,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>  		.may_unmap = 1,
>  		.reclaim_idx = MAX_NR_ZONES - 1,
>  		.may_swap = !noswap,
> +		.may_cma = 1,
>  	};
>  
>  	WARN_ON_ONCE(!current->reclaim_state);
> @@ -3341,6 +3359,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
>  		.may_swap = may_swap,
> +		.may_cma = 1,
>  	};
>  	/*
>  	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> @@ -3548,7 +3567,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
>   * or lower is eligible for reclaim until at least one usable zone is
>   * balanced.
>   */
> -static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
> +static int balance_pgdat(pg_data_t *pgdat, int order, int migratetype, int highest_zoneidx)
>  {
>  	int i;
>  	unsigned long nr_soft_reclaimed;
> @@ -3650,6 +3669,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
>  		 */
>  		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
>  		sc.may_swap = !nr_boost_reclaim;
> +		sc.may_cma = is_migrate_movable(migratetype);
>  
>  		/*
>  		 * Do some background aging of the anon list, to give
> @@ -3771,8 +3791,15 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
>  	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
>  }
>  
> +static int kswapd_migratetype(pg_data_t *pgdat, int prev_migratetype)
> +{
> +	int curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
> +
> +	return curr_migratetype == MIGRATE_TYPES ? prev_migratetype : curr_migratetype;
> +}
> +
>  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
> -				unsigned int highest_zoneidx)
> +				int migratetype, unsigned int highest_zoneidx)
>  {
>  	long remaining = 0;
>  	DEFINE_WAIT(wait);
> @@ -3807,8 +3834,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  		remaining = schedule_timeout(HZ/10);
>  
>  		/*
> -		 * If woken prematurely then reset kswapd_highest_zoneidx and
> -		 * order. The values will either be from a wakeup request or
> +		 * If woken prematurely then reset kswapd_highest_zoneidx, order
> +		 * and migratetype. The values will either be from a wakeup request or
>  		 * the previous request that slept prematurely.
>  		 */
>  		if (remaining) {
> @@ -3818,6 +3845,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>  
>  			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
>  				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
> +
> +			if (!is_migrate_movable(READ_ONCE(pgdat->kswapd_migratetype)))
> +				WRITE_ONCE(pgdat->kswapd_migratetype,
> +						kswapd_migratetype(pgdat, migratetype));
>  		}
>  
>  		finish_wait(&pgdat->kswapd_wait, &wait);
> @@ -3870,6 +3901,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
>   */
>  static int kswapd(void *p)
>  {
> +	int migratetype = 0;
>  	unsigned int alloc_order, reclaim_order;
>  	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
>  	pg_data_t *pgdat = (pg_data_t*)p;
> @@ -3895,23 +3927,27 @@ static int kswapd(void *p)
>  	set_freezable();
>  
>  	WRITE_ONCE(pgdat->kswapd_order, 0);
> +	WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
>  	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
>  	for ( ; ; ) {
>  		bool ret;
>  
>  		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
> +		migratetype = kswapd_migratetype(pgdat, migratetype);
>  		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
>  							highest_zoneidx);
>  
>  kswapd_try_sleep:
>  		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
> -					highest_zoneidx);
> +					migratetype, highest_zoneidx);
>  
>  		/* Read the new order and highest_zoneidx */
>  		alloc_order = READ_ONCE(pgdat->kswapd_order);
> +		migratetype = kswapd_migratetype(pgdat, migratetype);
>  		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
>  							highest_zoneidx);
>  		WRITE_ONCE(pgdat->kswapd_order, 0);
> +		WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
>  		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
>  
>  		ret = try_to_freeze();
> @@ -3934,8 +3970,8 @@ static int kswapd(void *p)
>  		 * request (alloc_order).
>  		 */
>  		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
> -						alloc_order);
> -		reclaim_order = balance_pgdat(pgdat, alloc_order,
> +						alloc_order, migratetype);
> +		reclaim_order = balance_pgdat(pgdat, alloc_order, migratetype,
>  						highest_zoneidx);
>  		if (reclaim_order < alloc_order)
>  			goto kswapd_try_sleep;
> @@ -3954,10 +3990,11 @@ static int kswapd(void *p)
>   * needed.
>   */
>  void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
> -		   enum zone_type highest_zoneidx)
> +		   int migratetype, enum zone_type highest_zoneidx)
>  {
>  	pg_data_t *pgdat;
>  	enum zone_type curr_idx;
> +	int curr_migratetype;
>  
>  	if (!managed_zone(zone))
>  		return;
> @@ -3967,6 +4004,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>  
>  	pgdat = zone->zone_pgdat;
>  	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
> +	curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
>  
>  	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
>  		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
> @@ -3974,6 +4012,9 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>  	if (READ_ONCE(pgdat->kswapd_order) < order)
>  		WRITE_ONCE(pgdat->kswapd_order, order);
>  
> +	if (curr_migratetype == MIGRATE_TYPES || is_migrate_movable(migratetype))
> +		WRITE_ONCE(pgdat->kswapd_migratetype, migratetype);
> +
>  	if (!waitqueue_active(&pgdat->kswapd_wait))
>  		return;
>  
> @@ -3994,7 +4035,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
>  	}
>  
>  	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
> -				      gfp_flags);
> +				      migratetype, gfp_flags);
>  	wake_up_interruptible(&pgdat->kswapd_wait);
>  }
>  
> @@ -4017,6 +4058,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>  		.may_writepage = 1,
>  		.may_unmap = 1,
>  		.may_swap = 1,
> +		.may_cma = 1,
>  		.hibernation_mode = 1,
>  	};
>  	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
> @@ -4176,6 +4218,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>  		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
>  		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
>  		.may_swap = 1,
> +		.may_cma = movable_reclaim(gfp_mask),
>  		.reclaim_idx = gfp_zone(gfp_mask),
>  	};
>  
> -- 
> 2.25.1

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation
@ 2021-02-09  8:23 zhou
  2021-02-09  9:23 ` Michal Hocko
  0 siblings, 1 reply; 11+ messages in thread
From: zhou @ 2021-02-09  8:23 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, akpm, mhocko, rostedt, mingo, vbabka, rientjes,
	willy, pankaj.gupta.linux, bhe, ying.huang, iamjoonsoo.kim,
	minchan, ruxian.feng, kai.cheng, zhao.xu, yunfeng.lan,
	zhouxianrong, zhou xianrong

From: zhou xianrong <xianrong.zhou@transsion.com>

For purpose of better migration cma pages are allocated after
failure movalbe allocations and are used normally for file pages
or anonymous pages.

In reclaim path so many cma pages if configurated are reclaimed
from lru lists in kswapd mainly or direct reclaim triggered by
unmovable or reclaimable allocations. But these cma pages can not
be used by original unmovable or reclaimable allocations. So the
reclaim are unnecessary.

In a same system if the cma pages were configurated to large then
more failture unmovable (vmalloc etc.) or reclaimable (slab etc.)
allocations are arised and then more kswapd rounds are triggered
and then more cma pages are reclaimed.

So this maybe cause vicious cycle. It causes that when we are under
low memory and still there are many cma pages that can not be
allocated due to unnecessary cma reclaim and cma fallback allocations
. So cma pages are not used sufficiently.

The modification is straightforward that skips reclaiming cma pages
in reclaim procedure which is triggered only by unmovable or
reclaimable allocations. This optimization can avoid ~3% unnecessary
cma isolations (cma isolated / total isolated).

Signed-off-by: zhou xianrong <xianrong.zhou@transsion.com>
---
 include/linux/mmzone.h        |  6 ++--
 include/trace/events/vmscan.h | 20 +++++++----
 mm/page_alloc.c               |  5 +--
 mm/vmscan.c                   | 63 +++++++++++++++++++++++++++++------
 4 files changed, 73 insertions(+), 21 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316bff3d..7dd38d7372b9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -301,6 +301,8 @@ struct lruvec {
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
 /* Isolate unevictable pages */
 #define ISOLATE_UNEVICTABLE	((__force isolate_mode_t)0x8)
+/* Isolate none cma pages */
+#define ISOLATE_NONCMA		((__force isolate_mode_t)0x10)
 
 /* LRU Isolation modes. */
 typedef unsigned __bitwise isolate_mode_t;
@@ -756,7 +758,7 @@ typedef struct pglist_data {
 	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;	/* Protected by
 					   mem_hotplug_begin/end() */
-	int kswapd_order;
+	int kswapd_order, kswapd_migratetype;
 	enum zone_type kswapd_highest_zoneidx;
 
 	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
@@ -840,7 +842,7 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
 
 void build_all_zonelists(pg_data_t *pgdat);
 void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
-		   enum zone_type highest_zoneidx);
+		   int migratetype, enum zone_type highest_zoneidx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			 int highest_zoneidx, unsigned int alloc_flags,
 			 long free_pages);
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 2070df64958e..41bbafdfde84 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -51,37 +51,41 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
 
 TRACE_EVENT(mm_vmscan_kswapd_wake,
 
-	TP_PROTO(int nid, int zid, int order),
+	TP_PROTO(int nid, int zid, int order, int mt),
 
-	TP_ARGS(nid, zid, order),
+	TP_ARGS(nid, zid, order, mt),
 
 	TP_STRUCT__entry(
 		__field(	int,	nid	)
 		__field(	int,	zid	)
 		__field(	int,	order	)
+		__field(	int,	mt	)
 	),
 
 	TP_fast_assign(
 		__entry->nid	= nid;
 		__entry->zid    = zid;
 		__entry->order	= order;
+		__entry->mt	= mt;
 	),
 
-	TP_printk("nid=%d order=%d",
+	TP_printk("nid=%d order=%d migratetype=%d",
 		__entry->nid,
-		__entry->order)
+		__entry->order,
+		__entry->mt)
 );
 
 TRACE_EVENT(mm_vmscan_wakeup_kswapd,
 
-	TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
+	TP_PROTO(int nid, int zid, int order, int mt, gfp_t gfp_flags),
 
-	TP_ARGS(nid, zid, order, gfp_flags),
+	TP_ARGS(nid, zid, order, mt, gfp_flags),
 
 	TP_STRUCT__entry(
 		__field(	int,	nid		)
 		__field(	int,	zid		)
 		__field(	int,	order		)
+		__field(	int,	mt		)
 		__field(	gfp_t,	gfp_flags	)
 	),
 
@@ -89,12 +93,14 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
 		__entry->nid		= nid;
 		__entry->zid		= zid;
 		__entry->order		= order;
+		__entry->mt		= mt;
 		__entry->gfp_flags	= gfp_flags;
 	),
 
-	TP_printk("nid=%d order=%d gfp_flags=%s",
+	TP_printk("nid=%d order=%d migratetype=%d gfp_flags=%s",
 		__entry->nid,
 		__entry->order,
+		__entry->mt,
 		show_gfp_flags(__entry->gfp_flags))
 );
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 519a60d5b6f7..45ceb15721b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3517,7 +3517,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 	/* Separate test+clear to avoid unnecessary atomics */
 	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
 		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
-		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+		wakeup_kswapd(zone, 0, 0, migratetype, zone_idx(zone));
 	}
 
 	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
@@ -4426,11 +4426,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
 	struct zone *zone;
 	pg_data_t *last_pgdat = NULL;
 	enum zone_type highest_zoneidx = ac->highest_zoneidx;
+	int migratetype = ac->migratetype;
 
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
 					ac->nodemask) {
 		if (last_pgdat != zone->zone_pgdat)
-			wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
+			wakeup_kswapd(zone, gfp_mask, order, migratetype, highest_zoneidx);
 		last_pgdat = zone->zone_pgdat;
 	}
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1b574ad199d..e61ec8747a40 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -99,6 +99,9 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
+	/* Can cma pages be reclaimed? */
+	unsigned int may_cma:1;
+
 	/*
 	 * Cgroups are not reclaimed below their configured memory.low,
 	 * unless we threaten to OOM. If any cgroups are skipped due to
@@ -286,6 +289,11 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 }
 #endif
 
+static bool movable_reclaim(gfp_t gfp_mask)
+{
+	return is_migrate_movable(gfp_migratetype(gfp_mask));
+}
+
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -1499,6 +1507,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
 		.may_unmap = 1,
+		.may_cma = 1,
 	};
 	struct reclaim_stat stat;
 	unsigned int nr_reclaimed;
@@ -1593,6 +1602,9 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
 		return ret;
 
+	if ((mode & ISOLATE_NONCMA) && is_migrate_cma(get_pageblock_migratetype(page)))
+		return ret;
+
 	return 0;
 }
 
@@ -1647,7 +1659,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	unsigned long skipped = 0;
 	unsigned long scan, total_scan, nr_pages;
 	LIST_HEAD(pages_skipped);
-	isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+	isolate_mode_t mode;
+
+	mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+	mode |= (sc->may_cma ? 0 : ISOLATE_NONCMA);
 
 	total_scan = 0;
 	scan = 0;
@@ -2125,6 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
 		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_cma = 1,
 	};
 
 	while (!list_empty(page_list)) {
@@ -3253,6 +3269,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_cma = movable_reclaim(gfp_mask),
 	};
 
 	/*
@@ -3298,6 +3315,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 		.may_unmap = 1,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.may_swap = !noswap,
+		.may_cma = 1,
 	};
 
 	WARN_ON_ONCE(!current->reclaim_state);
@@ -3341,6 +3359,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = may_swap,
+		.may_cma = 1,
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -3548,7 +3567,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
  * or lower is eligible for reclaim until at least one usable zone is
  * balanced.
  */
-static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int migratetype, int highest_zoneidx)
 {
 	int i;
 	unsigned long nr_soft_reclaimed;
@@ -3650,6 +3669,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		 */
 		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
 		sc.may_swap = !nr_boost_reclaim;
+		sc.may_cma = is_migrate_movable(migratetype);
 
 		/*
 		 * Do some background aging of the anon list, to give
@@ -3771,8 +3791,15 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
 	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
 }
 
+static int kswapd_migratetype(pg_data_t *pgdat, int prev_migratetype)
+{
+	int curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
+
+	return curr_migratetype == MIGRATE_TYPES ? prev_migratetype : curr_migratetype;
+}
+
 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
-				unsigned int highest_zoneidx)
+				int migratetype, unsigned int highest_zoneidx)
 {
 	long remaining = 0;
 	DEFINE_WAIT(wait);
@@ -3807,8 +3834,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 		remaining = schedule_timeout(HZ/10);
 
 		/*
-		 * If woken prematurely then reset kswapd_highest_zoneidx and
-		 * order. The values will either be from a wakeup request or
+		 * If woken prematurely then reset kswapd_highest_zoneidx, order
+		 * and migratetype. The values will either be from a wakeup request or
 		 * the previous request that slept prematurely.
 		 */
 		if (remaining) {
@@ -3818,6 +3845,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 
 			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
 				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
+
+			if (!is_migrate_movable(READ_ONCE(pgdat->kswapd_migratetype)))
+				WRITE_ONCE(pgdat->kswapd_migratetype,
+						kswapd_migratetype(pgdat, migratetype));
 		}
 
 		finish_wait(&pgdat->kswapd_wait, &wait);
@@ -3870,6 +3901,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
  */
 static int kswapd(void *p)
 {
+	int migratetype = 0;
 	unsigned int alloc_order, reclaim_order;
 	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
 	pg_data_t *pgdat = (pg_data_t*)p;
@@ -3895,23 +3927,27 @@ static int kswapd(void *p)
 	set_freezable();
 
 	WRITE_ONCE(pgdat->kswapd_order, 0);
+	WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
 	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 	for ( ; ; ) {
 		bool ret;
 
 		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
+		migratetype = kswapd_migratetype(pgdat, migratetype);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx);
 
 kswapd_try_sleep:
 		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
-					highest_zoneidx);
+					migratetype, highest_zoneidx);
 
 		/* Read the new order and highest_zoneidx */
 		alloc_order = READ_ONCE(pgdat->kswapd_order);
+		migratetype = kswapd_migratetype(pgdat, migratetype);
 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
 							highest_zoneidx);
 		WRITE_ONCE(pgdat->kswapd_order, 0);
+		WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES);
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 
 		ret = try_to_freeze();
@@ -3934,8 +3970,8 @@ static int kswapd(void *p)
 		 * request (alloc_order).
 		 */
 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
-						alloc_order);
-		reclaim_order = balance_pgdat(pgdat, alloc_order,
+						alloc_order, migratetype);
+		reclaim_order = balance_pgdat(pgdat, alloc_order, migratetype,
 						highest_zoneidx);
 		if (reclaim_order < alloc_order)
 			goto kswapd_try_sleep;
@@ -3954,10 +3990,11 @@ static int kswapd(void *p)
  * needed.
  */
 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
-		   enum zone_type highest_zoneidx)
+		   int migratetype, enum zone_type highest_zoneidx)
 {
 	pg_data_t *pgdat;
 	enum zone_type curr_idx;
+	int curr_migratetype;
 
 	if (!managed_zone(zone))
 		return;
@@ -3967,6 +4004,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 
 	pgdat = zone->zone_pgdat;
 	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
+	curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype);
 
 	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
@@ -3974,6 +4012,9 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 	if (READ_ONCE(pgdat->kswapd_order) < order)
 		WRITE_ONCE(pgdat->kswapd_order, order);
 
+	if (curr_migratetype == MIGRATE_TYPES || is_migrate_movable(migratetype))
+		WRITE_ONCE(pgdat->kswapd_migratetype, migratetype);
+
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 
@@ -3994,7 +4035,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 	}
 
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
-				      gfp_flags);
+				      migratetype, gfp_flags);
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
@@ -4017,6 +4058,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 		.may_writepage = 1,
 		.may_unmap = 1,
 		.may_swap = 1,
+		.may_cma = 1,
 		.hibernation_mode = 1,
 	};
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
@@ -4176,6 +4218,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,
+		.may_cma = movable_reclaim(gfp_mask),
 		.reclaim_idx = gfp_zone(gfp_mask),
 	};
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-03-15 16:13 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-13  8:31 [PATCH] kswapd: no need reclaim cma pages triggered by unmovable allocation zhou
2021-03-13 10:50 ` kernel test robot
2021-03-13 13:37 ` zhou xianrong
2021-03-15 15:46 ` David Hildenbrand
2021-03-15 16:09   ` Michal Hocko
2021-03-15 16:12     ` David Hildenbrand
  -- strict thread matches above, loose matches on Subject: below --
2021-02-09  8:23 zhou
2021-02-09  9:23 ` Michal Hocko
2021-02-10  4:07   ` zhou xianrong
2021-02-10 13:14     ` Michal Hocko
2021-02-11 11:01       ` zhou xianrong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).