[PATCH 4/5] Reclaim to satisfy WMARK_DEMOTE on toptier nodes

From: Hasan Al Maruf <hasan3050@gmail.com>
To: dave.hansen@linux.intel.com, ying.huang@intel.com,
	yang.shi@linux.alibaba.com, mgorman@techsingularity.net,
	riel@surriel.com, hannes@cmpxchg.org
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: [PATCH 4/5] Reclaim to satisfy WMARK_DEMOTE on toptier nodes
Date: Wed, 24 Nov 2021 13:58:29 -0500	[thread overview]
Message-ID: <cd42adda728e211be3f1d8719221dd02088b76bf.1637778851.git.hasanalmaruf@fb.com> (raw)
In-Reply-To: <cover.1637778851.git.hasanalmaruf@fb.com>

When kswapd is wakenup on a toptier node in a tiered-memory NUMA
balancing mode, it reclaims pages until the toptier node is balanced
and the number of free pages on toptier node satisfies WMARK_DEMOTE.

When THP (Transparent Huge Page) is enabled, sometimes demotion/promotion
between the memory nodes may pause for several hundreds of seconds as
the pages in the toptier node may sometimes become so hot, that kswapd
fails to reclaim any page.  Finally, the kswapd failure count
(pgdat->kswapd_failures) reaches its max value and kswapd will not be
waken up until a successful direct reclaiming. For general use case,
this isn't a big problem as the memory users will do direct reclaim
finally and trigger successful direct reclaiming or OOM to fix the
issue. But in memory tiering system, the demotion and promotion will
avoid to create too much memory pressure on the fast memory node, so
direct reclaiming will not be triggered to resolve the issue. To
resolve this, when promotion enabled, kswapd will be waken up every
10 seconds to try to free some pages to recover kswapd failures.

Signed-off-by: Hasan Al Maruf <hasanalmaruf@fb.com>
---
 mm/vmscan.c | 42 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c39b217effa9..1e87221f2b58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2386,8 +2386,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	unsigned long ap, fp;
 	enum lru_list lru;
 
-	/* If we have no swap space, do not bother scanning anon pages. */
-	if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
+	/*
+	 * If we have no swap space, do not bother scanning anon pages.
+	 * However, anon pages on toptier node can be demoted via reclaim
+	 * when numa promotion is enabled. Disable the check to prevent
+	 * demotion for no swap space when numa promotion is enabled.
+	 */
+	if (!numa_promotion_tiered_enabled &&
+		(!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc))) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2916,7 +2922,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			if (!managed_zone(zone))
 				continue;
 
-			total_high_wmark += high_wmark_pages(zone);
+			if (numa_promotion_tiered_enabled && node_is_toptier(pgdat->node_id))
+				total_high_wmark += demote_wmark_pages(zone);
+			else
+				total_high_wmark += high_wmark_pages(zone);
 		}
 
 		/*
@@ -3574,6 +3583,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
 	unsigned long mark = -1;
 	struct zone *zone;
 
+	if (numa_promotion_tiered_enabled && node_is_toptier(pgdat->node_id) &&
+			highest_zoneidx >= ZONE_NORMAL)
+		return pgdat_toptier_balanced(pgdat, 0, highest_zoneidx);
 	/*
 	 * Check watermarks bottom-up as lower zones are more likely to
 	 * meet watermarks.
@@ -3692,7 +3704,10 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
 		if (!managed_zone(zone))
 			continue;
 
-		sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
+		if (numa_promotion_tiered_enabled && node_is_toptier(pgdat->node_id))
+			sc->nr_to_reclaim += max(demote_wmark_pages(zone), SWAP_CLUSTER_MAX);
+		else
+			sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
 	}
 
 	/*
@@ -4021,8 +4036,23 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 		 */
 		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
 
-		if (!kthread_should_stop())
-			schedule();
+		if (!kthread_should_stop()) {
+			/*
+			 * In numa promotion modes, try harder to recover from
+			 * kswapd failures, because direct reclaiming may be
+			 * not triggered.
+			 */
+			if (numa_promotion_tiered_enabled &&
+						node_is_toptier(pgdat->node_id) &&
+					pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) {
+				remaining = schedule_timeout(10 * HZ);
+				if (!remaining) {
+					pgdat->kswapd_highest_zoneidx = ZONE_MOVABLE;
+					pgdat->kswapd_order = 0;
+				}
+			} else
+				schedule();
+		}
 
 		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
 	} else {
-- 
2.30.2