linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Yu Zhao <yuzhao@google.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, Yu Zhao <yuzhao@google.com>
Subject: [PATCH v14-fix 10/11] mm: multi-gen LRU: fixed long-tailed direct reclaim latency
Date: Sun, 18 Sep 2022 14:47:54 -0600	[thread overview]
Message-ID: <20220918204755.3135720-10-yuzhao@google.com> (raw)
In-Reply-To: <20220918204755.3135720-1-yuzhao@google.com>

Long-tailed direct reclaim latency was seen on high-memory (TBs)
machines: MGLRU is better at the 99th percentile but worse at the
99.9th.

It turned out the old direct reclaim backoff, which tries to enforce a
minimum fairness among all eligible memcgs, over-swapped by about
(total_mem>>DEF_PRIORITY)-nr_to_reclaim:

    /* adjust priority if memcg is offline or the target is met */
    if (!mem_cgroup_online(memcg))
        priority = 0;
    else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim)
        priority = DEF_PRIORITY;
    else
        priority = sc->priority;

The new backoff, which pulls the plug on swapping once the target is
met, trades some fairness for curtailed latency. Specifically, in
should_abort_scan():

    /* over-swapping can increase allocation latency */
    if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
        return true;

The fundamental problem is that the backoff requires a sophisticated
model and the previous one was oversimplified. The new one may still
be, but at least it can handle a couple more corner cases on top of
the above:

    /* age each memcg once to ensure fairness */
    if (max_seq - seq > 1)
        return true;

    The NR_FREE_PAGES check at the bottom of should_abort_scan().

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 mm/vmscan.c | 105 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 70 insertions(+), 35 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f6eab73bdfb9..50764b2d462f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -135,10 +135,9 @@ struct scan_control {
 	unsigned int no_demotion:1;
 
 #ifdef CONFIG_LRU_GEN
-	/* help make better choices when multiple memcgs are available */
+	/* help kswapd make better choices among multiple memcgs */
 	unsigned int memcgs_need_aging:1;
-	unsigned int memcgs_need_swapping:1;
-	unsigned int memcgs_avoid_swapping:1;
+	unsigned long last_reclaimed;
 #endif
 
 	/* Allocation order */
@@ -4524,22 +4523,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 
 	VM_WARN_ON_ONCE(!current_is_kswapd());
 
+	sc->last_reclaimed = sc->nr_reclaimed;
+
 	/*
-	 * To reduce the chance of going into the aging path or swapping, which
-	 * can be costly, optimistically skip them unless their corresponding
-	 * flags were cleared in the eviction path. This improves the overall
-	 * performance when multiple memcgs are available.
+	 * To reduce the chance of going into the aging path, which can be
+	 * costly, optimistically skip it if the flag below was cleared in the
+	 * eviction path. This improves the overall performance when multiple
+	 * memcgs are available.
 	 */
 	if (!sc->memcgs_need_aging) {
 		sc->memcgs_need_aging = true;
-		sc->memcgs_avoid_swapping = !sc->memcgs_need_swapping;
-		sc->memcgs_need_swapping = true;
 		return;
 	}
 
-	sc->memcgs_need_swapping = true;
-	sc->memcgs_avoid_swapping = true;
-
 	set_mm_walk(pgdat);
 
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
@@ -5035,7 +5031,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 
 	sc->nr_reclaimed += reclaimed;
 
-	if (type == LRU_GEN_ANON && need_swapping)
+	if (need_swapping && type == LRU_GEN_ANON)
 		*need_swapping = true;
 
 	return scanned;
@@ -5047,19 +5043,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
  *    reclaim.
  */
 static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
-				    bool can_swap, unsigned long reclaimed, bool *need_aging)
+				    bool can_swap, bool *need_aging)
 {
-	int priority;
 	unsigned long nr_to_scan;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	DEFINE_MAX_SEQ(lruvec);
 	DEFINE_MIN_SEQ(lruvec);
 
-	if (fatal_signal_pending(current)) {
-		sc->nr_reclaimed += MIN_LRU_BATCH;
-		return 0;
-	}
-
 	if (mem_cgroup_below_min(memcg) ||
 	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
 		return 0;
@@ -5068,15 +5058,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
 	if (!nr_to_scan)
 		return 0;
 
-	/* adjust priority if memcg is offline or the target is met */
-	if (!mem_cgroup_online(memcg))
-		priority = 0;
-	else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim)
-		priority = DEF_PRIORITY;
-	else
-		priority = sc->priority;
-
-	nr_to_scan >>= priority;
+	nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0;
 	if (!nr_to_scan)
 		return 0;
 
@@ -5084,7 +5066,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
 		return nr_to_scan;
 
 	/* skip the aging path at the default priority */
-	if (priority == DEF_PRIORITY)
+	if (sc->priority == DEF_PRIORITY)
 		goto done;
 
 	/* leave the work to lru_gen_age_node() */
@@ -5097,6 +5079,60 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
 	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
 }
 
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+			      struct scan_control *sc, bool need_swapping)
+{
+	int i;
+	DEFINE_MAX_SEQ(lruvec);
+
+	if (!current_is_kswapd()) {
+		/* age each memcg once to ensure fairness */
+		if (max_seq - seq > 1)
+			return true;
+
+		/* over-swapping can increase allocation latency */
+		if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+			return true;
+
+		/* give this thread a chance to exit and free its memory */
+		if (fatal_signal_pending(current)) {
+			sc->nr_reclaimed += MIN_LRU_BATCH;
+			return true;
+		}
+
+		if (cgroup_reclaim(sc))
+			return false;
+	} else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+		return false;
+
+	/* keep scanning at low priorities to ensure fairness */
+	if (sc->priority > DEF_PRIORITY - 2)
+		return false;
+
+	/*
+	 * A minimum amount of work was done under global memory pressure. For
+	 * kswapd, it may be overshooting. For direct reclaim, the target isn't
+	 * met, and yet the allocation may still succeed, since kswapd may have
+	 * caught up. In either case, it's better to stop now, and restart if
+	 * necessary.
+	 */
+	for (i = 0; i <= sc->reclaim_idx; i++) {
+		unsigned long wmark;
+		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+
+		if (!managed_zone(zone))
+			continue;
+
+		wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+		if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+			return false;
+	}
+
+	sc->nr_reclaimed += MIN_LRU_BATCH;
+
+	return true;
+}
+
 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	struct blk_plug plug;
@@ -5104,6 +5140,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
 	bool need_swapping = false;
 	unsigned long scanned = 0;
 	unsigned long reclaimed = sc->nr_reclaimed;
+	DEFINE_MAX_SEQ(lruvec);
 
 	lru_add_drain();
 
@@ -5123,7 +5160,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
 		else
 			swappiness = 0;
 
-		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed, &need_aging);
+		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
 		if (!nr_to_scan)
 			goto done;
 
@@ -5135,17 +5172,15 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
 		if (scanned >= nr_to_scan)
 			break;
 
-		if (sc->memcgs_avoid_swapping && swappiness < 200 && need_swapping)
+		if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
 			break;
 
 		cond_resched();
 	}
 
 	/* see the comment in lru_gen_age_node() */
-	if (!need_aging)
+	if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
 		sc->memcgs_need_aging = false;
-	if (!need_swapping)
-		sc->memcgs_need_swapping = false;
 done:
 	clear_mm_walk();
 
-- 
2.37.3.968.ga6b4b080e4-goog



  parent reply	other threads:[~2022-09-18 20:48 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-15  7:13 [PATCH v14 00/14] Multi-Gen LRU Framework Yu Zhao
2022-08-15  7:13 ` [PATCH v14 01/14] mm: x86, arm64: add arch_has_hw_pte_young() Yu Zhao
2022-08-15  7:13 ` [PATCH v14 02/14] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG Yu Zhao
2022-08-15  7:13 ` [PATCH v14 03/14] mm/vmscan.c: refactor shrink_node() Yu Zhao
2022-08-15  7:13 ` [PATCH v14 04/14] Revert "include/linux/mm_inline.h: fold __update_lru_size() into its sole caller" Yu Zhao
2022-08-15  7:13 ` [PATCH v14 05/14] mm: multi-gen LRU: groundwork Yu Zhao
2022-08-15  7:13 ` [PATCH v14 06/14] mm: multi-gen LRU: minimal implementation Yu Zhao
2022-08-15  7:13 ` [PATCH v14 07/14] mm: multi-gen LRU: exploit locality in rmap Yu Zhao
2022-09-01  9:18   ` Nadav Amit
2022-09-02  1:17     ` Yu Zhao
2022-09-02  1:28       ` Yu Zhao
2022-08-15  7:13 ` [PATCH v14 08/14] mm: multi-gen LRU: support page table walks Yu Zhao
2022-10-13 15:04   ` Peter Zijlstra
2022-10-19  5:51     ` Yu Zhao
2022-10-19 17:40       ` Linus Torvalds
2022-10-20 14:13         ` Peter Zijlstra
2022-10-20 17:29           ` Yu Zhao
2022-10-20 17:35           ` Linus Torvalds
2022-10-20 18:55             ` Peter Zijlstra
2022-10-21  2:10               ` Linus Torvalds
2022-10-21  3:38                 ` Matthew Wilcox
2022-10-21 16:50                   ` Linus Torvalds
2022-10-23 14:44                     ` David Gow
2022-10-23 17:55                     ` Maciej W. Rozycki
2022-10-23 18:35                       ` Linus Torvalds
2022-10-24  7:30                         ` Arnd Bergmann
2022-10-25 16:28                         ` Maciej W. Rozycki
2022-10-26 15:43                           ` Arnd Bergmann
2022-10-27 23:08                             ` Maciej W. Rozycki
2022-10-28  7:27                               ` Arnd Bergmann
2022-10-21 10:12                 ` Peter Zijlstra
2022-10-24 18:20                 ` Gareth Poole
2022-10-24 19:28                 ` Serentty
2022-08-15  7:13 ` [PATCH v14 09/14] mm: multi-gen LRU: optimize multiple memcgs Yu Zhao
2022-08-15  7:13 ` [PATCH v14 10/14] mm: multi-gen LRU: kill switch Yu Zhao
2022-08-15  7:13 ` [PATCH v14 11/14] mm: multi-gen LRU: thrashing prevention Yu Zhao
2022-08-15  7:13 ` [PATCH v14 12/14] mm: multi-gen LRU: debugfs interface Yu Zhao
2022-08-15  7:13 ` [PATCH v14 13/14] mm: multi-gen LRU: admin guide Yu Zhao
2022-08-15  9:06   ` Bagas Sanjaya
2022-08-15  9:12   ` Mike Rapoport
2022-08-17 22:46     ` Yu Zhao
2022-09-20  7:43   ` Bagas Sanjaya
2022-08-15  7:13 ` [PATCH v14 14/14] mm: multi-gen LRU: design doc Yu Zhao
2022-08-15  9:07   ` Bagas Sanjaya
2022-08-31  4:17 ` OpenWrt / MIPS benchmark with MGLRU Yu Zhao
2022-08-31 15:13   ` Dave Hansen
2022-08-31 22:18   ` Yu Zhao
2022-09-12  0:08 ` [PATCH v14 00/14] Multi-Gen LRU Framework Andrew Morton
2022-09-15 17:56   ` Yu Zhao
2022-09-18 20:40     ` Yu Zhao
2022-09-18 20:47       ` [PATCH v14-fix 01/11] mm: multi-gen LRU: update admin guide Yu Zhao
2022-09-18 20:47         ` [PATCH v14-fix 02/11] mm: multi-gen LRU: add comment in lru_gen_use_mm() Yu Zhao
2022-09-18 20:47         ` [PATCH v14-fix 03/11] mm: multi-gen LRU: warn on !ptep_test_and_clear_young() Yu Zhao
2022-09-18 23:47           ` Andrew Morton
2022-09-18 23:53             ` Yu Zhao
2022-09-18 20:47         ` [PATCH v14-fix 04/11] mm: multi-gen LRU: fix warning from __rcu Yu Zhao
2022-09-18 20:47         ` [PATCH v14-fix 05/11] mm: multi-gen LRU: fix warning from seq_is_valid() Yu Zhao
2022-09-18 20:47         ` [PATCH v14-fix 06/11] mm: multi-gen LRU: delete overcautious VM_WARN_ON_ONCE() Yu Zhao
2022-09-18 20:47         ` [PATCH v14-fix 07/11] mm: multi-gen LRU: dial down MAX_LRU_BATCH Yu Zhao
2022-09-18 20:47         ` [PATCH v14-fix 08/11] mm: multi-gen LRU: delete newline in kswapd_age_node() Yu Zhao
2022-09-18 20:47         ` [PATCH v14-fix 09/11] mm: multi-gen LRU: add comment in lru_gen_look_around() Yu Zhao
2022-09-18 20:47         ` Yu Zhao [this message]
2022-09-18 20:47         ` [PATCH v14-fix 11/11] mm: multi-gen LRU: refactor get_nr_evictable() Yu Zhao
2022-09-18 23:47       ` [PATCH v14 00/14] Multi-Gen LRU Framework Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220918204755.3135720-10-yuzhao@google.com \
    --to=yuzhao@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).