From: Yu Zhao <yuzhao@google.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, Yu Zhao <yuzhao@google.com>
Subject: [PATCH v14-fix 10/11] mm: multi-gen LRU: fixed long-tailed direct reclaim latency
Date: Sun, 18 Sep 2022 14:47:54 -0600 [thread overview]
Message-ID: <20220918204755.3135720-10-yuzhao@google.com> (raw)
In-Reply-To: <20220918204755.3135720-1-yuzhao@google.com>
Long-tailed direct reclaim latency was seen on high-memory (TBs)
machines: MGLRU is better at the 99th percentile but worse at the
99.9th.
It turned out the old direct reclaim backoff, which tries to enforce a
minimum fairness among all eligible memcgs, over-swapped by about
(total_mem>>DEF_PRIORITY)-nr_to_reclaim:
/* adjust priority if memcg is offline or the target is met */
if (!mem_cgroup_online(memcg))
priority = 0;
else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim)
priority = DEF_PRIORITY;
else
priority = sc->priority;
The new backoff, which pulls the plug on swapping once the target is
met, trades some fairness for curtailed latency. Specifically, in
should_abort_scan():
/* over-swapping can increase allocation latency */
if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
return true;
The fundamental problem is that the backoff requires a sophisticated
model and the previous one was oversimplified. The new one may still
be, but at least it can handle a couple more corner cases on top of
the above:
/* age each memcg once to ensure fairness */
if (max_seq - seq > 1)
return true;
The NR_FREE_PAGES check at the bottom of should_abort_scan().
Signed-off-by: Yu Zhao <yuzhao@google.com>
---
mm/vmscan.c | 105 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 70 insertions(+), 35 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f6eab73bdfb9..50764b2d462f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -135,10 +135,9 @@ struct scan_control {
unsigned int no_demotion:1;
#ifdef CONFIG_LRU_GEN
- /* help make better choices when multiple memcgs are available */
+ /* help kswapd make better choices among multiple memcgs */
unsigned int memcgs_need_aging:1;
- unsigned int memcgs_need_swapping:1;
- unsigned int memcgs_avoid_swapping:1;
+ unsigned long last_reclaimed;
#endif
/* Allocation order */
@@ -4524,22 +4523,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
VM_WARN_ON_ONCE(!current_is_kswapd());
+ sc->last_reclaimed = sc->nr_reclaimed;
+
/*
- * To reduce the chance of going into the aging path or swapping, which
- * can be costly, optimistically skip them unless their corresponding
- * flags were cleared in the eviction path. This improves the overall
- * performance when multiple memcgs are available.
+ * To reduce the chance of going into the aging path, which can be
+ * costly, optimistically skip it if the flag below was cleared in the
+ * eviction path. This improves the overall performance when multiple
+ * memcgs are available.
*/
if (!sc->memcgs_need_aging) {
sc->memcgs_need_aging = true;
- sc->memcgs_avoid_swapping = !sc->memcgs_need_swapping;
- sc->memcgs_need_swapping = true;
return;
}
- sc->memcgs_need_swapping = true;
- sc->memcgs_avoid_swapping = true;
-
set_mm_walk(pgdat);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
@@ -5035,7 +5031,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
sc->nr_reclaimed += reclaimed;
- if (type == LRU_GEN_ANON && need_swapping)
+ if (need_swapping && type == LRU_GEN_ANON)
*need_swapping = true;
return scanned;
@@ -5047,19 +5043,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
* reclaim.
*/
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap, unsigned long reclaimed, bool *need_aging)
+ bool can_swap, bool *need_aging)
{
- int priority;
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- if (fatal_signal_pending(current)) {
- sc->nr_reclaimed += MIN_LRU_BATCH;
- return 0;
- }
-
if (mem_cgroup_below_min(memcg) ||
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
return 0;
@@ -5068,15 +5058,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
if (!nr_to_scan)
return 0;
- /* adjust priority if memcg is offline or the target is met */
- if (!mem_cgroup_online(memcg))
- priority = 0;
- else if (sc->nr_reclaimed - reclaimed >= sc->nr_to_reclaim)
- priority = DEF_PRIORITY;
- else
- priority = sc->priority;
-
- nr_to_scan >>= priority;
+ nr_to_scan >>= mem_cgroup_online(memcg) ? sc->priority : 0;
if (!nr_to_scan)
return 0;
@@ -5084,7 +5066,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
return nr_to_scan;
/* skip the aging path at the default priority */
- if (priority == DEF_PRIORITY)
+ if (sc->priority == DEF_PRIORITY)
goto done;
/* leave the work to lru_gen_age_node() */
@@ -5097,6 +5079,60 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
}
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+ struct scan_control *sc, bool need_swapping)
+{
+ int i;
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (!current_is_kswapd()) {
+ /* age each memcg once to ensure fairness */
+ if (max_seq - seq > 1)
+ return true;
+
+ /* over-swapping can increase allocation latency */
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+ return true;
+
+ /* give this thread a chance to exit and free its memory */
+ if (fatal_signal_pending(current)) {
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+ return true;
+ }
+
+ if (cgroup_reclaim(sc))
+ return false;
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+ return false;
+
+ /* keep scanning at low priorities to ensure fairness */
+ if (sc->priority > DEF_PRIORITY - 2)
+ return false;
+
+ /*
+ * A minimum amount of work was done under global memory pressure. For
+ * kswapd, it may be overshooting. For direct reclaim, the target isn't
+ * met, and yet the allocation may still succeed, since kswapd may have
+ * caught up. In either case, it's better to stop now, and restart if
+ * necessary.
+ */
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ unsigned long wmark;
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+ return false;
+ }
+
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+
+ return true;
+}
+
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
@@ -5104,6 +5140,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
bool need_swapping = false;
unsigned long scanned = 0;
unsigned long reclaimed = sc->nr_reclaimed;
+ DEFINE_MAX_SEQ(lruvec);
lru_add_drain();
@@ -5123,7 +5160,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
else
swappiness = 0;
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, reclaimed, &need_aging);
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
if (!nr_to_scan)
goto done;
@@ -5135,17 +5172,15 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
if (scanned >= nr_to_scan)
break;
- if (sc->memcgs_avoid_swapping && swappiness < 200 && need_swapping)
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
break;
cond_resched();
}
/* see the comment in lru_gen_age_node() */
- if (!need_aging)
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
sc->memcgs_need_aging = false;
- if (!need_swapping)
- sc->memcgs_need_swapping = false;
done:
clear_mm_walk();
--
2.37.3.968.ga6b4b080e4-goog
next prev parent reply other threads:[~2022-09-18 20:48 UTC|newest]
Thread overview: 64+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-15 7:13 [PATCH v14 00/14] Multi-Gen LRU Framework Yu Zhao
2022-08-15 7:13 ` [PATCH v14 01/14] mm: x86, arm64: add arch_has_hw_pte_young() Yu Zhao
2022-08-15 7:13 ` [PATCH v14 02/14] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG Yu Zhao
2022-08-15 7:13 ` [PATCH v14 03/14] mm/vmscan.c: refactor shrink_node() Yu Zhao
2022-08-15 7:13 ` [PATCH v14 04/14] Revert "include/linux/mm_inline.h: fold __update_lru_size() into its sole caller" Yu Zhao
2022-08-15 7:13 ` [PATCH v14 05/14] mm: multi-gen LRU: groundwork Yu Zhao
2022-08-15 7:13 ` [PATCH v14 06/14] mm: multi-gen LRU: minimal implementation Yu Zhao
2022-08-15 7:13 ` [PATCH v14 07/14] mm: multi-gen LRU: exploit locality in rmap Yu Zhao
2022-09-01 9:18 ` Nadav Amit
2022-09-02 1:17 ` Yu Zhao
2022-09-02 1:28 ` Yu Zhao
2022-08-15 7:13 ` [PATCH v14 08/14] mm: multi-gen LRU: support page table walks Yu Zhao
2022-10-13 15:04 ` Peter Zijlstra
2022-10-19 5:51 ` Yu Zhao
2022-10-19 17:40 ` Linus Torvalds
2022-10-20 14:13 ` Peter Zijlstra
2022-10-20 17:29 ` Yu Zhao
2022-10-20 17:35 ` Linus Torvalds
2022-10-20 18:55 ` Peter Zijlstra
2022-10-21 2:10 ` Linus Torvalds
2022-10-21 3:38 ` Matthew Wilcox
2022-10-21 16:50 ` Linus Torvalds
2022-10-23 14:44 ` David Gow
2022-10-23 17:55 ` Maciej W. Rozycki
2022-10-23 18:35 ` Linus Torvalds
2022-10-24 7:30 ` Arnd Bergmann
2022-10-25 16:28 ` Maciej W. Rozycki
2022-10-26 15:43 ` Arnd Bergmann
2022-10-27 23:08 ` Maciej W. Rozycki
2022-10-28 7:27 ` Arnd Bergmann
2022-10-21 10:12 ` Peter Zijlstra
2022-10-24 18:20 ` Gareth Poole
2022-10-24 19:28 ` Serentty
2022-08-15 7:13 ` [PATCH v14 09/14] mm: multi-gen LRU: optimize multiple memcgs Yu Zhao
2022-08-15 7:13 ` [PATCH v14 10/14] mm: multi-gen LRU: kill switch Yu Zhao
2022-08-15 7:13 ` [PATCH v14 11/14] mm: multi-gen LRU: thrashing prevention Yu Zhao
2022-08-15 7:13 ` [PATCH v14 12/14] mm: multi-gen LRU: debugfs interface Yu Zhao
2022-08-15 7:13 ` [PATCH v14 13/14] mm: multi-gen LRU: admin guide Yu Zhao
2022-08-15 9:06 ` Bagas Sanjaya
2022-08-15 9:12 ` Mike Rapoport
2022-08-17 22:46 ` Yu Zhao
2022-09-20 7:43 ` Bagas Sanjaya
2022-08-15 7:13 ` [PATCH v14 14/14] mm: multi-gen LRU: design doc Yu Zhao
2022-08-15 9:07 ` Bagas Sanjaya
2022-08-31 4:17 ` OpenWrt / MIPS benchmark with MGLRU Yu Zhao
2022-08-31 15:13 ` Dave Hansen
2022-08-31 22:18 ` Yu Zhao
2022-09-12 0:08 ` [PATCH v14 00/14] Multi-Gen LRU Framework Andrew Morton
2022-09-15 17:56 ` Yu Zhao
2022-09-18 20:40 ` Yu Zhao
2022-09-18 20:47 ` [PATCH v14-fix 01/11] mm: multi-gen LRU: update admin guide Yu Zhao
2022-09-18 20:47 ` [PATCH v14-fix 02/11] mm: multi-gen LRU: add comment in lru_gen_use_mm() Yu Zhao
2022-09-18 20:47 ` [PATCH v14-fix 03/11] mm: multi-gen LRU: warn on !ptep_test_and_clear_young() Yu Zhao
2022-09-18 23:47 ` Andrew Morton
2022-09-18 23:53 ` Yu Zhao
2022-09-18 20:47 ` [PATCH v14-fix 04/11] mm: multi-gen LRU: fix warning from __rcu Yu Zhao
2022-09-18 20:47 ` [PATCH v14-fix 05/11] mm: multi-gen LRU: fix warning from seq_is_valid() Yu Zhao
2022-09-18 20:47 ` [PATCH v14-fix 06/11] mm: multi-gen LRU: delete overcautious VM_WARN_ON_ONCE() Yu Zhao
2022-09-18 20:47 ` [PATCH v14-fix 07/11] mm: multi-gen LRU: dial down MAX_LRU_BATCH Yu Zhao
2022-09-18 20:47 ` [PATCH v14-fix 08/11] mm: multi-gen LRU: delete newline in kswapd_age_node() Yu Zhao
2022-09-18 20:47 ` [PATCH v14-fix 09/11] mm: multi-gen LRU: add comment in lru_gen_look_around() Yu Zhao
2022-09-18 20:47 ` Yu Zhao [this message]
2022-09-18 20:47 ` [PATCH v14-fix 11/11] mm: multi-gen LRU: refactor get_nr_evictable() Yu Zhao
2022-09-18 23:47 ` [PATCH v14 00/14] Multi-Gen LRU Framework Andrew Morton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220918204755.3135720-10-yuzhao@google.com \
--to=yuzhao@google.com \
--cc=akpm@linux-foundation.org \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).