linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Yu Zhao <yuzhao@google.com>
To: linux-mm@kvack.org
Cc: Alex Shi <alexs@kernel.org>, Andi Kleen <ak@linux.intel.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Benjamin Manes <ben.manes@gmail.com>,
	Dave Chinner <david@fromorbit.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Jonathan Corbet <corbet@lwn.net>,
	Joonsoo Kim <iamjoonsoo.kim@lge.com>,
	Matthew Wilcox <willy@infradead.org>,
	Mel Gorman <mgorman@suse.de>, Miaohe Lin <linmiaohe@huawei.com>,
	Michael Larabel <michael@michaellarabel.com>,
	Michal Hocko <mhocko@suse.com>,
	Michel Lespinasse <michel@lespinasse.org>,
	Rik van Riel <riel@surriel.com>, Roman Gushchin <guro@fb.com>,
	Rong Chen <rong.a.chen@intel.com>,
	SeongJae Park <sjpark@amazon.de>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	Vlastimil Babka <vbabka@suse.cz>, Yang Shi <shy828301@gmail.com>,
	Ying Huang <ying.huang@intel.com>, Zi Yan <ziy@nvidia.com>,
	linux-kernel@vger.kernel.org, lkp@lists.01.org,
	page-reclaim@google.com, Yu Zhao <yuzhao@google.com>
Subject: [PATCH v2 13/16] mm: multigenerational lru: page reclaim
Date: Tue, 13 Apr 2021 00:56:30 -0600	[thread overview]
Message-ID: <20210413065633.2782273-14-yuzhao@google.com> (raw)
In-Reply-To: <20210413065633.2782273-1-yuzhao@google.com>

With the aging and the eviction in place, we can build the page
reclaim in a straightforward manner:
  1) In order to reduce the latency, direct reclaim only invokes the
  aging when both min_seq[2] reaches max_seq-1; otherwise it invokes
  the eviction.
  2) In order to avoid the aging in the direct reclaim path, kswapd
  does the background aging more proactively. It invokes the aging
  when either of min_seq[2] reaches max_seq-1; otherwise it invokes
  the eviction.

And we add another optimization: pages mapped around a referenced PTE
may also have been referenced due to the spatial locality. In the
reclaim path, if the rmap finds the PTE mapping a page under reclaim
referenced, it calls a new function lru_gen_scan_around() to scan the
vicinity of the PTE. And if this new function finds others referenced
PTEs, it updates the generation number of the pages mapped by those
PTEs.

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 include/linux/mmzone.h |   6 ++
 mm/rmap.c              |   6 ++
 mm/vmscan.c            | 236 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 248 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dcfadf6a8c07..a22e9e40083f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -292,6 +292,7 @@ enum lruvec_flags {
 };
 
 struct lruvec;
+struct page_vma_mapped_walk;
 
 #define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
 #define LRU_USAGE_MASK		((BIT(LRU_USAGE_WIDTH) - 1) << LRU_USAGE_PGOFF)
@@ -384,6 +385,7 @@ struct lrugen {
 
 void lru_gen_init_lruvec(struct lruvec *lruvec);
 void lru_gen_set_state(bool enable, bool main, bool swap);
+void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw);
 
 #else /* CONFIG_LRU_GEN */
 
@@ -395,6 +397,10 @@ static inline void lru_gen_set_state(bool enable, bool main, bool swap)
 {
 }
 
+static inline void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
 #endif /* CONFIG_LRU_GEN */
 
 struct lruvec {
diff --git a/mm/rmap.c b/mm/rmap.c
index b0fc27e77d6d..d600b282ced5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -72,6 +72,7 @@
 #include <linux/page_idle.h>
 #include <linux/memremap.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/mm_inline.h>
 
 #include <asm/tlbflush.h>
 
@@ -792,6 +793,11 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		}
 
 		if (pvmw.pte) {
+			/* the multigenerational lru exploits the spatial locality */
+			if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
+				lru_gen_scan_around(&pvmw);
+				referenced++;
+			}
 			if (ptep_clear_flush_young_notify(vma, address,
 						pvmw.pte)) {
 				/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6239b1acd84f..01c475386379 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1114,6 +1114,10 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 		if (!sc->may_unmap && page_mapped(page))
 			goto keep_locked;
 
+		/* in case the page was found accessed by lru_gen_scan_around() */
+		if (lru_gen_enabled() && !ignore_references && PageReferenced(page))
+			goto keep_locked;
+
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
@@ -2233,6 +2237,10 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
 	unsigned long file;
 	struct lruvec *target_lruvec;
 
+	/* the multigenerational lru doesn't use these counters */
+	if (lru_gen_enabled())
+		return;
+
 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 
 	/*
@@ -2522,6 +2530,19 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	}
 }
 
+#ifdef CONFIG_LRU_GEN
+static void age_lru_gens(struct pglist_data *pgdat, struct scan_control *sc);
+static void shrink_lru_gens(struct lruvec *lruvec, struct scan_control *sc);
+#else
+static void age_lru_gens(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
+static void shrink_lru_gens(struct lruvec *lruvec, struct scan_control *sc)
+{
+}
+#endif
+
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	unsigned long nr[NR_LRU_LISTS];
@@ -2533,6 +2554,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	struct blk_plug plug;
 	bool scan_adjusted;
 
+	if (lru_gen_enabled()) {
+		shrink_lru_gens(lruvec, sc);
+		return;
+	}
+
 	get_scan_count(lruvec, sc, nr);
 
 	/* Record the original scan target for proportional adjustments later */
@@ -2999,6 +3025,10 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
 	struct lruvec *target_lruvec;
 	unsigned long refaults;
 
+	/* the multigenerational lru doesn't use these counters */
+	if (lru_gen_enabled())
+		return;
+
 	target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
 	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
 	target_lruvec->refaults[0] = refaults;
@@ -3373,6 +3403,11 @@ static void age_active_anon(struct pglist_data *pgdat,
 	struct mem_cgroup *memcg;
 	struct lruvec *lruvec;
 
+	if (lru_gen_enabled()) {
+		age_lru_gens(pgdat, sc);
+		return;
+	}
+
 	if (!total_swap_pages)
 		return;
 
@@ -5468,6 +5503,57 @@ static bool walk_mm_list(struct lruvec *lruvec, unsigned long max_seq,
 	return true;
 }
 
+void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
+{
+	pte_t *pte;
+	unsigned long start, end;
+	int old_gen, new_gen;
+	unsigned long flags;
+	struct lruvec *lruvec;
+	struct mem_cgroup *memcg;
+	struct pglist_data *pgdat = page_pgdat(pvmw->page);
+
+	lockdep_assert_held(pvmw->ptl);
+
+	start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+	end = pmd_addr_end(pvmw->address, pvmw->vma->vm_end);
+	pte = pvmw->pte - ((pvmw->address - start) >> PAGE_SHIFT);
+
+	memcg = lock_page_memcg(pvmw->page);
+	lruvec = lock_page_lruvec_irqsave(pvmw->page, &flags);
+
+	new_gen = lru_gen_from_seq(lruvec->evictable.max_seq);
+
+	for (; start != end; pte++, start += PAGE_SIZE) {
+		struct page *page;
+		unsigned long pfn = pte_pfn(*pte);
+
+		if (!pte_present(*pte) || !pte_young(*pte) || is_zero_pfn(pfn))
+			continue;
+
+		if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+			continue;
+
+		page = compound_head(pfn_to_page(pfn));
+		if (page_to_nid(page) != pgdat->node_id)
+			continue;
+
+		if (page_memcg_rcu(page) != memcg)
+			continue;
+		/*
+		 * We may be holding many locks. So try to finish as fast as
+		 * possible and leave the accessed and the dirty bits to page
+		 * table walks.
+		 */
+		old_gen = page_update_gen(page, new_gen);
+		if (old_gen >= 0 && old_gen != new_gen)
+			lru_gen_update_size(page, lruvec, old_gen, new_gen);
+	}
+
+	unlock_page_lruvec_irqrestore(lruvec, flags);
+	unlock_page_memcg(pvmw->page);
+}
+
 /******************************************************************************
  *                          the eviction
  ******************************************************************************/
@@ -5809,6 +5895,156 @@ static bool evict_lru_gen_pages(struct lruvec *lruvec, struct scan_control *sc,
 	return *nr_to_scan > 0 && sc->nr_reclaimed < sc->nr_to_reclaim;
 }
 
+/******************************************************************************
+ *                          page reclaim
+ ******************************************************************************/
+
+static int get_swappiness(struct lruvec *lruvec)
+{
+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+	int swappiness = mem_cgroup_get_nr_swap_pages(memcg) >= (long)SWAP_CLUSTER_MAX ?
+			 mem_cgroup_swappiness(memcg) : 0;
+
+	VM_BUG_ON(swappiness > 200U);
+
+	return swappiness;
+}
+
+static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+				    int swappiness)
+{
+	int gen, file, zone;
+	long nr_to_scan = 0;
+	struct lrugen *lrugen = &lruvec->evictable;
+	DEFINE_MAX_SEQ();
+	DEFINE_MIN_SEQ();
+
+	lru_add_drain();
+
+	for (file = !swappiness; file < ANON_AND_FILE; file++) {
+		unsigned long seq;
+
+		for (seq = min_seq[file]; seq <= max_seq; seq++) {
+			gen = lru_gen_from_seq(seq);
+
+			for (zone = 0; zone <= sc->reclaim_idx; zone++)
+				nr_to_scan += READ_ONCE(lrugen->sizes[gen][file][zone]);
+		}
+	}
+
+	nr_to_scan = max(nr_to_scan, 0L);
+	nr_to_scan = round_up(nr_to_scan >> sc->priority, SWAP_CLUSTER_MAX);
+
+	if (max_nr_gens(max_seq, min_seq, swappiness) > MIN_NR_GENS)
+		return nr_to_scan;
+
+	/* kswapd uses age_lru_gens() */
+	if (current_is_kswapd())
+		return 0;
+
+	return walk_mm_list(lruvec, max_seq, sc, swappiness, NULL) ? nr_to_scan : 0;
+}
+
+static void shrink_lru_gens(struct lruvec *lruvec, struct scan_control *sc)
+{
+	struct blk_plug plug;
+	unsigned long scanned = 0;
+	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+	blk_start_plug(&plug);
+
+	while (true) {
+		long nr_to_scan;
+		int swappiness = sc->may_swap ? get_swappiness(lruvec) : 0;
+
+		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness) - scanned;
+		if (nr_to_scan < (long)SWAP_CLUSTER_MAX)
+			break;
+
+		scanned += nr_to_scan;
+
+		if (!evict_lru_gen_pages(lruvec, sc, swappiness, &nr_to_scan))
+			break;
+
+		scanned -= nr_to_scan;
+
+		if (mem_cgroup_below_min(memcg) ||
+		    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+			break;
+
+		cond_resched();
+	}
+
+	blk_finish_plug(&plug);
+}
+
+/******************************************************************************
+ *                          the background aging
+ ******************************************************************************/
+
+static int lru_gen_spread = MIN_NR_GENS;
+
+static void try_walk_mm_list(struct lruvec *lruvec, struct scan_control *sc)
+{
+	int gen, file, zone;
+	long old_and_young[2] = {};
+	struct mm_walk_args args = {};
+	int spread = READ_ONCE(lru_gen_spread);
+	int swappiness = get_swappiness(lruvec);
+	struct lrugen *lrugen = &lruvec->evictable;
+	DEFINE_MAX_SEQ();
+	DEFINE_MIN_SEQ();
+
+	lru_add_drain();
+
+	for (file = !swappiness; file < ANON_AND_FILE; file++) {
+		unsigned long seq;
+
+		for (seq = min_seq[file]; seq <= max_seq; seq++) {
+			gen = lru_gen_from_seq(seq);
+
+			for (zone = 0; zone < MAX_NR_ZONES; zone++)
+				old_and_young[seq == max_seq] +=
+					READ_ONCE(lrugen->sizes[gen][file][zone]);
+		}
+	}
+
+	old_and_young[0] = max(old_and_young[0], 0L);
+	old_and_young[1] = max(old_and_young[1], 0L);
+
+	if (old_and_young[0] + old_and_young[1] < SWAP_CLUSTER_MAX)
+		return;
+
+	/* try to spread pages out across spread+1 generations */
+	if (old_and_young[0] >= old_and_young[1] * spread &&
+	    min_nr_gens(max_seq, min_seq, swappiness) > max(spread, MIN_NR_GENS))
+		return;
+
+	walk_mm_list(lruvec, max_seq, sc, swappiness, &args);
+}
+
+static void age_lru_gens(struct pglist_data *pgdat, struct scan_control *sc)
+{
+	struct mem_cgroup *memcg;
+
+	VM_BUG_ON(!current_is_kswapd());
+
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+		struct lrugen *lrugen = &lruvec->evictable;
+
+		if (!mem_cgroup_below_min(memcg) &&
+		    (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
+			try_walk_mm_list(lruvec, sc);
+
+		if (!mem_cgroup_disabled())
+			atomic_add_unless(&lrugen->priority, 1, DEF_PRIORITY);
+
+		cond_resched();
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+}
+
 /******************************************************************************
  *                          state change
  ******************************************************************************/
-- 
2.31.1.295.g9ea45b61b8-goog


  parent reply	other threads:[~2021-04-13  6:57 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-13  6:56 [PATCH v2 00/16] Multigenerational LRU Framework Yu Zhao
2021-04-13  6:56 ` [PATCH v2 01/16] include/linux/memcontrol.h: do not warn in page_memcg_rcu() if !CONFIG_MEMCG Yu Zhao
2021-04-13  6:56 ` [PATCH v2 02/16] include/linux/nodemask.h: define next_memory_node() if !CONFIG_NUMA Yu Zhao
2021-04-13  6:56 ` [PATCH v2 03/16] include/linux/huge_mm.h: define is_huge_zero_pmd() if !CONFIG_TRANSPARENT_HUGEPAGE Yu Zhao
2021-04-13  6:56 ` [PATCH v2 04/16] include/linux/cgroup.h: export cgroup_mutex Yu Zhao
2021-04-13  6:56 ` [PATCH v2 05/16] mm/swap.c: export activate_page() Yu Zhao
2021-04-13  6:56 ` [PATCH v2 06/16] mm, x86: support the access bit on non-leaf PMD entries Yu Zhao
2021-04-13  6:56 ` [PATCH v2 07/16] mm/vmscan.c: refactor shrink_node() Yu Zhao
2021-04-13  6:56 ` [PATCH v2 08/16] mm: multigenerational lru: groundwork Yu Zhao
2021-04-13  6:56 ` [PATCH v2 09/16] mm: multigenerational lru: activation Yu Zhao
2021-04-13  6:56 ` [PATCH v2 10/16] mm: multigenerational lru: mm_struct list Yu Zhao
2021-04-14 14:36   ` Matthew Wilcox
2021-04-13  6:56 ` [PATCH v2 11/16] mm: multigenerational lru: aging Yu Zhao
2021-04-13  6:56 ` [PATCH v2 12/16] mm: multigenerational lru: eviction Yu Zhao
2021-04-13  6:56 ` Yu Zhao [this message]
2021-04-13  6:56 ` [PATCH v2 14/16] mm: multigenerational lru: user interface Yu Zhao
2021-04-13  6:56 ` [PATCH v2 15/16] mm: multigenerational lru: Kconfig Yu Zhao
2021-04-13  6:56 ` [PATCH v2 16/16] mm: multigenerational lru: documentation Yu Zhao
2021-04-13  7:51 ` [PATCH v2 00/16] Multigenerational LRU Framework SeongJae Park
2021-04-13 16:13   ` Jens Axboe
2021-04-13 16:42     ` SeongJae Park
2021-04-13 23:14     ` Dave Chinner
2021-04-14  2:29       ` Rik van Riel
     [not found]         ` <CAOUHufafMcaG8sOS=1YMy2P_6p0R1FzP16bCwpUau7g1-PybBQ@mail.gmail.com>
2021-04-14  6:15           ` Huang, Ying
2021-04-14  7:58             ` Yu Zhao
2021-04-14  8:27               ` Huang, Ying
2021-04-14 13:51                 ` Rik van Riel
2021-04-14 15:56                   ` Andi Kleen
2021-04-14 15:58                   ` [page-reclaim] " Shakeel Butt
2021-04-14 18:45                   ` Yu Zhao
2021-04-14 15:51           ` Andi Kleen
2021-04-14 15:58             ` Rik van Riel
2021-04-14 19:14               ` Yu Zhao
2021-04-14 19:41                 ` Rik van Riel
2021-04-14 20:08                   ` Yu Zhao
2021-04-14 19:04             ` Yu Zhao
2021-04-15  3:00               ` Andi Kleen
2021-04-15  7:13                 ` Yu Zhao
2021-04-15  8:19                   ` Huang, Ying
2021-04-15  9:57                   ` Michel Lespinasse
2021-04-24  2:33                     ` Yu Zhao
2021-04-24  3:30                       ` Andi Kleen
2021-04-24  4:16                         ` Yu Zhao
2021-04-14  3:40       ` Yu Zhao
2021-04-14  4:50         ` Dave Chinner
2021-04-14  7:16           ` Yu Zhao
2021-04-14 10:00             ` Yu Zhao
2021-04-15  1:36             ` Dave Chinner
2021-04-24 21:21               ` Yu Zhao
2021-04-14 14:43       ` Jens Axboe
2021-04-14 19:42         ` Yu Zhao
2021-04-15  1:21         ` Dave Chinner
2021-04-14 17:43 ` Johannes Weiner
2021-04-27 10:35   ` Yu Zhao
2021-04-29 23:46 ` Konstantin Kharlamov
2021-04-30  6:37   ` Konstantin Kharlamov
2021-04-30 19:31     ` Yu Zhao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210413065633.2782273-14-yuzhao@google.com \
    --to=yuzhao@google.com \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexs@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=ben.manes@gmail.com \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@fromorbit.com \
    --cc=guro@fb.com \
    --cc=hannes@cmpxchg.org \
    --cc=hdanton@sina.com \
    --cc=iamjoonsoo.kim@lge.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lkp@lists.01.org \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.com \
    --cc=michael@michaellarabel.com \
    --cc=michel@lespinasse.org \
    --cc=page-reclaim@google.com \
    --cc=riel@surriel.com \
    --cc=rong.a.chen@intel.com \
    --cc=shy828301@gmail.com \
    --cc=sjpark@amazon.de \
    --cc=tim.c.chen@linux.intel.com \
    --cc=vbabka@suse.cz \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).