* [PATCH 1/3] mm/lru_gen: Move some code around so that next patch is simpler
@ 2023-06-13 12:00 Aneesh Kumar K.V
2023-06-13 12:00 ` [PATCH 2/3] mm/lru_gen: lru_gen_look_around simplification Aneesh Kumar K.V
2023-06-13 12:00 ` [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported Aneesh Kumar K.V
0 siblings, 2 replies; 16+ messages in thread
From: Aneesh Kumar K.V @ 2023-06-13 12:00 UTC (permalink / raw)
To: linux-mm, akpm; +Cc: Yu Zhao, T . J . Alumbaugh, Aneesh Kumar K.V
Move lrur_gen_add_folio to .c. We will support arch specific mapping
of page access count to generation in a later patch and will use
that when adding folio to lruvec. This move enables that.
No functional change in this patch.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
include/linux/mm_inline.h | 47 +----------
mm/vmscan.c | 172 ++++++++++++++++++++++++--------------
2 files changed, 110 insertions(+), 109 deletions(-)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 0e1d239a882c..2a86dc4d96ab 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -217,52 +217,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli
VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}
-static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
-{
- unsigned long seq;
- unsigned long flags;
- int gen = folio_lru_gen(folio);
- int type = folio_is_file_lru(folio);
- int zone = folio_zonenum(folio);
- struct lru_gen_folio *lrugen = &lruvec->lrugen;
-
- VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
-
- if (folio_test_unevictable(folio) || !lrugen->enabled)
- return false;
- /*
- * There are three common cases for this page:
- * 1. If it's hot, e.g., freshly faulted in or previously hot and
- * migrated, add it to the youngest generation.
- * 2. If it's cold but can't be evicted immediately, i.e., an anon page
- * not in swapcache or a dirty page pending writeback, add it to the
- * second oldest generation.
- * 3. Everything else (clean, cold) is added to the oldest generation.
- */
- if (folio_test_active(folio))
- seq = lrugen->max_seq;
- else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
- (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio))))
- seq = lrugen->min_seq[type] + 1;
- else
- seq = lrugen->min_seq[type];
-
- gen = lru_gen_from_seq(seq);
- flags = (gen + 1UL) << LRU_GEN_PGOFF;
- /* see the comment on MIN_NR_GENS about PG_active */
- set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
-
- lru_gen_update_size(lruvec, folio, -1, gen);
- /* for folio_rotate_reclaimable() */
- if (reclaiming)
- list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
- else
- list_add(&folio->lru, &lrugen->folios[gen][type][zone]);
-
- return true;
-}
-
+bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming);
static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
unsigned long flags;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6d0cd2840cf0..edfe073b475e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3748,29 +3748,6 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
* the aging
******************************************************************************/
-/* promote pages accessed through page tables */
-static int folio_update_gen(struct folio *folio, int gen)
-{
- unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
-
- VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
- VM_WARN_ON_ONCE(!rcu_read_lock_held());
-
- do {
- /* lru_gen_del_folio() has isolated this page? */
- if (!(old_flags & LRU_GEN_MASK)) {
- /* for shrink_folio_list() */
- new_flags = old_flags | BIT(PG_referenced);
- continue;
- }
-
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
- } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
-
- return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
-}
-
/* protect pages accessed multiple times through file descriptors */
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
@@ -3801,6 +3778,70 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
return new_gen;
}
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pte_present(pte) || is_zero_pfn(pfn))
+ return -1;
+
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+
+static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
+ struct pglist_data *pgdat, bool can_swap)
+{
+ struct folio *folio;
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return NULL;
+
+ folio = pfn_folio(pfn);
+ if (folio_nid(folio) != pgdat->node_id)
+ return NULL;
+
+ if (folio_memcg_rcu(folio) != memcg)
+ return NULL;
+
+ /* file VMAs can contain anon pages from COW */
+ if (!folio_is_file_lru(folio) && !can_swap)
+ return NULL;
+
+ return folio;
+}
+
+/* promote pages accessed through page tables */
+static int folio_update_gen(struct folio *folio, int gen)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ /* lru_gen_del_folio() has isolated this page? */
+ if (!(old_flags & LRU_GEN_MASK)) {
+ /* for shrink_folio_list() */
+ new_flags = old_flags | BIT(PG_referenced);
+ continue;
+ }
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
int old_gen, int new_gen)
{
@@ -3910,23 +3951,6 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
return false;
}
-static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
-{
- unsigned long pfn = pte_pfn(pte);
-
- VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
-
- if (!pte_present(pte) || is_zero_pfn(pfn))
- return -1;
-
- if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
- return -1;
-
- if (WARN_ON_ONCE(!pfn_valid(pfn)))
- return -1;
-
- return pfn;
-}
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
@@ -3948,29 +3972,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
}
#endif
-static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
- struct pglist_data *pgdat, bool can_swap)
-{
- struct folio *folio;
-
- /* try to avoid unnecessary memory loads */
- if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
- return NULL;
-
- folio = pfn_folio(pfn);
- if (folio_nid(folio) != pgdat->node_id)
- return NULL;
-
- if (folio_memcg_rcu(folio) != memcg)
- return NULL;
-
- /* file VMAs can contain anon pages from COW */
- if (!folio_is_file_lru(folio) && !can_swap)
- return NULL;
-
- return folio;
-}
-
static bool suitable_to_scan(int total, int young)
{
int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
@@ -5557,6 +5558,51 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
pgdat->kswapd_failures = 0;
}
+bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long seq;
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
+
+ if (folio_test_unevictable(folio) || !lrugen->enabled)
+ return false;
+ /*
+ * There are three common cases for this page:
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
+ * migrated, add it to the youngest generation.
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
+ * not in swapcache or a dirty page pending writeback, add it to the
+ * second oldest generation.
+ * 3. Everything else (clean, cold) is added to the oldest generation.
+ */
+ if (folio_test_active(folio))
+ seq = lrugen->max_seq;
+ else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
+ (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
+ seq = lrugen->min_seq[type] + 1;
+ else
+ seq = lrugen->min_seq[type];
+
+ gen = lru_gen_from_seq(seq);
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
+ /* see the comment on MIN_NR_GENS about PG_active */
+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+
+ lru_gen_update_size(lruvec, folio, -1, gen);
+ /* for folio_rotate_reclaimable() */
+ if (reclaiming)
+ list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
+ else
+ list_add(&folio->lru, &lrugen->folios[gen][type][zone]);
+
+ return true;
+}
/******************************************************************************
* state change
******************************************************************************/
--
2.40.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 2/3] mm/lru_gen: lru_gen_look_around simplification
2023-06-13 12:00 [PATCH 1/3] mm/lru_gen: Move some code around so that next patch is simpler Aneesh Kumar K.V
@ 2023-06-13 12:00 ` Aneesh Kumar K.V
2023-06-13 12:00 ` [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported Aneesh Kumar K.V
1 sibling, 0 replies; 16+ messages in thread
From: Aneesh Kumar K.V @ 2023-06-13 12:00 UTC (permalink / raw)
To: linux-mm, akpm; +Cc: Yu Zhao, T . J . Alumbaugh, Aneesh Kumar K.V
To store generation details in folio_flags we need lru_gen_mm_walk
structure in which we batch the nr_pages update. A follow-up patch wants to
avoid compiling all the lru_gen mm walk-related code on architectures that
don't support it. Split out the look-around generation update by marking
folio active into a separate helper which will be used in that case.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
mm/vmscan.c | 57 +++++++++++++++++++++++++++++++++++------------------
1 file changed, 38 insertions(+), 19 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index edfe073b475e..f277beba556c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4619,6 +4619,39 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* rmap/PT walk feedback
******************************************************************************/
+static void __look_around_gen_update(struct folio *folio, int new_gen)
+{
+ int old_gen;
+
+ old_gen = folio_lru_gen(folio);
+ if (old_gen < 0)
+ folio_set_referenced(folio);
+ else if (old_gen != new_gen)
+ folio_activate(folio);
+}
+
+static inline bool current_reclaim_state_can_swap(void)
+{
+ if (current->reclaim_state)
+ return current->reclaim_state->mm_walk->can_swap;
+ return true;
+}
+
+static void look_around_gen_update(struct folio *folio, int new_gen)
+{
+ int old_gen;
+ struct lru_gen_mm_walk *walk;
+
+ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
+ if (walk) {
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ return;
+ }
+ return __look_around_gen_update(folio, new_gen);
+}
+
/*
* This function exploits spatial locality when shrink_folio_list() walks the
* rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
@@ -4631,7 +4664,6 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
int i;
unsigned long start;
unsigned long end;
- struct lru_gen_mm_walk *walk;
int young = 0;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
@@ -4640,7 +4672,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
DEFINE_MAX_SEQ(lruvec);
- int old_gen, new_gen = lru_gen_from_seq(max_seq);
+ int new_gen = lru_gen_from_seq(max_seq);
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
@@ -4648,9 +4680,6 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (spin_is_contended(pvmw->ptl))
return;
- /* avoid taking the LRU lock under the PTL when possible */
- walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
-
start = max(addr & PMD_MASK, pvmw->vma->vm_start);
end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
@@ -4683,7 +4712,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (!pte_young(pte[i]))
continue;
- folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
+ folio = get_pfn_folio(pfn, memcg, pgdat,
+ current_reclaim_state_can_swap());
+
if (!folio)
continue;
@@ -4697,19 +4728,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
!folio_test_swapcache(folio)))
folio_mark_dirty(folio);
- if (walk) {
- old_gen = folio_update_gen(folio, new_gen);
- if (old_gen >= 0 && old_gen != new_gen)
- update_batch_size(walk, folio, old_gen, new_gen);
-
- continue;
- }
-
- old_gen = folio_lru_gen(folio);
- if (old_gen < 0)
- folio_set_referenced(folio);
- else if (old_gen != new_gen)
- folio_activate(folio);
+ look_around_gen_update(folio, new_gen);
}
arch_leave_lazy_mmu_mode();
--
2.40.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-13 12:00 [PATCH 1/3] mm/lru_gen: Move some code around so that next patch is simpler Aneesh Kumar K.V
2023-06-13 12:00 ` [PATCH 2/3] mm/lru_gen: lru_gen_look_around simplification Aneesh Kumar K.V
@ 2023-06-13 12:00 ` Aneesh Kumar K.V
2023-06-13 12:23 ` Matthew Wilcox
` (2 more replies)
1 sibling, 3 replies; 16+ messages in thread
From: Aneesh Kumar K.V @ 2023-06-13 12:00 UTC (permalink / raw)
To: linux-mm, akpm; +Cc: Yu Zhao, T . J . Alumbaugh, Aneesh Kumar K.V
Not all architecture supports hardware atomic updates of access bits. On
such an arch, we don't use page table walk to classify pages into
generations. Add a kernel config option and remove adding all the page
table walk code on such architecture.
No preformance change observed with mongodb ycsb test:
Patch details Throughput(Ops/sec)
without patch 93278
With patch 93400
Without patch:
$ size mm/vmscan.o
text data bss dec hex filename
112102 42721 40 154863 25cef mm/vmscan.o
With patch
$ size mm/vmscan.o
text data bss dec hex filename
105430 41333 24 146787 23d63 mm/vmscan.o
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
arch/Kconfig | 3 +
arch/arm64/Kconfig | 1 +
arch/x86/Kconfig | 1 +
include/linux/memcontrol.h | 2 +-
include/linux/mm_types.h | 8 +--
include/linux/mmzone.h | 8 +++
include/linux/swap.h | 2 +-
kernel/fork.c | 2 +-
mm/memcontrol.c | 2 +-
mm/vmscan.c | 128 +++++++++++++++++++++++++++++++++----
10 files changed, 137 insertions(+), 20 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 205fd23e0cad..5cdd98731298 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1458,6 +1458,9 @@ config DYNAMIC_SIGFRAME
config HAVE_ARCH_NODE_DEV_GROUP
bool
+config LRU_TASK_PAGE_AGING
+ bool
+
config ARCH_HAS_NONLEAF_PMD_YOUNG
bool
help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b1201d25a8a4..e0994fb3504b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -225,6 +225,7 @@ config ARM64
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
select KASAN_VMALLOC if KASAN
+ select LRU_TASK_PAGE_AGING if LRU_GEN
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 53bab123a8ee..bde9e6f33b22 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -276,6 +276,7 @@ config X86
select HAVE_GENERIC_VDSO
select HOTPLUG_SMT if SMP
select IRQ_FORCED_THREADING
+ select LRU_TASK_PAGE_AGING if LRU_GEN
select NEED_PER_CPU_EMBED_FIRST_CHUNK
select NEED_PER_CPU_PAGE_FIRST_CHUNK
select NEED_SG_DMA_LENGTH
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 43d4ec8445d4..ea5d1d7bfb8b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -321,7 +321,7 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 306a3d1a0fa6..f90a4860a792 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -786,7 +786,7 @@ struct mm_struct {
*/
unsigned long ksm_rmap_items;
#endif
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
struct {
/* this mm_struct is on lru_gen_mm_list */
struct list_head list;
@@ -801,7 +801,7 @@ struct mm_struct {
struct mem_cgroup *memcg;
#endif
} lru_gen;
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_TASK_PAGE_AGING */
} __randomize_layout;
/*
@@ -830,7 +830,7 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
}
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
struct lru_gen_mm_list {
/* mm_struct list for page table walkers */
@@ -864,7 +864,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
WRITE_ONCE(mm->lru_gen.bitmap, -1);
}
-#else /* !CONFIG_LRU_GEN */
+#else /* !CONFIG_LRU_TASK_PAGE_AGING */
static inline void lru_gen_add_mm(struct mm_struct *mm)
{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a4889c9d4055..b35698148d3c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -427,6 +427,7 @@ struct lru_gen_folio {
#endif
};
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
enum {
MM_LEAF_TOTAL, /* total leaf entries */
MM_LEAF_OLD, /* old leaf entries */
@@ -469,6 +470,7 @@ struct lru_gen_mm_walk {
bool can_swap;
bool force_scan;
};
+#endif
void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
@@ -613,8 +615,12 @@ struct lruvec {
#ifdef CONFIG_LRU_GEN
/* evictable pages divided into generations */
struct lru_gen_folio lrugen;
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/* to concurrently iterate lru_gen_mm_list */
struct lru_gen_mm_state mm_state;
+#else
+ bool seq_update_progress;
+#endif
#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
@@ -1354,8 +1360,10 @@ typedef struct pglist_data {
unsigned long flags;
#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/* kswap mm walk data */
struct lru_gen_mm_walk mm_walk;
+#endif
/* lru_gen_folio list */
struct lru_gen_memcg memcg_lru;
#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3c69cb653cb9..ce09b1e44275 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -155,7 +155,7 @@ union swap_header {
struct reclaim_state {
/* pages reclaimed outside of LRU-based reclaim */
unsigned long reclaimed;
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/* per-thread mm walk data */
struct lru_gen_mm_walk *mm_walk;
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..2c9e21e39f84 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2939,7 +2939,7 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
- if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ if (IS_ENABLED(CONFIG_LRU_TASK_PAGE_AGING) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 959d6a27e23d..d8fe30d880c6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6404,7 +6404,7 @@ static void mem_cgroup_move_task(void)
}
#endif
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f277beba556c..207e62d42888 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3304,6 +3304,7 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
}
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/******************************************************************************
* Bloom filters
******************************************************************************/
@@ -3650,6 +3651,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
return success;
}
+#endif
/******************************************************************************
* PID controller
@@ -3819,6 +3821,8 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
return folio;
}
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
+
/* promote pages accessed through page tables */
static int folio_update_gen(struct folio *folio, int gen)
{
@@ -3882,6 +3886,16 @@ static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk
}
}
+static void reset_current_reclaim_batch_size(struct lruvec *lruvec)
+{
+ struct lru_gen_mm_walk *walk;
+
+ walk = current->reclaim_state->mm_walk;
+ if (walk && walk->batched)
+ return reset_batch_size(lruvec, walk);
+
+}
+
static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
{
struct address_space *mapping;
@@ -4304,7 +4318,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
} while (err == -EAGAIN);
}
-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
+static void *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
{
struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
@@ -4335,6 +4349,23 @@ static void clear_mm_walk(void)
if (!current_is_kswapd())
kfree(walk);
}
+#else
+
+static void reset_current_reclaim_batch_size(struct lruvec *lruvec)
+{
+
+}
+
+static inline void *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
+{
+ return NULL;
+}
+
+static inline void clear_mm_walk(void)
+{
+
+}
+#endif
static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
{
@@ -4468,11 +4499,15 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+#ifndef CONFIG_LRU_TASK_PAGE_AGING
+ lruvec->seq_update_progress = false;
+#endif
spin_unlock_irq(&lruvec->lru_lock);
}
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap, bool force_scan)
+ bool can_swap, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -4498,7 +4533,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
goto done;
}
- walk = set_mm_walk(NULL, true);
+ walk = (struct lru_gen_mm_walk *)set_mm_walk(NULL, true);
if (!walk) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
@@ -4520,6 +4555,51 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
return success;
}
+#else
+
+/*
+ * inc_max_seq can drop the lru_lock in between. So use a waitqueue seq_update_progress
+ * to allow concurrent access.
+ */
+bool __try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ bool can_swap, bool force_scan)
+{
+ bool success = false;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+
+ /* see the comment in iterate_mm_list() */
+ if (lruvec->seq_update_progress)
+ success = false;
+ else {
+ spin_lock_irq(&lruvec->lru_lock);
+
+ if (max_seq != lrugen->max_seq)
+ goto done;
+
+ if (lruvec->seq_update_progress)
+ goto done;
+
+ success = true;
+ lruvec->seq_update_progress = true;
+done:
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ if (success)
+ inc_max_seq(lruvec, can_swap, force_scan);
+
+ return success;
+}
+
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ bool can_swap, bool force_scan)
+{
+ return __try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+}
+#endif
+
/******************************************************************************
* working set protection
@@ -4630,6 +4710,7 @@ static void __look_around_gen_update(struct folio *folio, int new_gen)
folio_activate(folio);
}
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
static inline bool current_reclaim_state_can_swap(void)
{
if (current->reclaim_state)
@@ -4651,6 +4732,18 @@ static void look_around_gen_update(struct folio *folio, int new_gen)
}
return __look_around_gen_update(folio, new_gen);
}
+#else
+
+static inline bool current_reclaim_state_can_swap(void)
+{
+ return true;
+}
+
+static inline void look_around_gen_update(struct folio *folio, int new_gen)
+{
+ return __look_around_gen_update(folio, new_gen);
+}
+#endif
/*
* This function exploits spatial locality when shrink_folio_list() walks the
@@ -4714,7 +4807,6 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
folio = get_pfn_folio(pfn, memcg, pgdat,
current_reclaim_state_can_swap());
-
if (!folio)
continue;
@@ -4734,9 +4826,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
arch_leave_lazy_mmu_mode();
mem_cgroup_unlock_pages();
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
/* feedback from rmap walkers to page table walkers */
if (suitable_to_scan(i, young))
update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+#endif
}
/******************************************************************************
@@ -5156,7 +5250,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
struct folio *next;
enum vm_event_item item;
struct reclaim_stat stat;
- struct lru_gen_mm_walk *walk;
bool skip_retry = false;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -5211,9 +5304,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
move_folios_to_lru(lruvec, &list);
- walk = current->reclaim_state->mm_walk;
- if (walk && walk->batched)
- reset_batch_size(lruvec, walk);
+ reset_current_reclaim_batch_size(lruvec);
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
@@ -5321,7 +5412,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
return nr_to_scan;
/* skip this lruvec as it's low on cold folios */
- return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+ return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
}
static unsigned long get_nr_to_reclaim(struct scan_control *sc)
@@ -5929,6 +6020,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_putc(m, '\n');
}
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
const char *s = " ";
@@ -5945,6 +6037,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_printf(m, " %10lu%c", n, s[i]);
}
seq_putc(m, '\n');
+#endif
}
/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
@@ -6026,7 +6119,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_contr
if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
return -ERANGE;
- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+ try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
return 0;
}
@@ -6218,7 +6311,12 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
lruvec->mm_state.seq = MIN_NR_GENS;
+#else
+ lruvec->seq_update_progress = false;
+#endif
+
}
#ifdef CONFIG_MEMCG
@@ -6237,16 +6335,20 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat)
void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
INIT_LIST_HEAD(&memcg->mm_list.fifo);
spin_lock_init(&memcg->mm_list.lock);
+
+#endif
}
void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
- int i;
int nid;
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
+#endif
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
@@ -6256,10 +6358,12 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
lruvec->lrugen.list.next = LIST_POISON1;
- for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+#ifdef CONFIG_LRU_TASK_PAGE_AGING
+ for (int i = 0; i < NR_BLOOM_FILTERS; i++) {
bitmap_free(lruvec->mm_state.filters[i]);
lruvec->mm_state.filters[i] = NULL;
}
+#endif
}
}
--
2.40.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-13 12:00 ` [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported Aneesh Kumar K.V
@ 2023-06-13 12:23 ` Matthew Wilcox
2023-06-13 13:28 ` Aneesh Kumar K V
2023-06-21 2:27 ` kernel test robot
2023-06-24 14:53 ` Aneesh Kumar K.V
2 siblings, 1 reply; 16+ messages in thread
From: Matthew Wilcox @ 2023-06-13 12:23 UTC (permalink / raw)
To: Aneesh Kumar K.V; +Cc: linux-mm, akpm, Yu Zhao, T . J . Alumbaugh
On Tue, Jun 13, 2023 at 05:30:47PM +0530, Aneesh Kumar K.V wrote:
> @@ -4498,7 +4533,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
> goto done;
> }
>
> - walk = set_mm_walk(NULL, true);
> + walk = (struct lru_gen_mm_walk *)set_mm_walk(NULL, true);
This isn't C++.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-13 12:23 ` Matthew Wilcox
@ 2023-06-13 13:28 ` Aneesh Kumar K V
2023-06-13 13:36 ` Matthew Wilcox
0 siblings, 1 reply; 16+ messages in thread
From: Aneesh Kumar K V @ 2023-06-13 13:28 UTC (permalink / raw)
To: Matthew Wilcox; +Cc: linux-mm, akpm, Yu Zhao, T . J . Alumbaugh
On 6/13/23 5:53 PM, Matthew Wilcox wrote:
> On Tue, Jun 13, 2023 at 05:30:47PM +0530, Aneesh Kumar K.V wrote:
>> @@ -4498,7 +4533,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
>> goto done;
>> }
>>
>> - walk = set_mm_walk(NULL, true);
>> + walk = (struct lru_gen_mm_walk *)set_mm_walk(NULL, true);
>
> This isn't C++.
>
We have similar pattern for things like kmalloc()? I understand the desire to have functions return
the correct type. But the amount of code that we are able to avoid with this patch for certain architecture is
really large.
-aneesh
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-13 13:28 ` Aneesh Kumar K V
@ 2023-06-13 13:36 ` Matthew Wilcox
2023-06-13 13:47 ` Aneesh Kumar K V
0 siblings, 1 reply; 16+ messages in thread
From: Matthew Wilcox @ 2023-06-13 13:36 UTC (permalink / raw)
To: Aneesh Kumar K V; +Cc: linux-mm, akpm, Yu Zhao, T . J . Alumbaugh
On Tue, Jun 13, 2023 at 06:58:41PM +0530, Aneesh Kumar K V wrote:
> On 6/13/23 5:53 PM, Matthew Wilcox wrote:
> > On Tue, Jun 13, 2023 at 05:30:47PM +0530, Aneesh Kumar K.V wrote:
> >> @@ -4498,7 +4533,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
> >> goto done;
> >> }
> >>
> >> - walk = set_mm_walk(NULL, true);
> >> + walk = (struct lru_gen_mm_walk *)set_mm_walk(NULL, true);
> >
> > This isn't C++.
> >
>
> We have similar pattern for things like kmalloc()?
No. No, we don't. Nobody does that. Perhaps some really crappy code
in staging. DO NOT USE CASTS.
> I understand the desire to have functions return
> the correct type. But the amount of code that we are able to avoid with this patch for certain architecture is
> really large.
There's probably a better way to do what you're trying to do, but the
simple fact remains that the cast you added is needed in C++ and not in C.
Linux is not written in C++. Do not add the cast.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-13 13:36 ` Matthew Wilcox
@ 2023-06-13 13:47 ` Aneesh Kumar K V
0 siblings, 0 replies; 16+ messages in thread
From: Aneesh Kumar K V @ 2023-06-13 13:47 UTC (permalink / raw)
To: Matthew Wilcox; +Cc: linux-mm, akpm, Yu Zhao, T . J . Alumbaugh
On 6/13/23 7:06 PM, Matthew Wilcox wrote:
> On Tue, Jun 13, 2023 at 06:58:41PM +0530, Aneesh Kumar K V wrote:
>> On 6/13/23 5:53 PM, Matthew Wilcox wrote:
>>> On Tue, Jun 13, 2023 at 05:30:47PM +0530, Aneesh Kumar K.V wrote:
>>>> @@ -4498,7 +4533,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
>>>> goto done;
>>>> }
>>>>
>>>> - walk = set_mm_walk(NULL, true);
>>>> + walk = (struct lru_gen_mm_walk *)set_mm_walk(NULL, true);
>>>
>>> This isn't C++.
>>>
>>
>> We have similar pattern for things like kmalloc()?
>
> No. No, we don't. Nobody does that. Perhaps some really crappy code
> in staging. DO NOT USE CASTS.
>
>> I understand the desire to have functions return
>> the correct type. But the amount of code that we are able to avoid with this patch for certain architecture is
>> really large.
>
> There's probably a better way to do what you're trying to do, but the
> simple fact remains that the cast you added is needed in C++ and not in C.
> Linux is not written in C++. Do not add the cast.
What I want is to allow the usage of set_mm_walk() such that i don't need to have that #ifdef throughout the code.
I also want to keep the definition of struct lru_gen_mm_walk {} within that #ifdef as below.
#ifdef CONFIG_LRU_TASK_PAGE_AGING
struct lru_gen_mm_walk {
/* the lruvec under reclaim */
...
};
static void *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
{
struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
...
return walk;
}
#else
static inline void *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
{
return NULL;
}
#endif
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-13 12:00 ` [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported Aneesh Kumar K.V
2023-06-13 12:23 ` Matthew Wilcox
@ 2023-06-21 2:27 ` kernel test robot
2023-06-24 14:53 ` Aneesh Kumar K.V
2 siblings, 0 replies; 16+ messages in thread
From: kernel test robot @ 2023-06-21 2:27 UTC (permalink / raw)
To: Aneesh Kumar K.V, linux-mm, akpm
Cc: oe-kbuild-all, Yu Zhao, T . J . Alumbaugh, Aneesh Kumar K.V
Hi Aneesh,
kernel test robot noticed the following build warnings:
[auto build test WARNING on arm64/for-next/core]
[also build test WARNING on linus/master tip/x86/core v6.4-rc7 next-20230620]
[cannot apply to akpm-mm/mm-everything]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Aneesh-Kumar-K-V/mm-lru_gen-lru_gen_look_around-simplification/20230613-200408
base: https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
patch link: https://lore.kernel.org/r/20230613120047.149573-3-aneesh.kumar%40linux.ibm.com
patch subject: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
config: alpha-allmodconfig (https://download.01.org/0day-ci/archive/20230621/202306211018.fodsNZaR-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 12.3.0
reproduce: (https://download.01.org/0day-ci/archive/20230621/202306211018.fodsNZaR-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202306211018.fodsNZaR-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> mm/vmscan.c:4564:6: warning: no previous prototype for '__try_to_inc_max_seq' [-Wmissing-prototypes]
4564 | bool __try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
| ^~~~~~~~~~~~~~~~~~~~
vim +/__try_to_inc_max_seq +4564 mm/vmscan.c
4559
4560 /*
4561 * inc_max_seq can drop the lru_lock in between. So use a waitqueue seq_update_progress
4562 * to allow concurrent access.
4563 */
> 4564 bool __try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
4565 bool can_swap, bool force_scan)
4566 {
4567 bool success = false;
4568 struct lru_gen_folio *lrugen = &lruvec->lrugen;
4569
4570 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
4571
4572 /* see the comment in iterate_mm_list() */
4573 if (lruvec->seq_update_progress)
4574 success = false;
4575 else {
4576 spin_lock_irq(&lruvec->lru_lock);
4577
4578 if (max_seq != lrugen->max_seq)
4579 goto done;
4580
4581 if (lruvec->seq_update_progress)
4582 goto done;
4583
4584 success = true;
4585 lruvec->seq_update_progress = true;
4586 done:
4587 spin_unlock_irq(&lruvec->lru_lock);
4588 }
4589
4590 if (success)
4591 inc_max_seq(lruvec, can_swap, force_scan);
4592
4593 return success;
4594 }
4595
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-13 12:00 ` [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported Aneesh Kumar K.V
2023-06-13 12:23 ` Matthew Wilcox
2023-06-21 2:27 ` kernel test robot
@ 2023-06-24 14:53 ` Aneesh Kumar K.V
2023-06-25 19:34 ` Yu Zhao
2 siblings, 1 reply; 16+ messages in thread
From: Aneesh Kumar K.V @ 2023-06-24 14:53 UTC (permalink / raw)
To: linux-mm, akpm; +Cc: Yu Zhao, T . J . Alumbaugh
Hi Yu Zhao,
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> Not all architecture supports hardware atomic updates of access bits. On
> such an arch, we don't use page table walk to classify pages into
> generations. Add a kernel config option and remove adding all the page
> table walk code on such architecture.
>
> No preformance change observed with mongodb ycsb test:
>
> Patch details Throughput(Ops/sec)
> without patch 93278
> With patch 93400
>
> Without patch:
> $ size mm/vmscan.o
> text data bss dec hex filename
> 112102 42721 40 154863 25cef mm/vmscan.o
>
> With patch
>
> $ size mm/vmscan.o
> text data bss dec hex filename
> 105430 41333 24 146787 23d63 mm/vmscan.o
>
Any feedback on this patch? Can we look at merging this change?
-aneesh
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-24 14:53 ` Aneesh Kumar K.V
@ 2023-06-25 19:34 ` Yu Zhao
2023-06-26 10:52 ` Aneesh Kumar K V
0 siblings, 1 reply; 16+ messages in thread
From: Yu Zhao @ 2023-06-25 19:34 UTC (permalink / raw)
To: Aneesh Kumar K.V; +Cc: linux-mm, akpm
[-- Attachment #1: Type: text/plain, Size: 2193 bytes --]
On Sat, Jun 24, 2023 at 8:54 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Hi Yu Zhao,
>
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>
> > Not all architecture supports hardware atomic updates of access bits. On
> > such an arch, we don't use page table walk to classify pages into
> > generations. Add a kernel config option and remove adding all the page
> > table walk code on such architecture.
> >
> > No preformance change observed with mongodb ycsb test:
> >
> > Patch details Throughput(Ops/sec)
> > without patch 93278
> > With patch 93400
> >
> > Without patch:
> > $ size mm/vmscan.o
> > text data bss dec hex filename
> > 112102 42721 40 154863 25cef mm/vmscan.o
> >
> > With patch
> >
> > $ size mm/vmscan.o
> > text data bss dec hex filename
> > 105430 41333 24 146787 23d63 mm/vmscan.o
> >
>
> Any feedback on this patch? Can we look at merging this change?
Just want to make sure I fully understand the motivation: are there
any other end goals besides reducing the footprint mentioned above?
E.g., preparing for HCA, etc. (My current understanding is that HCA
shouldn't care about it, since it's already runtime disabled if HCA
doesn't want to use it.)
Also as explained offline, solely relying on folio_activate() in
lru_gen_look_around() can cause a measure regression on powerpc,
because
1. PAGEVEC_SIZE is 15 whereas pglist_data->mm_walk.batched is
virtually unlimited.
2. Once folio_activate() reaches that limit, it takes the LRU lock on
top of the PTL, which can be shared by multiple page tables on
powerpc.
In fact, I think we try the opposite direction first, before arriving
at any conclusions, i.e.,
#define arch_has_hw_pte_young() radix_enabled()
on powerpc. This might benefit platforms with the A-bit but not HCA,
e.g., POWER9. I just ran a quick test (memcached/memtier I previously
shared with you) and it showed far less PTL contention in kswapd. I'm
attaching the flamegraphs for you to analyze. Could you try some
benchmarks with the above change on your end as well?
Thanks.
[-- Attachment #2: a.svg --]
[-- Type: image/svg+xml, Size: 347341 bytes --]
[-- Attachment #3: b.svg --]
[-- Type: image/svg+xml, Size: 342754 bytes --]
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-25 19:34 ` Yu Zhao
@ 2023-06-26 10:52 ` Aneesh Kumar K V
2023-06-26 17:04 ` Yu Zhao
0 siblings, 1 reply; 16+ messages in thread
From: Aneesh Kumar K V @ 2023-06-26 10:52 UTC (permalink / raw)
To: Yu Zhao; +Cc: linux-mm, akpm
On 6/26/23 1:04 AM, Yu Zhao wrote:
> On Sat, Jun 24, 2023 at 8:54 AM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> Hi Yu Zhao,
>>
>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>
>>> Not all architecture supports hardware atomic updates of access bits. On
>>> such an arch, we don't use page table walk to classify pages into
>>> generations. Add a kernel config option and remove adding all the page
>>> table walk code on such architecture.
>>>
>>> No preformance change observed with mongodb ycsb test:
>>>
>>> Patch details Throughput(Ops/sec)
>>> without patch 93278
>>> With patch 93400
>>>
>>> Without patch:
>>> $ size mm/vmscan.o
>>> text data bss dec hex filename
>>> 112102 42721 40 154863 25cef mm/vmscan.o
>>>
>>> With patch
>>>
>>> $ size mm/vmscan.o
>>> text data bss dec hex filename
>>> 105430 41333 24 146787 23d63 mm/vmscan.o
>>>
>>
>> Any feedback on this patch? Can we look at merging this change?
>
> Just want to make sure I fully understand the motivation: are there
> any other end goals besides reducing the footprint mentioned above?
> E.g., preparing for HCA, etc. (My current understanding is that HCA
> shouldn't care about it, since it's already runtime disabled if HCA
> doesn't want to use it.)
>
My goal with this change was to remove all those dead code from getting complied
in for ppc64.
> Also as explained offline, solely relying on folio_activate() in
> lru_gen_look_around() can cause a measure regression on powerpc,
> because
> 1. PAGEVEC_SIZE is 15 whereas pglist_data->mm_walk.batched is
> virtually unlimited.
> 2. Once folio_activate() reaches that limit, it takes the LRU lock on
> top of the PTL, which can be shared by multiple page tables on
> powerpc.
>
> In fact, I think we try the opposite direction first, before arriving
> at any conclusions, i.e.,
> #define arch_has_hw_pte_young() radix_enabled()
The reason it is disabled on powerpc was that a reference bit update takes a pagefault
on powerpc irrespective of the translation mode.
> on powerpc. This might benefit platforms with the A-bit but not HCA,
> e.g., POWER9. I just ran a quick test (memcached/memtier I previously
> shared with you) and it showed far less PTL contention in kswapd. I'm
> attaching the flamegraphs for you to analyze. Could you try some
> benchmarks with the above change on your end as well?
>
The ptl lock is a valid concern even though i didn't observe the contention increasing with
the change. I will rerun the test to verify. We have possibly two options here
1) Delay the lruvec->nr_pages update until the sort phase. But as you explained earlier, that
can impact should_run_aging().
2) Add another batching mechanism similar to pglist_data->mm_walk which can be used on architecture
that don't support hardware update of access/reference bit to be used only by lru_gen_look_around()
-aneesh
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-26 10:52 ` Aneesh Kumar K V
@ 2023-06-26 17:04 ` Yu Zhao
2023-06-27 11:48 ` Aneesh Kumar K V
0 siblings, 1 reply; 16+ messages in thread
From: Yu Zhao @ 2023-06-26 17:04 UTC (permalink / raw)
To: Aneesh Kumar K V; +Cc: linux-mm, akpm
On Mon, Jun 26, 2023 at 4:52 AM Aneesh Kumar K V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 6/26/23 1:04 AM, Yu Zhao wrote:
> > On Sat, Jun 24, 2023 at 8:54 AM Aneesh Kumar K.V
> > <aneesh.kumar@linux.ibm.com> wrote:
> >>
> >> Hi Yu Zhao,
> >>
> >> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> >>
> >>> Not all architecture supports hardware atomic updates of access bits. On
> >>> such an arch, we don't use page table walk to classify pages into
> >>> generations. Add a kernel config option and remove adding all the page
> >>> table walk code on such architecture.
> >>>
> >>> No preformance change observed with mongodb ycsb test:
> >>>
> >>> Patch details Throughput(Ops/sec)
> >>> without patch 93278
> >>> With patch 93400
> >>>
> >>> Without patch:
> >>> $ size mm/vmscan.o
> >>> text data bss dec hex filename
> >>> 112102 42721 40 154863 25cef mm/vmscan.o
> >>>
> >>> With patch
> >>>
> >>> $ size mm/vmscan.o
> >>> text data bss dec hex filename
> >>> 105430 41333 24 146787 23d63 mm/vmscan.o
> >>>
> >>
> >> Any feedback on this patch? Can we look at merging this change?
> >
> > Just want to make sure I fully understand the motivation: are there
> > any other end goals besides reducing the footprint mentioned above?
> > E.g., preparing for HCA, etc. (My current understanding is that HCA
> > shouldn't care about it, since it's already runtime disabled if HCA
> > doesn't want to use it.)
> >
>
> My goal with this change was to remove all those dead code from getting complied
> in for ppc64.
I see. But the first thing (lru_gen_add_folio()) you moved has nothing
to do with this goal, because it's still compiled after the entire
series.
> > Also as explained offline, solely relying on folio_activate() in
> > lru_gen_look_around() can cause a measure regression on powerpc,
> > because
> > 1. PAGEVEC_SIZE is 15 whereas pglist_data->mm_walk.batched is
> > virtually unlimited.
> > 2. Once folio_activate() reaches that limit, it takes the LRU lock on
> > top of the PTL, which can be shared by multiple page tables on
> > powerpc.
> >
> > In fact, I think we try the opposite direction first, before arriving
> > at any conclusions, i.e.,
> > #define arch_has_hw_pte_young() radix_enabled()
>
> The reason it is disabled on powerpc was that a reference bit update takes a pagefault
> on powerpc irrespective of the translation mode.
This is not true.
From "IBM POWER9 Processor User Manual":
https://openpowerfoundation.org/resources/ibmpower9usermanual/
4.10.14 Reference and Change Bits
...
When performing HPT translation, the hardware performs the R and C
bit updates nonatomically.
...
The radix case is more complex, and I'll leave it to you to interpret
what it means:
From "Power ISA Version 3.0 B":
https://openpowerfoundation.org/specifications/isa/
5.7.12 Reference and Change Recording
...
For Radix Tree translation, the Reference and Change bits are set atomically.
...
> > on powerpc. This might benefit platforms with the A-bit but not HCA,
> > e.g., POWER9. I just ran a quick test (memcached/memtier I previously
> > shared with you) and it showed far less PTL contention in kswapd. I'm
> > attaching the flamegraphs for you to analyze. Could you try some
> > benchmarks with the above change on your end as well?
> >
>
> The ptl lock is a valid concern even though i didn't observe the contention increasing with
> the change. I will rerun the test to verify. We have possibly two options here
>
> 1) Delay the lruvec->nr_pages update until the sort phase. But as you explained earlier, that
> can impact should_run_aging().
>
>
> 2) Add another batching mechanism similar to pglist_data->mm_walk which can be used on architecture
> that don't support hardware update of access/reference bit to be used only by lru_gen_look_around()
Sounds good. Thanks.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-26 17:04 ` Yu Zhao
@ 2023-06-27 11:48 ` Aneesh Kumar K V
2023-06-27 19:10 ` Yu Zhao
0 siblings, 1 reply; 16+ messages in thread
From: Aneesh Kumar K V @ 2023-06-27 11:48 UTC (permalink / raw)
To: Yu Zhao; +Cc: linux-mm, akpm, npiggin
On 6/26/23 10:34 PM, Yu Zhao wrote:
> On Mon, Jun 26, 2023 at 4:52 AM Aneesh Kumar K V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> On 6/26/23 1:04 AM, Yu Zhao wrote:
>>> On Sat, Jun 24, 2023 at 8:54 AM Aneesh Kumar K.V
>>> <aneesh.kumar@linux.ibm.com> wrote:
>>>>
>>>> Hi Yu Zhao,
>>>>
>>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
>>>>
>>>>> Not all architecture supports hardware atomic updates of access bits. On
>>>>> such an arch, we don't use page table walk to classify pages into
>>>>> generations. Add a kernel config option and remove adding all the page
>>>>> table walk code on such architecture.
>>>>>
>>>>> No preformance change observed with mongodb ycsb test:
>>>>>
>>>>> Patch details Throughput(Ops/sec)
>>>>> without patch 93278
>>>>> With patch 93400
>>>>>
>>>>> Without patch:
>>>>> $ size mm/vmscan.o
>>>>> text data bss dec hex filename
>>>>> 112102 42721 40 154863 25cef mm/vmscan.o
>>>>>
>>>>> With patch
>>>>>
>>>>> $ size mm/vmscan.o
>>>>> text data bss dec hex filename
>>>>> 105430 41333 24 146787 23d63 mm/vmscan.o
>>>>>
>>>>
>>>> Any feedback on this patch? Can we look at merging this change?
>>>
>>> Just want to make sure I fully understand the motivation: are there
>>> any other end goals besides reducing the footprint mentioned above?
>>> E.g., preparing for HCA, etc. (My current understanding is that HCA
>>> shouldn't care about it, since it's already runtime disabled if HCA
>>> doesn't want to use it.)
>>>
>>
>> My goal with this change was to remove all those dead code from getting complied
>> in for ppc64.
>
> I see. But the first thing (lru_gen_add_folio()) you moved has nothing
> to do with this goal, because it's still compiled after the entire
> series.
>
Sure. will drop that change.
>>> Also as explained offline, solely relying on folio_activate() in
>>> lru_gen_look_around() can cause a measure regression on powerpc,
>>> because
>>> 1. PAGEVEC_SIZE is 15 whereas pglist_data->mm_walk.batched is
>>> virtually unlimited.
>>> 2. Once folio_activate() reaches that limit, it takes the LRU lock on
>>> top of the PTL, which can be shared by multiple page tables on
>>> powerpc.
>>>
>>> In fact, I think we try the opposite direction first, before arriving
>>> at any conclusions, i.e.,
>>> #define arch_has_hw_pte_young() radix_enabled()
>>
>> The reason it is disabled on powerpc was that a reference bit update takes a pagefault
>> on powerpc irrespective of the translation mode.
>
> This is not true.
>
> From "IBM POWER9 Processor User Manual":
> https://openpowerfoundation.org/resources/ibmpower9usermanual/
>
> 4.10.14 Reference and Change Bits
> ...
> When performing HPT translation, the hardware performs the R and C
> bit updates nonatomically.
> ...
>
> The radix case is more complex, and I'll leave it to you to interpret
> what it means:
>
> From "Power ISA Version 3.0 B":
> https://openpowerfoundation.org/specifications/isa/
>
> 5.7.12 Reference and Change Recording
> ...
> For Radix Tree translation, the Reference and Change bits are set atomically.
> ...
>
it is atomic in that software use ldarx/stdcx to update these bits. Hardware/core won't
update this directly even though Nest can update this directly without taking a fault. So
for all purpose we can assume that on radix R/C bit is updated by page fault handler.
Generic page table update sequence are slightly different with hash translation in that
some page table field updates requires marking the page table entry invalid.
>>> on powerpc. This might benefit platforms with the A-bit but not HCA,
>>> e.g., POWER9. I just ran a quick test (memcached/memtier I previously
>>> shared with you) and it showed far less PTL contention in kswapd. I'm
>>> attaching the flamegraphs for you to analyze. Could you try some
>>> benchmarks with the above change on your end as well?
>>>
>>
>> The ptl lock is a valid concern even though i didn't observe the contention increasing with
>> the change. I will rerun the test to verify. We have possibly two options here
>>
>> 1) Delay the lruvec->nr_pages update until the sort phase. But as you explained earlier, that
>> can impact should_run_aging().
>>
>>
>> 2) Add another batching mechanism similar to pglist_data->mm_walk which can be used on architecture
>> that don't support hardware update of access/reference bit to be used only by lru_gen_look_around()
>
> Sounds good. Thanks.
I will go ahead working on the approach 2 I outlined above?
-aneesh
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
2023-06-27 11:48 ` Aneesh Kumar K V
@ 2023-06-27 19:10 ` Yu Zhao
0 siblings, 0 replies; 16+ messages in thread
From: Yu Zhao @ 2023-06-27 19:10 UTC (permalink / raw)
To: Aneesh Kumar K V; +Cc: linux-mm, akpm, npiggin, linuxppc-dev
On Tue, Jun 27, 2023 at 5:48 AM Aneesh Kumar K V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 6/26/23 10:34 PM, Yu Zhao wrote:
> > On Mon, Jun 26, 2023 at 4:52 AM Aneesh Kumar K V
> > <aneesh.kumar@linux.ibm.com> wrote:
> >>
> >> On 6/26/23 1:04 AM, Yu Zhao wrote:
> >>> On Sat, Jun 24, 2023 at 8:54 AM Aneesh Kumar K.V
> >>> <aneesh.kumar@linux.ibm.com> wrote:
> >>>>
> >>>> Hi Yu Zhao,
> >>>>
> >>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> >>>>
> >>>>> Not all architecture supports hardware atomic updates of access bits. On
> >>>>> such an arch, we don't use page table walk to classify pages into
> >>>>> generations. Add a kernel config option and remove adding all the page
> >>>>> table walk code on such architecture.
> >>>>>
> >>>>> No preformance change observed with mongodb ycsb test:
> >>>>>
> >>>>> Patch details Throughput(Ops/sec)
> >>>>> without patch 93278
> >>>>> With patch 93400
> >>>>>
> >>>>> Without patch:
> >>>>> $ size mm/vmscan.o
> >>>>> text data bss dec hex filename
> >>>>> 112102 42721 40 154863 25cef mm/vmscan.o
> >>>>>
> >>>>> With patch
> >>>>>
> >>>>> $ size mm/vmscan.o
> >>>>> text data bss dec hex filename
> >>>>> 105430 41333 24 146787 23d63 mm/vmscan.o
> >>>>>
> >>>>
> >>>> Any feedback on this patch? Can we look at merging this change?
> >>>
> >>> Just want to make sure I fully understand the motivation: are there
> >>> any other end goals besides reducing the footprint mentioned above?
> >>> E.g., preparing for HCA, etc. (My current understanding is that HCA
> >>> shouldn't care about it, since it's already runtime disabled if HCA
> >>> doesn't want to use it.)
> >>>
> >>
> >> My goal with this change was to remove all those dead code from getting complied
> >> in for ppc64.
> >
> > I see. But the first thing (lru_gen_add_folio()) you moved has nothing
> > to do with this goal, because it's still compiled after the entire
> > series.
> >
>
> Sure. will drop that change.
>
> >>> Also as explained offline, solely relying on folio_activate() in
> >>> lru_gen_look_around() can cause a measure regression on powerpc,
> >>> because
> >>> 1. PAGEVEC_SIZE is 15 whereas pglist_data->mm_walk.batched is
> >>> virtually unlimited.
> >>> 2. Once folio_activate() reaches that limit, it takes the LRU lock on
> >>> top of the PTL, which can be shared by multiple page tables on
> >>> powerpc.
> >>>
> >>> In fact, I think we try the opposite direction first, before arriving
> >>> at any conclusions, i.e.,
> >>> #define arch_has_hw_pte_young() radix_enabled()
> >>
> >> The reason it is disabled on powerpc was that a reference bit update takes a pagefault
> >> on powerpc irrespective of the translation mode.
> >
> > This is not true.
> >
> > From "IBM POWER9 Processor User Manual":
> > https://openpowerfoundation.org/resources/ibmpower9usermanual/
> >
> > 4.10.14 Reference and Change Bits
> > ...
> > When performing HPT translation, the hardware performs the R and C
> > bit updates nonatomically.
> > ...
> >
> > The radix case is more complex, and I'll leave it to you to interpret
> > what it means:
> >
> > From "Power ISA Version 3.0 B":
> > https://openpowerfoundation.org/specifications/isa/
> >
> > 5.7.12 Reference and Change Recording
> > ...
> > For Radix Tree translation, the Reference and Change bits are set atomically.
> > ...
> >
>
> it is atomic in that software use ldarx/stdcx to update these bits. Hardware/core won't
> update this directly even though Nest can update this directly without taking a fault. So
> for all purpose we can assume that on radix R/C bit is updated by page fault handler.
Thanks. To me, it sounds like stating a function provided by h/w, not
a requirement for s/w. (IMO, the latter would be something like
"software must/should set the bits atomically.) But I'll take your
word for it.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
@ 2023-06-27 19:10 ` Yu Zhao
0 siblings, 0 replies; 16+ messages in thread
From: Yu Zhao @ 2023-06-27 19:10 UTC (permalink / raw)
To: Aneesh Kumar K V; +Cc: linux-mm, akpm, linuxppc-dev, npiggin
On Tue, Jun 27, 2023 at 5:48 AM Aneesh Kumar K V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 6/26/23 10:34 PM, Yu Zhao wrote:
> > On Mon, Jun 26, 2023 at 4:52 AM Aneesh Kumar K V
> > <aneesh.kumar@linux.ibm.com> wrote:
> >>
> >> On 6/26/23 1:04 AM, Yu Zhao wrote:
> >>> On Sat, Jun 24, 2023 at 8:54 AM Aneesh Kumar K.V
> >>> <aneesh.kumar@linux.ibm.com> wrote:
> >>>>
> >>>> Hi Yu Zhao,
> >>>>
> >>>> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:
> >>>>
> >>>>> Not all architecture supports hardware atomic updates of access bits. On
> >>>>> such an arch, we don't use page table walk to classify pages into
> >>>>> generations. Add a kernel config option and remove adding all the page
> >>>>> table walk code on such architecture.
> >>>>>
> >>>>> No preformance change observed with mongodb ycsb test:
> >>>>>
> >>>>> Patch details Throughput(Ops/sec)
> >>>>> without patch 93278
> >>>>> With patch 93400
> >>>>>
> >>>>> Without patch:
> >>>>> $ size mm/vmscan.o
> >>>>> text data bss dec hex filename
> >>>>> 112102 42721 40 154863 25cef mm/vmscan.o
> >>>>>
> >>>>> With patch
> >>>>>
> >>>>> $ size mm/vmscan.o
> >>>>> text data bss dec hex filename
> >>>>> 105430 41333 24 146787 23d63 mm/vmscan.o
> >>>>>
> >>>>
> >>>> Any feedback on this patch? Can we look at merging this change?
> >>>
> >>> Just want to make sure I fully understand the motivation: are there
> >>> any other end goals besides reducing the footprint mentioned above?
> >>> E.g., preparing for HCA, etc. (My current understanding is that HCA
> >>> shouldn't care about it, since it's already runtime disabled if HCA
> >>> doesn't want to use it.)
> >>>
> >>
> >> My goal with this change was to remove all those dead code from getting complied
> >> in for ppc64.
> >
> > I see. But the first thing (lru_gen_add_folio()) you moved has nothing
> > to do with this goal, because it's still compiled after the entire
> > series.
> >
>
> Sure. will drop that change.
>
> >>> Also as explained offline, solely relying on folio_activate() in
> >>> lru_gen_look_around() can cause a measure regression on powerpc,
> >>> because
> >>> 1. PAGEVEC_SIZE is 15 whereas pglist_data->mm_walk.batched is
> >>> virtually unlimited.
> >>> 2. Once folio_activate() reaches that limit, it takes the LRU lock on
> >>> top of the PTL, which can be shared by multiple page tables on
> >>> powerpc.
> >>>
> >>> In fact, I think we try the opposite direction first, before arriving
> >>> at any conclusions, i.e.,
> >>> #define arch_has_hw_pte_young() radix_enabled()
> >>
> >> The reason it is disabled on powerpc was that a reference bit update takes a pagefault
> >> on powerpc irrespective of the translation mode.
> >
> > This is not true.
> >
> > From "IBM POWER9 Processor User Manual":
> > https://openpowerfoundation.org/resources/ibmpower9usermanual/
> >
> > 4.10.14 Reference and Change Bits
> > ...
> > When performing HPT translation, the hardware performs the R and C
> > bit updates nonatomically.
> > ...
> >
> > The radix case is more complex, and I'll leave it to you to interpret
> > what it means:
> >
> > From "Power ISA Version 3.0 B":
> > https://openpowerfoundation.org/specifications/isa/
> >
> > 5.7.12 Reference and Change Recording
> > ...
> > For Radix Tree translation, the Reference and Change bits are set atomically.
> > ...
> >
>
> it is atomic in that software use ldarx/stdcx to update these bits. Hardware/core won't
> update this directly even though Nest can update this directly without taking a fault. So
> for all purpose we can assume that on radix R/C bit is updated by page fault handler.
Thanks. To me, it sounds like stating a function provided by h/w, not
a requirement for s/w. (IMO, the latter would be something like
"software must/should set the bits atomically.) But I'll take your
word for it.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
@ 2023-06-21 0:40 kernel test robot
0 siblings, 0 replies; 16+ messages in thread
From: kernel test robot @ 2023-06-21 0:40 UTC (permalink / raw)
To: oe-kbuild; +Cc: lkp
::::::
:::::: Manual check reason: "um arch report"
::::::
BCC: lkp@intel.com
CC: llvm@lists.linux.dev
CC: oe-kbuild-all@lists.linux.dev
In-Reply-To: <20230613120047.149573-3-aneesh.kumar@linux.ibm.com>
References: <20230613120047.149573-3-aneesh.kumar@linux.ibm.com>
TO: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
TO: linux-mm@kvack.org
TO: akpm@linux-foundation.org
CC: Yu Zhao <yuzhao@google.com>
CC: "T . J . Alumbaugh" <talumbau@google.com>
CC: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Hi Aneesh,
kernel test robot noticed the following build warnings:
[auto build test WARNING on arm64/for-next/core]
[also build test WARNING on linus/master tip/x86/core v6.4-rc7 next-20230620]
[cannot apply to akpm-mm/mm-everything]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Aneesh-Kumar-K-V/mm-lru_gen-lru_gen_look_around-simplification/20230613-200408
base: https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
patch link: https://lore.kernel.org/r/20230613120047.149573-3-aneesh.kumar%40linux.ibm.com
patch subject: [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported
:::::: branch date: 8 days ago
:::::: commit date: 8 days ago
config: um-randconfig-r026-20230620 (https://download.01.org/0day-ci/archive/20230621/202306210854.KhzSokFU-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce: (https://download.01.org/0day-ci/archive/20230621/202306210854.KhzSokFU-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/r/202306210854.KhzSokFU-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from mm/vmscan.c:19:
In file included from include/linux/kernel_stat.h:9:
In file included from include/linux/interrupt.h:11:
In file included from include/linux/hardirq.h:11:
In file included from arch/um/include/asm/hardirq.h:5:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/um/include/asm/io.h:24:
include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
547 | val = __raw_readb(PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
560 | val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
37 | #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
| ^
In file included from mm/vmscan.c:19:
In file included from include/linux/kernel_stat.h:9:
In file included from include/linux/interrupt.h:11:
In file included from include/linux/hardirq.h:11:
In file included from arch/um/include/asm/hardirq.h:5:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/um/include/asm/io.h:24:
include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
573 | val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
35 | #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
| ^
In file included from mm/vmscan.c:19:
In file included from include/linux/kernel_stat.h:9:
In file included from include/linux/interrupt.h:11:
In file included from include/linux/hardirq.h:11:
In file included from arch/um/include/asm/hardirq.h:5:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/um/include/asm/io.h:24:
include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
584 | __raw_writeb(value, PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
594 | __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
604 | __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:692:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
692 | readsb(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:700:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
700 | readsw(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:708:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
708 | readsl(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:717:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
717 | writesb(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:726:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
726 | writesw(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:735:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
735 | writesl(PCI_IOBASE + addr, buffer, count);
| ~~~~~~~~~~ ^
>> mm/vmscan.c:4564:6: warning: no previous prototype for function '__try_to_inc_max_seq' [-Wmissing-prototypes]
4564 | bool __try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
| ^
mm/vmscan.c:4564:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
4564 | bool __try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
| ^
| static
>> mm/vmscan.c:4760:6: warning: variable 'young' set but not used [-Wunused-but-set-variable]
4760 | int young = 0;
| ^
14 warnings generated.
vim +/__try_to_inc_max_seq +4564 mm/vmscan.c
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4559
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4560 /*
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4561 * inc_max_seq can drop the lru_lock in between. So use a waitqueue seq_update_progress
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4562 * to allow concurrent access.
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4563 */
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 @4564 bool __try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4565 bool can_swap, bool force_scan)
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4566 {
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4567 bool success = false;
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4568 struct lru_gen_folio *lrugen = &lruvec->lrugen;
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4569
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4570 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4571
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4572 /* see the comment in iterate_mm_list() */
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4573 if (lruvec->seq_update_progress)
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4574 success = false;
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4575 else {
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4576 spin_lock_irq(&lruvec->lru_lock);
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4577
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4578 if (max_seq != lrugen->max_seq)
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4579 goto done;
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4580
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4581 if (lruvec->seq_update_progress)
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4582 goto done;
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4583
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4584 success = true;
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4585 lruvec->seq_update_progress = true;
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4586 done:
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4587 spin_unlock_irq(&lruvec->lru_lock);
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4588 }
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4589
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4590 if (success)
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4591 inc_max_seq(lruvec, can_swap, force_scan);
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4592
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4593 return success;
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4594 }
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4595
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4596 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4597 bool can_swap, bool force_scan)
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4598 {
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4599 return __try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4600 }
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4601 #endif
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4602
bd74fdaea14602 Yu Zhao 2022-09-18 4603
7b8144e63d8471 T.J. Alumbaugh 2023-01-18 4604 /******************************************************************************
7b8144e63d8471 T.J. Alumbaugh 2023-01-18 4605 * working set protection
7b8144e63d8471 T.J. Alumbaugh 2023-01-18 4606 ******************************************************************************/
7b8144e63d8471 T.J. Alumbaugh 2023-01-18 4607
7348cc91821b0c Yu Zhao 2022-12-21 4608 static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
ac35a490237446 Yu Zhao 2022-09-18 4609 {
7348cc91821b0c Yu Zhao 2022-12-21 4610 int gen, type, zone;
7348cc91821b0c Yu Zhao 2022-12-21 4611 unsigned long total = 0;
7348cc91821b0c Yu Zhao 2022-12-21 4612 bool can_swap = get_swappiness(lruvec, sc);
7348cc91821b0c Yu Zhao 2022-12-21 4613 struct lru_gen_folio *lrugen = &lruvec->lrugen;
ac35a490237446 Yu Zhao 2022-09-18 4614 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
ac35a490237446 Yu Zhao 2022-09-18 4615 DEFINE_MAX_SEQ(lruvec);
ac35a490237446 Yu Zhao 2022-09-18 4616 DEFINE_MIN_SEQ(lruvec);
ac35a490237446 Yu Zhao 2022-09-18 4617
7348cc91821b0c Yu Zhao 2022-12-21 4618 for (type = !can_swap; type < ANON_AND_FILE; type++) {
7348cc91821b0c Yu Zhao 2022-12-21 4619 unsigned long seq;
ac35a490237446 Yu Zhao 2022-09-18 4620
7348cc91821b0c Yu Zhao 2022-12-21 4621 for (seq = min_seq[type]; seq <= max_seq; seq++) {
7348cc91821b0c Yu Zhao 2022-12-21 4622 gen = lru_gen_from_seq(seq);
ac35a490237446 Yu Zhao 2022-09-18 4623
7348cc91821b0c Yu Zhao 2022-12-21 4624 for (zone = 0; zone < MAX_NR_ZONES; zone++)
7348cc91821b0c Yu Zhao 2022-12-21 4625 total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
7348cc91821b0c Yu Zhao 2022-12-21 4626 }
7348cc91821b0c Yu Zhao 2022-12-21 4627 }
7348cc91821b0c Yu Zhao 2022-12-21 4628
7348cc91821b0c Yu Zhao 2022-12-21 4629 /* whether the size is big enough to be helpful */
7348cc91821b0c Yu Zhao 2022-12-21 4630 return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
7348cc91821b0c Yu Zhao 2022-12-21 4631 }
7348cc91821b0c Yu Zhao 2022-12-21 4632
7348cc91821b0c Yu Zhao 2022-12-21 4633 static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
7348cc91821b0c Yu Zhao 2022-12-21 4634 unsigned long min_ttl)
7348cc91821b0c Yu Zhao 2022-12-21 4635 {
7348cc91821b0c Yu Zhao 2022-12-21 4636 int gen;
7348cc91821b0c Yu Zhao 2022-12-21 4637 unsigned long birth;
7348cc91821b0c Yu Zhao 2022-12-21 4638 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
7348cc91821b0c Yu Zhao 2022-12-21 4639 DEFINE_MIN_SEQ(lruvec);
ac35a490237446 Yu Zhao 2022-09-18 4640
7348cc91821b0c Yu Zhao 2022-12-21 4641 /* see the comment on lru_gen_folio */
7348cc91821b0c Yu Zhao 2022-12-21 4642 gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
7348cc91821b0c Yu Zhao 2022-12-21 4643 birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
1332a809d95a4f Yu Zhao 2022-09-18 4644
1332a809d95a4f Yu Zhao 2022-09-18 4645 if (time_is_after_jiffies(birth + min_ttl))
1332a809d95a4f Yu Zhao 2022-09-18 4646 return false;
1332a809d95a4f Yu Zhao 2022-09-18 4647
7348cc91821b0c Yu Zhao 2022-12-21 4648 if (!lruvec_is_sizable(lruvec, sc))
1332a809d95a4f Yu Zhao 2022-09-18 4649 return false;
1332a809d95a4f Yu Zhao 2022-09-18 4650
7348cc91821b0c Yu Zhao 2022-12-21 4651 mem_cgroup_calculate_protection(NULL, memcg);
1332a809d95a4f Yu Zhao 2022-09-18 4652
7348cc91821b0c Yu Zhao 2022-12-21 4653 return !mem_cgroup_below_min(NULL, memcg);
ac35a490237446 Yu Zhao 2022-09-18 4654 }
ac35a490237446 Yu Zhao 2022-09-18 4655
1332a809d95a4f Yu Zhao 2022-09-18 4656 /* to protect the working set of the last N jiffies */
1332a809d95a4f Yu Zhao 2022-09-18 4657 static unsigned long lru_gen_min_ttl __read_mostly;
1332a809d95a4f Yu Zhao 2022-09-18 4658
ac35a490237446 Yu Zhao 2022-09-18 4659 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
ac35a490237446 Yu Zhao 2022-09-18 4660 {
ac35a490237446 Yu Zhao 2022-09-18 4661 struct mem_cgroup *memcg;
1332a809d95a4f Yu Zhao 2022-09-18 4662 unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
ac35a490237446 Yu Zhao 2022-09-18 4663
ac35a490237446 Yu Zhao 2022-09-18 4664 VM_WARN_ON_ONCE(!current_is_kswapd());
ac35a490237446 Yu Zhao 2022-09-18 4665
7348cc91821b0c Yu Zhao 2022-12-21 4666 /* check the order to exclude compaction-induced reclaim */
7348cc91821b0c Yu Zhao 2022-12-21 4667 if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
f76c83378851f8 Yu Zhao 2022-09-18 4668 return;
bd74fdaea14602 Yu Zhao 2022-09-18 4669
ac35a490237446 Yu Zhao 2022-09-18 4670 memcg = mem_cgroup_iter(NULL, NULL, NULL);
ac35a490237446 Yu Zhao 2022-09-18 4671 do {
ac35a490237446 Yu Zhao 2022-09-18 4672 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
ac35a490237446 Yu Zhao 2022-09-18 4673
7348cc91821b0c Yu Zhao 2022-12-21 4674 if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
7348cc91821b0c Yu Zhao 2022-12-21 4675 mem_cgroup_iter_break(NULL, memcg);
7348cc91821b0c Yu Zhao 2022-12-21 4676 return;
7348cc91821b0c Yu Zhao 2022-12-21 4677 }
ac35a490237446 Yu Zhao 2022-09-18 4678
ac35a490237446 Yu Zhao 2022-09-18 4679 cond_resched();
ac35a490237446 Yu Zhao 2022-09-18 4680 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
bd74fdaea14602 Yu Zhao 2022-09-18 4681
1332a809d95a4f Yu Zhao 2022-09-18 4682 /*
1332a809d95a4f Yu Zhao 2022-09-18 4683 * The main goal is to OOM kill if every generation from all memcgs is
1332a809d95a4f Yu Zhao 2022-09-18 4684 * younger than min_ttl. However, another possibility is all memcgs are
7348cc91821b0c Yu Zhao 2022-12-21 4685 * either too small or below min.
1332a809d95a4f Yu Zhao 2022-09-18 4686 */
1332a809d95a4f Yu Zhao 2022-09-18 4687 if (mutex_trylock(&oom_lock)) {
1332a809d95a4f Yu Zhao 2022-09-18 4688 struct oom_control oc = {
1332a809d95a4f Yu Zhao 2022-09-18 4689 .gfp_mask = sc->gfp_mask,
1332a809d95a4f Yu Zhao 2022-09-18 4690 };
1332a809d95a4f Yu Zhao 2022-09-18 4691
1332a809d95a4f Yu Zhao 2022-09-18 4692 out_of_memory(&oc);
1332a809d95a4f Yu Zhao 2022-09-18 4693
1332a809d95a4f Yu Zhao 2022-09-18 4694 mutex_unlock(&oom_lock);
1332a809d95a4f Yu Zhao 2022-09-18 4695 }
ac35a490237446 Yu Zhao 2022-09-18 4696 }
ac35a490237446 Yu Zhao 2022-09-18 4697
db19a43d9b3a88 T.J. Alumbaugh 2023-01-18 4698 /******************************************************************************
db19a43d9b3a88 T.J. Alumbaugh 2023-01-18 4699 * rmap/PT walk feedback
db19a43d9b3a88 T.J. Alumbaugh 2023-01-18 4700 ******************************************************************************/
db19a43d9b3a88 T.J. Alumbaugh 2023-01-18 4701
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4702 static void __look_around_gen_update(struct folio *folio, int new_gen)
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4703 {
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4704 int old_gen;
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4705
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4706 old_gen = folio_lru_gen(folio);
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4707 if (old_gen < 0)
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4708 folio_set_referenced(folio);
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4709 else if (old_gen != new_gen)
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4710 folio_activate(folio);
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4711 }
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4712
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4713 #ifdef CONFIG_LRU_TASK_PAGE_AGING
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4714 static inline bool current_reclaim_state_can_swap(void)
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4715 {
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4716 if (current->reclaim_state)
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4717 return current->reclaim_state->mm_walk->can_swap;
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4718 return true;
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4719 }
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4720
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4721 static void look_around_gen_update(struct folio *folio, int new_gen)
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4722 {
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4723 int old_gen;
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4724 struct lru_gen_mm_walk *walk;
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4725
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4726 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4727 if (walk) {
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4728 old_gen = folio_update_gen(folio, new_gen);
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4729 if (old_gen >= 0 && old_gen != new_gen)
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4730 update_batch_size(walk, folio, old_gen, new_gen);
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4731 return;
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4732 }
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4733 return __look_around_gen_update(folio, new_gen);
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4734 }
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4735 #else
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4736
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4737 static inline bool current_reclaim_state_can_swap(void)
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4738 {
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4739 return true;
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4740 }
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4741
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4742 static inline void look_around_gen_update(struct folio *folio, int new_gen)
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4743 {
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4744 return __look_around_gen_update(folio, new_gen);
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4745 }
4bfcfc7496afb0 Aneesh Kumar K.V 2023-06-13 4746 #endif
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4747
018ee47f14893d Yu Zhao 2022-09-18 4748 /*
49fd9b6df54e61 Matthew Wilcox (Oracle 2022-09-02 4749) * This function exploits spatial locality when shrink_folio_list() walks the
bd74fdaea14602 Yu Zhao 2022-09-18 4750 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
bd74fdaea14602 Yu Zhao 2022-09-18 4751 * the scan was done cacheline efficiently, it adds the PMD entry pointing to
bd74fdaea14602 Yu Zhao 2022-09-18 4752 * the PTE table to the Bloom filter. This forms a feedback loop between the
bd74fdaea14602 Yu Zhao 2022-09-18 4753 * eviction and the aging.
018ee47f14893d Yu Zhao 2022-09-18 4754 */
018ee47f14893d Yu Zhao 2022-09-18 4755 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
018ee47f14893d Yu Zhao 2022-09-18 4756 {
018ee47f14893d Yu Zhao 2022-09-18 4757 int i;
018ee47f14893d Yu Zhao 2022-09-18 4758 unsigned long start;
018ee47f14893d Yu Zhao 2022-09-18 4759 unsigned long end;
bd74fdaea14602 Yu Zhao 2022-09-18 @4760 int young = 0;
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4761 pte_t *pte = pvmw->pte;
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4762 unsigned long addr = pvmw->address;
018ee47f14893d Yu Zhao 2022-09-18 4763 struct folio *folio = pfn_folio(pvmw->pfn);
018ee47f14893d Yu Zhao 2022-09-18 4764 struct mem_cgroup *memcg = folio_memcg(folio);
018ee47f14893d Yu Zhao 2022-09-18 4765 struct pglist_data *pgdat = folio_pgdat(folio);
018ee47f14893d Yu Zhao 2022-09-18 4766 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
018ee47f14893d Yu Zhao 2022-09-18 4767 DEFINE_MAX_SEQ(lruvec);
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4768 int new_gen = lru_gen_from_seq(max_seq);
018ee47f14893d Yu Zhao 2022-09-18 4769
018ee47f14893d Yu Zhao 2022-09-18 4770 lockdep_assert_held(pvmw->ptl);
018ee47f14893d Yu Zhao 2022-09-18 4771 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
018ee47f14893d Yu Zhao 2022-09-18 4772
018ee47f14893d Yu Zhao 2022-09-18 4773 if (spin_is_contended(pvmw->ptl))
018ee47f14893d Yu Zhao 2022-09-18 4774 return;
018ee47f14893d Yu Zhao 2022-09-18 4775
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4776 start = max(addr & PMD_MASK, pvmw->vma->vm_start);
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4777 end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
018ee47f14893d Yu Zhao 2022-09-18 4778
018ee47f14893d Yu Zhao 2022-09-18 4779 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4780 if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
018ee47f14893d Yu Zhao 2022-09-18 4781 end = start + MIN_LRU_BATCH * PAGE_SIZE;
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4782 else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
018ee47f14893d Yu Zhao 2022-09-18 4783 start = end - MIN_LRU_BATCH * PAGE_SIZE;
018ee47f14893d Yu Zhao 2022-09-18 4784 else {
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4785 start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4786 end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
018ee47f14893d Yu Zhao 2022-09-18 4787 }
018ee47f14893d Yu Zhao 2022-09-18 4788 }
018ee47f14893d Yu Zhao 2022-09-18 4789
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4790 /* folio_update_gen() requires stable folio_memcg() */
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4791 if (!mem_cgroup_trylock_pages(memcg))
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4792 return;
018ee47f14893d Yu Zhao 2022-09-18 4793
018ee47f14893d Yu Zhao 2022-09-18 4794 arch_enter_lazy_mmu_mode();
018ee47f14893d Yu Zhao 2022-09-18 4795
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4796 pte -= (addr - start) / PAGE_SIZE;
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4797
018ee47f14893d Yu Zhao 2022-09-18 4798 for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
018ee47f14893d Yu Zhao 2022-09-18 4799 unsigned long pfn;
018ee47f14893d Yu Zhao 2022-09-18 4800
018ee47f14893d Yu Zhao 2022-09-18 4801 pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
018ee47f14893d Yu Zhao 2022-09-18 4802 if (pfn == -1)
018ee47f14893d Yu Zhao 2022-09-18 4803 continue;
018ee47f14893d Yu Zhao 2022-09-18 4804
018ee47f14893d Yu Zhao 2022-09-18 4805 if (!pte_young(pte[i]))
018ee47f14893d Yu Zhao 2022-09-18 4806 continue;
018ee47f14893d Yu Zhao 2022-09-18 4807
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4808 folio = get_pfn_folio(pfn, memcg, pgdat,
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4809 current_reclaim_state_can_swap());
018ee47f14893d Yu Zhao 2022-09-18 4810 if (!folio)
018ee47f14893d Yu Zhao 2022-09-18 4811 continue;
018ee47f14893d Yu Zhao 2022-09-18 4812
018ee47f14893d Yu Zhao 2022-09-18 4813 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
018ee47f14893d Yu Zhao 2022-09-18 4814 VM_WARN_ON_ONCE(true);
018ee47f14893d Yu Zhao 2022-09-18 4815
bd74fdaea14602 Yu Zhao 2022-09-18 4816 young++;
bd74fdaea14602 Yu Zhao 2022-09-18 4817
018ee47f14893d Yu Zhao 2022-09-18 4818 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
018ee47f14893d Yu Zhao 2022-09-18 4819 !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
018ee47f14893d Yu Zhao 2022-09-18 4820 !folio_test_swapcache(folio)))
018ee47f14893d Yu Zhao 2022-09-18 4821 folio_mark_dirty(folio);
018ee47f14893d Yu Zhao 2022-09-18 4822
b94907270ac7be Aneesh Kumar K.V 2023-06-13 4823 look_around_gen_update(folio, new_gen);
018ee47f14893d Yu Zhao 2022-09-18 4824 }
018ee47f14893d Yu Zhao 2022-09-18 4825
018ee47f14893d Yu Zhao 2022-09-18 4826 arch_leave_lazy_mmu_mode();
abf086721a2f1e T.J. Alumbaugh 2023-01-18 4827 mem_cgroup_unlock_pages();
018ee47f14893d Yu Zhao 2022-09-18 4828
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2023-06-27 19:12 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-13 12:00 [PATCH 1/3] mm/lru_gen: Move some code around so that next patch is simpler Aneesh Kumar K.V
2023-06-13 12:00 ` [PATCH 2/3] mm/lru_gen: lru_gen_look_around simplification Aneesh Kumar K.V
2023-06-13 12:00 ` [PATCH 3/3] mm/lru_gen: Don't build multi-gen LRU page table walk code on architecture not supported Aneesh Kumar K.V
2023-06-13 12:23 ` Matthew Wilcox
2023-06-13 13:28 ` Aneesh Kumar K V
2023-06-13 13:36 ` Matthew Wilcox
2023-06-13 13:47 ` Aneesh Kumar K V
2023-06-21 2:27 ` kernel test robot
2023-06-24 14:53 ` Aneesh Kumar K.V
2023-06-25 19:34 ` Yu Zhao
2023-06-26 10:52 ` Aneesh Kumar K V
2023-06-26 17:04 ` Yu Zhao
2023-06-27 11:48 ` Aneesh Kumar K V
2023-06-27 19:10 ` Yu Zhao
2023-06-27 19:10 ` Yu Zhao
2023-06-21 0:40 kernel test robot
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.