On 22 Mar 2019, at 21:44, Yang Shi wrote: > Since PMEM provides larger capacity than DRAM and has much lower > access latency than disk, so it is a good choice to use as a middle > tier between DRAM and disk in page reclaim path. > > With PMEM nodes, the demotion path of anonymous pages could be: > > DRAM -> PMEM -> swap device > > This patch demotes anonymous pages only for the time being and demote > THP to PMEM in a whole. However this may cause expensive page reclaim > and/or compaction on PMEM node if there is memory pressure on it. But, > considering the capacity of PMEM and allocation only happens on PMEM > when PMEM is specified explicity, such cases should be not that often. > So, it sounds worth keeping THP in a whole instead of splitting it. > > Demote pages to the cloest non-DRAM node even though the system is > swapless. The current logic of page reclaim just scan anon LRU when > swap is on and swappiness is set properly. Demoting to PMEM doesn't > need care whether swap is available or not. But, reclaiming from PMEM > still skip anon LRU is swap is not available. > > The demotion just happens between DRAM node and its cloest PMEM node. > Demoting to a remote PMEM node is not allowed for now. > > And, define a new migration reason for demotion, called MR_DEMOTE. > Demote page via async migration to avoid blocking. > > Signed-off-by: Yang Shi > --- > include/linux/migrate.h | 1 + > include/trace/events/migrate.h | 3 +- > mm/debug.c | 1 + > mm/internal.h | 22 ++++++++++ > mm/vmscan.c | 99 ++++++++++++++++++++++++++++++++++-------- > 5 files changed, 107 insertions(+), 19 deletions(-) > > diff --git a/include/linux/migrate.h b/include/linux/migrate.h > index e13d9bf..78c8dda 100644 > --- a/include/linux/migrate.h > +++ b/include/linux/migrate.h > @@ -25,6 +25,7 @@ enum migrate_reason { > MR_MEMPOLICY_MBIND, > MR_NUMA_MISPLACED, > MR_CONTIG_RANGE, > + MR_DEMOTE, > MR_TYPES > }; > > diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h > index 705b33d..c1d5b36 100644 > --- a/include/trace/events/migrate.h > +++ b/include/trace/events/migrate.h > @@ -20,7 +20,8 @@ > EM( MR_SYSCALL, "syscall_or_cpuset") \ > EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ > EM( MR_NUMA_MISPLACED, "numa_misplaced") \ > - EMe(MR_CONTIG_RANGE, "contig_range") > + EM( MR_CONTIG_RANGE, "contig_range") \ > + EMe(MR_DEMOTE, "demote") > > /* > * First define the enums in the above macros to be exported to userspace > diff --git a/mm/debug.c b/mm/debug.c > index c0b31b6..cc0d7df 100644 > --- a/mm/debug.c > +++ b/mm/debug.c > @@ -25,6 +25,7 @@ > "mempolicy_mbind", > "numa_misplaced", > "cma", > + "demote", > }; > > const struct trace_print_flags pageflag_names[] = { > diff --git a/mm/internal.h b/mm/internal.h > index 46ad0d8..0152300 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -303,6 +303,19 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask, > } > #endif > > +static inline bool has_nonram_online(void) > +{ > + int i = 0; > + > + for_each_online_node(i) { > + /* Have PMEM node online? */ > + if (!node_isset(i, def_alloc_nodemask)) > + return true; > + } > + > + return false; > +} > + > /* mm/util.c */ > void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, > struct vm_area_struct *prev, struct rb_node *rb_parent); > @@ -565,5 +578,14 @@ static inline bool is_migrate_highatomic_page(struct page *page) > } > > void setup_zone_pageset(struct zone *zone); > + > +#ifdef CONFIG_NUMA > extern struct page *alloc_new_node_page(struct page *page, unsigned long node); > +#else > +static inline struct page *alloc_new_node_page(struct page *page, > + unsigned long node) > +{ > + return NULL; > +} > +#endif > #endif /* __MM_INTERNAL_H */ > diff --git a/mm/vmscan.c b/mm/vmscan.c > index a5ad0b3..bdcab6b 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1094,6 +1094,19 @@ static void page_check_dirty_writeback(struct page *page, > mapping->a_ops->is_dirty_writeback(page, dirty, writeback); > } > > +static inline bool is_demote_ok(struct pglist_data *pgdat) > +{ > + /* Current node is not DRAM node */ > + if (!node_isset(pgdat->node_id, def_alloc_nodemask)) > + return false; > + > + /* No online PMEM node */ > + if (!has_nonram_online()) > + return false; > + > + return true; > +} > + > /* > * shrink_page_list() returns the number of reclaimed pages > */ > @@ -1106,6 +1119,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, > { > LIST_HEAD(ret_pages); > LIST_HEAD(free_pages); > + LIST_HEAD(demote_pages); > unsigned nr_reclaimed = 0; > > memset(stat, 0, sizeof(*stat)); > @@ -1262,6 +1276,22 @@ static unsigned long shrink_page_list(struct list_head *page_list, > } > > /* > + * Demote DRAM pages regardless the mempolicy. > + * Demot anonymous pages only for now and skip MADV_FREE s/Demot/Demote > + * pages. > + */ > + if (PageAnon(page) && !PageSwapCache(page) && > + (node_isset(page_to_nid(page), def_alloc_nodemask)) && > + PageSwapBacked(page)) { > + > + if (has_nonram_online()) { > + list_add(&page->lru, &demote_pages); > + unlock_page(page); > + continue; > + } > + } > + > + /* > * Anonymous process memory has backing store? > * Try to allocate it some swap space here. > * Lazyfree page could be freed directly > @@ -1477,6 +1507,25 @@ static unsigned long shrink_page_list(struct list_head *page_list, > VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); > } > > + /* Demote pages to PMEM */ > + if (!list_empty(&demote_pages)) { > + int err, target_nid; > + nodemask_t used_mask; > + > + nodes_clear(used_mask); > + target_nid = find_next_best_node(pgdat->node_id, &used_mask, > + true); > + > + err = migrate_pages(&demote_pages, alloc_new_node_page, NULL, > + target_nid, MIGRATE_ASYNC, MR_DEMOTE); > + > + if (err) { > + putback_movable_pages(&demote_pages); > + > + list_splice(&ret_pages, &demote_pages); > + } > + } > + I like your approach here. It reuses the existing migrate_pages() interface without adding extra code. I also would like to be CC’d in your future versions. Thank you. -- Best Regards, Yan Zi