On 19 Oct 2023, at 16:39, Hugh Dickins wrote: > Shrink shmem's stack usage by eliminating the pseudo-vma from its folio > allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the > principal actor for passing mempolicy choice down to __alloc_pages(), > rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). > > vma_alloc_folio() and alloc_pages() remain, but as wrappers around > alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the > additional args to policy_nodemask(), which subsumes policy_node(). > Cleanup throughout, cutting out some unhelpful "helpers". > > It would all be much simpler without MPOL_INTERLEAVE, but that adds a > dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd > ("tmpfs: distribute interleave better across nodes"), which added ino bias > to the interleave, hidden from mm/mempolicy.c until this commit. > > Hence "ilx" throughout, the "interleave index". Originally I thought it > could be done just with nid, but that's wrong: the nodemask may come from > the shared policy layer below a shmem vma, or it may come from the task > layer above a shmem vma; and without the final nodemask then nodeid cannot > be decided. And how ilx is applied depends also on page order. > > The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: > with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX > passed down from vma-less alloc_pages() is also used as hint not to use > THP-style hugepage allocation - to avoid the overhead of a hugepage arg > (though I don't understand why we never just added a GFP bit for THP - if > it actually needs a different allocation strategy from other pages of the > same order). vma_alloc_folio() still carries its hugepage arg here, but > it is not used, and should be removed when agreed. > > get_vma_policy() no longer allows a NULL vma: over time I believe we've > eradicated all the places which used to need it e.g. swapoff and madvise > used to pass NULL vma to read_swap_cache_async(), but now know the vma. > > Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com > Signed-off-by: Hugh Dickins > Cc: Andi Kleen > Cc: Christoph Lameter > Cc: David Hildenbrand > Cc: Greg Kroah-Hartman > Cc: Huang Ying > Cc: Kefeng Wang > Cc: Matthew Wilcox (Oracle) > Cc: Mel Gorman > Cc: Michal Hocko > Cc: Mike Kravetz > Cc: Nhat Pham > Cc: Sidhartha Kumar > Cc: Suren Baghdasaryan > Cc: Tejun heo > Cc: Vishal Moola (Oracle) > Cc: Yang Shi > Cc: Yosry Ahmed > --- > Rebased to mm.git's current mm-stable, to resolve with removal of > vma_policy() from include/linux/mempolicy.h, and temporary omission > of Nhat's ZSWAP mods from mm/swap_state.c: no other changes. > > git cherry-pick 800caf44af25^..237d4ce921f0 # applies mm-unstable's 01-09 > then apply this "mempolicy: alloc_pages_mpol() for NUMA policy without vma" > git cherry-pick e4fb3362b782^..ec6412928b8e # applies mm-unstable's 11-12 > > fs/proc/task_mmu.c | 5 +- > include/linux/gfp.h | 10 +- > include/linux/mempolicy.h | 13 +- > include/linux/mm.h | 2 +- > ipc/shm.c | 21 +-- > mm/mempolicy.c | 383 +++++++++++++++++++--------------------------- > mm/shmem.c | 92 ++++++----- > mm/swap.h | 9 +- > mm/swap_state.c | 86 +++++++---- > 9 files changed, 299 insertions(+), 322 deletions(-) > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index 1d99450..66ae1c2 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -2673,8 +2673,9 @@ static int show_numa_map(struct seq_file *m, void *v) > struct numa_maps *md = &numa_priv->md; > struct file *file = vma->vm_file; > struct mm_struct *mm = vma->vm_mm; > - struct mempolicy *pol; > char buffer[64]; > + struct mempolicy *pol; > + pgoff_t ilx; > int nid; > > if (!mm) > @@ -2683,7 +2684,7 @@ static int show_numa_map(struct seq_file *m, void *v) > /* Ensure we start with an empty set of numa_maps statistics. */ > memset(md, 0, sizeof(*md)); > > - pol = __get_vma_policy(vma, vma->vm_start); > + pol = __get_vma_policy(vma, vma->vm_start, &ilx); > if (pol) { > mpol_to_str(buffer, sizeof(buffer), pol); > mpol_cond_put(pol); > diff --git a/include/linux/gfp.h b/include/linux/gfp.h > index 665f066..f74f8d0 100644 > --- a/include/linux/gfp.h > +++ b/include/linux/gfp.h > @@ -8,6 +8,7 @@ > #include > > struct vm_area_struct; > +struct mempolicy; > > /* Convert GFP flags to their corresponding migrate type */ > #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) > @@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, > > #ifdef CONFIG_NUMA > struct page *alloc_pages(gfp_t gfp, unsigned int order); > -struct folio *folio_alloc(gfp_t gfp, unsigned order); > +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, > + struct mempolicy *mpol, pgoff_t ilx, int nid); > +struct folio *folio_alloc(gfp_t gfp, unsigned int order); > struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, > unsigned long addr, bool hugepage); > #else > @@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) > { > return alloc_pages_node(numa_node_id(), gfp_mask, order); > } > +static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, > + struct mempolicy *mpol, pgoff_t ilx, int nid) > +{ > + return alloc_pages(gfp, order); > +} > static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) > { > return __folio_alloc_node(gfp, order, numa_node_id()); > diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h > index acdb12f..2801d5b 100644 > --- a/include/linux/mempolicy.h > +++ b/include/linux/mempolicy.h > @@ -126,7 +126,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, > > struct mempolicy *get_task_policy(struct task_struct *p); > struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, > - unsigned long addr); > + unsigned long addr, pgoff_t *ilx); > +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, > + unsigned long addr, int order, pgoff_t *ilx); > bool vma_policy_mof(struct vm_area_struct *vma); > > extern void numa_default_policy(void); > @@ -140,8 +142,6 @@ extern int huge_node(struct vm_area_struct *vma, > extern bool init_nodemask_of_mempolicy(nodemask_t *mask); > extern bool mempolicy_in_oom_domain(struct task_struct *tsk, > const nodemask_t *mask); > -extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); > - > extern unsigned int mempolicy_slab_node(void); > > extern enum zone_type policy_zone; > @@ -213,6 +213,13 @@ static inline void mpol_free_shared_policy(struct shared_policy *sp) > return NULL; > } > > +static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, > + unsigned long addr, int order, pgoff_t *ilx) > +{ > + *ilx = 0; > + return NULL; > +} > + > static inline int > vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) > { > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 86e040e..b4d67a8 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -619,7 +619,7 @@ struct vm_operations_struct { > * policy. > */ > struct mempolicy *(*get_policy)(struct vm_area_struct *vma, > - unsigned long addr); > + unsigned long addr, pgoff_t *ilx); > #endif > /* > * Called by vm_normal_page() for special PTEs to find the > diff --git a/ipc/shm.c b/ipc/shm.c > index 576a543..222aaf0 100644 > --- a/ipc/shm.c > +++ b/ipc/shm.c > @@ -562,30 +562,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma) > } > > #ifdef CONFIG_NUMA > -static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) > +static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) > { > - struct file *file = vma->vm_file; > - struct shm_file_data *sfd = shm_file_data(file); > + struct shm_file_data *sfd = shm_file_data(vma->vm_file); > int err = 0; > > if (sfd->vm_ops->set_policy) > - err = sfd->vm_ops->set_policy(vma, new); > + err = sfd->vm_ops->set_policy(vma, mpol); > return err; > } > > static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, > - unsigned long addr) > + unsigned long addr, pgoff_t *ilx) > { > - struct file *file = vma->vm_file; > - struct shm_file_data *sfd = shm_file_data(file); > - struct mempolicy *pol = NULL; > + struct shm_file_data *sfd = shm_file_data(vma->vm_file); > + struct mempolicy *mpol = vma->vm_policy; > > if (sfd->vm_ops->get_policy) > - pol = sfd->vm_ops->get_policy(vma, addr); > - else if (vma->vm_policy) > - pol = vma->vm_policy; > - > - return pol; > + mpol = sfd->vm_ops->get_policy(vma, addr, ilx); > + return mpol; > } > #endif > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 596d580..8df0503 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -114,6 +114,8 @@ > #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ > #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ > > +#define NO_INTERLEAVE_INDEX (-1UL) > + > static struct kmem_cache *policy_cache; > static struct kmem_cache *sn_cache; > > @@ -898,6 +900,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, > } > > if (flags & MPOL_F_ADDR) { > + pgoff_t ilx; /* ignored here */ > /* > * Do NOT fall back to task policy if the > * vma/shared policy at addr is NULL. We > @@ -909,10 +912,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, > mmap_read_unlock(mm); > return -EFAULT; > } > - if (vma->vm_ops && vma->vm_ops->get_policy) > - pol = vma->vm_ops->get_policy(vma, addr); > - else > - pol = vma->vm_policy; > + pol = __get_vma_policy(vma, addr, &ilx); > } else if (addr) > return -EINVAL; > > @@ -1170,6 +1170,15 @@ static struct folio *new_folio(struct folio *src, unsigned long start) > break; > } > > + /* > + * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL > + * when the page can no longer be located in a vma: that is not ideal > + * (migrate_pages() will give up early, presuming ENOMEM), but good > + * enough to avoid a crash by syzkaller or concurrent holepunch. > + */ > + if (!vma) > + return NULL; > + How often would this happen? I just want to point out that ENOMEM can cause src THPs or large folios to be split by migrate_pages(). > if (folio_test_hugetlb(src)) { > return alloc_hugetlb_folio_vma(folio_hstate(src), > vma, address); > @@ -1178,9 +1187,6 @@ static struct folio *new_folio(struct folio *src, unsigned long start) > if (folio_test_large(src)) > gfp = GFP_TRANSHUGE; > > - /* > - * if !vma, vma_alloc_folio() will use task or system default policy > - */ > return vma_alloc_folio(gfp, folio_order(src), vma, address, > folio_test_large(src)); > } -- Best Regards, Yan, Zi