On 19 Oct 2023, at 16:39, Hugh Dickins wrote:

> Shrink shmem's stack usage by eliminating the pseudo-vma from its folio
> allocation.  alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the
> principal actor for passing mempolicy choice down to __alloc_pages(),
> rather than vma_alloc_folio(gfp, order, vma, addr, hugepage).
>
> vma_alloc_folio() and alloc_pages() remain, but as wrappers around
> alloc_pages_mpol().  alloc_pages_bulk_*() untouched, except to provide the
> additional args to policy_nodemask(), which subsumes policy_node().
> Cleanup throughout, cutting out some unhelpful "helpers".
>
> It would all be much simpler without MPOL_INTERLEAVE, but that adds a
> dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd
> ("tmpfs: distribute interleave better across nodes"), which added ino bias
> to the interleave, hidden from mm/mempolicy.c until this commit.
>
> Hence "ilx" throughout, the "interleave index".  Originally I thought it
> could be done just with nid, but that's wrong: the nodemask may come from
> the shared policy layer below a shmem vma, or it may come from the task
> layer above a shmem vma; and without the final nodemask then nodeid cannot
> be decided.  And how ilx is applied depends also on page order.
>
> The interleave index is almost always irrelevant unless MPOL_INTERLEAVE:
> with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX
> passed down from vma-less alloc_pages() is also used as hint not to use
> THP-style hugepage allocation - to avoid the overhead of a hugepage arg
> (though I don't understand why we never just added a GFP bit for THP - if
> it actually needs a different allocation strategy from other pages of the
> same order).  vma_alloc_folio() still carries its hugepage arg here, but
> it is not used, and should be removed when agreed.
>
> get_vma_policy() no longer allows a NULL vma: over time I believe we've
> eradicated all the places which used to need it e.g.  swapoff and madvise
> used to pass NULL vma to read_swap_cache_async(), but now know the vma.
>
> Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com
> Signed-off-by: Hugh Dickins <hughd@google.com>
> Cc: Andi Kleen <ak@linux.intel.com>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: Huang Ying <ying.huang@intel.com>
> Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
> Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: Mel Gorman <mgorman@techsingularity.net>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Nhat Pham <nphamcs@gmail.com>
> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Tejun heo <tj@kernel.org>
> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> Cc: Yang Shi <shy828301@gmail.com>
> Cc: Yosry Ahmed <yosryahmed@google.com>
> ---
> Rebased to mm.git's current mm-stable, to resolve with removal of
> vma_policy() from include/linux/mempolicy.h, and temporary omission
> of Nhat's ZSWAP mods from mm/swap_state.c: no other changes.
>
> git cherry-pick 800caf44af25^..237d4ce921f0 # applies mm-unstable's 01-09
> then apply this "mempolicy: alloc_pages_mpol() for NUMA policy without vma"
> git cherry-pick e4fb3362b782^..ec6412928b8e # applies mm-unstable's 11-12
>
>  fs/proc/task_mmu.c        |   5 +-
>  include/linux/gfp.h       |  10 +-
>  include/linux/mempolicy.h |  13 +-
>  include/linux/mm.h        |   2 +-
>  ipc/shm.c                 |  21 +--
>  mm/mempolicy.c            | 383 +++++++++++++++++++---------------------------
>  mm/shmem.c                |  92 ++++++-----
>  mm/swap.h                 |   9 +-
>  mm/swap_state.c           |  86 +++++++----
>  9 files changed, 299 insertions(+), 322 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 1d99450..66ae1c2 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -2673,8 +2673,9 @@ static int show_numa_map(struct seq_file *m, void *v)
>  	struct numa_maps *md = &numa_priv->md;
>  	struct file *file = vma->vm_file;
>  	struct mm_struct *mm = vma->vm_mm;
> -	struct mempolicy *pol;
>  	char buffer[64];
> +	struct mempolicy *pol;
> +	pgoff_t ilx;
>  	int nid;
>
>  	if (!mm)
> @@ -2683,7 +2684,7 @@ static int show_numa_map(struct seq_file *m, void *v)
>  	/* Ensure we start with an empty set of numa_maps statistics. */
>  	memset(md, 0, sizeof(*md));
>
> -	pol = __get_vma_policy(vma, vma->vm_start);
> +	pol = __get_vma_policy(vma, vma->vm_start, &ilx);
>  	if (pol) {
>  		mpol_to_str(buffer, sizeof(buffer), pol);
>  		mpol_cond_put(pol);
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 665f066..f74f8d0 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -8,6 +8,7 @@
>  #include <linux/topology.h>
>
>  struct vm_area_struct;
> +struct mempolicy;
>
>  /* Convert GFP flags to their corresponding migrate type */
>  #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
> @@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>
>  #ifdef CONFIG_NUMA
>  struct page *alloc_pages(gfp_t gfp, unsigned int order);
> -struct folio *folio_alloc(gfp_t gfp, unsigned order);
> +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
> +		struct mempolicy *mpol, pgoff_t ilx, int nid);
> +struct folio *folio_alloc(gfp_t gfp, unsigned int order);
>  struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
>  		unsigned long addr, bool hugepage);
>  #else
> @@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
>  {
>  	return alloc_pages_node(numa_node_id(), gfp_mask, order);
>  }
> +static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
> +		struct mempolicy *mpol, pgoff_t ilx, int nid)
> +{
> +	return alloc_pages(gfp, order);
> +}
>  static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
>  {
>  	return __folio_alloc_node(gfp, order, numa_node_id());
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index acdb12f..2801d5b 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -126,7 +126,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
>
>  struct mempolicy *get_task_policy(struct task_struct *p);
>  struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
> -		unsigned long addr);
> +		unsigned long addr, pgoff_t *ilx);
> +struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
> +		unsigned long addr, int order, pgoff_t *ilx);
>  bool vma_policy_mof(struct vm_area_struct *vma);
>
>  extern void numa_default_policy(void);
> @@ -140,8 +142,6 @@ extern int huge_node(struct vm_area_struct *vma,
>  extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
>  extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
>  				const nodemask_t *mask);
> -extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
> -
>  extern unsigned int mempolicy_slab_node(void);
>
>  extern enum zone_type policy_zone;
> @@ -213,6 +213,13 @@ static inline void mpol_free_shared_policy(struct shared_policy *sp)
>  	return NULL;
>  }
>
> +static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
> +				unsigned long addr, int order, pgoff_t *ilx)
> +{
> +	*ilx = 0;
> +	return NULL;
> +}
> +
>  static inline int
>  vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
>  {
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 86e040e..b4d67a8 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -619,7 +619,7 @@ struct vm_operations_struct {
>  	 * policy.
>  	 */
>  	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
> -					unsigned long addr);
> +					unsigned long addr, pgoff_t *ilx);
>  #endif
>  	/*
>  	 * Called by vm_normal_page() for special PTEs to find the
> diff --git a/ipc/shm.c b/ipc/shm.c
> index 576a543..222aaf0 100644
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -562,30 +562,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma)
>  }
>
>  #ifdef CONFIG_NUMA
> -static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
> +static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
>  {
> -	struct file *file = vma->vm_file;
> -	struct shm_file_data *sfd = shm_file_data(file);
> +	struct shm_file_data *sfd = shm_file_data(vma->vm_file);
>  	int err = 0;
>
>  	if (sfd->vm_ops->set_policy)
> -		err = sfd->vm_ops->set_policy(vma, new);
> +		err = sfd->vm_ops->set_policy(vma, mpol);
>  	return err;
>  }
>
>  static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
> -					unsigned long addr)
> +					unsigned long addr, pgoff_t *ilx)
>  {
> -	struct file *file = vma->vm_file;
> -	struct shm_file_data *sfd = shm_file_data(file);
> -	struct mempolicy *pol = NULL;
> +	struct shm_file_data *sfd = shm_file_data(vma->vm_file);
> +	struct mempolicy *mpol = vma->vm_policy;
>
>  	if (sfd->vm_ops->get_policy)
> -		pol = sfd->vm_ops->get_policy(vma, addr);
> -	else if (vma->vm_policy)
> -		pol = vma->vm_policy;
> -
> -	return pol;
> +		mpol = sfd->vm_ops->get_policy(vma, addr, ilx);
> +	return mpol;
>  }
>  #endif
>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 596d580..8df0503 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -114,6 +114,8 @@
>  #define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)	/* Invert check for nodemask */
>  #define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)	/* Write-lock walked vmas */
>
> +#define NO_INTERLEAVE_INDEX (-1UL)
> +
>  static struct kmem_cache *policy_cache;
>  static struct kmem_cache *sn_cache;
>
> @@ -898,6 +900,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>  	}
>
>  	if (flags & MPOL_F_ADDR) {
> +		pgoff_t ilx;		/* ignored here */
>  		/*
>  		 * Do NOT fall back to task policy if the
>  		 * vma/shared policy at addr is NULL.  We
> @@ -909,10 +912,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>  			mmap_read_unlock(mm);
>  			return -EFAULT;
>  		}
> -		if (vma->vm_ops && vma->vm_ops->get_policy)
> -			pol = vma->vm_ops->get_policy(vma, addr);
> -		else
> -			pol = vma->vm_policy;
> +		pol = __get_vma_policy(vma, addr, &ilx);
>  	} else if (addr)
>  		return -EINVAL;
>
> @@ -1170,6 +1170,15 @@ static struct folio *new_folio(struct folio *src, unsigned long start)
>  			break;
>  	}
>
> +	/*
> +	 * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL
> +	 * when the page can no longer be located in a vma: that is not ideal
> +	 * (migrate_pages() will give up early, presuming ENOMEM), but good
> +	 * enough to avoid a crash by syzkaller or concurrent holepunch.
> +	 */
> +	if (!vma)
> +		return NULL;
> +

How often would this happen? I just want to point out that ENOMEM can cause
src THPs or large folios to be split by migrate_pages().

>  	if (folio_test_hugetlb(src)) {
>  		return alloc_hugetlb_folio_vma(folio_hstate(src),
>  				vma, address);
> @@ -1178,9 +1187,6 @@ static struct folio *new_folio(struct folio *src, unsigned long start)
>  	if (folio_test_large(src))
>  		gfp = GFP_TRANSHUGE;
>
> -	/*
> -	 * if !vma, vma_alloc_folio() will use task or system default policy
> -	 */
>  	return vma_alloc_folio(gfp, folio_order(src), vma, address,
>  			folio_test_large(src));
>  }


--
Best Regards,
Yan, Zi