Hi Michal, > > Fixes: 5265047ac301 ("mm, thp: really limit transparent hugepage allocation to local node") > Reported-by: Stefan Priebe > Debugged-by: Andrea Arcangeli > Signed-off-by: Michal Hocko > --- > include/linux/mempolicy.h | 2 ++ > mm/huge_memory.c | 25 +++++++++++++++++-------- > mm/mempolicy.c | 28 +--------------------------- > 3 files changed, 20 insertions(+), 35 deletions(-) > > diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h > index 5228c62af416..bac395f1d00a 100644 > --- a/include/linux/mempolicy.h > +++ b/include/linux/mempolicy.h > @@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, > struct mempolicy *get_task_policy(struct task_struct *p); > struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, > unsigned long addr); > +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, > + unsigned long addr); > bool vma_policy_mof(struct vm_area_struct *vma); > > extern void numa_default_policy(void); > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index c3bc7e9c9a2a..94472bf9a31b 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -629,21 +629,30 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, > * available > * never: never stall for any thp allocation > */ > -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) > +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) > { > const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); > + gfp_t this_node = 0; > + struct mempolicy *pol; > + > +#ifdef CONFIG_NUMA > + /* __GFP_THISNODE makes sense only if there is no explicit binding */ > + pol = get_vma_policy(vma, addr); > + if (pol->mode != MPOL_BIND) > + this_node = __GFP_THISNODE; > +#endif > > if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) > - return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); > + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY | this_node); > if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) > - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; > + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; > if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) > return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : > - __GFP_KSWAPD_RECLAIM); > + __GFP_KSWAPD_RECLAIM | this_node); > if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) > return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : > - 0); > - return GFP_TRANSHUGE_LIGHT; > + this_node); > + return GFP_TRANSHUGE_LIGHT | this_node; > } > > /* Caller must hold page table lock. */ > @@ -715,7 +724,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) > pte_free(vma->vm_mm, pgtable); > return ret; > } > - gfp = alloc_hugepage_direct_gfpmask(vma); > + gfp = alloc_hugepage_direct_gfpmask(vma, haddr); > page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); > if (unlikely(!page)) { > count_vm_event(THP_FAULT_FALLBACK); > @@ -1290,7 +1299,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) > alloc: > if (transparent_hugepage_enabled(vma) && > !transparent_hugepage_debug_cow()) { > - huge_gfp = alloc_hugepage_direct_gfpmask(vma); > + huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); > new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); > } else > new_page = NULL; > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index da858f794eb6..75bbfc3d6233 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -1648,7 +1648,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, > * freeing by another task. It is the caller's responsibility to free the > * extra reference for shared policies. > */ > -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, > +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, > unsigned long addr) > { > struct mempolicy *pol = __get_vma_policy(vma, addr); > @@ -2026,32 +2026,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, > goto out; > } > > - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { > - int hpage_node = node; > - > - /* > - * For hugepage allocation and non-interleave policy which > - * allows the current node (or other explicitly preferred > - * node) we only try to allocate from the current/preferred > - * node and don't fall back to other nodes, as the cost of > - * remote accesses would likely offset THP benefits. > - * > - * If the policy is interleave, or does not allow the current > - * node in its nodemask, we allocate the standard way. > - */ > - if (pol->mode == MPOL_PREFERRED && > - !(pol->flags & MPOL_F_LOCAL)) > - hpage_node = pol->v.preferred_node; > - > - nmask = policy_nodemask(gfp, pol); > - if (!nmask || node_isset(hpage_node, *nmask)) { > - mpol_cond_put(pol); > - page = __alloc_pages_node(hpage_node, > - gfp | __GFP_THISNODE, order); > - goto out; > - } > - } > - > nmask = policy_nodemask(gfp, pol); > preferred_nid = policy_node(gfp, pol, node); > page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); > -- > 2.18.0 > Thanks for your patch. I tested it against Linus’s tree with “memhog -r3 130g” in a two-socket machine with 128GB memory on each node and got the results below. I expect this test should fill one node, then fall back to the other. 1. madvise(MADV_HUGEPAGE) + defrag = {always, madvise, defer+madvise}: no swap, THPs are allocated in the fallback node. 2. madvise(MADV_HUGEPAGE) + defrag = defer: pages got swapped to the disk instead of being allocated in the fallback node. 3. no madvise, THP is on by default + defrag = {always, defer, defer+madvise}: pages got swapped to the disk instead of being allocated in the fallback node. 4. no madvise, THP is on by default + defrag = madvise: no swap, base pages are allocated in the fallback node. The result 2 and 3 seems unexpected, since pages should be allocated in the fallback node. The reason, as Andrea mentioned in his email, is that the combination of __THIS_NODE and __GFP_DIRECT_RECLAIM (plus __GFP_KSWAPD_RECLAIM from this experiment). __THIS_NODE uses ZONELIST_NOFALLBACK, which removes the fallback possibility and __GFP_*_RECLAIM triggers page reclaim in the first page allocation node when fallback nodes are removed by ZONELIST_NOFALLBACK. IMHO, __THIS_NODE should not be used for user memory allocation at all, since it fights against most of memory policies. But kernel memory allocation would need it as a kernel MPOL_BIND memory policy. Comments? — Best Regards, Yan Zi