linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm, memcg: sync allocation and memcg charge gfp flags for THP
@ 2015-03-16 14:08 Michal Hocko
  2015-03-18 14:34 ` Vlastimil Babka
  0 siblings, 1 reply; 13+ messages in thread
From: Michal Hocko @ 2015-03-16 14:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Johannes Weiner, linux-mm, LKML

memcg currently uses hardcoded GFP_TRANSHUGE gfp flags for all THP
charges. THP allocations, however, might be using different flags
depending on /sys/kernel/mm/transparent_hugepage/{,khugepaged/}defrag
and the current allocation context.

The primary difference is that defrag configured to "madvise" value will
clear __GFP_WAIT flag from the core gfp mask to make the allocation
lighter for all mappings which are not backed by VM_HUGEPAGE vmas.
If memcg charge path ignores this fact we will get light allocation but
the a potential memcg reclaim would kill the whole point of the
configuration.

Fix the mismatch by providing the same gfp mask used for the
allocation to the charge functions. This is quite easy for all
paths except for hugepaged kernel thread with !CONFIG_NUMA which is
doing a pre-allocation long before the allocated page is used in
collapse_huge_page via khugepaged_alloc_page. To prevent from cluttering
the whole code path from khugepaged_do_scan we simply return the current
flags as per khugepaged_defrag() value which might have changed since
the preallocation. If somebody changed the value of the knob we would
charge differently but this shouldn't happen often and it is definitely
not critical because it would only lead to a reduced success rate of
one-off THP promotion.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
---
 mm/huge_memory.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 625eb556b509..91898b010406 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -708,7 +708,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long haddr, pmd_t *pmd,
-					struct page *page)
+					struct page *page, gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
 	pgtable_t pgtable;
@@ -716,7 +716,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-	if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
+	if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
 		return VM_FAULT_OOM;
 
 	pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +822,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
 	}
-	if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+	if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
 		put_page(page);
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
@@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long haddr;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
+	gfp_t huge_gfp = GFP_TRANSHUGE;	/* for allocation and charge */
 
 	ptl = pmd_lockptr(mm, pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1107,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 alloc:
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow()) {
-		gfp_t gfp;
-
-		gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
-		new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+		huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
 	} else
 		new_page = NULL;
 
@@ -1131,7 +1130,7 @@ alloc:
 	}
 
 	if (unlikely(mem_cgroup_try_charge(new_page, mm,
-					   GFP_TRANSHUGE, &memcg))) {
+					   huge_gfp, &memcg))) {
 		put_page(new_page);
 		if (page) {
 			split_huge_page(page);
@@ -2354,16 +2353,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 }
 
 static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+*khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
 		       struct vm_area_struct *vma, unsigned long address,
 		       int node)
 {
-	gfp_t flags;
-
 	VM_BUG_ON_PAGE(*hpage, *hpage);
 
 	/* Only allocate from the target node */
-	flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
 	        __GFP_THISNODE;
 
 	/*
@@ -2374,7 +2371,7 @@ static struct page
 	 */
 	up_read(&mm->mmap_sem);
 
-	*hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER);
+	*hpage = alloc_pages_exact_node(node, *gfp, HPAGE_PMD_ORDER);
 	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
@@ -2428,12 +2425,18 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 }
 
 static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+*khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
 		       struct vm_area_struct *vma, unsigned long address,
 		       int node)
 {
 	up_read(&mm->mmap_sem);
 	VM_BUG_ON(!*hpage);
+
+	/*
+	 * khugepaged_alloc_hugepage is doing the preallocation, use the same
+	 * gfp flags here.
+	 */
+	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), 0);
 	return  *hpage;
 }
 #endif
@@ -2468,16 +2471,17 @@ static void collapse_huge_page(struct mm_struct *mm,
 	struct mem_cgroup *memcg;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
+	gfp_t gfp;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
 	/* release the mmap_sem read lock. */
-	new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+	new_page = khugepaged_alloc_page(hpage, &gfp, mm, vma, address, node);
 	if (!new_page)
 		return;
 
 	if (unlikely(mem_cgroup_try_charge(new_page, mm,
-					   GFP_TRANSHUGE, &memcg)))
+					   gfp, &memcg)))
 		return;
 
 	/*
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] mm, memcg: sync allocation and memcg charge gfp flags for THP
  2015-03-16 14:08 [PATCH] mm, memcg: sync allocation and memcg charge gfp flags for THP Michal Hocko
@ 2015-03-18 14:34 ` Vlastimil Babka
  2015-03-18 15:02   ` Michal Hocko
  0 siblings, 1 reply; 13+ messages in thread
From: Vlastimil Babka @ 2015-03-18 14:34 UTC (permalink / raw)
  To: Michal Hocko, Andrew Morton; +Cc: Johannes Weiner, linux-mm, LKML

On 03/16/2015 03:08 PM, Michal Hocko wrote:
> memcg currently uses hardcoded GFP_TRANSHUGE gfp flags for all THP
> charges. THP allocations, however, might be using different flags
> depending on /sys/kernel/mm/transparent_hugepage/{,khugepaged/}defrag
> and the current allocation context.
>
> The primary difference is that defrag configured to "madvise" value will
> clear __GFP_WAIT flag from the core gfp mask to make the allocation
> lighter for all mappings which are not backed by VM_HUGEPAGE vmas.
> If memcg charge path ignores this fact we will get light allocation but
> the a potential memcg reclaim would kill the whole point of the
> configuration.
>
> Fix the mismatch by providing the same gfp mask used for the
> allocation to the charge functions. This is quite easy for all
> paths except for hugepaged kernel thread with !CONFIG_NUMA which is
> doing a pre-allocation long before the allocated page is used in
> collapse_huge_page via khugepaged_alloc_page. To prevent from cluttering
> the whole code path from khugepaged_do_scan we simply return the current
> flags as per khugepaged_defrag() value which might have changed since
> the preallocation. If somebody changed the value of the knob we would
> charge differently but this shouldn't happen often and it is definitely
> not critical because it would only lead to a reduced success rate of
> one-off THP promotion.
>
> Signed-off-by: Michal Hocko <mhocko@suse.cz>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

(a nitpick below)

> ---
>   mm/huge_memory.c | 36 ++++++++++++++++++++----------------
>   1 file changed, 20 insertions(+), 16 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 625eb556b509..91898b010406 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -708,7 +708,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
>   static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
>   					struct vm_area_struct *vma,
>   					unsigned long haddr, pmd_t *pmd,
> -					struct page *page)
> +					struct page *page, gfp_t gfp)
>   {
>   	struct mem_cgroup *memcg;
>   	pgtable_t pgtable;
> @@ -716,7 +716,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
>
>   	VM_BUG_ON_PAGE(!PageCompound(page), page);
>
> -	if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
> +	if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
>   		return VM_FAULT_OOM;
>
>   	pgtable = pte_alloc_one(mm, haddr);
> @@ -822,7 +822,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
>   		count_vm_event(THP_FAULT_FALLBACK);
>   		return VM_FAULT_FALLBACK;
>   	}
> -	if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
> +	if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
>   		put_page(page);
>   		count_vm_event(THP_FAULT_FALLBACK);
>   		return VM_FAULT_FALLBACK;
> @@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
>   	unsigned long haddr;
>   	unsigned long mmun_start;	/* For mmu_notifiers */
>   	unsigned long mmun_end;		/* For mmu_notifiers */
> +	gfp_t huge_gfp = GFP_TRANSHUGE;	/* for allocation and charge */

This value is actually never used. Is it here because the compiler emits 
a spurious non-initialized value warning otherwise? It should be easy 
for it to prove that setting new_page to something non-null implies 
initializing huge_gfp (in the hunk below), and NULL new_page means it 
doesn't reach the mem_cgroup_try_charge() call?

>
>   	ptl = pmd_lockptr(mm, pmd);
>   	VM_BUG_ON_VMA(!vma->anon_vma, vma);
> @@ -1106,10 +1107,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
>   alloc:
>   	if (transparent_hugepage_enabled(vma) &&
>   	    !transparent_hugepage_debug_cow()) {
> -		gfp_t gfp;
> -
> -		gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
> -		new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
> +		huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
> +		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
>   	} else
>   		new_page = NULL;
>
> @@ -1131,7 +1130,7 @@ alloc:
>   	}
>
>   	if (unlikely(mem_cgroup_try_charge(new_page, mm,
> -					   GFP_TRANSHUGE, &memcg))) {
> +					   huge_gfp, &memcg))) {
>   		put_page(new_page);
>   		if (page) {
>   			split_huge_page(page);


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mm, memcg: sync allocation and memcg charge gfp flags for THP
  2015-03-18 14:34 ` Vlastimil Babka
@ 2015-03-18 15:02   ` Michal Hocko
  2015-03-18 15:40     ` Vlastimil Babka
  0 siblings, 1 reply; 13+ messages in thread
From: Michal Hocko @ 2015-03-18 15:02 UTC (permalink / raw)
  To: Vlastimil Babka; +Cc: Andrew Morton, Johannes Weiner, linux-mm, LKML

On Wed 18-03-15 15:34:50, Vlastimil Babka wrote:
> On 03/16/2015 03:08 PM, Michal Hocko wrote:
> >memcg currently uses hardcoded GFP_TRANSHUGE gfp flags for all THP
> >charges. THP allocations, however, might be using different flags
> >depending on /sys/kernel/mm/transparent_hugepage/{,khugepaged/}defrag
> >and the current allocation context.
> >
> >The primary difference is that defrag configured to "madvise" value will
> >clear __GFP_WAIT flag from the core gfp mask to make the allocation
> >lighter for all mappings which are not backed by VM_HUGEPAGE vmas.
> >If memcg charge path ignores this fact we will get light allocation but
> >the a potential memcg reclaim would kill the whole point of the
> >configuration.
> >
> >Fix the mismatch by providing the same gfp mask used for the
> >allocation to the charge functions. This is quite easy for all
> >paths except for hugepaged kernel thread with !CONFIG_NUMA which is
> >doing a pre-allocation long before the allocated page is used in
> >collapse_huge_page via khugepaged_alloc_page. To prevent from cluttering
> >the whole code path from khugepaged_do_scan we simply return the current
> >flags as per khugepaged_defrag() value which might have changed since
> >the preallocation. If somebody changed the value of the knob we would
> >charge differently but this shouldn't happen often and it is definitely
> >not critical because it would only lead to a reduced success rate of
> >one-off THP promotion.
> >
> >Signed-off-by: Michal Hocko <mhocko@suse.cz>
> 
> Acked-by: Vlastimil Babka <vbabka@suse.cz>

Thanks!

[...]
> >@@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >  	unsigned long haddr;
> >  	unsigned long mmun_start;	/* For mmu_notifiers */
> >  	unsigned long mmun_end;		/* For mmu_notifiers */
> >+	gfp_t huge_gfp = GFP_TRANSHUGE;	/* for allocation and charge */
> 
> This value is actually never used. Is it here because the compiler emits a
> spurious non-initialized value warning otherwise? It should be easy for it
> to prove that setting new_page to something non-null implies initializing
> huge_gfp (in the hunk below), and NULL new_page means it doesn't reach the
> mem_cgroup_try_charge() call?

No, I haven't tried to workaround the compiler. It just made the code
more obvious to me. I can remove the initialization if you prefer, of
course.

> >  	ptl = pmd_lockptr(mm, pmd);
> >  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
> >@@ -1106,10 +1107,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >  alloc:
> >  	if (transparent_hugepage_enabled(vma) &&
> >  	    !transparent_hugepage_debug_cow()) {
> >-		gfp_t gfp;
> >-
> >-		gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
> >-		new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
> >+		huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
> >+		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
> >  	} else
> >  		new_page = NULL;
> >
> >@@ -1131,7 +1130,7 @@ alloc:
> >  	}
> >
> >  	if (unlikely(mem_cgroup_try_charge(new_page, mm,
> >-					   GFP_TRANSHUGE, &memcg))) {
> >+					   huge_gfp, &memcg))) {
> >  		put_page(new_page);
> >  		if (page) {
> >  			split_huge_page(page);
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mm, memcg: sync allocation and memcg charge gfp flags for THP
  2015-03-18 15:02   ` Michal Hocko
@ 2015-03-18 15:40     ` Vlastimil Babka
  2015-03-18 15:59       ` Michal Hocko
  0 siblings, 1 reply; 13+ messages in thread
From: Vlastimil Babka @ 2015-03-18 15:40 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Andrew Morton, Johannes Weiner, linux-mm, LKML

On 03/18/2015 04:02 PM, Michal Hocko wrote:
> On Wed 18-03-15 15:34:50, Vlastimil Babka wrote:
>> On 03/16/2015 03:08 PM, Michal Hocko wrote:
>>> @@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>   	unsigned long haddr;
>>>   	unsigned long mmun_start;	/* For mmu_notifiers */
>>>   	unsigned long mmun_end;		/* For mmu_notifiers */
>>> +	gfp_t huge_gfp = GFP_TRANSHUGE;	/* for allocation and charge */
>>
>> This value is actually never used. Is it here because the compiler emits a
>> spurious non-initialized value warning otherwise? It should be easy for it
>> to prove that setting new_page to something non-null implies initializing
>> huge_gfp (in the hunk below), and NULL new_page means it doesn't reach the
>> mem_cgroup_try_charge() call?
>
> No, I haven't tried to workaround the compiler. It just made the code
> more obvious to me. I can remove the initialization if you prefer, of
> course.

Yeah IMHO it would be better to remove it, if possible. Leaving it has 
the (albeit small) chance that future patch will again use the value in 
the code before it's determined based on defrag setting.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mm, memcg: sync allocation and memcg charge gfp flags for THP
  2015-03-18 15:40     ` Vlastimil Babka
@ 2015-03-18 15:59       ` Michal Hocko
  2015-03-18 16:09         ` Vlastimil Babka
  0 siblings, 1 reply; 13+ messages in thread
From: Michal Hocko @ 2015-03-18 15:59 UTC (permalink / raw)
  To: Vlastimil Babka; +Cc: Andrew Morton, Johannes Weiner, linux-mm, LKML

On Wed 18-03-15 16:40:34, Vlastimil Babka wrote:
> On 03/18/2015 04:02 PM, Michal Hocko wrote:
> >On Wed 18-03-15 15:34:50, Vlastimil Babka wrote:
> >>On 03/16/2015 03:08 PM, Michal Hocko wrote:
> >>>@@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >>>  	unsigned long haddr;
> >>>  	unsigned long mmun_start;	/* For mmu_notifiers */
> >>>  	unsigned long mmun_end;		/* For mmu_notifiers */
> >>>+	gfp_t huge_gfp = GFP_TRANSHUGE;	/* for allocation and charge */
> >>
> >>This value is actually never used. Is it here because the compiler emits a
> >>spurious non-initialized value warning otherwise? It should be easy for it
> >>to prove that setting new_page to something non-null implies initializing
> >>huge_gfp (in the hunk below), and NULL new_page means it doesn't reach the
> >>mem_cgroup_try_charge() call?
> >
> >No, I haven't tried to workaround the compiler. It just made the code
> >more obvious to me. I can remove the initialization if you prefer, of
> >course.
> 
> Yeah IMHO it would be better to remove it, if possible. Leaving it has the
> (albeit small) chance that future patch will again use the value in the code
> before it's determined based on defrag setting.
 
Wouldn't an uninitialized value be used in such a case?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mm, memcg: sync allocation and memcg charge gfp flags for THP
  2015-03-18 15:59       ` Michal Hocko
@ 2015-03-18 16:09         ` Vlastimil Babka
  2015-03-18 16:14           ` [PATCH -v2] " Michal Hocko
  0 siblings, 1 reply; 13+ messages in thread
From: Vlastimil Babka @ 2015-03-18 16:09 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Andrew Morton, Johannes Weiner, linux-mm, LKML

On 03/18/2015 04:59 PM, Michal Hocko wrote:
> On Wed 18-03-15 16:40:34, Vlastimil Babka wrote:
>> On 03/18/2015 04:02 PM, Michal Hocko wrote:
>>> On Wed 18-03-15 15:34:50, Vlastimil Babka wrote:
>>>> On 03/16/2015 03:08 PM, Michal Hocko wrote:
>>>>> @@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
>>>>>   	unsigned long haddr;
>>>>>   	unsigned long mmun_start;	/* For mmu_notifiers */
>>>>>   	unsigned long mmun_end;		/* For mmu_notifiers */
>>>>> +	gfp_t huge_gfp = GFP_TRANSHUGE;	/* for allocation and charge */
>>>>
>>>> This value is actually never used. Is it here because the compiler emits a
>>>> spurious non-initialized value warning otherwise? It should be easy for it
>>>> to prove that setting new_page to something non-null implies initializing
>>>> huge_gfp (in the hunk below), and NULL new_page means it doesn't reach the
>>>> mem_cgroup_try_charge() call?
>>>
>>> No, I haven't tried to workaround the compiler. It just made the code
>>> more obvious to me. I can remove the initialization if you prefer, of
>>> course.
>>
>> Yeah IMHO it would be better to remove it, if possible. Leaving it has the
>> (albeit small) chance that future patch will again use the value in the code
>> before it's determined based on defrag setting.
>
> Wouldn't an uninitialized value be used in such a case?

Yeah, but then you should get a (correct) warning :)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH -v2] mm, memcg: sync allocation and memcg charge gfp flags for THP
  2015-03-18 16:09         ` Vlastimil Babka
@ 2015-03-18 16:14           ` Michal Hocko
  2015-04-03  1:41             ` [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix David Rientjes
  2015-04-04  1:34             ` [PATCH -v2] mm, memcg: sync allocation and memcg charge gfp flags for THP David Rientjes
  0 siblings, 2 replies; 13+ messages in thread
From: Michal Hocko @ 2015-03-18 16:14 UTC (permalink / raw)
  To: Vlastimil Babka; +Cc: Andrew Morton, Johannes Weiner, linux-mm, LKML

Updated version
---
>From 3d16e584223b17b55a232fe876a4c86abcc85c5a Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 18 Mar 2015 17:12:55 +0100
Subject: [PATCH] mm, memcg: sync allocation and memcg charge gfp flags for THP

memcg currently uses hardcoded GFP_TRANSHUGE gfp flags for all THP
charges. THP allocations, however, might be using different flags
depending on /sys/kernel/mm/transparent_hugepage/{,khugepaged/}defrag
and the current allocation context.

The primary difference is that defrag configured to "madvise" value will
clear __GFP_WAIT flag from the core gfp mask to make the allocation
lighter for all mappings which are not backed by VM_HUGEPAGE vmas.
If memcg charge path ignores this fact we will get light allocation but
the a potential memcg reclaim would kill the whole point of the
configuration.

Fix the mismatch by providing the same gfp mask used for the
allocation to the charge functions. This is quite easy for all
paths except for hugepaged kernel thread with !CONFIG_NUMA which is
doing a pre-allocation long before the allocated page is used in
collapse_huge_page via khugepaged_alloc_page. To prevent from cluttering
the whole code path from khugepaged_do_scan we simply return the current
flags as per khugepaged_defrag() value which might have changed since
the preallocation. If somebody changed the value of the knob we would
charge differently but this shouldn't happen often and it is definitely
not critical because it would only lead to a reduced success rate of
one-off THP promotion.

Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
---
 mm/huge_memory.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 625eb556b509..03e8d12d499c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -708,7 +708,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long haddr, pmd_t *pmd,
-					struct page *page)
+					struct page *page, gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
 	pgtable_t pgtable;
@@ -716,7 +716,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-	if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
+	if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
 		return VM_FAULT_OOM;
 
 	pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +822,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
 	}
-	if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+	if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
 		put_page(page);
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
@@ -1080,6 +1080,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long haddr;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
+	gfp_t huge_gfp;			/* for allocation and charge */
 
 	ptl = pmd_lockptr(mm, pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1107,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 alloc:
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow()) {
-		gfp_t gfp;
-
-		gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
-		new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+		huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
 	} else
 		new_page = NULL;
 
@@ -1131,7 +1130,7 @@ alloc:
 	}
 
 	if (unlikely(mem_cgroup_try_charge(new_page, mm,
-					   GFP_TRANSHUGE, &memcg))) {
+					   huge_gfp, &memcg))) {
 		put_page(new_page);
 		if (page) {
 			split_huge_page(page);
@@ -2354,16 +2353,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 }
 
 static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+*khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
 		       struct vm_area_struct *vma, unsigned long address,
 		       int node)
 {
-	gfp_t flags;
-
 	VM_BUG_ON_PAGE(*hpage, *hpage);
 
 	/* Only allocate from the target node */
-	flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
 	        __GFP_THISNODE;
 
 	/*
@@ -2374,7 +2371,7 @@ static struct page
 	 */
 	up_read(&mm->mmap_sem);
 
-	*hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER);
+	*hpage = alloc_pages_exact_node(node, *gfp, HPAGE_PMD_ORDER);
 	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
@@ -2428,12 +2425,18 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 }
 
 static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+*khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
 		       struct vm_area_struct *vma, unsigned long address,
 		       int node)
 {
 	up_read(&mm->mmap_sem);
 	VM_BUG_ON(!*hpage);
+
+	/*
+	 * khugepaged_alloc_hugepage is doing the preallocation, use the same
+	 * gfp flags here.
+	 */
+	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), 0);
 	return  *hpage;
 }
 #endif
@@ -2468,16 +2471,17 @@ static void collapse_huge_page(struct mm_struct *mm,
 	struct mem_cgroup *memcg;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
+	gfp_t gfp;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
 	/* release the mmap_sem read lock. */
-	new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+	new_page = khugepaged_alloc_page(hpage, &gfp, mm, vma, address, node);
 	if (!new_page)
 		return;
 
 	if (unlikely(mem_cgroup_try_charge(new_page, mm,
-					   GFP_TRANSHUGE, &memcg)))
+					   gfp, &memcg)))
 		return;
 
 	/*
-- 
2.1.4

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix
  2015-03-18 16:14           ` [PATCH -v2] " Michal Hocko
@ 2015-04-03  1:41             ` David Rientjes
  2015-04-03  8:38               ` Vlastimil Babka
  2015-04-03 10:50               ` Michal Hocko
  2015-04-04  1:34             ` [PATCH -v2] mm, memcg: sync allocation and memcg charge gfp flags for THP David Rientjes
  1 sibling, 2 replies; 13+ messages in thread
From: David Rientjes @ 2015-04-03  1:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Vlastimil Babka, Johannes Weiner, linux-mm, linux-kernel

"mm, memcg: sync allocation and memcg charge gfp flags for THP" in -mm 
introduces a formal to pass the gfp mask for khugepaged's hugepage 
allocation.  This is just too ugly to live.

alloc_hugepage_gfpmask() cannot differ between NUMA and UMA configs by 
anything in GFP_RECLAIM_MASK, which is the only thing that matters for 
memcg reclaim, so just determine the gfp flags once in 
collapse_huge_page() and avoid the complexity.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 -mm: intended to be folded into
      mm-memcg-sync-allocation-and-memcg-charge-gfp-flags-for-thp.patch

 mm/huge_memory.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2373,16 +2373,12 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 }
 
 static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
 		       struct vm_area_struct *vma, unsigned long address,
 		       int node)
 {
 	VM_BUG_ON_PAGE(*hpage, *hpage);
 
-	/* Only allocate from the target node */
-	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
-	        __GFP_THISNODE;
-
 	/*
 	 * Before allocating the hugepage, release the mmap_sem read lock.
 	 * The allocation can take potentially a long time if it involves
@@ -2391,7 +2387,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
 	 */
 	up_read(&mm->mmap_sem);
 
-	*hpage = alloc_pages_exact_node(node, *gfp, HPAGE_PMD_ORDER);
+	*hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
 	if (unlikely(!*hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
@@ -2445,18 +2441,13 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 }
 
 static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
 		       struct vm_area_struct *vma, unsigned long address,
 		       int node)
 {
 	up_read(&mm->mmap_sem);
 	VM_BUG_ON(!*hpage);
 
-	/*
-	 * khugepaged_alloc_hugepage is doing the preallocation, use the same
-	 * gfp flags here.
-	 */
-	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), 0);
 	return  *hpage;
 }
 #endif
@@ -2495,8 +2486,12 @@ static void collapse_huge_page(struct mm_struct *mm,
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
+	/* Only allocate from the target node */
+	gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+		__GFP_THISNODE;
+
 	/* release the mmap_sem read lock. */
-	new_page = khugepaged_alloc_page(hpage, &gfp, mm, vma, address, node);
+	new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
 	if (!new_page)
 		return;
 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix
  2015-04-03  1:41             ` [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix David Rientjes
@ 2015-04-03  8:38               ` Vlastimil Babka
  2015-04-03 10:50               ` Michal Hocko
  1 sibling, 0 replies; 13+ messages in thread
From: Vlastimil Babka @ 2015-04-03  8:38 UTC (permalink / raw)
  To: David Rientjes, Andrew Morton
  Cc: Michal Hocko, Johannes Weiner, linux-mm, linux-kernel

On 04/03/2015 03:41 AM, David Rientjes wrote:
> "mm, memcg: sync allocation and memcg charge gfp flags for THP" in -mm
> introduces a formal to pass the gfp mask for khugepaged's hugepage
> allocation.  This is just too ugly to live.
>
> alloc_hugepage_gfpmask() cannot differ between NUMA and UMA configs by
> anything in GFP_RECLAIM_MASK, which is the only thing that matters for
> memcg reclaim, so just determine the gfp flags once in
> collapse_huge_page() and avoid the complexity.
>
> Signed-off-by: David Rientjes <rientjes@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix
  2015-04-03  1:41             ` [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix David Rientjes
  2015-04-03  8:38               ` Vlastimil Babka
@ 2015-04-03 10:50               ` Michal Hocko
  1 sibling, 0 replies; 13+ messages in thread
From: Michal Hocko @ 2015-04-03 10:50 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Vlastimil Babka, Johannes Weiner, linux-mm, linux-kernel

On Thu 02-04-15 18:41:18, David Rientjes wrote:
> "mm, memcg: sync allocation and memcg charge gfp flags for THP" in -mm 
> introduces a formal to pass the gfp mask for khugepaged's hugepage 
> allocation.  This is just too ugly to live.
> 
> alloc_hugepage_gfpmask() cannot differ between NUMA and UMA configs by 
> anything in GFP_RECLAIM_MASK, which is the only thing that matters for 
> memcg reclaim, so just determine the gfp flags once in 
> collapse_huge_page() and avoid the complexity.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>

Thanks for this cleanup!

Acked-by: Michal Hocko <mhocko@suse.cz>
> ---
>  -mm: intended to be folded into
>       mm-memcg-sync-allocation-and-memcg-charge-gfp-flags-for-thp.patch
> 
>  mm/huge_memory.c | 21 ++++++++-------------
>  1 file changed, 8 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2373,16 +2373,12 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
>  }
>  
>  static struct page *
> -khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
> +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
>  		       struct vm_area_struct *vma, unsigned long address,
>  		       int node)
>  {
>  	VM_BUG_ON_PAGE(*hpage, *hpage);
>  
> -	/* Only allocate from the target node */
> -	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
> -	        __GFP_THISNODE;
> -
>  	/*
>  	 * Before allocating the hugepage, release the mmap_sem read lock.
>  	 * The allocation can take potentially a long time if it involves
> @@ -2391,7 +2387,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
>  	 */
>  	up_read(&mm->mmap_sem);
>  
> -	*hpage = alloc_pages_exact_node(node, *gfp, HPAGE_PMD_ORDER);
> +	*hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
>  	if (unlikely(!*hpage)) {
>  		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
>  		*hpage = ERR_PTR(-ENOMEM);
> @@ -2445,18 +2441,13 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
>  }
>  
>  static struct page *
> -khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
> +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
>  		       struct vm_area_struct *vma, unsigned long address,
>  		       int node)
>  {
>  	up_read(&mm->mmap_sem);
>  	VM_BUG_ON(!*hpage);
>  
> -	/*
> -	 * khugepaged_alloc_hugepage is doing the preallocation, use the same
> -	 * gfp flags here.
> -	 */
> -	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), 0);
>  	return  *hpage;
>  }
>  #endif
> @@ -2495,8 +2486,12 @@ static void collapse_huge_page(struct mm_struct *mm,
>  
>  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>  
> +	/* Only allocate from the target node */
> +	gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
> +		__GFP_THISNODE;
> +
>  	/* release the mmap_sem read lock. */
> -	new_page = khugepaged_alloc_page(hpage, &gfp, mm, vma, address, node);
> +	new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
>  	if (!new_page)
>  		return;
>  
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v2] mm, memcg: sync allocation and memcg charge gfp flags for THP
  2015-03-18 16:14           ` [PATCH -v2] " Michal Hocko
  2015-04-03  1:41             ` [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix David Rientjes
@ 2015-04-04  1:34             ` David Rientjes
  2015-04-07 12:19               ` Michal Hocko
  1 sibling, 1 reply; 13+ messages in thread
From: David Rientjes @ 2015-04-04  1:34 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Vlastimil Babka, Andrew Morton, Johannes Weiner, linux-mm, LKML

On Wed, 18 Mar 2015, Michal Hocko wrote:

> memcg currently uses hardcoded GFP_TRANSHUGE gfp flags for all THP
> charges. THP allocations, however, might be using different flags
> depending on /sys/kernel/mm/transparent_hugepage/{,khugepaged/}defrag
> and the current allocation context.
> 
> The primary difference is that defrag configured to "madvise" value will
> clear __GFP_WAIT flag from the core gfp mask to make the allocation
> lighter for all mappings which are not backed by VM_HUGEPAGE vmas.
> If memcg charge path ignores this fact we will get light allocation but
> the a potential memcg reclaim would kill the whole point of the
> configuration.
> 
> Fix the mismatch by providing the same gfp mask used for the
> allocation to the charge functions. This is quite easy for all
> paths except for hugepaged kernel thread with !CONFIG_NUMA which is
> doing a pre-allocation long before the allocated page is used in
> collapse_huge_page via khugepaged_alloc_page. To prevent from cluttering
> the whole code path from khugepaged_do_scan we simply return the current
> flags as per khugepaged_defrag() value which might have changed since
> the preallocation. If somebody changed the value of the knob we would
> charge differently but this shouldn't happen often and it is definitely
> not critical because it would only lead to a reduced success rate of
> one-off THP promotion.
> 
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Michal Hocko <mhocko@suse.cz>

Acked-by: David Rientjes <rientjes@google.com>

I'm slightly surprised that this issue never got reported before.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH -v2] mm, memcg: sync allocation and memcg charge gfp flags for THP
  2015-04-04  1:34             ` [PATCH -v2] mm, memcg: sync allocation and memcg charge gfp flags for THP David Rientjes
@ 2015-04-07 12:19               ` Michal Hocko
  0 siblings, 0 replies; 13+ messages in thread
From: Michal Hocko @ 2015-04-07 12:19 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Andrew Morton, Johannes Weiner, linux-mm, LKML

On Fri 03-04-15 18:34:18, David Rientjes wrote:
> On Wed, 18 Mar 2015, Michal Hocko wrote:
> 
> > memcg currently uses hardcoded GFP_TRANSHUGE gfp flags for all THP
> > charges. THP allocations, however, might be using different flags
> > depending on /sys/kernel/mm/transparent_hugepage/{,khugepaged/}defrag
> > and the current allocation context.
> > 
> > The primary difference is that defrag configured to "madvise" value will
> > clear __GFP_WAIT flag from the core gfp mask to make the allocation
> > lighter for all mappings which are not backed by VM_HUGEPAGE vmas.
> > If memcg charge path ignores this fact we will get light allocation but
> > the a potential memcg reclaim would kill the whole point of the
> > configuration.
> > 
> > Fix the mismatch by providing the same gfp mask used for the
> > allocation to the charge functions. This is quite easy for all
> > paths except for hugepaged kernel thread with !CONFIG_NUMA which is
> > doing a pre-allocation long before the allocated page is used in
> > collapse_huge_page via khugepaged_alloc_page. To prevent from cluttering
> > the whole code path from khugepaged_do_scan we simply return the current
> > flags as per khugepaged_defrag() value which might have changed since
> > the preallocation. If somebody changed the value of the knob we would
> > charge differently but this shouldn't happen often and it is definitely
> > not critical because it would only lead to a reduced success rate of
> > one-off THP promotion.
> > 
> > Acked-by: Vlastimil Babka <vbabka@suse.cz>
> > Signed-off-by: Michal Hocko <mhocko@suse.cz>
> 
> Acked-by: David Rientjes <rientjes@google.com>

Thanks!

> I'm slightly surprised that this issue never got reported before.

I am afraid not many people are familiar with the effect of
/sys/kernel/mm/transparent_hugepage/{,khugepaged/}defrag knob(s).

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix
       [not found] <058201d06de5$9e15edc0$da41c940$@alibaba-inc.com>
@ 2015-04-03  8:14 ` Hillf Danton
  0 siblings, 0 replies; 13+ messages in thread
From: Hillf Danton @ 2015-04-03  8:14 UTC (permalink / raw)
  To: David Rientjes; +Cc: linux-mm, linux-kernel, Andrew Morton

> 
> "mm, memcg: sync allocation and memcg charge gfp flags for THP" in -mm
> introduces a formal to pass the gfp mask for khugepaged's hugepage
> allocation.  This is just too ugly to live.
> 
> alloc_hugepage_gfpmask() cannot differ between NUMA and UMA configs by
> anything in GFP_RECLAIM_MASK, which is the only thing that matters for
> memcg reclaim, so just determine the gfp flags once in
> collapse_huge_page() and avoid the complexity.
> 
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>

>  -mm: intended to be folded into
>       mm-memcg-sync-allocation-and-memcg-charge-gfp-flags-for-thp.patch
> 
>  mm/huge_memory.c | 21 ++++++++-------------
>  1 file changed, 8 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2373,16 +2373,12 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
>  }
> 
>  static struct page *
> -khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
> +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
>  		       struct vm_area_struct *vma, unsigned long address,
>  		       int node)
>  {
>  	VM_BUG_ON_PAGE(*hpage, *hpage);
> 
> -	/* Only allocate from the target node */
> -	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
> -	        __GFP_THISNODE;
> -
>  	/*
>  	 * Before allocating the hugepage, release the mmap_sem read lock.
>  	 * The allocation can take potentially a long time if it involves
> @@ -2391,7 +2387,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
>  	 */
>  	up_read(&mm->mmap_sem);
> 
> -	*hpage = alloc_pages_exact_node(node, *gfp, HPAGE_PMD_ORDER);
> +	*hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
>  	if (unlikely(!*hpage)) {
>  		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
>  		*hpage = ERR_PTR(-ENOMEM);
> @@ -2445,18 +2441,13 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
>  }
> 
>  static struct page *
> -khugepaged_alloc_page(struct page **hpage, gfp_t *gfp, struct mm_struct *mm,
> +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
>  		       struct vm_area_struct *vma, unsigned long address,
>  		       int node)
>  {
>  	up_read(&mm->mmap_sem);
>  	VM_BUG_ON(!*hpage);
> 
> -	/*
> -	 * khugepaged_alloc_hugepage is doing the preallocation, use the same
> -	 * gfp flags here.
> -	 */
> -	*gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), 0);
>  	return  *hpage;
>  }
>  #endif
> @@ -2495,8 +2486,12 @@ static void collapse_huge_page(struct mm_struct *mm,
> 
>  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> 
> +	/* Only allocate from the target node */
> +	gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
> +		__GFP_THISNODE;
> +
>  	/* release the mmap_sem read lock. */
> -	new_page = khugepaged_alloc_page(hpage, &gfp, mm, vma, address, node);
> +	new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
>  	if (!new_page)
>  		return;
> 
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 
> 



^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-04-07 12:19 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-16 14:08 [PATCH] mm, memcg: sync allocation and memcg charge gfp flags for THP Michal Hocko
2015-03-18 14:34 ` Vlastimil Babka
2015-03-18 15:02   ` Michal Hocko
2015-03-18 15:40     ` Vlastimil Babka
2015-03-18 15:59       ` Michal Hocko
2015-03-18 16:09         ` Vlastimil Babka
2015-03-18 16:14           ` [PATCH -v2] " Michal Hocko
2015-04-03  1:41             ` [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix David Rientjes
2015-04-03  8:38               ` Vlastimil Babka
2015-04-03 10:50               ` Michal Hocko
2015-04-04  1:34             ` [PATCH -v2] mm, memcg: sync allocation and memcg charge gfp flags for THP David Rientjes
2015-04-07 12:19               ` Michal Hocko
     [not found] <058201d06de5$9e15edc0$da41c940$@alibaba-inc.com>
2015-04-03  8:14 ` [patch] mm, memcg: sync allocation and memcg charge gfp flags for thp fix fix Hillf Danton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).