linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise
@ 2014-10-05  2:48 David Rientjes
  2014-10-05 17:15 ` Linus Torvalds
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: David Rientjes @ 2014-10-05  2:48 UTC (permalink / raw)
  To: Andrew Morton, Linus Torvalds
  Cc: Kirill A. Shutemov, Andrea Arcangeli, Suleiman Souhlal, stable,
	linux-kernel, linux-mm

If an anonymous mapping is not allowed to fault thp memory and then
madvise(MADV_HUGEPAGE) is used after fault, khugepaged will never
collapse this memory into thp memory.

This occurs because the madvise(2) handler for thp, hugepage_advise(),
clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags
until the final action of madvise_behavior().  This causes the
khugepaged_enter_vma_merge() to be a no-op in hugepage_advise() when the
vma had previously had VM_NOHUGEPAGE set.

Fix this by passing the correct vma flags to the khugepaged mm slot
handler.  There's no chance khugepaged can run on this vma until after
madvise_behavior() returns since we hold mm->mmap_sem.

It would be possible to clear VM_NOHUGEPAGE directly from vma->vm_flags
in hugepage_advise(), but I didn't want to introduce special case
behavior into madvise_behavior().  I think it's best to just let it
always set vma->vm_flags itself.

Cc: <stable@vger.kernel.org>
Reported-by: Suleiman Souhlal <suleiman@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/khugepaged.h | 17 ++++++++++-------
 mm/huge_memory.c           | 11 ++++++-----
 mm/mmap.c                  |  8 ++++----
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -6,7 +6,8 @@
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern int __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
-extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
+extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+				      unsigned long vm_flags);
 
 #define khugepaged_enabled()					       \
 	(transparent_hugepage_flags &				       \
@@ -35,13 +36,13 @@ static inline void khugepaged_exit(struct mm_struct *mm)
 		__khugepaged_exit(mm);
 }
 
-static inline int khugepaged_enter(struct vm_area_struct *vma)
+static inline int khugepaged_enter(struct vm_area_struct *vma,
+				   unsigned long vm_flags)
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
 		if ((khugepaged_always() ||
-		     (khugepaged_req_madv() &&
-		      vma->vm_flags & VM_HUGEPAGE)) &&
-		    !(vma->vm_flags & VM_NOHUGEPAGE))
+		     (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
+		    !(vm_flags & VM_NOHUGEPAGE))
 			if (__khugepaged_enter(vma->vm_mm))
 				return -ENOMEM;
 	return 0;
@@ -54,11 +55,13 @@ static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 static inline void khugepaged_exit(struct mm_struct *mm)
 {
 }
-static inline int khugepaged_enter(struct vm_area_struct *vma)
+static inline int khugepaged_enter(struct vm_area_struct *vma,
+				   unsigned long vm_flags)
 {
 	return 0;
 }
-static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+					     unsigned long vm_flags)
 {
 	return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_FALLBACK;
 	if (unlikely(anon_vma_prepare(vma)))
 		return VM_FAULT_OOM;
-	if (unlikely(khugepaged_enter(vma)))
+	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
 		return VM_FAULT_OOM;
 	if (!(flags & FAULT_FLAG_WRITE) &&
 			transparent_hugepage_use_zero_page()) {
@@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
 		 * register it here without waiting a page fault that
 		 * may not happen any time soon.
 		 */
-		if (unlikely(khugepaged_enter_vma_merge(vma)))
+		if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
 			return -ENOMEM;
 		break;
 	case MADV_NOHUGEPAGE:
@@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm)
 	return 0;
 }
 
-int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+			       unsigned long vm_flags)
 {
 	unsigned long hstart, hend;
 	if (!vma->anon_vma)
@@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
 	if (vma->vm_ops)
 		/* khugepaged not yet working on file or special mappings */
 		return 0;
-	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+	VM_BUG_ON(vm_flags & VM_NO_THP);
 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 	hend = vma->vm_end & HPAGE_PMD_MASK;
 	if (hstart < hend)
-		return khugepaged_enter(vma);
+		return khugepaged_enter(vma, vm_flags);
 	return 0;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1056,7 +1056,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 				end, prev->vm_pgoff, NULL);
 		if (err)
 			return NULL;
-		khugepaged_enter_vma_merge(prev);
+		khugepaged_enter_vma_merge(prev, vm_flags);
 		return prev;
 	}
 
@@ -1075,7 +1075,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 				next->vm_pgoff - pglen, NULL);
 		if (err)
 			return NULL;
-		khugepaged_enter_vma_merge(area);
+		khugepaged_enter_vma_merge(area, vm_flags);
 		return area;
 	}
 
@@ -2192,7 +2192,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		}
 	}
 	vma_unlock_anon_vma(vma);
-	khugepaged_enter_vma_merge(vma);
+	khugepaged_enter_vma_merge(vma, vma->vm_flags);
 	validate_mm(vma->vm_mm);
 	return error;
 }
@@ -2261,7 +2261,7 @@ int expand_downwards(struct vm_area_struct *vma,
 		}
 	}
 	vma_unlock_anon_vma(vma);
-	khugepaged_enter_vma_merge(vma);
+	khugepaged_enter_vma_merge(vma, vma->vm_flags);
 	validate_mm(vma->vm_mm);
 	return error;
 }

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise
  2014-10-05  2:48 [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise David Rientjes
@ 2014-10-05 17:15 ` Linus Torvalds
  2014-10-05 18:41 ` Kirill A. Shutemov
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 8+ messages in thread
From: Linus Torvalds @ 2014-10-05 17:15 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Kirill A. Shutemov, Andrea Arcangeli,
	Suleiman Souhlal, stable, Linux Kernel Mailing List, linux-mm

On Sat, Oct 4, 2014 at 7:48 PM, David Rientjes <rientjes@google.com> wrote:
>
> This occurs because the madvise(2) handler for thp, hugepage_advise(),
> clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags
> until the final action of madvise_behavior().  This causes the
> khugepaged_enter_vma_merge() to be a no-op in hugepage_advise() when the
> vma had previously had VM_NOHUGEPAGE set.

So color me confused, and when I'm confused I don't apply patches. But
there's no "hugepage_advise()" in my source tree, and quite frankly, I
also don't like how you now separately pass in vm_flags that always
*should* be the same as vma->vm_flags.

Maybe this is against -mm, but it's marked for stable and sent to me,
so I'm piping up about my lack of applying this.

                Linus

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise
  2014-10-05  2:48 [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise David Rientjes
  2014-10-05 17:15 ` Linus Torvalds
@ 2014-10-05 18:41 ` Kirill A. Shutemov
  2014-10-05 18:51   ` Linus Torvalds
  2014-10-06  9:42   ` David Rientjes
  2014-10-06 15:03 ` Kirill A. Shutemov
  2014-10-15 21:13 ` [patch resend] " David Rientjes
  3 siblings, 2 replies; 8+ messages in thread
From: Kirill A. Shutemov @ 2014-10-05 18:41 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Linus Torvalds, Kirill A. Shutemov,
	Andrea Arcangeli, Suleiman Souhlal, stable, linux-kernel,
	linux-mm

On Sat, Oct 04, 2014 at 07:48:04PM -0700, David Rientjes wrote:
> If an anonymous mapping is not allowed to fault thp memory and then
> madvise(MADV_HUGEPAGE) is used after fault, khugepaged will never
> collapse this memory into thp memory.
> 
> This occurs because the madvise(2) handler for thp, hugepage_advise(),
> clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags
> until the final action of madvise_behavior().  This causes the
> khugepaged_enter_vma_merge() to be a no-op in hugepage_advise() when the
> vma had previously had VM_NOHUGEPAGE set.
> 
> Fix this by passing the correct vma flags to the khugepaged mm slot
> handler.  There's no chance khugepaged can run on this vma until after
> madvise_behavior() returns since we hold mm->mmap_sem.
> 
> It would be possible to clear VM_NOHUGEPAGE directly from vma->vm_flags
> in hugepage_advise(), but I didn't want to introduce special case
> behavior into madvise_behavior().  I think it's best to just let it
> always set vma->vm_flags itself.
> 
> Cc: <stable@vger.kernel.org>
> Reported-by: Suleiman Souhlal <suleiman@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Look like rather complex fix for a not that complex bug.
What about untested patch below?

From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Sun, 5 Oct 2014 21:22:43 +0300
Subject: [PATCH] thp: fix registering VMA into khugepaged on
 madvise(MADV_HUGEPAGE)

hugepage_madvise() tries to register VMA into khugepaged with
khugepaged_enter_vma_merge() on madvise(MADV_HUGEPAGE). Unfortunately
it's effectevely nop, since khugepaged_enter_vma_merge() rely on
vma->vm_flags which has not yet updated by the time of
hugepage_madvise().

Let's move khugepaged_enter_vma_merge() to the end of madvise_behavior().
Now we also have chance to catch VMAs which become good for THP after
vma_merge().

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/huge_memory.c | 8 +++-----
 mm/madvise.c     | 6 ++++++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f8ffd9412ec5..f84d52158a66 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1966,12 +1966,10 @@ int hugepage_madvise(struct vm_area_struct *vma,
 		*vm_flags &= ~VM_NOHUGEPAGE;
 		*vm_flags |= VM_HUGEPAGE;
 		/*
-		 * If the vma become good for khugepaged to scan,
-		 * register it here without waiting a page fault that
-		 * may not happen any time soon.
+		 * vma->vm_flags is not yet updated here. madvise_behavior()
+		 * will take care to register it in khugepaged once flags
+		 * updated.
 		 */
-		if (unlikely(khugepaged_enter_vma_merge(vma)))
-			return -ENOMEM;
 		break;
 	case MADV_NOHUGEPAGE:
 		/*
diff --git a/mm/madvise.c b/mm/madvise.c
index 0938b30da4ab..60effd2c5e9c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -128,6 +128,12 @@ success:
 	 */
 	vma->vm_flags = new_flags;
 
+	/*
+	 * If the vma become good for khugepaged to scan, register it here
+	 * without waiting a page fault that may not happen any time soon.
+	 */
+	if (unlikely(khugepaged_enter_vma_merge(vma)))
+		error = -ENOMEM;
 out:
 	if (error == -ENOMEM)
 		error = -EAGAIN;
-- 
 Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise
  2014-10-05 18:41 ` Kirill A. Shutemov
@ 2014-10-05 18:51   ` Linus Torvalds
  2014-10-06  9:42   ` David Rientjes
  1 sibling, 0 replies; 8+ messages in thread
From: Linus Torvalds @ 2014-10-05 18:51 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: David Rientjes, Andrew Morton, Kirill A. Shutemov,
	Andrea Arcangeli, Suleiman Souhlal, stable,
	Linux Kernel Mailing List, linux-mm

On Sun, Oct 5, 2014 at 11:41 AM, Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> Look like rather complex fix for a not that complex bug.
> What about untested patch below?

Much nicer. This will miss 3.17 because I'm cutting it today and it's
too late to have this discussion for that, but I think I'd prefer to
merge this instead later..

              Linus

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise
  2014-10-05 18:41 ` Kirill A. Shutemov
  2014-10-05 18:51   ` Linus Torvalds
@ 2014-10-06  9:42   ` David Rientjes
  1 sibling, 0 replies; 8+ messages in thread
From: David Rientjes @ 2014-10-06  9:42 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Linus Torvalds, Kirill A. Shutemov,
	Andrea Arcangeli, Suleiman Souhlal, stable, linux-kernel,
	linux-mm

On Sun, 5 Oct 2014, Kirill A. Shutemov wrote:

> On Sat, Oct 04, 2014 at 07:48:04PM -0700, David Rientjes wrote:
> > If an anonymous mapping is not allowed to fault thp memory and then
> > madvise(MADV_HUGEPAGE) is used after fault, khugepaged will never
> > collapse this memory into thp memory.
> > 
> > This occurs because the madvise(2) handler for thp, hugepage_advise(),
> > clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags
> > until the final action of madvise_behavior().  This causes the
> > khugepaged_enter_vma_merge() to be a no-op in hugepage_advise() when the

This should be hugepage_madvise().

> > vma had previously had VM_NOHUGEPAGE set.
> > 
> > Fix this by passing the correct vma flags to the khugepaged mm slot
> > handler.  There's no chance khugepaged can run on this vma until after
> > madvise_behavior() returns since we hold mm->mmap_sem.
> > 
> > It would be possible to clear VM_NOHUGEPAGE directly from vma->vm_flags
> > in hugepage_advise(), but I didn't want to introduce special case
> > behavior into madvise_behavior().  I think it's best to just let it
> > always set vma->vm_flags itself.
> > 
> > Cc: <stable@vger.kernel.org>
> > Reported-by: Suleiman Souhlal <suleiman@google.com>
> > Signed-off-by: David Rientjes <rientjes@google.com>
> 
> Look like rather complex fix for a not that complex bug.
> What about untested patch below?
> 
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> Date: Sun, 5 Oct 2014 21:22:43 +0300
> Subject: [PATCH] thp: fix registering VMA into khugepaged on
>  madvise(MADV_HUGEPAGE)
> 
> hugepage_madvise() tries to register VMA into khugepaged with
> khugepaged_enter_vma_merge() on madvise(MADV_HUGEPAGE). Unfortunately
> it's effectevely nop, since khugepaged_enter_vma_merge() rely on
> vma->vm_flags which has not yet updated by the time of
> hugepage_madvise().
> 
> Let's move khugepaged_enter_vma_merge() to the end of madvise_behavior().
> Now we also have chance to catch VMAs which become good for THP after
> vma_merge().
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/huge_memory.c | 8 +++-----
>  mm/madvise.c     | 6 ++++++
>  2 files changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f8ffd9412ec5..f84d52158a66 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1966,12 +1966,10 @@ int hugepage_madvise(struct vm_area_struct *vma,
>  		*vm_flags &= ~VM_NOHUGEPAGE;
>  		*vm_flags |= VM_HUGEPAGE;
>  		/*
> -		 * If the vma become good for khugepaged to scan,
> -		 * register it here without waiting a page fault that
> -		 * may not happen any time soon.
> +		 * vma->vm_flags is not yet updated here. madvise_behavior()
> +		 * will take care to register it in khugepaged once flags
> +		 * updated.
>  		 */
> -		if (unlikely(khugepaged_enter_vma_merge(vma)))
> -			return -ENOMEM;
>  		break;
>  	case MADV_NOHUGEPAGE:
>  		/*
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 0938b30da4ab..60effd2c5e9c 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -128,6 +128,12 @@ success:
>  	 */
>  	vma->vm_flags = new_flags;
>  
> +	/*
> +	 * If the vma become good for khugepaged to scan, register it here
> +	 * without waiting a page fault that may not happen any time soon.
> +	 */
> +	if (unlikely(khugepaged_enter_vma_merge(vma)))
> +		error = -ENOMEM;
>  out:
>  	if (error == -ENOMEM)
>  		error = -EAGAIN;

I'm pretty sure this won't compile, but I'm also pretty sure it's easy to 
come up with an madvise() bit for anon vmas that would cause the BUG_ON() 
to trigger for CONFIG_DEBUG_VM and unnecessarily do alloc_mm_slot() for 
madvise() calls that aren't MADV_HUGEPAGE with this that go through the 
madvise_behavior() path, and for that reason it's probably not as 
extendable as we'd like.  I can verify this tomorrow if you'd like.  This 
is the point of the last paragraph of my changelog to isolate all thp 
behavior changes to MADV_HUGEPAGE and MADV_NOHUGEPAGE in one place as it's 
currently done and not add any special handling in madvise_behavior().

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise
  2014-10-05  2:48 [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise David Rientjes
  2014-10-05 17:15 ` Linus Torvalds
  2014-10-05 18:41 ` Kirill A. Shutemov
@ 2014-10-06 15:03 ` Kirill A. Shutemov
  2014-10-06 20:53   ` David Rientjes
  2014-10-15 21:13 ` [patch resend] " David Rientjes
  3 siblings, 1 reply; 8+ messages in thread
From: Kirill A. Shutemov @ 2014-10-06 15:03 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Linus Torvalds, Kirill A. Shutemov,
	Andrea Arcangeli, Suleiman Souhlal, stable, linux-kernel,
	linux-mm

On Sat, Oct 04, 2014 at 07:48:04PM -0700, David Rientjes wrote:
> If an anonymous mapping is not allowed to fault thp memory and then
> madvise(MADV_HUGEPAGE) is used after fault, khugepaged will never
> collapse this memory into thp memory.
> 
> This occurs because the madvise(2) handler for thp, hugepage_advise(),
> clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags
> until the final action of madvise_behavior().  This causes the
> khugepaged_enter_vma_merge() to be a no-op in hugepage_advise() when the
> vma had previously had VM_NOHUGEPAGE set.
> 
> Fix this by passing the correct vma flags to the khugepaged mm slot
> handler.  There's no chance khugepaged can run on this vma until after
> madvise_behavior() returns since we hold mm->mmap_sem.
> 
> It would be possible to clear VM_NOHUGEPAGE directly from vma->vm_flags
> in hugepage_advise(), but I didn't want to introduce special case
> behavior into madvise_behavior().  I think it's best to just let it
> always set vma->vm_flags itself.
> 
> Cc: <stable@vger.kernel.org>
> Reported-by: Suleiman Souhlal <suleiman@google.com>
> Signed-off-by: David Rientjes <rientjes@google.com>

Okay, I've looked once again and it seems your approach is better.
Although, I don't like that we need to pass down vma->vm_flags to every
khugepaged_enter() and khugepaged_enter_vma_merge().

My proposal is below. Build-tested only.

And I don't think this is subject for stable@: no crash or serious
misbehaviour. Registering to khugepaged is postponed until first page
fault. Not a big deal.

>From 6434d87a313317bcbc98313794840c366ba39ba1 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 6 Oct 2014 17:52:17 +0300
Subject: [PATCH] thp: fix registering VMA into khugepaged on 
 madvise(MADV_HUGEPAGE)

hugepage_madvise() tries to register VMA into khugepaged with
khugepaged_enter_vma_merge() on madvise(MADV_HUGEPAGE). Unfortunately
it's effectevely nop, since khugepaged_enter_vma_merge() rely on
vma->vm_flags which has not yet updated by the time of
hugepage_madvise().

Let's create a variant of khugepaged_enter_vma_merge() which takes
external vm_flags to consider. At the moment there's only two users for
such function: hugepage_madvise() and vma_merge().

Reported-by: Suleiman Souhlal <suleiman@google.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/khugepaged.h |  9 +++++++++
 mm/huge_memory.c           | 27 ++++++++++++++++++++-------
 mm/mmap.c                  |  6 +++---
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 6b394f0b5148..bb25e323ee2d 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -7,6 +7,8 @@
 extern int __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
 extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
+extern int __khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+		vm_flags_t vm_flags);
 
 #define khugepaged_enabled()					       \
 	(transparent_hugepage_flags &				       \
@@ -62,6 +64,13 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
 {
 	return 0;
 }
+
+static inline int __khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+		vm_flags_t vm_flags)
+{
+	return 0;
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_KHUGEPAGED_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 017aff657ef5..4b0afc076827 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1944,7 +1944,7 @@ out:
 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
 
 int hugepage_madvise(struct vm_area_struct *vma,
-		     unsigned long *vm_flags, int advice)
+		     vm_flags_t *vm_flags, int advice)
 {
 	switch (advice) {
 	case MADV_HUGEPAGE:
@@ -1969,8 +1969,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
 		 * register it here without waiting a page fault that
 		 * may not happen any time soon.
 		 */
-		if (unlikely(khugepaged_enter_vma_merge(vma)))
-			return -ENOMEM;
+		return __khugepaged_enter_vma_merge(vma, *vm_flags);
 		break;
 	case MADV_NOHUGEPAGE:
 		/*
@@ -2070,7 +2069,8 @@ int __khugepaged_enter(struct mm_struct *mm)
 	return 0;
 }
 
-int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+int __khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+		vm_flags_t vm_flags)
 {
 	unsigned long hstart, hend;
 	if (!vma->anon_vma)
@@ -2082,14 +2082,27 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
 	if (vma->vm_ops)
 		/* khugepaged not yet working on file or special mappings */
 		return 0;
-	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+	VM_BUG_ON(vm_flags & VM_NO_THP);
 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 	hend = vma->vm_end & HPAGE_PMD_MASK;
-	if (hstart < hend)
-		return khugepaged_enter(vma);
+	if (hstart >= hend)
+		return 0;
+	if (test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
+		return 0;
+	if (vm_flags & VM_NOHUGEPAGE)
+		return 0;
+	if (khugepaged_always())
+		return __khugepaged_enter(vma->vm_mm);
+	if (khugepaged_req_madv() && vm_flags & VM_HUGEPAGE)
+		return __khugepaged_enter(vma->vm_mm);
 	return 0;
 }
 
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+	return __khugepaged_enter_vma_merge(vma, vma->vm_flags);
+}
+
 void __khugepaged_exit(struct mm_struct *mm)
 {
 	struct mm_slot *mm_slot;
diff --git a/mm/mmap.c b/mm/mmap.c
index c0a3637cdb64..19fee85f0b12 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1009,8 +1009,8 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  */
 struct vm_area_struct *vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
-			unsigned long end, unsigned long vm_flags,
-		     	struct anon_vma *anon_vma, struct file *file,
+			unsigned long end, vm_flags_t vm_flags,
+			struct anon_vma *anon_vma, struct file *file,
 			pgoff_t pgoff, struct mempolicy *policy)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
@@ -1056,7 +1056,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 				end, prev->vm_pgoff, NULL);
 		if (err)
 			return NULL;
-		khugepaged_enter_vma_merge(prev);
+		__khugepaged_enter_vma_merge(prev, vm_flags);
 		return prev;
 	}
 
-- 
 Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise
  2014-10-06 15:03 ` Kirill A. Shutemov
@ 2014-10-06 20:53   ` David Rientjes
  0 siblings, 0 replies; 8+ messages in thread
From: David Rientjes @ 2014-10-06 20:53 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Andrew Morton, Linus Torvalds, Kirill A. Shutemov,
	Andrea Arcangeli, Suleiman Souhlal, stable, linux-kernel,
	linux-mm

On Mon, 6 Oct 2014, Kirill A. Shutemov wrote:

> Okay, I've looked once again and it seems your approach is better.
> Although, I don't like that we need to pass down vma->vm_flags to every
> khugepaged_enter() and khugepaged_enter_vma_merge().
> 
> My proposal is below. Build-tested only.
> 
> And I don't think this is subject for stable@: no crash or serious
> misbehaviour. Registering to khugepaged is postponed until first page
> fault. Not a big deal.
> 

The simple testcase that first discovered this issue does an 
MADV_NOHUGEPAGE over a large length of memory prior to fault.  There are 
users that do this same thing, do mprotect(PROT_NONE) on areas (spurious 
or otherwise) for guard regions, and then do madvise(MADV_HUGEPAGE) over 
the non-guard regions.  The motivation is to prevent faulting memory due 
to thp that will later have its permissions changed to PROT_NONE.  Indeed, 
glibc's stack allocator faults the thread descriptor prior to setting 
premissions for the guard itself.  The motivation for MADV_NOHUGEPAGE is 
to specify that the range really doesn't want thp and the usecase is 
perfectly valid.  Never backing that memory with thp again upon 
MADV_HUGEPAGE seems like something that should be fixed in stable kernels 
since relying on refaulting the memory later is never guaranteed to 
happen.  Andrea's comment itself in the code is right on: the page fault 
may not happen anytime soon, which is the reason for calling into 
khugepaged_enter_vma_merge() in the first place; that comment could be 
extended to also say "the page fault may not happen anytime soon (or 
ever)".

I'd prefer that this is backported to stable unless Andrew objects or 
there's a compelling reason it doesn't meet the stable criteria (I 
couldn't find one).

> From 6434d87a313317bcbc98313794840c366ba39ba1 Mon Sep 17 00:00:00 2001
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> Date: Mon, 6 Oct 2014 17:52:17 +0300
> Subject: [PATCH] thp: fix registering VMA into khugepaged on 
>  madvise(MADV_HUGEPAGE)
> 
> hugepage_madvise() tries to register VMA into khugepaged with
> khugepaged_enter_vma_merge() on madvise(MADV_HUGEPAGE). Unfortunately
> it's effectevely nop, since khugepaged_enter_vma_merge() rely on
> vma->vm_flags which has not yet updated by the time of
> hugepage_madvise().
> 
> Let's create a variant of khugepaged_enter_vma_merge() which takes
> external vm_flags to consider. At the moment there's only two users for
> such function: hugepage_madvise() and vma_merge().
> 
> Reported-by: Suleiman Souhlal <suleiman@google.com>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  include/linux/khugepaged.h |  9 +++++++++
>  mm/huge_memory.c           | 27 ++++++++++++++++++++-------
>  mm/mmap.c                  |  6 +++---
>  3 files changed, 32 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index 6b394f0b5148..bb25e323ee2d 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -7,6 +7,8 @@
>  extern int __khugepaged_enter(struct mm_struct *mm);
>  extern void __khugepaged_exit(struct mm_struct *mm);
>  extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
> +extern int __khugepaged_enter_vma_merge(struct vm_area_struct *vma,
> +		vm_flags_t vm_flags);
>  
>  #define khugepaged_enabled()					       \
>  	(transparent_hugepage_flags &				       \
> @@ -62,6 +64,13 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
>  {
>  	return 0;
>  }
> +
> +static inline int __khugepaged_enter_vma_merge(struct vm_area_struct *vma,
> +		vm_flags_t vm_flags)
> +{
> +	return 0;
> +}
> +
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
>  #endif /* _LINUX_KHUGEPAGED_H */
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 017aff657ef5..4b0afc076827 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1944,7 +1944,7 @@ out:
>  #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
>  
>  int hugepage_madvise(struct vm_area_struct *vma,
> -		     unsigned long *vm_flags, int advice)
> +		     vm_flags_t *vm_flags, int advice)
>  {
>  	switch (advice) {
>  	case MADV_HUGEPAGE:
> @@ -1969,8 +1969,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
>  		 * register it here without waiting a page fault that
>  		 * may not happen any time soon.
>  		 */
> -		if (unlikely(khugepaged_enter_vma_merge(vma)))
> -			return -ENOMEM;
> +		return __khugepaged_enter_vma_merge(vma, *vm_flags);
>  		break;
>  	case MADV_NOHUGEPAGE:
>  		/*
> @@ -2070,7 +2069,8 @@ int __khugepaged_enter(struct mm_struct *mm)
>  	return 0;
>  }
>  
> -int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
> +int __khugepaged_enter_vma_merge(struct vm_area_struct *vma,
> +		vm_flags_t vm_flags)
>  {
>  	unsigned long hstart, hend;
>  	if (!vma->anon_vma)
> @@ -2082,14 +2082,27 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
>  	if (vma->vm_ops)
>  		/* khugepaged not yet working on file or special mappings */
>  		return 0;
> -	VM_BUG_ON(vma->vm_flags & VM_NO_THP);
> +	VM_BUG_ON(vm_flags & VM_NO_THP);
>  	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
>  	hend = vma->vm_end & HPAGE_PMD_MASK;
> -	if (hstart < hend)
> -		return khugepaged_enter(vma);
> +	if (hstart >= hend)
> +		return 0;
> +	if (test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
> +		return 0;
> +	if (vm_flags & VM_NOHUGEPAGE)
> +		return 0;
> +	if (khugepaged_always())
> +		return __khugepaged_enter(vma->vm_mm);
> +	if (khugepaged_req_madv() && vm_flags & VM_HUGEPAGE)
> +		return __khugepaged_enter(vma->vm_mm);

I don't believe Linus would consider your patch to be "much nicer" than 
mine anymore with such checks added to khugepaged_enter_vma_merge().

I also don't understand your objection to passing in the effective 
vma->vm_flags to a function that relies upon vm_flags to decide whether it 
is eligible for thp or not.  If you'd prefer a comment in the code 
(incremental on my patch) then I'm sure the maintainer can judge if it's 
necessary.  But I don't think my solution, which ended up adding four 
lines of code, is more "complex" as you said in comparison to yours that 
adds 22 lines and obviously does checks in khugepaged_enter_vma_merge() 
that are completely unnecessary with my patch.

Anyway, I believe all has been said on this matter and the bike-shedding 
can come later if necessary.  I'd prefer my patch is merged with the 
s/hugepage_advise/hugepage_madvise/ fix for the changelog.

>  	return 0;
>  }
>  
> +int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
> +{
> +	return __khugepaged_enter_vma_merge(vma, vma->vm_flags);
> +}
> +
>  void __khugepaged_exit(struct mm_struct *mm)
>  {
>  	struct mm_slot *mm_slot;
> diff --git a/mm/mmap.c b/mm/mmap.c
> index c0a3637cdb64..19fee85f0b12 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1009,8 +1009,8 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
>   */
>  struct vm_area_struct *vma_merge(struct mm_struct *mm,
>  			struct vm_area_struct *prev, unsigned long addr,
> -			unsigned long end, unsigned long vm_flags,
> -		     	struct anon_vma *anon_vma, struct file *file,
> +			unsigned long end, vm_flags_t vm_flags,
> +			struct anon_vma *anon_vma, struct file *file,
>  			pgoff_t pgoff, struct mempolicy *policy)
>  {
>  	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> @@ -1056,7 +1056,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
>  				end, prev->vm_pgoff, NULL);
>  		if (err)
>  			return NULL;
> -		khugepaged_enter_vma_merge(prev);
> +		__khugepaged_enter_vma_merge(prev, vm_flags);
>  		return prev;
>  	}
>  

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [patch resend] mm, thp: fix collapsing of hugepages on madvise
  2014-10-05  2:48 [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise David Rientjes
                   ` (2 preceding siblings ...)
  2014-10-06 15:03 ` Kirill A. Shutemov
@ 2014-10-15 21:13 ` David Rientjes
  3 siblings, 0 replies; 8+ messages in thread
From: David Rientjes @ 2014-10-15 21:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, Kirill A. Shutemov, Andrea Arcangeli,
	Suleiman Souhlal, stable, linux-kernel, linux-mm

If an anonymous mapping is not allowed to fault thp memory and then
madvise(MADV_HUGEPAGE) is used after fault, khugepaged will never
collapse this memory into thp memory.

This occurs because the madvise(2) handler for thp, hugepage_madvise(),
clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags
until the final action of madvise_behavior().  This causes the
khugepaged_enter_vma_merge() to be a no-op in hugepage_madvise() when the
vma had previously had VM_NOHUGEPAGE set.

Fix this by passing the correct vma flags to the khugepaged mm slot
handler.  There's no chance khugepaged can run on this vma until after
madvise_behavior() returns since we hold mm->mmap_sem.

It would be possible to clear VM_NOHUGEPAGE directly from vma->vm_flags
in hugepage_advise(), but I didn't want to introduce special case
behavior into madvise_behavior().  I think it's best to just let it
always set vma->vm_flags itself.

Cc: <stable@vger.kernel.org>
Reported-by: Suleiman Souhlal <suleiman@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/khugepaged.h | 17 ++++++++++-------
 mm/huge_memory.c           | 11 ++++++-----
 mm/mmap.c                  |  8 ++++----
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -6,7 +6,8 @@
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern int __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
-extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
+extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+				      unsigned long vm_flags);
 
 #define khugepaged_enabled()					       \
 	(transparent_hugepage_flags &				       \
@@ -35,13 +36,13 @@ static inline void khugepaged_exit(struct mm_struct *mm)
 		__khugepaged_exit(mm);
 }
 
-static inline int khugepaged_enter(struct vm_area_struct *vma)
+static inline int khugepaged_enter(struct vm_area_struct *vma,
+				   unsigned long vm_flags)
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
 		if ((khugepaged_always() ||
-		     (khugepaged_req_madv() &&
-		      vma->vm_flags & VM_HUGEPAGE)) &&
-		    !(vma->vm_flags & VM_NOHUGEPAGE))
+		     (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
+		    !(vm_flags & VM_NOHUGEPAGE))
 			if (__khugepaged_enter(vma->vm_mm))
 				return -ENOMEM;
 	return 0;
@@ -54,11 +55,13 @@ static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 static inline void khugepaged_exit(struct mm_struct *mm)
 {
 }
-static inline int khugepaged_enter(struct vm_area_struct *vma)
+static inline int khugepaged_enter(struct vm_area_struct *vma,
+				   unsigned long vm_flags)
 {
 	return 0;
 }
-static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+					     unsigned long vm_flags)
 {
 	return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_FALLBACK;
 	if (unlikely(anon_vma_prepare(vma)))
 		return VM_FAULT_OOM;
-	if (unlikely(khugepaged_enter(vma)))
+	if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
 		return VM_FAULT_OOM;
 	if (!(flags & FAULT_FLAG_WRITE) &&
 			transparent_hugepage_use_zero_page()) {
@@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
 		 * register it here without waiting a page fault that
 		 * may not happen any time soon.
 		 */
-		if (unlikely(khugepaged_enter_vma_merge(vma)))
+		if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
 			return -ENOMEM;
 		break;
 	case MADV_NOHUGEPAGE:
@@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm)
 	return 0;
 }
 
-int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
+			       unsigned long vm_flags)
 {
 	unsigned long hstart, hend;
 	if (!vma->anon_vma)
@@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
 	if (vma->vm_ops)
 		/* khugepaged not yet working on file or special mappings */
 		return 0;
-	VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
+	VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma);
 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 	hend = vma->vm_end & HPAGE_PMD_MASK;
 	if (hstart < hend)
-		return khugepaged_enter(vma);
+		return khugepaged_enter(vma, vm_flags);
 	return 0;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1080,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 				end, prev->vm_pgoff, NULL);
 		if (err)
 			return NULL;
-		khugepaged_enter_vma_merge(prev);
+		khugepaged_enter_vma_merge(prev, vm_flags);
 		return prev;
 	}
 
@@ -1099,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 				next->vm_pgoff - pglen, NULL);
 		if (err)
 			return NULL;
-		khugepaged_enter_vma_merge(area);
+		khugepaged_enter_vma_merge(area, vm_flags);
 		return area;
 	}
 
@@ -2208,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 		}
 	}
 	vma_unlock_anon_vma(vma);
-	khugepaged_enter_vma_merge(vma);
+	khugepaged_enter_vma_merge(vma, vma->vm_flags);
 	validate_mm(vma->vm_mm);
 	return error;
 }
@@ -2277,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma,
 		}
 	}
 	vma_unlock_anon_vma(vma);
-	khugepaged_enter_vma_merge(vma);
+	khugepaged_enter_vma_merge(vma, vma->vm_flags);
 	validate_mm(vma->vm_mm);
 	return error;
 }

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2014-10-15 21:13 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-05  2:48 [patch for-3.17] mm, thp: fix collapsing of hugepages on madvise David Rientjes
2014-10-05 17:15 ` Linus Torvalds
2014-10-05 18:41 ` Kirill A. Shutemov
2014-10-05 18:51   ` Linus Torvalds
2014-10-06  9:42   ` David Rientjes
2014-10-06 15:03 ` Kirill A. Shutemov
2014-10-06 20:53   ` David Rientjes
2014-10-15 21:13 ` [patch resend] " David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).