linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem
       [not found] <04f701d1c797$1ebe6b80$5c3b4280$@alibaba-inc.com>
@ 2016-06-16  6:52 ` Hillf Danton
  2016-06-16 10:08   ` Kirill A. Shutemov
  0 siblings, 1 reply; 7+ messages in thread
From: Hillf Danton @ 2016-06-16  6:52 UTC (permalink / raw)
  To: 'Ebru Akagunduz', Kirill A. Shutemov; +Cc: linux-kernel, linux-mm

> 
> From: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> 
> Currently khugepaged makes swapin readahead under down_write.  This patch
> supplies to make swapin readahead under down_read instead of down_write.
> 
> The patch was tested with a test program that allocates 800MB of memory,
> writes to it, and then sleeps.  The system was forced to swap out all.
> Afterwards, the test program touches the area by writing, it skips a page
> in each 20 pages of the area.
> 
> Link: http://lkml.kernel.org/r/1464335964-6510-4-git-send-email-ebru.akagunduz@gmail.com
> Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Cyrill Gorcunov <gorcunov@openvz.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Minchan Kim <minchan.kim@gmail.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
>  mm/huge_memory.c | 92 ++++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 63 insertions(+), 29 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f2bc57c45d2f..96dfe3f09bf6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2378,6 +2378,35 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
>  }
> 
>  /*
> + * If mmap_sem temporarily dropped, revalidate vma
> + * before taking mmap_sem.

See below

> + * Return 0 if succeeds, otherwise return none-zero
> + * value (scan code).
> + */
> +
> +static int hugepage_vma_revalidate(struct mm_struct *mm,
> +				   struct vm_area_struct *vma,
> +				   unsigned long address)
> +{
> +	unsigned long hstart, hend;
> +
> +	if (unlikely(khugepaged_test_exit(mm)))
> +		return SCAN_ANY_PROCESS;
> +
> +	vma = find_vma(mm, address);
> +	if (!vma)
> +		return SCAN_VMA_NULL;
> +
> +	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> +	hend = vma->vm_end & HPAGE_PMD_MASK;
> +	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
> +		return SCAN_ADDRESS_RANGE;
> +	if (!hugepage_vma_check(vma))
> +		return SCAN_VMA_CHECK;
> +	return 0;
> +}
> +
> +/*
>   * Bring missing pages in from swap, to complete THP collapse.
>   * Only done if khugepaged_scan_pmd believes it is worthwhile.
>   *
> @@ -2385,7 +2414,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
>   * but with mmap_sem held to protect against vma changes.
>   */
> 
> -static void __collapse_huge_page_swapin(struct mm_struct *mm,
> +static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>  					struct vm_area_struct *vma,
>  					unsigned long address, pmd_t *pmd)
>  {
> @@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
>  			continue;
>  		swapped_in++;
>  		ret = do_swap_page(mm, vma, _address, pte, pmd,
> -				   FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> +				   FAULT_FLAG_ALLOW_RETRY,

Add a description in change log for it please.

>  				   pteval);
> +		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
> +		if (ret & VM_FAULT_RETRY) {
> +			down_read(&mm->mmap_sem);
> +			/* vma is no longer available, don't continue to swapin */
> +			if (hugepage_vma_revalidate(mm, vma, address))
> +				return false;

Revalidate vma _after_ acquiring mmap_sem, but the above comment says _before_.

> +		}
>  		if (ret & VM_FAULT_ERROR) {
>  			trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
> -			return;
> +			return false;
>  		}
>  		/* pte is unmapped now, we need to map it */
>  		pte = pte_offset_map(pmd, _address);
> @@ -2413,6 +2449,7 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
>  	pte--;
>  	pte_unmap(pte);
>  	trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
> +	return true;
>  }
> 
>  static void collapse_huge_page(struct mm_struct *mm,
> @@ -2427,7 +2464,6 @@ static void collapse_huge_page(struct mm_struct *mm,
>  	struct page *new_page;
>  	spinlock_t *pmd_ptl, *pte_ptl;
>  	int isolated = 0, result = 0;
> -	unsigned long hstart, hend;
>  	struct mem_cgroup *memcg;
>  	unsigned long mmun_start;	/* For mmu_notifiers */
>  	unsigned long mmun_end;		/* For mmu_notifiers */
> @@ -2450,39 +2486,37 @@ static void collapse_huge_page(struct mm_struct *mm,
>  		goto out_nolock;
>  	}
> 
> -	/*
> -	 * Prevent all access to pagetables with the exception of
> -	 * gup_fast later hanlded by the ptep_clear_flush and the VM
> -	 * handled by the anon_vma lock + PG_lock.
> -	 */
> -	down_write(&mm->mmap_sem);
> -	if (unlikely(khugepaged_test_exit(mm))) {
> -		result = SCAN_ANY_PROCESS;
> +	down_read(&mm->mmap_sem);
> +	result = hugepage_vma_revalidate(mm, vma, address);
> +	if (result)
>  		goto out;
> -	}
> 
> -	vma = find_vma(mm, address);
> -	if (!vma) {
> -		result = SCAN_VMA_NULL;
> -		goto out;
> -	}
> -	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
> -	hend = vma->vm_end & HPAGE_PMD_MASK;
> -	if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
> -		result = SCAN_ADDRESS_RANGE;
> -		goto out;
> -	}
> -	if (!hugepage_vma_check(vma)) {
> -		result = SCAN_VMA_CHECK;
> -		goto out;
> -	}
>  	pmd = mm_find_pmd(mm, address);
>  	if (!pmd) {
>  		result = SCAN_PMD_NULL;
>  		goto out;
>  	}
> 
> -	__collapse_huge_page_swapin(mm, vma, address, pmd);
> +	/*
> +	 * __collapse_huge_page_swapin always returns with mmap_sem
> +	 * locked. If it fails, release mmap_sem and jump directly
> +	 * label out. Continuing to collapse causes inconsistency.
> +	 */
> +	if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
> +		up_read(&mm->mmap_sem);
> +		goto out;

Jump out with mmap_sem released, 

> +	}
> +
> +	up_read(&mm->mmap_sem);
> +	/*
> +	 * Prevent all access to pagetables with the exception of
> +	 * gup_fast later handled by the ptep_clear_flush and the VM
> +	 * handled by the anon_vma lock + PG_lock.
> +	 */
> +	down_write(&mm->mmap_sem);
> +	result = hugepage_vma_revalidate(mm, vma, address);
> +	if (result)
> +		goto out;

but jump out again with mmap_sem held.

They are cleaned up in subsequent darns?

> 
>  	anon_vma_lock_write(vma->anon_vma);
> 
> --
> 2.8.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem
  2016-06-16  6:52 ` [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem Hillf Danton
@ 2016-06-16 10:08   ` Kirill A. Shutemov
  2016-06-18 19:09     ` Ebru Akagunduz
  0 siblings, 1 reply; 7+ messages in thread
From: Kirill A. Shutemov @ 2016-06-16 10:08 UTC (permalink / raw)
  To: Hillf Danton
  Cc: 'Ebru Akagunduz', Kirill A. Shutemov, linux-kernel, linux-mm

On Thu, Jun 16, 2016 at 02:52:52PM +0800, Hillf Danton wrote:
> > 
> > From: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> > 
> > Currently khugepaged makes swapin readahead under down_write.  This patch
> > supplies to make swapin readahead under down_read instead of down_write.
> > 
> > The patch was tested with a test program that allocates 800MB of memory,
> > writes to it, and then sleeps.  The system was forced to swap out all.
> > Afterwards, the test program touches the area by writing, it skips a page
> > in each 20 pages of the area.
> > 
> > Link: http://lkml.kernel.org/r/1464335964-6510-4-git-send-email-ebru.akagunduz@gmail.com
> > Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Rik van Riel <riel@redhat.com>
> > Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > Cc: Cyrill Gorcunov <gorcunov@openvz.org>
> > Cc: Mel Gorman <mgorman@suse.de>
> > Cc: David Rientjes <rientjes@google.com>
> > Cc: Vlastimil Babka <vbabka@suse.cz>
> > Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Michal Hocko <mhocko@suse.cz>
> > Cc: Minchan Kim <minchan.kim@gmail.com>
> > Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> > ---
> >  mm/huge_memory.c | 92 ++++++++++++++++++++++++++++++++++++++------------------
> >  1 file changed, 63 insertions(+), 29 deletions(-)
> > 
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index f2bc57c45d2f..96dfe3f09bf6 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2378,6 +2378,35 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
> >  }
> > 
> >  /*
> > + * If mmap_sem temporarily dropped, revalidate vma
> > + * before taking mmap_sem.
> 
> See below

> > @@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
> >  			continue;
> >  		swapped_in++;
> >  		ret = do_swap_page(mm, vma, _address, pte, pmd,
> > -				   FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> > +				   FAULT_FLAG_ALLOW_RETRY,
> 
> Add a description in change log for it please.

Ebru, would you address it?

> >  				   pteval);
> > +		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
> > +		if (ret & VM_FAULT_RETRY) {
> > +			down_read(&mm->mmap_sem);
> > +			/* vma is no longer available, don't continue to swapin */
> > +			if (hugepage_vma_revalidate(mm, vma, address))
> > +				return false;
> 
> Revalidate vma _after_ acquiring mmap_sem, but the above comment says _before_.

Ditto.

> > +	if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
> > +		up_read(&mm->mmap_sem);
> > +		goto out;
> 
> Jump out with mmap_sem released, 
> 
> > +	result = hugepage_vma_revalidate(mm, vma, address);
> > +	if (result)
> > +		goto out;
> 
> but jump out again with mmap_sem held.
> 
> They are cleaned up in subsequent darns?

I didn't fold fixups for these
> 

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem
  2016-06-16 10:08   ` Kirill A. Shutemov
@ 2016-06-18 19:09     ` Ebru Akagunduz
  2016-06-20  2:51       ` Hillf Danton
  2016-06-20 11:15       ` Michal Hocko
  0 siblings, 2 replies; 7+ messages in thread
From: Ebru Akagunduz @ 2016-06-18 19:09 UTC (permalink / raw)
  To: Kirill A. Shutemov, Hillf Danton; +Cc: linux-kernel, linux-mm

On Thu, Jun 16, 2016 at 01:08:54PM +0300, Kirill A. Shutemov wrote:
> On Thu, Jun 16, 2016 at 02:52:52PM +0800, Hillf Danton wrote:
> > > 
> > > From: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> > > 
> > > Currently khugepaged makes swapin readahead under down_write.  This patch
> > > supplies to make swapin readahead under down_read instead of down_write.
> > > 
> > > The patch was tested with a test program that allocates 800MB of memory,
> > > writes to it, and then sleeps.  The system was forced to swap out all.
> > > Afterwards, the test program touches the area by writing, it skips a page
> > > in each 20 pages of the area.
> > > 
> > > Link: http://lkml.kernel.org/r/1464335964-6510-4-git-send-email-ebru.akagunduz@gmail.com
> > > Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> > > Cc: Hugh Dickins <hughd@google.com>
> > > Cc: Rik van Riel <riel@redhat.com>
> > > Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> > > Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > > Cc: Andrea Arcangeli <aarcange@redhat.com>
> > > Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > > Cc: Cyrill Gorcunov <gorcunov@openvz.org>
> > > Cc: Mel Gorman <mgorman@suse.de>
> > > Cc: David Rientjes <rientjes@google.com>
> > > Cc: Vlastimil Babka <vbabka@suse.cz>
> > > Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > Cc: Michal Hocko <mhocko@suse.cz>
> > > Cc: Minchan Kim <minchan.kim@gmail.com>
> > > Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> > > ---
> > >  mm/huge_memory.c | 92 ++++++++++++++++++++++++++++++++++++++------------------
> > >  1 file changed, 63 insertions(+), 29 deletions(-)
> > > 
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index f2bc57c45d2f..96dfe3f09bf6 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -2378,6 +2378,35 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
> > >  }
> > > 
> > >  /*
> > > + * If mmap_sem temporarily dropped, revalidate vma
> > > + * before taking mmap_sem.
> > 
> > See below
> 
> > > @@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
> > >  			continue;
> > >  		swapped_in++;
> > >  		ret = do_swap_page(mm, vma, _address, pte, pmd,
> > > -				   FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> > > +				   FAULT_FLAG_ALLOW_RETRY,
> > 
> > Add a description in change log for it please.
> 
> Ebru, would you address it?
> 
This changelog really seems poor.
Is there a way to update only changelog of the commit?
I tried to use git rebase to amend commit, however
I could not rebase. This patch only needs better changelog.

I would like to update it as follows, if you would like to too:

"
Currently khugepaged makes swapin readahead under down_write.  This patch
supplies to make swapin readahead under down_read instead of down_write.

Along swapin, we can need to drop and re-take mmap_sem. Therefore we
have to be sure vma is consistent. This patch adds a helper function
to validate vma and also supplies that async swapin should not be
performed without waiting.

The patch was tested with a test program that allocates 800MB of memory,
writes to it, and then sleeps.  The system was forced to swap out all.
Afterwards, the test program touches the area by writing, it skips a page
in each 20 pages of the area.
"

Could you please suggest me a way to replace above changelog with the old?

> > >  				   pteval);
> > > +		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
> > > +		if (ret & VM_FAULT_RETRY) {
> > > +			down_read(&mm->mmap_sem);
> > > +			/* vma is no longer available, don't continue to swapin */
> > > +			if (hugepage_vma_revalidate(mm, vma, address))
> > > +				return false;
> > 
> > Revalidate vma _after_ acquiring mmap_sem, but the above comment says _before_.
> 
> Ditto.
> 
> > > +	if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
> > > +		up_read(&mm->mmap_sem);
> > > +		goto out;
> > 
> > Jump out with mmap_sem released, 
> > 
> > > +	result = hugepage_vma_revalidate(mm, vma, address);
> > > +	if (result)
> > > +		goto out;
> > 
> > but jump out again with mmap_sem held.
> > 
> > They are cleaned up in subsequent darns?
> 
Yes, that is reported and fixed here:
http://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/?id=fc7038a69cee6b817261f7cd805e9663fdc1075c

However, the above comment inconsistency still there.
I've added a fix patch:

>From 404438ff1b0617cbf7434cba0c5a08f79ccb8a5d Mon Sep 17 00:00:00 2001
From: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Date: Sat, 18 Jun 2016 21:07:22 +0300
Subject: [PATCH] mm, thp: fix comment inconsistency for swapin readahead
 functions

Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
---
 mm/huge_memory.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index acd374e..f0d528e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2436,9 +2436,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
 		if (ret & VM_FAULT_RETRY) {
 			down_read(&mm->mmap_sem);
-			/* vma is no longer available, don't continue to swapin */
-			if (hugepage_vma_revalidate(mm, address))
+			if (hugepage_vma_revalidate(mm, address)) {
+				/* vma is no longer available, don't continue to swapin */
 				return false;
+			}
 		}
 		if (ret & VM_FAULT_ERROR) {
 			trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
@@ -2513,8 +2514,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	if (allocstall == curr_allocstall && swap != 0) {
 		/*
 		 * __collapse_huge_page_swapin always returns with mmap_sem
-		 * locked.  If it fails, release mmap_sem and jump directly
-		 * out.  Continuing to collapse causes inconsistency.
+		 * locked. If it fails, we release mmap_sem and jump out_nolock.
+		 * Continuing to collapse causes inconsistency.
 		 */
 		if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
 			mem_cgroup_cancel_charge(new_page, memcg, true);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem
  2016-06-18 19:09     ` Ebru Akagunduz
@ 2016-06-20  2:51       ` Hillf Danton
  2016-06-22 11:24         ` Ebru Akagunduz
  2016-06-20 11:15       ` Michal Hocko
  1 sibling, 1 reply; 7+ messages in thread
From: Hillf Danton @ 2016-06-20  2:51 UTC (permalink / raw)
  To: 'Ebru Akagunduz', 'Kirill A. Shutemov'
  Cc: linux-kernel, linux-mm, Andrew Morton

> > > > @@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
> > > >  			continue;
> > > >  		swapped_in++;
> > > >  		ret = do_swap_page(mm, vma, _address, pte, pmd,
> > > > -				   FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> > > > +				   FAULT_FLAG_ALLOW_RETRY,
> > >
> > > Add a description in change log for it please.
> >
> > Ebru, would you address it?
> >
> This changelog really seems poor.
> Is there a way to update only changelog of the commit?
> I tried to use git rebase to amend commit, however
> I could not rebase. This patch only needs better changelog.
> 
> I would like to update it as follows, if you would like to too:
> 
> "
> Currently khugepaged makes swapin readahead under down_write.  This patch
> supplies to make swapin readahead under down_read instead of down_write.
> 
> Along swapin, we can need to drop and re-take mmap_sem. Therefore we
> have to be sure vma is consistent. This patch adds a helper function
> to validate vma and also supplies that async swapin should not be
> performed without waiting.
> 
> The patch was tested with a test program that allocates 800MB of memory,
> writes to it, and then sleeps.  The system was forced to swap out all.
> Afterwards, the test program touches the area by writing, it skips a page
> in each 20 pages of the area.
> "
> 
I like to ask again, why is FAULT_FLAG_RETRY_NOWAIT dropped?

> Could you please suggest me a way to replace above changelog with the old?
> 
We can ask Andrew for some advices.

> > >
> > > They are cleaned up in subsequent darns?
> >
> Yes, that is reported and fixed here:
> http://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/?id=fc7038a69cee6b817261f7cd805e9663fdc1075c
> 
> However, the above comment inconsistency still there.
> I've added a fix patch:
> 
> From 404438ff1b0617cbf7434cba0c5a08f79ccb8a5d Mon Sep 17 00:00:00 2001
> From: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> Date: Sat, 18 Jun 2016 21:07:22 +0300
> Subject: [PATCH] mm, thp: fix comment inconsistency for swapin readahead
>  functions
>
Fill in change log please.

thanks
Hillf
 
> Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> ---
>  mm/huge_memory.c | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index acd374e..f0d528e 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2436,9 +2436,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>  		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
>  		if (ret & VM_FAULT_RETRY) {
>  			down_read(&mm->mmap_sem);
> -			/* vma is no longer available, don't continue to swapin */
> -			if (hugepage_vma_revalidate(mm, address))
> +			if (hugepage_vma_revalidate(mm, address)) {
> +				/* vma is no longer available, don't continue to swapin */
>  				return false;
> +			}
>  		}
>  		if (ret & VM_FAULT_ERROR) {
>  			trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
> @@ -2513,8 +2514,8 @@ static void collapse_huge_page(struct mm_struct *mm,
>  	if (allocstall == curr_allocstall && swap != 0) {
>  		/*
>  		 * __collapse_huge_page_swapin always returns with mmap_sem
> -		 * locked.  If it fails, release mmap_sem and jump directly
> -		 * out.  Continuing to collapse causes inconsistency.
> +		 * locked. If it fails, we release mmap_sem and jump out_nolock.
> +		 * Continuing to collapse causes inconsistency.
>  		 */
>  		if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
>  			mem_cgroup_cancel_charge(new_page, memcg, true);
> --
> 1.9.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem
  2016-06-18 19:09     ` Ebru Akagunduz
  2016-06-20  2:51       ` Hillf Danton
@ 2016-06-20 11:15       ` Michal Hocko
  1 sibling, 0 replies; 7+ messages in thread
From: Michal Hocko @ 2016-06-20 11:15 UTC (permalink / raw)
  To: Ebru Akagunduz; +Cc: Kirill A. Shutemov, Hillf Danton, linux-kernel, linux-mm

On Sat 18-06-16 22:09:51, Ebru Akagunduz wrote:
[...]
> This changelog really seems poor.
> Is there a way to update only changelog of the commit?

git commit --amend would do that for the current commit. You can also
tell git rebase -i to 'reword' a particular commits.
> Could you please suggest me a way to replace above changelog with the old?

Just tell Andrew, he can replace the changelog in his tree.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem
  2016-06-20  2:51       ` Hillf Danton
@ 2016-06-22 11:24         ` Ebru Akagunduz
  0 siblings, 0 replies; 7+ messages in thread
From: Ebru Akagunduz @ 2016-06-22 11:24 UTC (permalink / raw)
  To: Hillf Danton, Kirill A. Shutemov; +Cc: hughd, linux-kernel, linux-mm, akpm

On Mon, Jun 20, 2016 at 10:51:25AM +0800, Hillf Danton wrote:
> > > > > @@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
> > > > >  			continue;
> > > > >  		swapped_in++;
> > > > >  		ret = do_swap_page(mm, vma, _address, pte, pmd,
> > > > > -				   FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
> > > > > +				   FAULT_FLAG_ALLOW_RETRY,
> > > >
> > > > Add a description in change log for it please.
> > >
> > > Ebru, would you address it?
> > >
> > This changelog really seems poor.
> > Is there a way to update only changelog of the commit?
> > I tried to use git rebase to amend commit, however
> > I could not rebase. This patch only needs better changelog.
> > 
> > I would like to update it as follows, if you would like to too:
> > 
> > "
> > Currently khugepaged makes swapin readahead under down_write.  This patch
> > supplies to make swapin readahead under down_read instead of down_write.
> > 
> > Along swapin, we can need to drop and re-take mmap_sem. Therefore we
> > have to be sure vma is consistent. This patch adds a helper function
> > to validate vma and also supplies that async swapin should not be
> > performed without waiting.
> > 
> > The patch was tested with a test program that allocates 800MB of memory,
> > writes to it, and then sleeps.  The system was forced to swap out all.
> > Afterwards, the test program touches the area by writing, it skips a page
> > in each 20 pages of the area.
> > "
> > 
> I like to ask again, why is FAULT_FLAG_RETRY_NOWAIT dropped?
> 
I dropped it regarding to Hugh's concern. If I understood correctly,
he said async swapin without waiting can cause waste. When I looked the
code path, it seemed subtle to me. There was a large discussion,
below is Hugh's concern:

"
Doesn't this imply that __collapse_huge_page_swapin() will initiate all
the necessary swapins for a THP, then (given the FAULT_FLAG_ALLOW_RETRY)
not wait for them to complete, so khugepaged will give up on that extent
and move on to another; then after another full circuit of all the mms
it needs to examine, it will arrive back at this extent and build a THP
from the swapins it arranged last time.

Which may work well when a system transitions from busy+swappingout
to idle+swappingin, but isn't that rather a special case?  It feels
(meaning, I've not measured at all) as if the inbetween busyish case
will waste a lot of I/O and memory on swapins that have to be discarded
again before khugepaged has made its sedate way back to slotting them in.
"

Cc'ed Hugh. Maybe I misunderstood him.

> > Could you please suggest me a way to replace above changelog with the old?
> > 
> We can ask Andrew for some advices.
> 
> > > >
> > > > They are cleaned up in subsequent darns?
> > >
> > Yes, that is reported and fixed here:
> > http://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/?id=fc7038a69cee6b817261f7cd805e9663fdc1075c
> > 
> > However, the above comment inconsistency still there.
> > I've added a fix patch:
> > 
> > From 404438ff1b0617cbf7434cba0c5a08f79ccb8a5d Mon Sep 17 00:00:00 2001
> > From: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> > Date: Sat, 18 Jun 2016 21:07:22 +0300
> > Subject: [PATCH] mm, thp: fix comment inconsistency for swapin readahead
> >  functions
> >
> Fill in change log please.
> 
I filled it and sent as part of a series.
Cc'ed you in that patch.
> thanks
> Hillf
>  
> > Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> > ---
> >  mm/huge_memory.c | 9 +++++----
> >  1 file changed, 5 insertions(+), 4 deletions(-)
> > 
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index acd374e..f0d528e 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2436,9 +2436,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> >  		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
> >  		if (ret & VM_FAULT_RETRY) {
> >  			down_read(&mm->mmap_sem);
> > -			/* vma is no longer available, don't continue to swapin */
> > -			if (hugepage_vma_revalidate(mm, address))
> > +			if (hugepage_vma_revalidate(mm, address)) {
> > +				/* vma is no longer available, don't continue to swapin */
> >  				return false;
> > +			}
> >  		}
> >  		if (ret & VM_FAULT_ERROR) {
> >  			trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
> > @@ -2513,8 +2514,8 @@ static void collapse_huge_page(struct mm_struct *mm,
> >  	if (allocstall == curr_allocstall && swap != 0) {
> >  		/*
> >  		 * __collapse_huge_page_swapin always returns with mmap_sem
> > -		 * locked.  If it fails, release mmap_sem and jump directly
> > -		 * out.  Continuing to collapse causes inconsistency.
> > +		 * locked. If it fails, we release mmap_sem and jump out_nolock.
> > +		 * Continuing to collapse causes inconsistency.
> >  		 */
> >  		if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
> >  			mem_cgroup_cancel_charge(new_page, memcg, true);
> > --
> > 1.9.1
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem
  2016-06-15 20:06 ` [PATCHv9-rebased2 00/37] " Kirill A. Shutemov
@ 2016-06-15 20:06   ` Kirill A. Shutemov
  0 siblings, 0 replies; 7+ messages in thread
From: Kirill A. Shutemov @ 2016-06-15 20:06 UTC (permalink / raw)
  To: Hugh Dickins, Andrea Arcangeli, Andrew Morton
  Cc: Dave Hansen, Vlastimil Babka, Christoph Lameter, Naoya Horiguchi,
	Jerome Marchand, Yang Shi, Sasha Levin, Andres Lagar-Cavilla,
	Ning Qu, linux-kernel, linux-mm, linux-fsdevel, Ebru Akagunduz,
	Rik van Riel, Kirill A. Shutemov, Joonsoo Kim, Cyrill Gorcunov,
	Mel Gorman, David Rientjes, Aneesh Kumar K . V, Johannes Weiner,
	Michal Hocko, Minchan Kim

From: Ebru Akagunduz <ebru.akagunduz@gmail.com>

Currently khugepaged makes swapin readahead under down_write.  This patch
supplies to make swapin readahead under down_read instead of down_write.

The patch was tested with a test program that allocates 800MB of memory,
writes to it, and then sleeps.  The system was forced to swap out all.
Afterwards, the test program touches the area by writing, it skips a page
in each 20 pages of the area.

Link: http://lkml.kernel.org/r/1464335964-6510-4-git-send-email-ebru.akagunduz@gmail.com
Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 92 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 29 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f2bc57c45d2f..96dfe3f09bf6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2378,6 +2378,35 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
 }
 
 /*
+ * If mmap_sem temporarily dropped, revalidate vma
+ * before taking mmap_sem.
+ * Return 0 if succeeds, otherwise return none-zero
+ * value (scan code).
+ */
+
+static int hugepage_vma_revalidate(struct mm_struct *mm,
+				   struct vm_area_struct *vma,
+				   unsigned long address)
+{
+	unsigned long hstart, hend;
+
+	if (unlikely(khugepaged_test_exit(mm)))
+		return SCAN_ANY_PROCESS;
+
+	vma = find_vma(mm, address);
+	if (!vma)
+		return SCAN_VMA_NULL;
+
+	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+	hend = vma->vm_end & HPAGE_PMD_MASK;
+	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+		return SCAN_ADDRESS_RANGE;
+	if (!hugepage_vma_check(vma))
+		return SCAN_VMA_CHECK;
+	return 0;
+}
+
+/*
  * Bring missing pages in from swap, to complete THP collapse.
  * Only done if khugepaged_scan_pmd believes it is worthwhile.
  *
@@ -2385,7 +2414,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
  * but with mmap_sem held to protect against vma changes.
  */
 
-static void __collapse_huge_page_swapin(struct mm_struct *mm,
+static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long address, pmd_t *pmd)
 {
@@ -2401,11 +2430,18 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
 			continue;
 		swapped_in++;
 		ret = do_swap_page(mm, vma, _address, pte, pmd,
-				   FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+				   FAULT_FLAG_ALLOW_RETRY,
 				   pteval);
+		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
+		if (ret & VM_FAULT_RETRY) {
+			down_read(&mm->mmap_sem);
+			/* vma is no longer available, don't continue to swapin */
+			if (hugepage_vma_revalidate(mm, vma, address))
+				return false;
+		}
 		if (ret & VM_FAULT_ERROR) {
 			trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
-			return;
+			return false;
 		}
 		/* pte is unmapped now, we need to map it */
 		pte = pte_offset_map(pmd, _address);
@@ -2413,6 +2449,7 @@ static void __collapse_huge_page_swapin(struct mm_struct *mm,
 	pte--;
 	pte_unmap(pte);
 	trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+	return true;
 }
 
 static void collapse_huge_page(struct mm_struct *mm,
@@ -2427,7 +2464,6 @@ static void collapse_huge_page(struct mm_struct *mm,
 	struct page *new_page;
 	spinlock_t *pmd_ptl, *pte_ptl;
 	int isolated = 0, result = 0;
-	unsigned long hstart, hend;
 	struct mem_cgroup *memcg;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
@@ -2450,39 +2486,37 @@ static void collapse_huge_page(struct mm_struct *mm,
 		goto out_nolock;
 	}
 
-	/*
-	 * Prevent all access to pagetables with the exception of
-	 * gup_fast later hanlded by the ptep_clear_flush and the VM
-	 * handled by the anon_vma lock + PG_lock.
-	 */
-	down_write(&mm->mmap_sem);
-	if (unlikely(khugepaged_test_exit(mm))) {
-		result = SCAN_ANY_PROCESS;
+	down_read(&mm->mmap_sem);
+	result = hugepage_vma_revalidate(mm, vma, address);
+	if (result)
 		goto out;
-	}
 
-	vma = find_vma(mm, address);
-	if (!vma) {
-		result = SCAN_VMA_NULL;
-		goto out;
-	}
-	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
-	hend = vma->vm_end & HPAGE_PMD_MASK;
-	if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
-		result = SCAN_ADDRESS_RANGE;
-		goto out;
-	}
-	if (!hugepage_vma_check(vma)) {
-		result = SCAN_VMA_CHECK;
-		goto out;
-	}
 	pmd = mm_find_pmd(mm, address);
 	if (!pmd) {
 		result = SCAN_PMD_NULL;
 		goto out;
 	}
 
-	__collapse_huge_page_swapin(mm, vma, address, pmd);
+	/*
+	 * __collapse_huge_page_swapin always returns with mmap_sem
+	 * locked. If it fails, release mmap_sem and jump directly
+	 * label out. Continuing to collapse causes inconsistency.
+	 */
+	if (!__collapse_huge_page_swapin(mm, vma, address, pmd)) {
+		up_read(&mm->mmap_sem);
+		goto out;
+	}
+
+	up_read(&mm->mmap_sem);
+	/*
+	 * Prevent all access to pagetables with the exception of
+	 * gup_fast later handled by the ptep_clear_flush and the VM
+	 * handled by the anon_vma lock + PG_lock.
+	 */
+	down_write(&mm->mmap_sem);
+	result = hugepage_vma_revalidate(mm, vma, address);
+	if (result)
+		goto out;
 
 	anon_vma_lock_write(vma->anon_vma);
 
-- 
2.8.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-06-22 11:26 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <04f701d1c797$1ebe6b80$5c3b4280$@alibaba-inc.com>
2016-06-16  6:52 ` [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem Hillf Danton
2016-06-16 10:08   ` Kirill A. Shutemov
2016-06-18 19:09     ` Ebru Akagunduz
2016-06-20  2:51       ` Hillf Danton
2016-06-22 11:24         ` Ebru Akagunduz
2016-06-20 11:15       ` Michal Hocko
2016-06-06 14:06 [PATCHv9 00/32] THP-enabled tmpfs/shmem using compound pages Kirill A. Shutemov
2016-06-15 20:06 ` [PATCHv9-rebased2 00/37] " Kirill A. Shutemov
2016-06-15 20:06   ` [PATCHv9-rebased2 01/37] mm, thp: make swapin readahead under down_read of mmap_sem Kirill A. Shutemov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).