All of lore.kernel.org
 help / color / mirror / Atom feed
From: Hugh Dickins <hughd@google.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: linux-mm@kvack.org, akpm@linux-foundation.org,
	mpe@ellerman.id.au,  linuxppc-dev@lists.ozlabs.org,
	kaleshsingh@google.com, npiggin@gmail.com,
	 joel@joelfernandes.org,
	Christophe Leroy <christophe.leroy@csgroup.eu>,
	 Linus Torvalds <torvalds@linux-foundation.org>,
	 "Kirill A . Shutemov" <kirill@shutemov.name>
Subject: Re: [PATCH v7 01/11] mm/mremap: Fix race between MOVE_PMD mremap and pageout
Date: Mon, 7 Jun 2021 17:06:28 -0700 (PDT)	[thread overview]
Message-ID: <f789af6-8924-3b83-6f82-c662175af126@google.com> (raw)
In-Reply-To: <20210607055131.156184-2-aneesh.kumar@linux.ibm.com>

On Mon, 7 Jun 2021, Aneesh Kumar K.V wrote:

> CPU 1				CPU 2					CPU 3
> 
> mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one
> 
> mmap_write_lock_killable()
> 
> 				addr = old_addr
> 				lock(pte_ptl)
> lock(pmd_ptl)
> pmd = *old_pmd
> pmd_clear(old_pmd)
> flush_tlb_range(old_addr)
> 
> *new_pmd = pmd
> 									*new_addr = 10; and fills
> 									TLB with new addr
> 									and old pfn
> 
> unlock(pmd_ptl)
> 				ptep_clear_flush()
> 				old pfn is free.
> 									Stale TLB entry
> 
> Fix this race by holding pmd lock in pageout. This still doesn't handle the race
> between MOVE_PUD and pageout.
> 
> Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
> Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

This seems very wrong to me, to require another level of locking in the
rmap lookup, just to fix some new pagetable games in mremap.

But Linus asked "Am I missing something?": neither of you have mentioned
mremap's take_rmap_locks(), so I hope that already meets your need.  And
if it needs to be called more often than before (see "need_rmap_locks"),
that's probably okay.

Hugh

> ---
>  include/linux/rmap.h |  9 ++++++---
>  mm/page_vma_mapped.c | 36 ++++++++++++++++++------------------
>  2 files changed, 24 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index def5c62c93b3..272ab0c2b60b 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -207,7 +207,8 @@ struct page_vma_mapped_walk {
>  	unsigned long address;
>  	pmd_t *pmd;
>  	pte_t *pte;
> -	spinlock_t *ptl;
> +	spinlock_t *pte_ptl;
> +	spinlock_t *pmd_ptl;
>  	unsigned int flags;
>  };
>  
> @@ -216,8 +217,10 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
>  	/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
>  	if (pvmw->pte && !PageHuge(pvmw->page))
>  		pte_unmap(pvmw->pte);
> -	if (pvmw->ptl)
> -		spin_unlock(pvmw->ptl);
> +	if (pvmw->pte_ptl)
> +		spin_unlock(pvmw->pte_ptl);
> +	if (pvmw->pmd_ptl)
> +		spin_unlock(pvmw->pmd_ptl);
>  }
>  
>  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 2cf01d933f13..87a2c94c7e27 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -47,8 +47,10 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
>  				return false;
>  		}
>  	}
> -	pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
> -	spin_lock(pvmw->ptl);
> +	if (USE_SPLIT_PTE_PTLOCKS) {
> +		pvmw->pte_ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
> +		spin_lock(pvmw->pte_ptl);
> +	}
>  	return true;
>  }
>  
> @@ -162,8 +164,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  		if (!pvmw->pte)
>  			return false;
>  
> -		pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
> -		spin_lock(pvmw->ptl);
> +		pvmw->pte_ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
> +		spin_lock(pvmw->pte_ptl);
>  		if (!check_pte(pvmw))
>  			return not_found(pvmw);
>  		return true;
> @@ -179,6 +181,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  	if (!pud_present(*pud))
>  		return false;
>  	pvmw->pmd = pmd_offset(pud, pvmw->address);
> +	pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
>  	/*
>  	 * Make sure the pmd value isn't cached in a register by the
>  	 * compiler and used as a stale value after we've observed a
> @@ -186,7 +189,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  	 */
>  	pmde = READ_ONCE(*pvmw->pmd);
>  	if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
> -		pvmw->ptl = pmd_lock(mm, pvmw->pmd);
>  		if (likely(pmd_trans_huge(*pvmw->pmd))) {
>  			if (pvmw->flags & PVMW_MIGRATION)
>  				return not_found(pvmw);
> @@ -206,14 +208,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  				}
>  			}
>  			return not_found(pvmw);
> -		} else {
> -			/* THP pmd was split under us: handle on pte level */
> -			spin_unlock(pvmw->ptl);
> -			pvmw->ptl = NULL;
>  		}
> -	} else if (!pmd_present(pmde)) {
> -		return false;
> -	}
> +	} else if (!pmd_present(pmde))
> +		return not_found(pvmw);
> +
>  	if (!map_pte(pvmw))
>  		goto next_pte;
>  	while (1) {
> @@ -233,19 +231,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  			/* Did we cross page table boundary? */
>  			if (pvmw->address % PMD_SIZE == 0) {
>  				pte_unmap(pvmw->pte);
> -				if (pvmw->ptl) {
> -					spin_unlock(pvmw->ptl);
> -					pvmw->ptl = NULL;
> +				if (pvmw->pte_ptl) {
> +					spin_unlock(pvmw->pte_ptl);
> +					pvmw->pte_ptl = NULL;
>  				}
> +				spin_unlock(pvmw->pmd_ptl);
> +				pvmw->pmd_ptl = NULL;
>  				goto restart;
>  			} else {
>  				pvmw->pte++;
>  			}
>  		} while (pte_none(*pvmw->pte));
>  
> -		if (!pvmw->ptl) {
> -			pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
> -			spin_lock(pvmw->ptl);
> +		if (USE_SPLIT_PTE_PTLOCKS && !pvmw->pte_ptl) {
> +			pvmw->pte_ptl = pte_lockptr(mm, pvmw->pmd);
> +			spin_lock(pvmw->pte_ptl);
>  		}
>  	}
>  }
> -- 
> 2.31.1


WARNING: multiple messages have this Message-ID (diff)
From: Hugh Dickins <hughd@google.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>,
	npiggin@gmail.com, linux-mm@kvack.org, kaleshsingh@google.com,
	joel@joelfernandes.org,
	"Kirill A . Shutemov" <kirill@shutemov.name>,
	akpm@linux-foundation.org, linuxppc-dev@lists.ozlabs.org
Subject: Re: [PATCH v7 01/11] mm/mremap: Fix race between MOVE_PMD mremap and pageout
Date: Mon, 7 Jun 2021 17:06:28 -0700 (PDT)	[thread overview]
Message-ID: <f789af6-8924-3b83-6f82-c662175af126@google.com> (raw)
In-Reply-To: <20210607055131.156184-2-aneesh.kumar@linux.ibm.com>

On Mon, 7 Jun 2021, Aneesh Kumar K.V wrote:

> CPU 1				CPU 2					CPU 3
> 
> mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one
> 
> mmap_write_lock_killable()
> 
> 				addr = old_addr
> 				lock(pte_ptl)
> lock(pmd_ptl)
> pmd = *old_pmd
> pmd_clear(old_pmd)
> flush_tlb_range(old_addr)
> 
> *new_pmd = pmd
> 									*new_addr = 10; and fills
> 									TLB with new addr
> 									and old pfn
> 
> unlock(pmd_ptl)
> 				ptep_clear_flush()
> 				old pfn is free.
> 									Stale TLB entry
> 
> Fix this race by holding pmd lock in pageout. This still doesn't handle the race
> between MOVE_PUD and pageout.
> 
> Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
> Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

This seems very wrong to me, to require another level of locking in the
rmap lookup, just to fix some new pagetable games in mremap.

But Linus asked "Am I missing something?": neither of you have mentioned
mremap's take_rmap_locks(), so I hope that already meets your need.  And
if it needs to be called more often than before (see "need_rmap_locks"),
that's probably okay.

Hugh

> ---
>  include/linux/rmap.h |  9 ++++++---
>  mm/page_vma_mapped.c | 36 ++++++++++++++++++------------------
>  2 files changed, 24 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index def5c62c93b3..272ab0c2b60b 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -207,7 +207,8 @@ struct page_vma_mapped_walk {
>  	unsigned long address;
>  	pmd_t *pmd;
>  	pte_t *pte;
> -	spinlock_t *ptl;
> +	spinlock_t *pte_ptl;
> +	spinlock_t *pmd_ptl;
>  	unsigned int flags;
>  };
>  
> @@ -216,8 +217,10 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
>  	/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
>  	if (pvmw->pte && !PageHuge(pvmw->page))
>  		pte_unmap(pvmw->pte);
> -	if (pvmw->ptl)
> -		spin_unlock(pvmw->ptl);
> +	if (pvmw->pte_ptl)
> +		spin_unlock(pvmw->pte_ptl);
> +	if (pvmw->pmd_ptl)
> +		spin_unlock(pvmw->pmd_ptl);
>  }
>  
>  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 2cf01d933f13..87a2c94c7e27 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -47,8 +47,10 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
>  				return false;
>  		}
>  	}
> -	pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
> -	spin_lock(pvmw->ptl);
> +	if (USE_SPLIT_PTE_PTLOCKS) {
> +		pvmw->pte_ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
> +		spin_lock(pvmw->pte_ptl);
> +	}
>  	return true;
>  }
>  
> @@ -162,8 +164,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  		if (!pvmw->pte)
>  			return false;
>  
> -		pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
> -		spin_lock(pvmw->ptl);
> +		pvmw->pte_ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
> +		spin_lock(pvmw->pte_ptl);
>  		if (!check_pte(pvmw))
>  			return not_found(pvmw);
>  		return true;
> @@ -179,6 +181,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  	if (!pud_present(*pud))
>  		return false;
>  	pvmw->pmd = pmd_offset(pud, pvmw->address);
> +	pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
>  	/*
>  	 * Make sure the pmd value isn't cached in a register by the
>  	 * compiler and used as a stale value after we've observed a
> @@ -186,7 +189,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  	 */
>  	pmde = READ_ONCE(*pvmw->pmd);
>  	if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
> -		pvmw->ptl = pmd_lock(mm, pvmw->pmd);
>  		if (likely(pmd_trans_huge(*pvmw->pmd))) {
>  			if (pvmw->flags & PVMW_MIGRATION)
>  				return not_found(pvmw);
> @@ -206,14 +208,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  				}
>  			}
>  			return not_found(pvmw);
> -		} else {
> -			/* THP pmd was split under us: handle on pte level */
> -			spin_unlock(pvmw->ptl);
> -			pvmw->ptl = NULL;
>  		}
> -	} else if (!pmd_present(pmde)) {
> -		return false;
> -	}
> +	} else if (!pmd_present(pmde))
> +		return not_found(pvmw);
> +
>  	if (!map_pte(pvmw))
>  		goto next_pte;
>  	while (1) {
> @@ -233,19 +231,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>  			/* Did we cross page table boundary? */
>  			if (pvmw->address % PMD_SIZE == 0) {
>  				pte_unmap(pvmw->pte);
> -				if (pvmw->ptl) {
> -					spin_unlock(pvmw->ptl);
> -					pvmw->ptl = NULL;
> +				if (pvmw->pte_ptl) {
> +					spin_unlock(pvmw->pte_ptl);
> +					pvmw->pte_ptl = NULL;
>  				}
> +				spin_unlock(pvmw->pmd_ptl);
> +				pvmw->pmd_ptl = NULL;
>  				goto restart;
>  			} else {
>  				pvmw->pte++;
>  			}
>  		} while (pte_none(*pvmw->pte));
>  
> -		if (!pvmw->ptl) {
> -			pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
> -			spin_lock(pvmw->ptl);
> +		if (USE_SPLIT_PTE_PTLOCKS && !pvmw->pte_ptl) {
> +			pvmw->pte_ptl = pte_lockptr(mm, pvmw->pmd);
> +			spin_lock(pvmw->pte_ptl);
>  		}
>  	}
>  }
> -- 
> 2.31.1

  reply	other threads:[~2021-06-08  0:06 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-07  5:51 [PATCH v7 00/11] Speedup mremap on ppc64 Aneesh Kumar K.V
2021-06-07  5:51 ` Aneesh Kumar K.V
2021-06-07  5:51 ` [PATCH v7 01/11] mm/mremap: Fix race between MOVE_PMD mremap and pageout Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-08  0:06   ` Hugh Dickins [this message]
2021-06-08  0:06     ` Hugh Dickins
2021-06-08  7:52     ` Aneesh Kumar K.V
2021-06-08  7:52       ` Aneesh Kumar K.V
2021-06-08  9:42       ` Kirill A. Shutemov
2021-06-08  9:42         ` Kirill A. Shutemov
2021-06-08 11:17         ` Aneesh Kumar K.V
2021-06-08 11:17           ` Aneesh Kumar K.V
2021-06-08 12:05           ` Kirill A. Shutemov
2021-06-08 12:05             ` Kirill A. Shutemov
2021-06-08 20:39       ` Hugh Dickins
2021-06-08 20:39         ` Hugh Dickins
2021-06-07  5:51 ` [PATCH v7 02/11] mm/mremap: Fix race between MOVE_PUD " Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-14 14:55   ` [mm/mremap] ecf8443e51: vm-scalability.throughput -29.4% regression kernel test robot
2021-06-14 14:55     ` kernel test robot
2021-06-14 14:55     ` kernel test robot
2021-06-14 14:58     ` Linus Torvalds
2021-06-14 14:58       ` Linus Torvalds
2021-06-14 14:58       ` Linus Torvalds
2021-06-14 14:58       ` Linus Torvalds
2021-06-14 16:08     ` Aneesh Kumar K.V
2021-06-14 16:08       ` Aneesh Kumar K.V
2021-06-14 16:08       ` Aneesh Kumar K.V
2021-06-17  2:38       ` [LKP] " Liu, Yujie
2021-06-17  2:38         ` Liu, Yujie
2021-06-17  2:38         ` [LKP] " Liu, Yujie
2021-06-07  5:51 ` [PATCH v7 03/11] selftest/mremap_test: Update the test to handle pagesize other than 4K Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-07  5:51 ` [PATCH v7 04/11] selftest/mremap_test: Avoid crash with static build Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-07  5:51 ` [PATCH v7 05/11] mm/mremap: Convert huge PUD move to separate helper Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-07  5:51 ` [PATCH v7 06/11] mm/mremap: Don't enable optimized PUD move if page table levels is 2 Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-07  5:51 ` [PATCH v7 07/11] mm/mremap: Use pmd/pud_poplulate to update page table entries Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-07  5:51 ` [PATCH v7 08/11] powerpc/mm/book3s64: Fix possible build error Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-07  5:51 ` [PATCH v7 09/11] mm/mremap: Allow arch runtime override Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-07  5:51 ` [PATCH v7 10/11] powerpc/book3s64/mm: Update flush_tlb_range to flush page walk cache Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-07  5:51 ` [PATCH v7 11/11] powerpc/mm: Enable HAVE_MOVE_PMD support Aneesh Kumar K.V
2021-06-07  5:51   ` Aneesh Kumar K.V
2021-06-07 10:10 ` [PATCH v7 00/11] Speedup mremap on ppc64 Nick Piggin
2021-06-07 10:10   ` Nick Piggin
2021-06-08  4:39   ` Aneesh Kumar K.V
2021-06-08  4:39     ` Aneesh Kumar K.V
2021-06-08  5:03     ` Nicholas Piggin
2021-06-08  5:03       ` Nicholas Piggin
2021-06-08 17:10   ` Linus Torvalds
2021-06-08 17:10     ` Linus Torvalds
2021-06-16  1:44     ` Nicholas Piggin
2021-06-16  1:44       ` Nicholas Piggin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f789af6-8924-3b83-6f82-c662175af126@google.com \
    --to=hughd@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=aneesh.kumar@linux.ibm.com \
    --cc=christophe.leroy@csgroup.eu \
    --cc=joel@joelfernandes.org \
    --cc=kaleshsingh@google.com \
    --cc=kirill@shutemov.name \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mpe@ellerman.id.au \
    --cc=npiggin@gmail.com \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.