All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
       [not found] <id:20210524090114.63446-10-aneesh.kumar@linux.ibm.com>
@ 2021-05-24 13:38   ` Aneesh Kumar K.V
  0 siblings, 0 replies; 10+ messages in thread
From: Aneesh Kumar K.V @ 2021-05-24 13:38 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: mpe, linuxppc-dev, kaleshsingh, npiggin, joel, Christophe Leroy,
	Linus Torvalds, Aneesh Kumar K.V

CPU 1				CPU 2					CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

				addr = old_addr
				lock(pte_ptl)
lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)

*new_pmd = pmd
									*new_addr = 10; and fills
									TLB with new addr
									and old pfn

unlock(pmd_ptl)
				ptep_get_and_clear()
				flush_tlb_range(old_addr)

				old pfn is free.
									Stale TLB entry

Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
parallel pagetable walk to finish operating on pte before updating new_pmd

With MOVE_PUD only enable MOVE_PUD only if USE_SPLIT_PTE_PTLOCKS is disabled.
In this case both pte ptl and pud ptl points to mm->page_table_lock.

Fixes: c49dd3401802 ("mm: speedup mremap on 1GB or larger regions")
Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Change:
* Check for split PTL before taking pte ptl lock.

 mm/mremap.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 8967a3707332..2fa3e0cb6176 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -224,7 +224,7 @@ static inline void flush_pte_tlb_pwc_range(struct vm_area_struct *vma,
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
-	spinlock_t *old_ptl, *new_ptl;
+	spinlock_t *pte_ptl, *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
 
@@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
 		return false;
 
+
 	/*
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
@@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 
+	if (pmd_none(*old_pmd))
+		goto unlock_out;
+
+	pte_ptl = pte_lockptr(mm, old_pmd);
 	/* Clear the pmd */
 	pmd = *old_pmd;
 	pmd_clear(old_pmd);
@@ -270,9 +275,20 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * flush the TLB before we move the page table entries.
 	 */
 	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
+
+	/*
+	 * Take the ptl here so that we wait for parallel page table walk
+	 * and operations (eg: pageout)using old addr to finish.
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_lock(pte_ptl);
+
 	VM_BUG_ON(!pmd_none(*new_pmd));
 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_unlock(pte_ptl);
 
+unlock_out:
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
@@ -296,6 +312,14 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t pud;
 
+	/*
+	 * Disable MOVE_PUD until we get the pageout done with all
+	 * higher level page table locks held. With SPLIT_PTE_PTLOCKS
+	 * we use mm->page_table_lock for both pte ptl and pud ptl
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		return false;
+
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
@ 2021-05-24 13:38   ` Aneesh Kumar K.V
  0 siblings, 0 replies; 10+ messages in thread
From: Aneesh Kumar K.V @ 2021-05-24 13:38 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	linuxppc-dev

CPU 1				CPU 2					CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

				addr = old_addr
				lock(pte_ptl)
lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)

*new_pmd = pmd
									*new_addr = 10; and fills
									TLB with new addr
									and old pfn

unlock(pmd_ptl)
				ptep_get_and_clear()
				flush_tlb_range(old_addr)

				old pfn is free.
									Stale TLB entry

Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
parallel pagetable walk to finish operating on pte before updating new_pmd

With MOVE_PUD only enable MOVE_PUD only if USE_SPLIT_PTE_PTLOCKS is disabled.
In this case both pte ptl and pud ptl points to mm->page_table_lock.

Fixes: c49dd3401802 ("mm: speedup mremap on 1GB or larger regions")
Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Change:
* Check for split PTL before taking pte ptl lock.

 mm/mremap.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 8967a3707332..2fa3e0cb6176 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -224,7 +224,7 @@ static inline void flush_pte_tlb_pwc_range(struct vm_area_struct *vma,
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
-	spinlock_t *old_ptl, *new_ptl;
+	spinlock_t *pte_ptl, *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
 
@@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
 		return false;
 
+
 	/*
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
@@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 
+	if (pmd_none(*old_pmd))
+		goto unlock_out;
+
+	pte_ptl = pte_lockptr(mm, old_pmd);
 	/* Clear the pmd */
 	pmd = *old_pmd;
 	pmd_clear(old_pmd);
@@ -270,9 +275,20 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * flush the TLB before we move the page table entries.
 	 */
 	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
+
+	/*
+	 * Take the ptl here so that we wait for parallel page table walk
+	 * and operations (eg: pageout)using old addr to finish.
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_lock(pte_ptl);
+
 	VM_BUG_ON(!pmd_none(*new_pmd));
 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_unlock(pte_ptl);
 
+unlock_out:
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
@@ -296,6 +312,14 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t pud;
 
+	/*
+	 * Disable MOVE_PUD until we get the pageout done with all
+	 * higher level page table locks held. With SPLIT_PTE_PTLOCKS
+	 * we use mm->page_table_lock for both pte ptl and pud ptl
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		return false;
+
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
  2021-05-24 13:38   ` Aneesh Kumar K.V
@ 2021-05-24 17:16     ` Linus Torvalds
  -1 siblings, 0 replies; 10+ messages in thread
From: Linus Torvalds @ 2021-05-24 17:16 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: Linux-MM, Andrew Morton, Michael Ellerman, linuxppc-dev,
	Kalesh Singh, Nick Piggin, Joel Fernandes, Christophe Leroy

On Mon, May 24, 2021 at 3:38 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
> parallel pagetable walk to finish operating on pte before updating new_pmd

Ack on the concept.

However, not so much on the patch.

Odd whitespace change:

> @@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>         if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
>                 return false;
>
> +
>         /*
>          * We don't have to worry about the ordering of src and dst
>          * ptlocks because exclusive mmap_lock prevents deadlock.

And new optimization for empty pmd, which seems unrelated to the
change and should presumably be separate:

> @@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>         if (new_ptl != old_ptl)
>                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>
> +       if (pmd_none(*old_pmd))
> +               goto unlock_out;
> +
> +       pte_ptl = pte_lockptr(mm, old_pmd);
>         /* Clear the pmd */
>         pmd = *old_pmd;
>         pmd_clear(old_pmd);

And also, why does the above assign 'pte_ptl' without using it, when
the actual use is ten lines further down?

So I think this patch needs some cleanup.

              Linus


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
@ 2021-05-24 17:16     ` Linus Torvalds
  0 siblings, 0 replies; 10+ messages in thread
From: Linus Torvalds @ 2021-05-24 17:16 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: Nick Piggin, Linux-MM, Kalesh Singh, Joel Fernandes,
	Andrew Morton, linuxppc-dev

On Mon, May 24, 2021 at 3:38 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
> parallel pagetable walk to finish operating on pte before updating new_pmd

Ack on the concept.

However, not so much on the patch.

Odd whitespace change:

> @@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>         if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
>                 return false;
>
> +
>         /*
>          * We don't have to worry about the ordering of src and dst
>          * ptlocks because exclusive mmap_lock prevents deadlock.

And new optimization for empty pmd, which seems unrelated to the
change and should presumably be separate:

> @@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>         if (new_ptl != old_ptl)
>                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>
> +       if (pmd_none(*old_pmd))
> +               goto unlock_out;
> +
> +       pte_ptl = pte_lockptr(mm, old_pmd);
>         /* Clear the pmd */
>         pmd = *old_pmd;
>         pmd_clear(old_pmd);

And also, why does the above assign 'pte_ptl' without using it, when
the actual use is ten lines further down?

So I think this patch needs some cleanup.

              Linus

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
  2021-05-24 17:16     ` Linus Torvalds
@ 2021-05-25  8:44       ` A lneesh Kumar K.V
  -1 siblings, 0 replies; 10+ messages in thread
From: A lneesh Kumar K.V @ 2021-05-25  8:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Linux-MM, Andrew Morton, Michael Ellerman, linuxppc-dev,
	Kalesh Singh, Nick Piggin, Joel Fernandes, Christophe Leroy

Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Mon, May 24, 2021 at 3:38 AM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
>> parallel pagetable walk to finish operating on pte before updating new_pmd
>
> Ack on the concept.


Should we worry about the below race. The window would be small

CPU 1				CPU 2					CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

mmap_write_lock_killable()

				addr = old_addr

lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)

lock(pte_ptl)
*new_pmd = pmd
unlock(pte_ptl)

unlock(pmd_ptl)
				lock(pte_ptl)
									*new_addr = 10; and fills
									TLB with new addr
									and old pfn

				ptep_clear_flush(old_addr)
				old pfn is free.
									Stale TLB entry



>
> However, not so much on the patch.
>
> Odd whitespace change:
>
>> @@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>>         if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
>>                 return false;
>>
>> +
>>         /*
>>          * We don't have to worry about the ordering of src and dst
>>          * ptlocks because exclusive mmap_lock prevents deadlock.
>
> And new optimization for empty pmd, which seems unrelated to the
> change and should presumably be separate:

That was added that we can safely do pte_lockptr() below

>
>> @@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>>         if (new_ptl != old_ptl)
>>                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>>
>> +       if (pmd_none(*old_pmd))
>> +               goto unlock_out;
>> +
>> +       pte_ptl = pte_lockptr(mm, old_pmd);
>>         /* Clear the pmd */
>>         pmd = *old_pmd;
>>         pmd_clear(old_pmd);
>
> And also, why does the above assign 'pte_ptl' without using it, when
> the actual use is ten lines further down?

So that we fetch the pte_ptl before we clear old_pmd. 


-aneesh


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
@ 2021-05-25  8:44       ` A lneesh Kumar K.V
  0 siblings, 0 replies; 10+ messages in thread
From: A lneesh Kumar K.V @ 2021-05-25  8:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Nick Piggin, Linux-MM, Kalesh Singh, Joel Fernandes,
	Andrew Morton, linuxppc-dev

Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Mon, May 24, 2021 at 3:38 AM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
>> parallel pagetable walk to finish operating on pte before updating new_pmd
>
> Ack on the concept.


Should we worry about the below race. The window would be small

CPU 1				CPU 2					CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

mmap_write_lock_killable()

				addr = old_addr

lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)

lock(pte_ptl)
*new_pmd = pmd
unlock(pte_ptl)

unlock(pmd_ptl)
				lock(pte_ptl)
									*new_addr = 10; and fills
									TLB with new addr
									and old pfn

				ptep_clear_flush(old_addr)
				old pfn is free.
									Stale TLB entry



>
> However, not so much on the patch.
>
> Odd whitespace change:
>
>> @@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>>         if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
>>                 return false;
>>
>> +
>>         /*
>>          * We don't have to worry about the ordering of src and dst
>>          * ptlocks because exclusive mmap_lock prevents deadlock.
>
> And new optimization for empty pmd, which seems unrelated to the
> change and should presumably be separate:

That was added that we can safely do pte_lockptr() below

>
>> @@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>>         if (new_ptl != old_ptl)
>>                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>>
>> +       if (pmd_none(*old_pmd))
>> +               goto unlock_out;
>> +
>> +       pte_ptl = pte_lockptr(mm, old_pmd);
>>         /* Clear the pmd */
>>         pmd = *old_pmd;
>>         pmd_clear(old_pmd);
>
> And also, why does the above assign 'pte_ptl' without using it, when
> the actual use is ten lines further down?

So that we fetch the pte_ptl before we clear old_pmd. 


-aneesh

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
  2021-05-25  8:44       ` A lneesh Kumar K.V
@ 2021-05-25 17:22         ` Linus Torvalds
  -1 siblings, 0 replies; 10+ messages in thread
From: Linus Torvalds @ 2021-05-25 17:22 UTC (permalink / raw)
  To: A lneesh Kumar K.V
  Cc: Linux-MM, Andrew Morton, Michael Ellerman, linuxppc-dev,
	Kalesh Singh, Nick Piggin, Joel Fernandes, Christophe Leroy

On Mon, May 24, 2021 at 10:44 PM A lneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Should we worry about the below race. The window would be small
>
> CPU 1                           CPU 2                                   CPU 3
>
> mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one
>
> mmap_write_lock_killable()
>
>                                 addr = old_addr
>
> lock(pmd_ptl)
> pmd = *old_pmd
> pmd_clear(old_pmd)
> flush_tlb_range(old_addr)
>
> lock(pte_ptl)
> *new_pmd = pmd
> unlock(pte_ptl)
>
> unlock(pmd_ptl)
>                                 lock(pte_ptl)
>                                                                         *new_addr = 10; and fills
>                                                                         TLB with new addr
>                                                                         and old pfn
>
>                                 ptep_clear_flush(old_addr)
>                                 old pfn is free.
>                                                                         Stale TLB entry

Hmm. Do you need a third CPU there? What is done above on CPU3 looks
like it might just be CPU1 accessing the new range immediately.

Which doesn't actually sound at all unlikely - so maybe the window is
small, but it sounds like something that could happen.

This looks nasty. The page shrinker has always been problematic
because it basically avoids the normal full set of locks.

I wonder if we could just make the page shrinker try-lock the mmap_sem
and avoid all this that way. It _is_ allowed to fail, after all, and
the page shrinker is "not normal" and should be less of a performance
issue than all the actual normal VM paths.

Does anybody have any good ideas?

> > And new optimization for empty pmd, which seems unrelated to the
> > change and should presumably be separate:
>
> That was added that we can safely do pte_lockptr() below

Oh, because pte_lockptr() doesn't actually use the "old_pmd" pointer
value - it actually *dereferences* the pointer.

That looks like a mis-design. Why does it do that? Why don't we pass
it the pmd value, if that's what it wants?

               Linus


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
@ 2021-05-25 17:22         ` Linus Torvalds
  0 siblings, 0 replies; 10+ messages in thread
From: Linus Torvalds @ 2021-05-25 17:22 UTC (permalink / raw)
  To: A lneesh Kumar K.V
  Cc: Nick Piggin, Linux-MM, Kalesh Singh, Joel Fernandes,
	Andrew Morton, linuxppc-dev

On Mon, May 24, 2021 at 10:44 PM A lneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Should we worry about the below race. The window would be small
>
> CPU 1                           CPU 2                                   CPU 3
>
> mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one
>
> mmap_write_lock_killable()
>
>                                 addr = old_addr
>
> lock(pmd_ptl)
> pmd = *old_pmd
> pmd_clear(old_pmd)
> flush_tlb_range(old_addr)
>
> lock(pte_ptl)
> *new_pmd = pmd
> unlock(pte_ptl)
>
> unlock(pmd_ptl)
>                                 lock(pte_ptl)
>                                                                         *new_addr = 10; and fills
>                                                                         TLB with new addr
>                                                                         and old pfn
>
>                                 ptep_clear_flush(old_addr)
>                                 old pfn is free.
>                                                                         Stale TLB entry

Hmm. Do you need a third CPU there? What is done above on CPU3 looks
like it might just be CPU1 accessing the new range immediately.

Which doesn't actually sound at all unlikely - so maybe the window is
small, but it sounds like something that could happen.

This looks nasty. The page shrinker has always been problematic
because it basically avoids the normal full set of locks.

I wonder if we could just make the page shrinker try-lock the mmap_sem
and avoid all this that way. It _is_ allowed to fail, after all, and
the page shrinker is "not normal" and should be less of a performance
issue than all the actual normal VM paths.

Does anybody have any good ideas?

> > And new optimization for empty pmd, which seems unrelated to the
> > change and should presumably be separate:
>
> That was added that we can safely do pte_lockptr() below

Oh, because pte_lockptr() doesn't actually use the "old_pmd" pointer
value - it actually *dereferences* the pointer.

That looks like a mis-design. Why does it do that? Why don't we pass
it the pmd value, if that's what it wants?

               Linus

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
  2021-05-24  9:01 [PATCH v6 09/11] " Aneesh Kumar K.V
@ 2021-05-24 13:38   ` Aneesh Kumar K.V
  0 siblings, 0 replies; 10+ messages in thread
From: Aneesh Kumar K.V @ 2021-05-24 13:38 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: mpe, linuxppc-dev, kaleshsingh, npiggin, joel, Christophe Leroy,
	Linus Torvalds, Aneesh Kumar K.V

CPU 1				CPU 2					CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

				addr = old_addr
				lock(pte_ptl)
lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)

*new_pmd = pmd
									*new_addr = 10; and fills
									TLB with new addr
									and old pfn

unlock(pmd_ptl)
				ptep_get_and_clear()
				flush_tlb_range(old_addr)

				old pfn is free.
									Stale TLB entry

Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
parallel pagetable walk to finish operating on pte before updating new_pmd

With MOVE_PUD only enable MOVE_PUD only if USE_SPLIT_PTE_PTLOCKS is disabled.
In this case both pte ptl and pud ptl points to mm->page_table_lock.

Fixes: c49dd3401802 ("mm: speedup mremap on 1GB or larger regions")
Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Change:
* Check for split PTL before taking pte ptl lock.

 mm/mremap.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 8967a3707332..2fa3e0cb6176 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -224,7 +224,7 @@ static inline void flush_pte_tlb_pwc_range(struct vm_area_struct *vma,
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
-	spinlock_t *old_ptl, *new_ptl;
+	spinlock_t *pte_ptl, *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
 
@@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
 		return false;
 
+
 	/*
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
@@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 
+	if (pmd_none(*old_pmd))
+		goto unlock_out;
+
+	pte_ptl = pte_lockptr(mm, old_pmd);
 	/* Clear the pmd */
 	pmd = *old_pmd;
 	pmd_clear(old_pmd);
@@ -270,9 +275,20 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * flush the TLB before we move the page table entries.
 	 */
 	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
+
+	/*
+	 * Take the ptl here so that we wait for parallel page table walk
+	 * and operations (eg: pageout)using old addr to finish.
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_lock(pte_ptl);
+
 	VM_BUG_ON(!pmd_none(*new_pmd));
 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_unlock(pte_ptl);
 
+unlock_out:
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
@@ -296,6 +312,14 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t pud;
 
+	/*
+	 * Disable MOVE_PUD until we get the pageout done with all
+	 * higher level page table locks held. With SPLIT_PTE_PTLOCKS
+	 * we use mm->page_table_lock for both pte ptl and pud ptl
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		return false;
+
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout
@ 2021-05-24 13:38   ` Aneesh Kumar K.V
  0 siblings, 0 replies; 10+ messages in thread
From: Aneesh Kumar K.V @ 2021-05-24 13:38 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	linuxppc-dev

CPU 1				CPU 2					CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

				addr = old_addr
				lock(pte_ptl)
lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)

*new_pmd = pmd
									*new_addr = 10; and fills
									TLB with new addr
									and old pfn

unlock(pmd_ptl)
				ptep_get_and_clear()
				flush_tlb_range(old_addr)

				old pfn is free.
									Stale TLB entry

Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
parallel pagetable walk to finish operating on pte before updating new_pmd

With MOVE_PUD only enable MOVE_PUD only if USE_SPLIT_PTE_PTLOCKS is disabled.
In this case both pte ptl and pud ptl points to mm->page_table_lock.

Fixes: c49dd3401802 ("mm: speedup mremap on 1GB or larger regions")
Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Change:
* Check for split PTL before taking pte ptl lock.

 mm/mremap.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 8967a3707332..2fa3e0cb6176 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -224,7 +224,7 @@ static inline void flush_pte_tlb_pwc_range(struct vm_area_struct *vma,
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
-	spinlock_t *old_ptl, *new_ptl;
+	spinlock_t *pte_ptl, *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
 
@@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
 		return false;
 
+
 	/*
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
@@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 
+	if (pmd_none(*old_pmd))
+		goto unlock_out;
+
+	pte_ptl = pte_lockptr(mm, old_pmd);
 	/* Clear the pmd */
 	pmd = *old_pmd;
 	pmd_clear(old_pmd);
@@ -270,9 +275,20 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * flush the TLB before we move the page table entries.
 	 */
 	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
+
+	/*
+	 * Take the ptl here so that we wait for parallel page table walk
+	 * and operations (eg: pageout)using old addr to finish.
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_lock(pte_ptl);
+
 	VM_BUG_ON(!pmd_none(*new_pmd));
 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_unlock(pte_ptl);
 
+unlock_out:
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
@@ -296,6 +312,14 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t pud;
 
+	/*
+	 * Disable MOVE_PUD until we get the pageout done with all
+	 * higher level page table locks held. With SPLIT_PTE_PTLOCKS
+	 * we use mm->page_table_lock for both pte ptl and pud ptl
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		return false;
+
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-05-25 17:23 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <id:20210524090114.63446-10-aneesh.kumar@linux.ibm.com>
2021-05-24 13:38 ` [PATCH v6 updated 9/11] mm/mremap: Fix race between mremap and pageout Aneesh Kumar K.V
2021-05-24 13:38   ` Aneesh Kumar K.V
2021-05-24 17:16   ` Linus Torvalds
2021-05-24 17:16     ` Linus Torvalds
2021-05-25  8:44     ` A lneesh Kumar K.V
2021-05-25  8:44       ` A lneesh Kumar K.V
2021-05-25 17:22       ` Linus Torvalds
2021-05-25 17:22         ` Linus Torvalds
2021-05-24  9:01 [PATCH v6 09/11] " Aneesh Kumar K.V
2021-05-24 13:38 ` [PATCH v6 updated 9/11] " Aneesh Kumar K.V
2021-05-24 13:38   ` Aneesh Kumar K.V

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.