[RFC,v2,2/2] fs/task_mmu: acquire mmap_lock for write on soft-dirty cleanup
diff mbox series

Message ID 20201225092529.3228466-3-namit@vmware.com
State New, archived
Headers show
Series
  • mm: fix races due to deferred TLB flushes
Related show

Commit Message

Nadav Amit Dec. 25, 2020, 9:25 a.m. UTC
From: Nadav Amit <namit@vmware.com>

Clearing soft-dirty through /proc/[pid]/clear_refs can cause memory
corruption as it clears the dirty-bit without acquiring the mmap_lock
for write and defers TLB flushes.

As a result of this behavior, it is possible that one of the CPUs would
have the stale PTE cached in its TLB and keep updating the page while
another thread triggers a page-fault, and the page-fault handler would
copy the old page into a new one.

Since the copying is performed without holding the page-table lock, it
is possible that after the copying, and before the PTE is actually
flushed, the CPU that cached the stale PTE in the TLB would keep
changing the page. These changes would be lost and memory corruption
would occur.

As Yu Zhao pointed, this race became more apparent since commit
09854ba94c6a ("mm: do_wp_page() simplification") which made
wp_page_copy() more likely to take place, specifically if
page_count(page) > 1.

The following test produces the failure quite well on 5.10 and my
machine. Note that the test is tailored for recent kernels behavior in
which wp_page_copy() is called when page_count(page) != 1, but the fact
the test does not fail on older kernels does not mean they are not
affected.

  #define _GNU_SOURCE
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <sys/mman.h>
  #include <unistd.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <assert.h>
  #include <fcntl.h>
  #include <string.h>
  #include <threads.h>
  #include <stdatomic.h>

  #define PAGE_SIZE	(4096)
  #define TLB_SIZE	(2000)
  #define N_PAGES	(300000)
  #define ITERATIONS	(2000)
  #define N_THREADS	(2)

  static int stop;
  static char *m;

  static int writer(void *argp)
  {
  	unsigned long t_idx = (unsigned long)argp;
  	int i, cnt = 0;

  	while (!atomic_load(&stop)) {
  		cnt++;
  		atomic_fetch_add((atomic_int *)m, 1);

  		/*
  		 * First thread only accesses the page to have it cached in the
  		 * TLB.
  		 */
  		if (t_idx == 0)
  			continue;

  		/*
  		 * Other threads access enough entries to cause eviction from
  		 * the TLB and trigger #PF upon the next access (before the TLB
  		 * flush of clear_ref actually takes place).
  		 */
  		for (i = 1; i < TLB_SIZE; i++) {
  			if (atomic_load((atomic_int *)(m + PAGE_SIZE * i))) {
  				fprintf(stderr, "unexpected error\n");
  				exit(1);
  			}
  		}
  	}
  	return cnt;
  }

  /*
   * Runs mlock/munlock in the background to raise the page-count of the
   * page and force copying instead of reusing the page. Raising the
   * page-count is possible in better ways, e.g., registering io_uring
   * buffers.
   */
  static int do_mlock(void *argp)
  {
  	while (!atomic_load(&stop)) {
  		if (mlock(m, PAGE_SIZE) || munlock(m, PAGE_SIZE)) {
  			perror("mlock/munlock");
  			exit(1);
  		}
  	}
  	return 0;
  }

  int main(void)
  {
  	int r, cnt, fd, total = 0;
  	long i;
  	thrd_t thr[N_THREADS];
  	thrd_t mlock_thr;

  	fd = open("/proc/self/clear_refs", O_WRONLY, 0666);
  	if (fd < 0) {
  		perror("open");
  		exit(1);
  	}

  	/*
  	 * Have large memory for clear_ref, so there would be some time between
  	 * the unmap and the actual deferred flush.
  	 */
  	m = mmap(NULL, PAGE_SIZE * N_PAGES, PROT_READ|PROT_WRITE,
  			MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0);
  	if (m == MAP_FAILED) {
  		perror("mmap");
  		exit(1);
  	}

  	for (i = 0; i < N_THREADS; i++) {
  		r = thrd_create(&thr[i], writer, (void *)i);
  		assert(r == thrd_success);
  	}

  	r = thrd_create(&mlock_thr, do_mlock, (void *)i);
  	assert(r == thrd_success);

  	for (i = 0; i < ITERATIONS; i++) {
  		r = pwrite(fd, "4", 1, 0);
  		if (r < 0) {
  			perror("pwrite");
  			exit(1);
  		}
  	}

  	atomic_store(&stop, 1);

  	r = thrd_join(mlock_thr, NULL);
  	assert(r == thrd_success);

  	for (i = 0; i < N_THREADS; i++) {
  		r = thrd_join(thr[i], &cnt);
  		assert(r == thrd_success);
  		total += cnt;
  	}

  	r = atomic_load((atomic_int *)(m));
  	if (r != total) {
  		fprintf(stderr, "failed: expected=%d actual=%d\n", total, r);
  		exit(-1);
  	}

  	fprintf(stderr, "ok\n");
  	return 0;
  }

Fix it by taking mmap_lock for write when clearing soft-dirty.

Note that the test keeps failing without the pending fix of the missing
TLB flushes in clear_refs_write() [1].

[1] https://lore.kernel.org/patchwork/patch/1351776/

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes tracking")
Signed-off-by: Nadav Amit <namit@vmware.com>
---
 fs/proc/task_mmu.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

Comments

Will Deacon Jan. 5, 2021, 3:08 p.m. UTC | #1
On Fri, Dec 25, 2020 at 01:25:29AM -0800, Nadav Amit wrote:
> From: Nadav Amit <namit@vmware.com>
> 
> Clearing soft-dirty through /proc/[pid]/clear_refs can cause memory
> corruption as it clears the dirty-bit without acquiring the mmap_lock
> for write and defers TLB flushes.
> 
> As a result of this behavior, it is possible that one of the CPUs would
> have the stale PTE cached in its TLB and keep updating the page while
> another thread triggers a page-fault, and the page-fault handler would
> copy the old page into a new one.

[...]

> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 217aa2705d5d..39b2bd27af79 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1189,6 +1189,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
>  	struct mm_struct *mm;
>  	struct vm_area_struct *vma;
>  	enum clear_refs_types type;
> +	bool write_lock = false;
>  	struct mmu_gather tlb;
>  	int itype;
>  	int rv;
> @@ -1236,21 +1237,16 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
>  		}
>  		tlb_gather_mmu(&tlb, mm, 0, -1);
>  		if (type == CLEAR_REFS_SOFT_DIRTY) {
> +			mmap_read_unlock(mm);
> +			if (mmap_write_lock_killable(mm)) {
> +				count = -EINTR;
> +				goto out_mm;
> +			}
>  			for (vma = mm->mmap; vma; vma = vma->vm_next) {
> -				if (!(vma->vm_flags & VM_SOFTDIRTY))
> -					continue;
> -				mmap_read_unlock(mm);
> -				if (mmap_write_lock_killable(mm)) {
> -					count = -EINTR;
> -					goto out_mm;
> -				}
> -				for (vma = mm->mmap; vma; vma = vma->vm_next) {
> -					vma->vm_flags &= ~VM_SOFTDIRTY;
> -					vma_set_page_prot(vma);
> -				}
> -				mmap_write_downgrade(mm);
> -				break;
> +				vma->vm_flags &= ~VM_SOFTDIRTY;
> +				vma_set_page_prot(vma);
>  			}
> +			write_lock = true;
>  
>  			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
>  						0, NULL, mm, 0, -1UL);
> @@ -1261,7 +1257,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
>  		if (type == CLEAR_REFS_SOFT_DIRTY)
>  			mmu_notifier_invalidate_range_end(&range);
>  		tlb_finish_mmu(&tlb, 0, -1);
> -		mmap_read_unlock(mm);
> +		if (write_lock)
> +			mmap_write_unlock(mm);
> +		else
> +			mmap_read_unlock(mm);
>  out_mm:
>  		mmput(mm);

I probably wouldn't bother with the 'write_lock' variable, and just check
'type == CLEAR_REFS_SOFT_DIRTY' instead.

But that's trivial and I don't have strong opinions, so:

Acked-by: Will Deacon <will@kernel.org>

Are you intending to land this for 5.11? If so, I can just rebase my other
series on top of this.

Will
Andrea Arcangeli Jan. 5, 2021, 6:20 p.m. UTC | #2
On Fri, Dec 25, 2020 at 01:25:29AM -0800, Nadav Amit wrote:
> Fixes: 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes tracking")

Targeting a backport down to 2013 when nothing could wrong in practice
with page_mapcount sounds backwards and unnecessarily risky.

In theory it was already broken and in theory
09854ba94c6aad7886996bfbee2530b3d8a7f4f4 is absolutely perfect and the
previous code of 2013 is completely wrong, but in practice the code
from 2013 worked perfectly until Aug 21 2020.

Since nothing at all could go wrong in soft dirty and uffd-wp until
09854ba94c6aad7886996bfbee2530b3d8a7f4f4, the Fixes need to target
that, definitely not a patch from 2013.

This means the backports will apply clean, they don't need a simple
solution but one that doesn't regress the performance of open source
virtual machines and open source products using clear_refs and uffd-wp
in general.

Thanks,
Andrea
Nadav Amit Jan. 5, 2021, 7:26 p.m. UTC | #3
> On Jan 5, 2021, at 10:20 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> 
> On Fri, Dec 25, 2020 at 01:25:29AM -0800, Nadav Amit wrote:
>> Fixes: 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes tracking")
> 
> Targeting a backport down to 2013 when nothing could wrong in practice
> with page_mapcount sounds backwards and unnecessarily risky.
> 
> In theory it was already broken and in theory
> 09854ba94c6aad7886996bfbee2530b3d8a7f4f4 is absolutely perfect and the
> previous code of 2013 is completely wrong, but in practice the code
> from 2013 worked perfectly until Aug 21 2020.

Well… If you consider the bug that Will recently fixed [1], then soft-dirty
was broken (for a different, yet related reason) since 0758cd830494
("asm-generic/tlb: avoid potential double flush”).

This is not to say that I argue that the patch should be backported to 2013,
just to say that memory corruption bugs can be unnoticed.

[1] https://patchwork.kernel.org/project/linux-mm/patch/20201210121110.10094-2-will@kernel.org/

> 
> Since nothing at all could go wrong in soft dirty and uffd-wp until
> 09854ba94c6aad7886996bfbee2530b3d8a7f4f4, the Fixes need to target
> that, definitely not a patch from 2013.
> 
> This means the backports will apply clean, they don't need a simple
> solution but one that doesn't regress the performance of open source
> virtual machines and open source products using clear_refs and uffd-wp
> in general.

To summarize my action items based your (and others) feedback on both
patches:

1. I will break the first patch into two different patches, one with the
“optimization” for write-unprotect, based on your feedback. It will not
be backported.

2. I will try to add a patch to avoid TLB flushes on
userfaultfd-writeunprotect. It will also not be backported.

3. Let me know if you want me to use your version of testing
mm_tlb_flush_pending() and conditionally flushing, wait for new version fro
you or Peter or to go with taking mmap_lock for write.

Thanks again,
Nadav
Andrea Arcangeli Jan. 5, 2021, 8:39 p.m. UTC | #4
On Tue, Jan 05, 2021 at 07:26:43PM +0000, Nadav Amit wrote:
> > On Jan 5, 2021, at 10:20 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> > 
> > On Fri, Dec 25, 2020 at 01:25:29AM -0800, Nadav Amit wrote:
> >> Fixes: 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes tracking")
> > 
> > Targeting a backport down to 2013 when nothing could wrong in practice
> > with page_mapcount sounds backwards and unnecessarily risky.
> > 
> > In theory it was already broken and in theory
> > 09854ba94c6aad7886996bfbee2530b3d8a7f4f4 is absolutely perfect and the
> > previous code of 2013 is completely wrong, but in practice the code
> > from 2013 worked perfectly until Aug 21 2020.
> 
> Well… If you consider the bug that Will recently fixed [1], then soft-dirty
> was broken (for a different, yet related reason) since 0758cd830494
> ("asm-generic/tlb: avoid potential double flush”).
> 
> This is not to say that I argue that the patch should be backported to 2013,
> just to say that memory corruption bugs can be unnoticed.
> 
> [1] https://patchwork.kernel.org/project/linux-mm/patch/20201210121110.10094-2-will@kernel.org/

Is this a fix or a cleanup?

The above is precisely what I said earlier that tlb_gather had no
reason to stay in clear_refs and it had to use inc_tlb_flush_pending
as mprotect, but it's not a fix? Is it? I suggested it as a pure
cleanup. So again no backport required. The commit says fix this but
it means "clean this up".

Now there are plenty of bugs can go unnoticed for decades, including
dirtycow and the very bug that allowed the fork child to attack the
parent with vmsplice that ultimately motivated the
page_mapcount->page_count in do_wp_page in Aug 2020.

Now let's take another example:
7066f0f933a1fd707bb38781866657769cff7efc which also was found by
source review only and never happened in practice, and unlike dirtycow
and the vmsplice attack on parent was not reproducible even at will
after it was found (even then it wouldn't be reproducible
exploitable). So you can take 7066f0f933a1fd707bb38781866657769cff7efc
as the example of theoretical issue that might still crash the kernel
if unlucky. So before 7066f0f933a1fd707bb38781866657769cff7efc, things
worked by luck but not reliably so.

How are all those above relevant here?

In my view none of the above is relevant.

As I already stated this specific issue, for both uffd-wp and
clear_refs wasn't even a theoretical bug before 2020 Aug, it is not
like 7066f0f933a1fd707bb38781866657769cff7efc, and it's not like
https://patchwork.kernel.org/project/linux-mm/patch/20201210121110.10094-2-will@kernel.org/
which appears a pure cleanup and doesn't need backporting to any
tree.

The uffd-wp clear_refs corruption mathematically could not happen
before Aug 2020, it worked by luck too, but unlike
7066f0f933a1fd707bb38781866657769cff7efc reliably so.

Philosophically I obviously agree the bug originated in 2013, but the
stable trees don't feel research material that would care about such a
intellectual detail.

So setting a Fixes back to 2013 that would go mess with all stable
tree by actively backporting a performance regressions to clear_refs
that can break runtime performance to fix a philosophical issue that
isn't even a theoretical issue, doesn't sound ideal to me.

> To summarize my action items based your (and others) feedback on both
> patches:
> 
> 1. I will break the first patch into two different patches, one with the
> “optimization” for write-unprotect, based on your feedback. It will not
> be backported.
>
> 2. I will try to add a patch to avoid TLB flushes on
> userfaultfd-writeunprotect. It will also not be backported.

I think 1 and 2 above could be in the same patch. Mixing an uffd-wp optimization with the
actual fix the memory corruption wasn't ideal, but doing the same
optimization to both wrprotect and un-wrprotect in the same patch
sounds ideal. The commit explanation would be identical and it can be
de-duplicated this way.

I'd suggest to coordinate with Peter on that, since I wasn't planning
to work on this if somebody else offered to do it.

> 3. Let me know if you want me to use your version of testing
> mm_tlb_flush_pending() and conditionally flushing, wait for new version fro
> you or Peter or to go with taking mmap_lock for write.

Yes, as you suggested, I'm trying to clean it up and send a new
version.

Ultimately my view is there are an huge number of cases where
mmap_write_lock or some other heavy lock that will require
occasionally to block on I/O is beyond impossible not to take. Even
speculative page faults only attack the low hanging anon memory and
there's still MADV_DONTNEED/FREE and other stuff that may have to run
in parallel with UFFDIO_WRITEPROTECT and clear_refs, not just page
faults.

As a reminder: the only case when modifying the vmas is allowed under
mmap_read_lock (I already tried once to make it safer by adding
READ_ONCE/WRITE_ONCE but wasn't merged see
https://www.spinics.net/lists/linux-mm/msg173420.html), is when
updating vm_end/vm_start in growsdown/up, where the vma is extended
down or up in the page fault under only mmap_read_lock.

I'm doing all I can to document and make it more explicit the
complexity we deal with in the code (as well as reducing the gcc
dependency in emitting atomic writes to update vm_end/vm_start, as we
should do in ptes as well in theory). As you may notice in the
feedback from the above submission not all even realized that we're
modifying vmas already under mmap_read_lock. So it'd be great to get
help to merge that READ_ONCE/WRITE_ONCE cleanup that is still valid
and pending for merge but it needs forward porting.

This one, for both soft dirty and uffd_wrprotect, is a walk in the
park to optimize in comparison to the vma modifications.

From my point of view in fact, doing the tlb flush or the wait on the
atomic to be released, does not increase kernel complexity compared to
what we had until now.

I think we already had this complexity before Aug 2020, but we didn't
realize it, and that's why thing then broke in clear_refs in Aug 2020
because of an unrelated change that finally exposed the complexity.

By handling the race so that we stop depending on an undocumented
page_mapcount dependency, we won't be increasing complexity, we'll be
merely documenting the complexity we already had to begin with, so
that it'll be less likely to bite us again in the future if it's
handled explicitly.

Thanks,
Andrea
Yu Zhao Jan. 5, 2021, 9:20 p.m. UTC | #5
On Tue, Jan 05, 2021 at 03:39:35PM -0500, Andrea Arcangeli wrote:
> On Tue, Jan 05, 2021 at 07:26:43PM +0000, Nadav Amit wrote:
> > > On Jan 5, 2021, at 10:20 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> > > 
> > > On Fri, Dec 25, 2020 at 01:25:29AM -0800, Nadav Amit wrote:
> > >> Fixes: 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes tracking")
> > > 
> > > Targeting a backport down to 2013 when nothing could wrong in practice
> > > with page_mapcount sounds backwards and unnecessarily risky.
> > > 
> > > In theory it was already broken and in theory
> > > 09854ba94c6aad7886996bfbee2530b3d8a7f4f4 is absolutely perfect and the
> > > previous code of 2013 is completely wrong, but in practice the code
> > > from 2013 worked perfectly until Aug 21 2020.
> > 
> > Well… If you consider the bug that Will recently fixed [1], then soft-dirty
> > was broken (for a different, yet related reason) since 0758cd830494
> > ("asm-generic/tlb: avoid potential double flush”).
> > 
> > This is not to say that I argue that the patch should be backported to 2013,
> > just to say that memory corruption bugs can be unnoticed.
> > 
> > [1] https://patchwork.kernel.org/project/linux-mm/patch/20201210121110.10094-2-will@kernel.org/
> 
> Is this a fix or a cleanup?
> 
> The above is precisely what I said earlier that tlb_gather had no
> reason to stay in clear_refs and it had to use inc_tlb_flush_pending
> as mprotect, but it's not a fix? Is it? I suggested it as a pure
> cleanup. So again no backport required. The commit says fix this but
> it means "clean this up".
> 
> Now there are plenty of bugs can go unnoticed for decades, including
> dirtycow and the very bug that allowed the fork child to attack the
> parent with vmsplice that ultimately motivated the
> page_mapcount->page_count in do_wp_page in Aug 2020.
> 
> Now let's take another example:
> 7066f0f933a1fd707bb38781866657769cff7efc which also was found by
> source review only and never happened in practice, and unlike dirtycow
> and the vmsplice attack on parent was not reproducible even at will
> after it was found (even then it wouldn't be reproducible
> exploitable). So you can take 7066f0f933a1fd707bb38781866657769cff7efc
> as the example of theoretical issue that might still crash the kernel
> if unlucky. So before 7066f0f933a1fd707bb38781866657769cff7efc, things
> worked by luck but not reliably so.
> 
> How are all those above relevant here?
> 
> In my view none of the above is relevant.
> 
> As I already stated this specific issue, for both uffd-wp and
> clear_refs wasn't even a theoretical bug before 2020 Aug, it is not
> like 7066f0f933a1fd707bb38781866657769cff7efc, and it's not like
> https://patchwork.kernel.org/project/linux-mm/patch/20201210121110.10094-2-will@kernel.org/
> which appears a pure cleanup and doesn't need backporting to any
> tree.
> 
> The uffd-wp clear_refs corruption mathematically could not happen
> before Aug 2020, it worked by luck too, but unlike
> 7066f0f933a1fd707bb38781866657769cff7efc reliably so.
> 
> Philosophically I obviously agree the bug originated in 2013, but the
> stable trees don't feel research material that would care about such a
> intellectual detail.
> 
> So setting a Fixes back to 2013 that would go mess with all stable
> tree by actively backporting a performance regressions to clear_refs
> that can break runtime performance to fix a philosophical issue that
> isn't even a theoretical issue, doesn't sound ideal to me.
> 
> > To summarize my action items based your (and others) feedback on both
> > patches:
> > 
> > 1. I will break the first patch into two different patches, one with the
> > “optimization” for write-unprotect, based on your feedback. It will not
> > be backported.
> >
> > 2. I will try to add a patch to avoid TLB flushes on
> > userfaultfd-writeunprotect. It will also not be backported.
> 
> I think 1 and 2 above could be in the same patch. Mixing an uffd-wp optimization with the
> actual fix the memory corruption wasn't ideal, but doing the same
> optimization to both wrprotect and un-wrprotect in the same patch
> sounds ideal. The commit explanation would be identical and it can be
> de-duplicated this way.
> 
> I'd suggest to coordinate with Peter on that, since I wasn't planning
> to work on this if somebody else offered to do it.

I probably could post something based on the local flush idea we
discussed, but it won't be in this month. It seems to me there is
much has to be done, e.g., auditing all clearing of the writable &
the dirty bits, document the exactly steps when clearing them to
prevent similar problems in the future. I'd be happy to review
your patches too if you could have them sooner.

Meanwhile, Nadav, my reviewed-by on your patch stands, since it's
straightforward and safe for backport.

> > 3. Let me know if you want me to use your version of testing
> > mm_tlb_flush_pending() and conditionally flushing, wait for new version fro
> > you or Peter or to go with taking mmap_lock for write.
> 
> Yes, as you suggested, I'm trying to clean it up and send a new
> version.
> 
> Ultimately my view is there are an huge number of cases where
> mmap_write_lock or some other heavy lock that will require
> occasionally to block on I/O is beyond impossible not to take. Even
> speculative page faults only attack the low hanging anon memory and
> there's still MADV_DONTNEED/FREE and other stuff that may have to run
> in parallel with UFFDIO_WRITEPROTECT and clear_refs, not just page
> faults.
> 
> As a reminder: the only case when modifying the vmas is allowed under
> mmap_read_lock (I already tried once to make it safer by adding
> READ_ONCE/WRITE_ONCE but wasn't merged see
> https://www.spinics.net/lists/linux-mm/msg173420.html), is when
> updating vm_end/vm_start in growsdown/up, where the vma is extended
> down or up in the page fault under only mmap_read_lock.
> 
> I'm doing all I can to document and make it more explicit the
> complexity we deal with in the code (as well as reducing the gcc
> dependency in emitting atomic writes to update vm_end/vm_start, as we
> should do in ptes as well in theory). As you may notice in the
> feedback from the above submission not all even realized that we're
> modifying vmas already under mmap_read_lock. So it'd be great to get
> help to merge that READ_ONCE/WRITE_ONCE cleanup that is still valid
> and pending for merge but it needs forward porting.
> 
> This one, for both soft dirty and uffd_wrprotect, is a walk in the
> park to optimize in comparison to the vma modifications.
> 
> From my point of view in fact, doing the tlb flush or the wait on the
> atomic to be released, does not increase kernel complexity compared to
> what we had until now.
> 
> I think we already had this complexity before Aug 2020, but we didn't
> realize it, and that's why thing then broke in clear_refs in Aug 2020
> because of an unrelated change that finally exposed the complexity.
> 
> By handling the race so that we stop depending on an undocumented
> page_mapcount dependency, we won't be increasing complexity, we'll be
> merely documenting the complexity we already had to begin with, so
> that it'll be less likely to bite us again in the future if it's
> handled explicitly.
> 
> Thanks,
> Andrea
>
Nadav Amit Jan. 5, 2021, 9:22 p.m. UTC | #6
> On Jan 5, 2021, at 12:39 PM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> 
> On Tue, Jan 05, 2021 at 07:26:43PM +0000, Nadav Amit wrote:
>>> On Jan 5, 2021, at 10:20 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
>>> 
>>> On Fri, Dec 25, 2020 at 01:25:29AM -0800, Nadav Amit wrote:
>>>> Fixes: 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes tracking")
>>> 
>>> Targeting a backport down to 2013 when nothing could wrong in practice
>>> with page_mapcount sounds backwards and unnecessarily risky.
>>> 
>>> In theory it was already broken and in theory
>>> 09854ba94c6aad7886996bfbee2530b3d8a7f4f4 is absolutely perfect and the
>>> previous code of 2013 is completely wrong, but in practice the code
>>> from 2013 worked perfectly until Aug 21 2020.
>> 
>> Well… If you consider the bug that Will recently fixed [1], then soft-dirty
>> was broken (for a different, yet related reason) since 0758cd830494
>> ("asm-generic/tlb: avoid potential double flush”).
>> 
>> This is not to say that I argue that the patch should be backported to 2013,
>> just to say that memory corruption bugs can be unnoticed.
>> 
>> [1] https://patchwork.kernel.org/project/linux-mm/patch/20201210121110.10094-2-will@kernel.org/
> 
> Is this a fix or a cleanup?
> 
> The above is precisely what I said earlier that tlb_gather had no
> reason to stay in clear_refs and it had to use inc_tlb_flush_pending
> as mprotect, but it's not a fix? Is it? I suggested it as a pure
> cleanup. So again no backport required. The commit says fix this but
> it means "clean this up".

It is actually a fix. I think the commit log is not entirely correct and
should include:

  Fixes: 0758cd830494 ("asm-generic/tlb: avoid potential double flush”).

Since 0758cd830494, calling tlb_finish_mmu() without any previous call to
pte_free_tlb() and friends does not flush the TLB. The soft-dirty bug
producer that I sent fails without this patch of Will.

> So setting a Fixes back to 2013 that would go mess with all stable
> tree by actively backporting a performance regressions to clear_refs
> that can break runtime performance to fix a philosophical issue that
> isn't even a theoretical issue, doesn't sound ideal to me.

Point taken.

> 
>> To summarize my action items based your (and others) feedback on both
>> patches:
>> 
>> 1. I will break the first patch into two different patches, one with the
>> “optimization” for write-unprotect, based on your feedback. It will not
>> be backported.
>> 
>> 2. I will try to add a patch to avoid TLB flushes on
>> userfaultfd-writeunprotect. It will also not be backported.
> 
> I think 1 and 2 above could be in the same patch. Mixing an uffd-wp optimization with the
> actual fix the memory corruption wasn't ideal, but doing the same
> optimization to both wrprotect and un-wrprotect in the same patch
> sounds ideal. The commit explanation would be identical and it can be
> de-duplicated this way.
> 
> I'd suggest to coordinate with Peter on that, since I wasn't planning
> to work on this if somebody else offered to do it.
> 
>> 3. Let me know if you want me to use your version of testing
>> mm_tlb_flush_pending() and conditionally flushing, wait for new version fro
>> you or Peter or to go with taking mmap_lock for write.
> 
> Yes, as you suggested, I'm trying to clean it up and send a new
> version.
> 
> Ultimately my view is there are an huge number of cases where
> mmap_write_lock or some other heavy lock that will require
> occasionally to block on I/O is beyond impossible not to take. Even
> speculative page faults only attack the low hanging anon memory and
> there's still MADV_DONTNEED/FREE and other stuff that may have to run
> in parallel with UFFDIO_WRITEPROTECT and clear_refs, not just page
> faults.
> 
> As a reminder: the only case when modifying the vmas is allowed under
> mmap_read_lock (I already tried once to make it safer by adding
> READ_ONCE/WRITE_ONCE but wasn't merged see
> https://www.spinics.net/lists/linux-mm/msg173420.html), is when
> updating vm_end/vm_start in growsdown/up, where the vma is extended
> down or up in the page fault under only mmap_read_lock.
> 
> I'm doing all I can to document and make it more explicit the
> complexity we deal with in the code (as well as reducing the gcc
> dependency in emitting atomic writes to update vm_end/vm_start, as we
> should do in ptes as well in theory). As you may notice in the
> feedback from the above submission not all even realized that we're
> modifying vmas already under mmap_read_lock. So it'd be great to get
> help to merge that READ_ONCE/WRITE_ONCE cleanup that is still valid
> and pending for merge but it needs forward porting.
> 
> This one, for both soft dirty and uffd_wrprotect, is a walk in the
> park to optimize in comparison to the vma modifications.

I am sure you are right.

> 
> From my point of view in fact, doing the tlb flush or the wait on the
> atomic to be released, does not increase kernel complexity compared to
> what we had until now.

It is also about performance due to unwarranted TLB flushes.

I think avoiding them requires some finer granularity detection of pending
page-faults. But anyhow, I still owe some TLB optimization patches (and v2
for userfaultfd+iouring) before I can even look at that.

In addition, as I stated before, having some clean interfaces that tell
whether a TLB flush is needed or not would be helpful and simpler to follow.
For instance, we can have is_pte_prot_demotion(oldprot, newprot) to figure
out whether a TLB flush is needed in change_pte_range() and avoid
unnecessary flushes when unprotecting pages with either mprotect() or
userfaultfd.
Peter Xu Jan. 5, 2021, 9:55 p.m. UTC | #7
On Tue, Jan 05, 2021 at 03:39:35PM -0500, Andrea Arcangeli wrote:
> I'd suggest to coordinate with Peter on that, since I wasn't planning
> to work on this if somebody else offered to do it.

Thanks, Andrea.  Nadav, please go ahead with whatever patch(es) in your mind.
Please let me know if you prefer me to do it, or I'll wait for your new version.

Thanks,
Will Deacon Jan. 5, 2021, 10:16 p.m. UTC | #8
On Tue, Jan 05, 2021 at 09:22:51PM +0000, Nadav Amit wrote:
> > On Jan 5, 2021, at 12:39 PM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> > 
> > On Tue, Jan 05, 2021 at 07:26:43PM +0000, Nadav Amit wrote:
> >>> On Jan 5, 2021, at 10:20 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> >>> 
> >>> On Fri, Dec 25, 2020 at 01:25:29AM -0800, Nadav Amit wrote:
> >>>> Fixes: 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes tracking")
> >>> 
> >>> Targeting a backport down to 2013 when nothing could wrong in practice
> >>> with page_mapcount sounds backwards and unnecessarily risky.
> >>> 
> >>> In theory it was already broken and in theory
> >>> 09854ba94c6aad7886996bfbee2530b3d8a7f4f4 is absolutely perfect and the
> >>> previous code of 2013 is completely wrong, but in practice the code
> >>> from 2013 worked perfectly until Aug 21 2020.
> >> 
> >> Well… If you consider the bug that Will recently fixed [1], then soft-dirty
> >> was broken (for a different, yet related reason) since 0758cd830494
> >> ("asm-generic/tlb: avoid potential double flush”).
> >> 
> >> This is not to say that I argue that the patch should be backported to 2013,
> >> just to say that memory corruption bugs can be unnoticed.
> >> 
> >> [1] https://patchwork.kernel.org/project/linux-mm/patch/20201210121110.10094-2-will@kernel.org/
> > 
> > Is this a fix or a cleanup?
> > 
> > The above is precisely what I said earlier that tlb_gather had no
> > reason to stay in clear_refs and it had to use inc_tlb_flush_pending
> > as mprotect, but it's not a fix? Is it? I suggested it as a pure
> > cleanup. So again no backport required. The commit says fix this but
> > it means "clean this up".
> 
> It is actually a fix. I think the commit log is not entirely correct and
> should include:
> 
>   Fixes: 0758cd830494 ("asm-generic/tlb: avoid potential double flush”).
> 
> Since 0758cd830494, calling tlb_finish_mmu() without any previous call to
> pte_free_tlb() and friends does not flush the TLB. The soft-dirty bug
> producer that I sent fails without this patch of Will.

Yes, it's a fix, but I didn't rush it for 5.10 because I don't think rushing
this sort of thing does anybody any favours. I agree that the commit log
should be updated; I mentioned this report in the cover letter:

https://lore.kernel.org/linux-mm/CA+32v5zzFYJQ7eHfJP-2OHeR+6p5PZsX=RDJNU6vGF3hLO+j-g@mail.gmail.com/

demonstrating that somebody has independently stumbled over the missing TLB
invalidation in userspace, but it's not as bad as the other issues we've been
discussing in this thread.

Will
Andrea Arcangeli Jan. 6, 2021, 12:02 a.m. UTC | #9
On Tue, Jan 05, 2021 at 09:22:51PM +0000, Nadav Amit wrote:
> It is also about performance due to unwarranted TLB flushes.

If there will be a problem switching to the wait_flush_pending() model
suggested by Peter may not even require changes to the common code in
memory.c since I'm thinking it may not even need to take a failure
path if we plug it in the same place of the tlb flush.

So instead of the flush we could always block there until we read zero
in the atomic, then smp_rmb() and we're ready to start the copy.

So either we flush IPI if we didn't read zero, or we block until we
read zero, the difference is going to be hidden to do_wp_page. All
do_wp_page cares about is that by the time the abstract call returns,
there's no stale TLB left for such pte. If it is achieved by blocking
and waiting or flushing the TLB it doesn't matter too much.

So thinking of how bad the IPI will do, with the improved arm64 tlb
flushing code in production, we keep track of how many simultaneous mm
context there are, specifically to skip the SMP-unscalable TLBI
broadcast on arm64 like we already avoid IPIs on lazy tlbs on x86 (see
x86 tlb_is_not_lazy in native_flush_tlb_others). In other words the
IPI will materialize only if there's more than one thread running
while clear_refs run. All lazy tlbs won't get IPIs on both x86
upstream and arm64 enterprise.

This won't help multithreaded processes that compute from all CPUs at
all times but even multiple vcpu threads aren't always guaranteed to
be running at all times.

My main concern would be an IPI flood that slowdown clear_refs and
UFFDIO_WRITEPROTECT, but an incremental optimization (not required for
correctness) is to have UFFDIO_WRITEPROTECT and clear_refs switch to
lazy tlb mode before they call inc_tlb_flush_pending() and unlazy only
after dec_tlb_flush_pending. So it's possible to at least guarantee
the IPI won't slow down them down.

> In addition, as I stated before, having some clean interfaces that tell
> whether a TLB flush is needed or not would be helpful and simpler to follow.
> For instance, we can have is_pte_prot_demotion(oldprot, newprot) to figure
> out whether a TLB flush is needed in change_pte_range() and avoid
> unnecessary flushes when unprotecting pages with either mprotect() or
> userfaultfd.

When you mentioned this earlier I was thinking what happens then with
flush_tlb_fix_spurious_fault(). The fact it's safe doesn't guarantee
it's a performance win if there's a stream of spurious faults as
result. So it'd need to be checked, especially as in the case of
mprotect where the flush can be deferred and coalesced in a single IPI
at the end so there's not so much to gain from it anyway.

If you can guarantee there won't be a flood suprious wrprotect faults,
then it'll be a nice optimization.

Thanks,
Andrea
Andrea Arcangeli Jan. 6, 2021, 12:29 a.m. UTC | #10
On Tue, Jan 05, 2021 at 10:16:29PM +0000, Will Deacon wrote:
> On Tue, Jan 05, 2021 at 09:22:51PM +0000, Nadav Amit wrote:
> > > On Jan 5, 2021, at 12:39 PM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> > > 
> > > On Tue, Jan 05, 2021 at 07:26:43PM +0000, Nadav Amit wrote:
> > >>> On Jan 5, 2021, at 10:20 AM, Andrea Arcangeli <aarcange@redhat.com> wrote:
> > >>> 
> > >>> On Fri, Dec 25, 2020 at 01:25:29AM -0800, Nadav Amit wrote:
> > >>>> Fixes: 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes tracking")
> > >>> 
> > >>> Targeting a backport down to 2013 when nothing could wrong in practice
> > >>> with page_mapcount sounds backwards and unnecessarily risky.
> > >>> 
> > >>> In theory it was already broken and in theory
> > >>> 09854ba94c6aad7886996bfbee2530b3d8a7f4f4 is absolutely perfect and the
> > >>> previous code of 2013 is completely wrong, but in practice the code
> > >>> from 2013 worked perfectly until Aug 21 2020.
> > >> 
> > >> Well… If you consider the bug that Will recently fixed [1], then soft-dirty
> > >> was broken (for a different, yet related reason) since 0758cd830494
> > >> ("asm-generic/tlb: avoid potential double flush”).
> > >> 
> > >> This is not to say that I argue that the patch should be backported to 2013,
> > >> just to say that memory corruption bugs can be unnoticed.
> > >> 
> > >> [1] https://patchwork.kernel.org/project/linux-mm/patch/20201210121110.10094-2-will@kernel.org/
> > > 
> > > Is this a fix or a cleanup?
> > > 
> > > The above is precisely what I said earlier that tlb_gather had no
> > > reason to stay in clear_refs and it had to use inc_tlb_flush_pending
> > > as mprotect, but it's not a fix? Is it? I suggested it as a pure
> > > cleanup. So again no backport required. The commit says fix this but
> > > it means "clean this up".
> > 
> > It is actually a fix. I think the commit log is not entirely correct and
> > should include:
> > 
> >   Fixes: 0758cd830494 ("asm-generic/tlb: avoid potential double flush”).

Agreed.

> > 
> > Since 0758cd830494, calling tlb_finish_mmu() without any previous call to
> > pte_free_tlb() and friends does not flush the TLB. The soft-dirty bug
> > producer that I sent fails without this patch of Will.
> 
> Yes, it's a fix, but I didn't rush it for 5.10 because I don't think rushing
> this sort of thing does anybody any favours. I agree that the commit log
> should be updated; I mentioned this report in the cover letter:
> 
> https://lore.kernel.org/linux-mm/CA+32v5zzFYJQ7eHfJP-2OHeR+6p5PZsX=RDJNU6vGF3hLO+j-g@mail.gmail.com/
> 
> demonstrating that somebody has independently stumbled over the missing TLB
> invalidation in userspace, but it's not as bad as the other issues we've been
> discussing in this thread.

Thanks for the explanation Nadav and Will.

The fact the code was a 100% match to the cleanup I independently
suggested a few weeks ago to reduce the confusion in clear_refs, made
me overlook the difference 0758cd830494 made. I didn't realize the
flush got optimized away if no gathering happened.

Backporting this sort of thing with Fixes I guess tends to give the
same kind of favors as rushing it for 5.10, but then in general the
Fixes is accurate here.

Overall it looks obviously safe cleanup and it is also a fix starting
in v5.6, so I don't think this can cause more issues than what it
sure fixes at least.

The cleanup was needed anyway, even before it become a fix, since if
it was mandatory to use tlb_gather when you purely need
inc_tlb_flush_pending, then mprotect couldn't get away with it.

I guess the the optimization in 0758cd830494 just made it more
explicit that no code should use tlb_gather if it doesn't need to
gather any page. Maybe adding some commentary in the comment on top of
tlb_gather_mmu about the new behavior wouldn't hurt.

Patch
diff mbox series

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 217aa2705d5d..39b2bd27af79 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1189,6 +1189,7 @@  static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	enum clear_refs_types type;
+	bool write_lock = false;
 	struct mmu_gather tlb;
 	int itype;
 	int rv;
@@ -1236,21 +1237,16 @@  static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		}
 		tlb_gather_mmu(&tlb, mm, 0, -1);
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
+			mmap_read_unlock(mm);
+			if (mmap_write_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
+			}
 			for (vma = mm->mmap; vma; vma = vma->vm_next) {
-				if (!(vma->vm_flags & VM_SOFTDIRTY))
-					continue;
-				mmap_read_unlock(mm);
-				if (mmap_write_lock_killable(mm)) {
-					count = -EINTR;
-					goto out_mm;
-				}
-				for (vma = mm->mmap; vma; vma = vma->vm_next) {
-					vma->vm_flags &= ~VM_SOFTDIRTY;
-					vma_set_page_prot(vma);
-				}
-				mmap_write_downgrade(mm);
-				break;
+				vma->vm_flags &= ~VM_SOFTDIRTY;
+				vma_set_page_prot(vma);
 			}
+			write_lock = true;
 
 			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
 						0, NULL, mm, 0, -1UL);
@@ -1261,7 +1257,10 @@  static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		if (type == CLEAR_REFS_SOFT_DIRTY)
 			mmu_notifier_invalidate_range_end(&range);
 		tlb_finish_mmu(&tlb, 0, -1);
-		mmap_read_unlock(mm);
+		if (write_lock)
+			mmap_write_unlock(mm);
+		else
+			mmap_read_unlock(mm);
 out_mm:
 		mmput(mm);
 	}