linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
@ 2019-09-24 17:15 Johannes Weiner
  2019-09-24 17:48 ` Matthew Wilcox
  0 siblings, 1 reply; 9+ messages in thread
From: Johannes Weiner @ 2019-09-24 17:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Josef Bacik, linux-mm, linux-kernel

From: Johannes Weiner <jweiner@fb.com>

One of our services is observing hanging ps/top/etc under heavy write
IO, and the task states show this is an mmap_sem priority inversion:

A write fault is holding the mmap_sem in read-mode and waiting for
(heavily cgroup-limited) IO in balance_dirty_pages():

[<0>] balance_dirty_pages+0x724/0x905
[<0>] balance_dirty_pages_ratelimited+0x254/0x390
[<0>] fault_dirty_shared_page.isra.96+0x4a/0x90
[<0>] do_wp_page+0x33e/0x400
[<0>] __handle_mm_fault+0x6f0/0xfa0
[<0>] handle_mm_fault+0xe4/0x200
[<0>] __do_page_fault+0x22b/0x4a0
[<0>] page_fault+0x45/0x50
[<0>] 0xffffffffffffffff

Somebody tries to change the address space, contending for the
mmap_sem in write-mode:

[<0>] call_rwsem_down_write_failed_killable+0x13/0x20
[<0>] do_mprotect_pkey+0xa8/0x330
[<0>] SyS_mprotect+0xf/0x20
[<0>] do_syscall_64+0x5b/0x100
[<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[<0>] 0xffffffffffffffff

The waiting writer locks out all subsequent readers to avoid lock
starvation, and several threads can be seen hanging like this:

[<0>] call_rwsem_down_read_failed+0x14/0x30
[<0>] proc_pid_cmdline_read+0xa0/0x480
[<0>] __vfs_read+0x23/0x140
[<0>] vfs_read+0x87/0x130
[<0>] SyS_read+0x42/0x90
[<0>] do_syscall_64+0x5b/0x100
[<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[<0>] 0xffffffffffffffff

To fix this, do what we do for cache read faults already: drop the
mmap_sem before calling into anything IO bound, in this case the
balance_dirty_pages() function, and return VM_FAULT_RETRY.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/memory.c | 53 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 2e796372927f..da5eb1d67447 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2221,12 +2221,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
  *
  * The function expects the page to be locked and unlocks it.
  */
-static void fault_dirty_shared_page(struct vm_area_struct *vma,
-				    struct page *page)
+static int fault_dirty_shared_page(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping;
+	struct page *page = vmf->page;
 	bool dirtied;
 	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+	int ret = 0;
 
 	dirtied = set_page_dirty(page);
 	VM_BUG_ON_PAGE(PageAnon(page), page);
@@ -2239,16 +2241,36 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma,
 	mapping = page_rmapping(page);
 	unlock_page(page);
 
+	if (!page_mkwrite)
+		file_update_time(vma->vm_file);
+
+	/*
+	 * Throttle page dirtying rate down to writeback speed.
+	 *
+	 * mapping may be NULL here because some device drivers do not
+	 * set page.mapping but still dirty their pages
+	 *
+	 * Drop the mmap_sem before waiting on IO, if we can. The file
+	 * is pinning the mapping, as per above.
+	 */
 	if ((dirtied || page_mkwrite) && mapping) {
-		/*
-		 * Some device drivers do not set page.mapping
-		 * but still dirty their pages
-		 */
+		struct file *fpin = NULL;
+
+		if ((vmf->flags &
+		     (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+		    FAULT_FLAG_ALLOW_RETRY) {
+			fpin = get_file(vma->vm_file);
+			up_read(&vma->vm_mm->mmap_sem);
+			ret = VM_FAULT_RETRY;
+		}
+
 		balance_dirty_pages_ratelimited(mapping);
+
+		if (fpin)
+			fput(fpin);
 	}
 
-	if (!page_mkwrite)
-		file_update_time(vma->vm_file);
+	return ret;
 }
 
 /*
@@ -2491,6 +2513,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 	__releases(vmf->ptl)
 {
 	struct vm_area_struct *vma = vmf->vma;
+	int ret = VM_FAULT_WRITE;
 
 	get_page(vmf->page);
 
@@ -2514,10 +2537,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 		wp_page_reuse(vmf);
 		lock_page(vmf->page);
 	}
-	fault_dirty_shared_page(vma, vmf->page);
+	ret |= fault_dirty_shared_page(vmf);
 	put_page(vmf->page);
 
-	return VM_FAULT_WRITE;
+	return ret;
 }
 
 /*
@@ -3561,7 +3584,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 		return ret;
 	}
 
-	fault_dirty_shared_page(vma, vmf->page);
+	ret |= fault_dirty_shared_page(vmf);
 	return ret;
 }
 
@@ -3576,7 +3599,6 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 static vm_fault_t do_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct mm_struct *vm_mm = vma->vm_mm;
 	vm_fault_t ret;
 
 	/*
@@ -3617,7 +3639,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
 
 	/* preallocated pagetable is unused: free it */
 	if (vmf->prealloc_pte) {
-		pte_free(vm_mm, vmf->prealloc_pte);
+		/*
+		 * XXX: Accessing vma->vm_mm now is not safe. The page
+		 * fault handler may have dropped the mmap_sem a long
+		 * time ago. Only s390 derefs that parameter.
+		 */
+		pte_free(vma->vm_mm, vmf->prealloc_pte);
 		vmf->prealloc_pte = NULL;
 	}
 	return ret;
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
  2019-09-24 17:15 [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault Johannes Weiner
@ 2019-09-24 17:48 ` Matthew Wilcox
  2019-09-24 19:42   ` Johannes Weiner
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Wilcox @ 2019-09-24 17:48 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Andrew Morton, Josef Bacik, linux-mm, linux-kernel

On Tue, Sep 24, 2019 at 01:15:18PM -0400, Johannes Weiner wrote:
> +static int fault_dirty_shared_page(struct vm_fault *vmf)

vm_fault_t, shirley?

> @@ -2239,16 +2241,36 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma,
>  	mapping = page_rmapping(page);
>  	unlock_page(page);
>  
> +	if (!page_mkwrite)
> +		file_update_time(vma->vm_file);
> +
> +	/*
> +	 * Throttle page dirtying rate down to writeback speed.
> +	 *
> +	 * mapping may be NULL here because some device drivers do not
> +	 * set page.mapping but still dirty their pages
> +	 *
> +	 * Drop the mmap_sem before waiting on IO, if we can. The file
> +	 * is pinning the mapping, as per above.
> +	 */
>  	if ((dirtied || page_mkwrite) && mapping) {
> -		/*
> -		 * Some device drivers do not set page.mapping
> -		 * but still dirty their pages
> -		 */
> +		struct file *fpin = NULL;
> +
> +		if ((vmf->flags &
> +		     (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
> +		    FAULT_FLAG_ALLOW_RETRY) {
> +			fpin = get_file(vma->vm_file);
> +			up_read(&vma->vm_mm->mmap_sem);
> +			ret = VM_FAULT_RETRY;
> +		}
> +
>  		balance_dirty_pages_ratelimited(mapping);
> +
> +		if (fpin)
> +			fput(fpin);
>  	}
>  
> -	if (!page_mkwrite)
> -		file_update_time(vma->vm_file);
> +	return ret;
>  }

I'm not a fan of moving file_update_time() to _before_ the
balance_dirty_pages call.  Also, this is now the third place that needs
maybe_unlock_mmap_for_io, see
https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/

How about:

+	struct file *fpin = NULL;

 	if ((dirtied || page_mkwrite) && mapping) {
		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
 		balance_dirty_pages_ratelimited(mapping);
	}
+
+	if (fpin) {
+		file_update_time(fpin);
+		fput(fpin);
+		return VM_FAULT_RETRY;
+	}

 	if (!page_mkwrite)
 		file_update_time(vma->vm_file);
+	return 0;
 }

>  /*
> @@ -2491,6 +2513,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
>  	__releases(vmf->ptl)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
> +	int ret = VM_FAULT_WRITE;

vm_fault_t again.

> @@ -3576,7 +3599,6 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
>  static vm_fault_t do_fault(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
> -	struct mm_struct *vm_mm = vma->vm_mm;
>  	vm_fault_t ret;
>  
>  	/*
> @@ -3617,7 +3639,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
>  
>  	/* preallocated pagetable is unused: free it */
>  	if (vmf->prealloc_pte) {
> -		pte_free(vm_mm, vmf->prealloc_pte);
> +		/*
> +		 * XXX: Accessing vma->vm_mm now is not safe. The page
> +		 * fault handler may have dropped the mmap_sem a long
> +		 * time ago. Only s390 derefs that parameter.
> +		 */
> +		pte_free(vma->vm_mm, vmf->prealloc_pte);

I'm confused.  This code looks like it was safe before (as it was caching
vma->vm_mm in a local variable), and now you've made it unsafe ... ?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
  2019-09-24 17:48 ` Matthew Wilcox
@ 2019-09-24 19:42   ` Johannes Weiner
  2019-09-24 20:46     ` Matthew Wilcox
  0 siblings, 1 reply; 9+ messages in thread
From: Johannes Weiner @ 2019-09-24 19:42 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Andrew Morton, Josef Bacik, linux-mm, linux-kernel

On Tue, Sep 24, 2019 at 10:48:09AM -0700, Matthew Wilcox wrote:
> On Tue, Sep 24, 2019 at 01:15:18PM -0400, Johannes Weiner wrote:
> > +static int fault_dirty_shared_page(struct vm_fault *vmf)
> 
> vm_fault_t, shirley?

Ah yes, I changed it.

> > @@ -2239,16 +2241,36 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma,
> >  	mapping = page_rmapping(page);
> >  	unlock_page(page);
> >  
> > +	if (!page_mkwrite)
> > +		file_update_time(vma->vm_file);
> > +
> > +	/*
> > +	 * Throttle page dirtying rate down to writeback speed.
> > +	 *
> > +	 * mapping may be NULL here because some device drivers do not
> > +	 * set page.mapping but still dirty their pages
> > +	 *
> > +	 * Drop the mmap_sem before waiting on IO, if we can. The file
> > +	 * is pinning the mapping, as per above.
> > +	 */
> >  	if ((dirtied || page_mkwrite) && mapping) {
> > -		/*
> > -		 * Some device drivers do not set page.mapping
> > -		 * but still dirty their pages
> > -		 */
> > +		struct file *fpin = NULL;
> > +
> > +		if ((vmf->flags &
> > +		     (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
> > +		    FAULT_FLAG_ALLOW_RETRY) {
> > +			fpin = get_file(vma->vm_file);
> > +			up_read(&vma->vm_mm->mmap_sem);
> > +			ret = VM_FAULT_RETRY;
> > +		}
> > +
> >  		balance_dirty_pages_ratelimited(mapping);
> > +
> > +		if (fpin)
> > +			fput(fpin);
> >  	}
> >  
> > -	if (!page_mkwrite)
> > -		file_update_time(vma->vm_file);
> > +	return ret;
> >  }
> 
> I'm not a fan of moving file_update_time() to _before_ the
> balance_dirty_pages call.

Can you elaborate why? If the filesystem has a page_mkwrite op, it
will have already called file_update_time() before this function is
entered. If anything, this change makes the sequence more consistent.

> Also, this is now the third place that needs
> maybe_unlock_mmap_for_io, see
> https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/

Good idea, I moved the helper to internal.h and converted to it.

I left the shmem site alone, though. It doesn't require the file
pinning, so it shouldn't pointlessly bump the file refcount and
suggest such a dependency - that could cost somebody later quite a bit
of time trying to understand the code.

> How about:
> 
> +	struct file *fpin = NULL;
> 
>  	if ((dirtied || page_mkwrite) && mapping) {
> 		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
>  		balance_dirty_pages_ratelimited(mapping);
> 	}
> +
> +	if (fpin) {
> +		file_update_time(fpin);

This isn't an equivalent change and could update the time twice if the
fs has a page_mkwrite op.

> +		fput(fpin);
> +		return VM_FAULT_RETRY;
> +	}
> 
>  	if (!page_mkwrite)
>  		file_update_time(vma->vm_file);
> +	return 0;
>  }
> 
> >  /*
> > @@ -2491,6 +2513,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
> >  	__releases(vmf->ptl)
> >  {
> >  	struct vm_area_struct *vma = vmf->vma;
> > +	int ret = VM_FAULT_WRITE;
> 
> vm_fault_t again.

Done.

> > @@ -3576,7 +3599,6 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> >  static vm_fault_t do_fault(struct vm_fault *vmf)
> >  {
> >  	struct vm_area_struct *vma = vmf->vma;
> > -	struct mm_struct *vm_mm = vma->vm_mm;
> >  	vm_fault_t ret;
> >  
> >  	/*
> > @@ -3617,7 +3639,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
> >  
> >  	/* preallocated pagetable is unused: free it */
> >  	if (vmf->prealloc_pte) {
> > -		pte_free(vm_mm, vmf->prealloc_pte);
> > +		/*
> > +		 * XXX: Accessing vma->vm_mm now is not safe. The page
> > +		 * fault handler may have dropped the mmap_sem a long
> > +		 * time ago. Only s390 derefs that parameter.
> > +		 */
> > +		pte_free(vma->vm_mm, vmf->prealloc_pte);
> 
> I'm confused.  This code looks like it was safe before (as it was caching
> vma->vm_mm in a local variable), and now you've made it unsafe ... ?

You're right. After looking at it again, this was a total brainfart:
the mm itself is of course safe to access. I removed these two hunks.

Thanks!

Updated patch:

---
From 60c1493bd3b5e0800147165a1dc36b77cf3306e5 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@fb.com>
Date: Wed, 8 May 2019 13:53:38 -0700
Subject: [PATCH v2] mm: drop mmap_sem before calling balance_dirty_pages()
 in write fault

One of our services is observing hanging ps/top/etc under heavy write
IO, and the task states show this is an mmap_sem priority inversion:

A write fault is holding the mmap_sem in read-mode and waiting for
(heavily cgroup-limited) IO in balance_dirty_pages():

[<0>] balance_dirty_pages+0x724/0x905
[<0>] balance_dirty_pages_ratelimited+0x254/0x390
[<0>] fault_dirty_shared_page.isra.96+0x4a/0x90
[<0>] do_wp_page+0x33e/0x400
[<0>] __handle_mm_fault+0x6f0/0xfa0
[<0>] handle_mm_fault+0xe4/0x200
[<0>] __do_page_fault+0x22b/0x4a0
[<0>] page_fault+0x45/0x50
[<0>] 0xffffffffffffffff

Somebody tries to change the address space, contending for the
mmap_sem in write-mode:

[<0>] call_rwsem_down_write_failed_killable+0x13/0x20
[<0>] do_mprotect_pkey+0xa8/0x330
[<0>] SyS_mprotect+0xf/0x20
[<0>] do_syscall_64+0x5b/0x100
[<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[<0>] 0xffffffffffffffff

The waiting writer locks out all subsequent readers to avoid lock
starvation, and several threads can be seen hanging like this:

[<0>] call_rwsem_down_read_failed+0x14/0x30
[<0>] proc_pid_cmdline_read+0xa0/0x480
[<0>] __vfs_read+0x23/0x140
[<0>] vfs_read+0x87/0x130
[<0>] SyS_read+0x42/0x90
[<0>] do_syscall_64+0x5b/0x100
[<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[<0>] 0xffffffffffffffff

To fix this, do what we do for cache read faults already: drop the
mmap_sem before calling into anything IO bound, in this case the
balance_dirty_pages() function, and return VM_FAULT_RETRY.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/filemap.c  | 21 ---------------------
 mm/internal.h | 21 +++++++++++++++++++++
 mm/memory.c   | 38 +++++++++++++++++++++++++++-----------
 3 files changed, 48 insertions(+), 32 deletions(-)

version 2:
- use maybe_unlock_mmap_for_io(), as per Willy
- remove vma dereference bug, as per Willy
- use vm_fault_t, as per Willy

diff --git a/mm/filemap.c b/mm/filemap.c
index 7f99d53005bb..9072c3fbf837 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2362,27 +2362,6 @@ EXPORT_SYMBOL(generic_file_read_iter);
 
 #ifdef CONFIG_MMU
 #define MMAP_LOTSAMISS  (100)
-static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
-					     struct file *fpin)
-{
-	int flags = vmf->flags;
-
-	if (fpin)
-		return fpin;
-
-	/*
-	 * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
-	 * anything, so we only pin the file and drop the mmap_sem if only
-	 * FAULT_FLAG_ALLOW_RETRY is set.
-	 */
-	if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
-	    FAULT_FLAG_ALLOW_RETRY) {
-		fpin = get_file(vmf->vma->vm_file);
-		up_read(&vmf->vma->vm_mm->mmap_sem);
-	}
-	return fpin;
-}
-
 /*
  * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
  * @vmf - the vm_fault for this fault.
diff --git a/mm/internal.h b/mm/internal.h
index 0d5f720c75ab..7dd7fbb577a9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -362,6 +362,27 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 	return max(start, vma->vm_start);
 }
 
+static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+						    struct file *fpin)
+{
+	int flags = vmf->flags;
+
+	if (fpin)
+		return fpin;
+
+	/*
+	 * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+	 * anything, so we only pin the file and drop the mmap_sem if only
+	 * FAULT_FLAG_ALLOW_RETRY is set.
+	 */
+	if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+	    FAULT_FLAG_ALLOW_RETRY) {
+		fpin = get_file(vmf->vma->vm_file);
+		up_read(&vmf->vma->vm_mm->mmap_sem);
+	}
+	return fpin;
+}
+
 #else /* !CONFIG_MMU */
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/memory.c b/mm/memory.c
index 2e796372927f..088b6d3b6f88 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2221,10 +2221,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
  *
  * The function expects the page to be locked and unlocks it.
  */
-static void fault_dirty_shared_page(struct vm_area_struct *vma,
-				    struct page *page)
+static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping;
+	struct page *page = vmf->page;
 	bool dirtied;
 	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
 
@@ -2239,16 +2240,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma,
 	mapping = page_rmapping(page);
 	unlock_page(page);
 
+	if (!page_mkwrite)
+		file_update_time(vma->vm_file);
+
+	/*
+	 * Throttle page dirtying rate down to writeback speed.
+	 *
+	 * mapping may be NULL here because some device drivers do not
+	 * set page.mapping but still dirty their pages
+	 *
+	 * Drop the mmap_sem before waiting on IO, if we can. The file
+	 * is pinning the mapping, as per above.
+	 */
 	if ((dirtied || page_mkwrite) && mapping) {
-		/*
-		 * Some device drivers do not set page.mapping
-		 * but still dirty their pages
-		 */
+		struct file *fpin;
+
+		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
 		balance_dirty_pages_ratelimited(mapping);
+		if (fpin) {
+			fput(fpin);
+			return VM_FAULT_RETRY;
+		}
 	}
 
-	if (!page_mkwrite)
-		file_update_time(vma->vm_file);
+	return 0;
 }
 
 /*
@@ -2491,6 +2506,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 	__releases(vmf->ptl)
 {
 	struct vm_area_struct *vma = vmf->vma;
+	vm_fault_t ret = VM_FAULT_WRITE;
 
 	get_page(vmf->page);
 
@@ -2514,10 +2530,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 		wp_page_reuse(vmf);
 		lock_page(vmf->page);
 	}
-	fault_dirty_shared_page(vma, vmf->page);
+	ret |= fault_dirty_shared_page(vmf);
 	put_page(vmf->page);
 
-	return VM_FAULT_WRITE;
+	return ret;
 }
 
 /*
@@ -3561,7 +3577,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 		return ret;
 	}
 
-	fault_dirty_shared_page(vma, vmf->page);
+	ret |= fault_dirty_shared_page(vmf);
 	return ret;
 }
 
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
  2019-09-24 19:42   ` Johannes Weiner
@ 2019-09-24 20:46     ` Matthew Wilcox
  2019-09-24 21:43       ` Johannes Weiner
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Wilcox @ 2019-09-24 20:46 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Andrew Morton, Josef Bacik, linux-mm, linux-kernel

On Tue, Sep 24, 2019 at 03:42:38PM -0400, Johannes Weiner wrote:
> > I'm not a fan of moving file_update_time() to _before_ the
> > balance_dirty_pages call.
> 
> Can you elaborate why? If the filesystem has a page_mkwrite op, it
> will have already called file_update_time() before this function is
> entered. If anything, this change makes the sequence more consistent.

Oh, that makes sense.  I thought it should be updated after all the data
was written, but it probably doesn't make much difference.

> > Also, this is now the third place that needs
> > maybe_unlock_mmap_for_io, see
> > https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/
> 
> Good idea, I moved the helper to internal.h and converted to it.
> 
> I left the shmem site alone, though. It doesn't require the file
> pinning, so it shouldn't pointlessly bump the file refcount and
> suggest such a dependency - that could cost somebody later quite a bit
> of time trying to understand the code.

The problem for shmem is this:

                        spin_unlock(&inode->i_lock);
                        schedule();

                        spin_lock(&inode->i_lock);
                        finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
                        spin_unlock(&inode->i_lock);

While scheduled, the VMA can go away and the inode be reclaimed, making
this a use-after-free.  The initial suggestion was an increment on
the inode refcount, but since we already have a pattern which involves
pinning the file, I thought that was a better way to go.

> From: Johannes Weiner <jweiner@fb.com>
> Date: Wed, 8 May 2019 13:53:38 -0700
> Subject: [PATCH v2] mm: drop mmap_sem before calling balance_dirty_pages()
>  in write fault
> 
> One of our services is observing hanging ps/top/etc under heavy write
> IO, and the task states show this is an mmap_sem priority inversion:
> 
> A write fault is holding the mmap_sem in read-mode and waiting for
> (heavily cgroup-limited) IO in balance_dirty_pages():
> 
> [<0>] balance_dirty_pages+0x724/0x905
> [<0>] balance_dirty_pages_ratelimited+0x254/0x390
> [<0>] fault_dirty_shared_page.isra.96+0x4a/0x90
> [<0>] do_wp_page+0x33e/0x400
> [<0>] __handle_mm_fault+0x6f0/0xfa0
> [<0>] handle_mm_fault+0xe4/0x200
> [<0>] __do_page_fault+0x22b/0x4a0
> [<0>] page_fault+0x45/0x50
> [<0>] 0xffffffffffffffff
> 
> Somebody tries to change the address space, contending for the
> mmap_sem in write-mode:
> 
> [<0>] call_rwsem_down_write_failed_killable+0x13/0x20
> [<0>] do_mprotect_pkey+0xa8/0x330
> [<0>] SyS_mprotect+0xf/0x20
> [<0>] do_syscall_64+0x5b/0x100
> [<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> [<0>] 0xffffffffffffffff
> 
> The waiting writer locks out all subsequent readers to avoid lock
> starvation, and several threads can be seen hanging like this:
> 
> [<0>] call_rwsem_down_read_failed+0x14/0x30
> [<0>] proc_pid_cmdline_read+0xa0/0x480
> [<0>] __vfs_read+0x23/0x140
> [<0>] vfs_read+0x87/0x130
> [<0>] SyS_read+0x42/0x90
> [<0>] do_syscall_64+0x5b/0x100
> [<0>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> [<0>] 0xffffffffffffffff
> 
> To fix this, do what we do for cache read faults already: drop the
> mmap_sem before calling into anything IO bound, in this case the
> balance_dirty_pages() function, and return VM_FAULT_RETRY.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
  2019-09-24 20:46     ` Matthew Wilcox
@ 2019-09-24 21:43       ` Johannes Weiner
  2019-09-26 13:49         ` Kirill A. Shutemov
  0 siblings, 1 reply; 9+ messages in thread
From: Johannes Weiner @ 2019-09-24 21:43 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Andrew Morton, Josef Bacik, linux-mm, linux-kernel

On Tue, Sep 24, 2019 at 01:46:08PM -0700, Matthew Wilcox wrote:
> On Tue, Sep 24, 2019 at 03:42:38PM -0400, Johannes Weiner wrote:
> > > I'm not a fan of moving file_update_time() to _before_ the
> > > balance_dirty_pages call.
> > 
> > Can you elaborate why? If the filesystem has a page_mkwrite op, it
> > will have already called file_update_time() before this function is
> > entered. If anything, this change makes the sequence more consistent.
> 
> Oh, that makes sense.  I thought it should be updated after all the data
> was written, but it probably doesn't make much difference.
> 
> > > Also, this is now the third place that needs
> > > maybe_unlock_mmap_for_io, see
> > > https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/
> > 
> > Good idea, I moved the helper to internal.h and converted to it.
> > 
> > I left the shmem site alone, though. It doesn't require the file
> > pinning, so it shouldn't pointlessly bump the file refcount and
> > suggest such a dependency - that could cost somebody later quite a bit
> > of time trying to understand the code.
> 
> The problem for shmem is this:
> 
>                         spin_unlock(&inode->i_lock);
>                         schedule();
> 
>                         spin_lock(&inode->i_lock);
>                         finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
>                         spin_unlock(&inode->i_lock);
> 
> While scheduled, the VMA can go away and the inode be reclaimed, making
> this a use-after-free.  The initial suggestion was an increment on
> the inode refcount, but since we already have a pattern which involves
> pinning the file, I thought that was a better way to go.

I completely read over the context of that email you linked - that
there is a bug in the existing code - and looked at it as mere
refactoring patch. My apologies.

Switching that shmem site to maybe_unlock_mmap_for_io() to indirectly
pin the inode (in a separate bug fix patch) indeed makes sense to me.

> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Thanks, Matthew.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
  2019-09-24 21:43       ` Johannes Weiner
@ 2019-09-26 13:49         ` Kirill A. Shutemov
  2019-09-26 18:50           ` Matthew Wilcox
  2019-09-26 19:26           ` Johannes Weiner
  0 siblings, 2 replies; 9+ messages in thread
From: Kirill A. Shutemov @ 2019-09-26 13:49 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Matthew Wilcox, Andrew Morton, Josef Bacik, linux-mm, linux-kernel

On Tue, Sep 24, 2019 at 05:43:37PM -0400, Johannes Weiner wrote:
> On Tue, Sep 24, 2019 at 01:46:08PM -0700, Matthew Wilcox wrote:
> > On Tue, Sep 24, 2019 at 03:42:38PM -0400, Johannes Weiner wrote:
> > > > I'm not a fan of moving file_update_time() to _before_ the
> > > > balance_dirty_pages call.
> > > 
> > > Can you elaborate why? If the filesystem has a page_mkwrite op, it
> > > will have already called file_update_time() before this function is
> > > entered. If anything, this change makes the sequence more consistent.
> > 
> > Oh, that makes sense.  I thought it should be updated after all the data
> > was written, but it probably doesn't make much difference.
> > 
> > > > Also, this is now the third place that needs
> > > > maybe_unlock_mmap_for_io, see
> > > > https://lore.kernel.org/linux-mm/20190917120852.x6x3aypwvh573kfa@box/
> > > 
> > > Good idea, I moved the helper to internal.h and converted to it.
> > > 
> > > I left the shmem site alone, though. It doesn't require the file
> > > pinning, so it shouldn't pointlessly bump the file refcount and
> > > suggest such a dependency - that could cost somebody later quite a bit
> > > of time trying to understand the code.
> > 
> > The problem for shmem is this:
> > 
> >                         spin_unlock(&inode->i_lock);
> >                         schedule();
> > 
> >                         spin_lock(&inode->i_lock);
> >                         finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
> >                         spin_unlock(&inode->i_lock);
> > 
> > While scheduled, the VMA can go away and the inode be reclaimed, making
> > this a use-after-free.  The initial suggestion was an increment on
> > the inode refcount, but since we already have a pattern which involves
> > pinning the file, I thought that was a better way to go.
> 
> I completely read over the context of that email you linked - that
> there is a bug in the existing code - and looked at it as mere
> refactoring patch. My apologies.
> 
> Switching that shmem site to maybe_unlock_mmap_for_io() to indirectly
> pin the inode (in a separate bug fix patch) indeed makes sense to me.

The patch on top of this one is below. Please post them together if you are
going to resend yours.
> 
> > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> > 
> > Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

From bdf96fe9e3c1a319e9fd131efbe0118ea41a41b1 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 26 Sep 2019 16:34:26 +0300
Subject: [PATCH] shmem: Pin the file in shmem_fault() if mmap_sem is dropped

syzbot found the following crash:

 BUG: KASAN: use-after-free in perf_trace_lock_acquire+0x401/0x530
 include/trace/events/lock.h:13
 Read of size 8 at addr ffff8880a5cf2c50 by task syz-executor.0/26173

 CPU: 0 PID: 26173 Comm: syz-executor.0 Not tainted 5.3.0-rc6 #146
 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
 Google 01/01/2011
 Call Trace:
   __dump_stack lib/dump_stack.c:77 [inline]
   dump_stack+0x172/0x1f0 lib/dump_stack.c:113
   print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351
   __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482
   kasan_report+0x12/0x17 mm/kasan/common.c:618
   __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
   perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13
   trace_lock_acquire include/trace/events/lock.h:13 [inline]
   lock_acquire+0x2de/0x410 kernel/locking/lockdep.c:4411
   __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
   _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151
   spin_lock include/linux/spinlock.h:338 [inline]
   shmem_fault+0x5ec/0x7b0 mm/shmem.c:2034
   __do_fault+0x111/0x540 mm/memory.c:3083
   do_shared_fault mm/memory.c:3535 [inline]
   do_fault mm/memory.c:3613 [inline]
   handle_pte_fault mm/memory.c:3840 [inline]
   __handle_mm_fault+0x2adf/0x3f20 mm/memory.c:3964
   handle_mm_fault+0x1b5/0x6b0 mm/memory.c:4001
   do_user_addr_fault arch/x86/mm/fault.c:1441 [inline]
   __do_page_fault+0x536/0xdd0 arch/x86/mm/fault.c:1506
   do_page_fault+0x38/0x590 arch/x86/mm/fault.c:1530
   page_fault+0x39/0x40 arch/x86/entry/entry_64.S:1202

It happens if the VMA got unmapped under us while we dropped mmap_sem
and inode got freed.

Pinning the file if we drop mmap_sem fixes the issue.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: syzbot+03ee87124ee05af991bd@syzkaller.appspotmail.com
Cc: Hillf Danton <hdanton@sina.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
---
 mm/shmem.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 30ce722c23fa..f672e4145cfd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2022,16 +2022,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
 		    shmem_falloc->waitq &&
 		    vmf->pgoff >= shmem_falloc->start &&
 		    vmf->pgoff < shmem_falloc->next) {
+			struct file *fpin = NULL;
 			wait_queue_head_t *shmem_falloc_waitq;
 			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
 
 			ret = VM_FAULT_NOPAGE;
-			if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
-			   !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
-				/* It's polite to up mmap_sem if we can */
-				up_read(&vma->vm_mm->mmap_sem);
+			fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+			if (fpin)
 				ret = VM_FAULT_RETRY;
-			}
 
 			shmem_falloc_waitq = shmem_falloc->waitq;
 			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
@@ -2049,6 +2047,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
 			spin_lock(&inode->i_lock);
 			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
 			spin_unlock(&inode->i_lock);
+
+			if (fpin)
+				fput(fpin);
 			return ret;
 		}
 		spin_unlock(&inode->i_lock);
-- 
2.21.0

-- 
 Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
  2019-09-26 13:49         ` Kirill A. Shutemov
@ 2019-09-26 18:50           ` Matthew Wilcox
  2019-09-26 19:26           ` Johannes Weiner
  1 sibling, 0 replies; 9+ messages in thread
From: Matthew Wilcox @ 2019-09-26 18:50 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Johannes Weiner, Andrew Morton, Josef Bacik, linux-mm, linux-kernel

On Thu, Sep 26, 2019 at 04:49:23PM +0300, Kirill A. Shutemov wrote:
> It happens if the VMA got unmapped under us while we dropped mmap_sem
> and inode got freed.
> 
> Pinning the file if we drop mmap_sem fixes the issue.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Reported-by: syzbot+03ee87124ee05af991bd@syzkaller.appspotmail.com
> Cc: Hillf Danton <hdanton@sina.com>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Hugh Dickins <hughd@google.com>

Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
  2019-09-26 13:49         ` Kirill A. Shutemov
  2019-09-26 18:50           ` Matthew Wilcox
@ 2019-09-26 19:26           ` Johannes Weiner
  2019-09-27  8:39             ` Kirill A. Shutemov
  1 sibling, 1 reply; 9+ messages in thread
From: Johannes Weiner @ 2019-09-26 19:26 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Matthew Wilcox, Andrew Morton, Josef Bacik, linux-mm, linux-kernel

On Thu, Sep 26, 2019 at 04:49:23PM +0300, Kirill A. Shutemov wrote:
> On Tue, Sep 24, 2019 at 05:43:37PM -0400, Johannes Weiner wrote:
> > On Tue, Sep 24, 2019 at 01:46:08PM -0700, Matthew Wilcox wrote:
> > > On Tue, Sep 24, 2019 at 03:42:38PM -0400, Johannes Weiner wrote:
> > > > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> > > 
> > > Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> 
> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

Thanks!

> From bdf96fe9e3c1a319e9fd131efbe0118ea41a41b1 Mon Sep 17 00:00:00 2001
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
> Date: Thu, 26 Sep 2019 16:34:26 +0300
> Subject: [PATCH] shmem: Pin the file in shmem_fault() if mmap_sem is dropped
> 
> syzbot found the following crash:
> 
>  BUG: KASAN: use-after-free in perf_trace_lock_acquire+0x401/0x530
>  include/trace/events/lock.h:13
>  Read of size 8 at addr ffff8880a5cf2c50 by task syz-executor.0/26173
> 
>  CPU: 0 PID: 26173 Comm: syz-executor.0 Not tainted 5.3.0-rc6 #146
>  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
>  Google 01/01/2011
>  Call Trace:
>    __dump_stack lib/dump_stack.c:77 [inline]
>    dump_stack+0x172/0x1f0 lib/dump_stack.c:113
>    print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351
>    __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482
>    kasan_report+0x12/0x17 mm/kasan/common.c:618
>    __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
>    perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13
>    trace_lock_acquire include/trace/events/lock.h:13 [inline]
>    lock_acquire+0x2de/0x410 kernel/locking/lockdep.c:4411
>    __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
>    _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151
>    spin_lock include/linux/spinlock.h:338 [inline]
>    shmem_fault+0x5ec/0x7b0 mm/shmem.c:2034
>    __do_fault+0x111/0x540 mm/memory.c:3083
>    do_shared_fault mm/memory.c:3535 [inline]
>    do_fault mm/memory.c:3613 [inline]
>    handle_pte_fault mm/memory.c:3840 [inline]
>    __handle_mm_fault+0x2adf/0x3f20 mm/memory.c:3964
>    handle_mm_fault+0x1b5/0x6b0 mm/memory.c:4001
>    do_user_addr_fault arch/x86/mm/fault.c:1441 [inline]
>    __do_page_fault+0x536/0xdd0 arch/x86/mm/fault.c:1506
>    do_page_fault+0x38/0x590 arch/x86/mm/fault.c:1530
>    page_fault+0x39/0x40 arch/x86/entry/entry_64.S:1202
> 
> It happens if the VMA got unmapped under us while we dropped mmap_sem
> and inode got freed.
> 
> Pinning the file if we drop mmap_sem fixes the issue.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Reported-by: syzbot+03ee87124ee05af991bd@syzkaller.appspotmail.com
> Cc: Hillf Danton <hdanton@sina.com>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Hugh Dickins <hughd@google.com>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

I have just one nitpick:

> @@ -2022,16 +2022,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
>  		    shmem_falloc->waitq &&
>  		    vmf->pgoff >= shmem_falloc->start &&
>  		    vmf->pgoff < shmem_falloc->next) {
> +			struct file *fpin = NULL;

That initialization seems unnecessary, as the fpin assignment below is
unconditional in the variable's scope.

The second argument to maybe_unlock_mmap_for_io() for tracking state
when the function is called multiple times in the filemap fault goto
maze, we shouldn't need that here for a simple invocation.

>  			wait_queue_head_t *shmem_falloc_waitq;
>  			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
>  
>  			ret = VM_FAULT_NOPAGE;
> -			if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
> -			   !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
> -				/* It's polite to up mmap_sem if we can */
> -				up_read(&vma->vm_mm->mmap_sem);
> +			fpin = maybe_unlock_mmap_for_io(vmf, fpin);

I.e. this:

			fpin = maybe_unlock_mmap_for_io(vmf, NULL);

> +			if (fpin)
>  				ret = VM_FAULT_RETRY;
> -			}
>  
>  			shmem_falloc_waitq = shmem_falloc->waitq;
>  			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
> @@ -2049,6 +2047,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
>  			spin_lock(&inode->i_lock);
>  			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
>  			spin_unlock(&inode->i_lock);
> +
> +			if (fpin)
> +				fput(fpin);
>  			return ret;
>  		}
>  		spin_unlock(&inode->i_lock);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault
  2019-09-26 19:26           ` Johannes Weiner
@ 2019-09-27  8:39             ` Kirill A. Shutemov
  0 siblings, 0 replies; 9+ messages in thread
From: Kirill A. Shutemov @ 2019-09-27  8:39 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Matthew Wilcox, Andrew Morton, Josef Bacik, linux-mm, linux-kernel

On Thu, Sep 26, 2019 at 03:26:24PM -0400, Johannes Weiner wrote:
> I have just one nitpick:

Here's addressed one:

From 22a9d58a79a3ebb727d9e909c8f833cd0a751c08 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 26 Sep 2019 16:34:26 +0300
Subject: [PATCH] shmem: Pin the file in shmem_fault() if mmap_sem is dropped

syzbot found the following crash:

 BUG: KASAN: use-after-free in perf_trace_lock_acquire+0x401/0x530
 include/trace/events/lock.h:13
 Read of size 8 at addr ffff8880a5cf2c50 by task syz-executor.0/26173

 CPU: 0 PID: 26173 Comm: syz-executor.0 Not tainted 5.3.0-rc6 #146
 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
 Google 01/01/2011
 Call Trace:
   __dump_stack lib/dump_stack.c:77 [inline]
   dump_stack+0x172/0x1f0 lib/dump_stack.c:113
   print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351
   __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482
   kasan_report+0x12/0x17 mm/kasan/common.c:618
   __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
   perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13
   trace_lock_acquire include/trace/events/lock.h:13 [inline]
   lock_acquire+0x2de/0x410 kernel/locking/lockdep.c:4411
   __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
   _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151
   spin_lock include/linux/spinlock.h:338 [inline]
   shmem_fault+0x5ec/0x7b0 mm/shmem.c:2034
   __do_fault+0x111/0x540 mm/memory.c:3083
   do_shared_fault mm/memory.c:3535 [inline]
   do_fault mm/memory.c:3613 [inline]
   handle_pte_fault mm/memory.c:3840 [inline]
   __handle_mm_fault+0x2adf/0x3f20 mm/memory.c:3964
   handle_mm_fault+0x1b5/0x6b0 mm/memory.c:4001
   do_user_addr_fault arch/x86/mm/fault.c:1441 [inline]
   __do_page_fault+0x536/0xdd0 arch/x86/mm/fault.c:1506
   do_page_fault+0x38/0x590 arch/x86/mm/fault.c:1530
   page_fault+0x39/0x40 arch/x86/entry/entry_64.S:1202

It happens if the VMA got unmapped under us while we dropped mmap_sem
and inode got freed.

Pinning the file if we drop mmap_sem fixes the issue.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: syzbot+03ee87124ee05af991bd@syzkaller.appspotmail.com
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
---
 mm/shmem.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 30ce722c23fa..0601ad615ccb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2022,16 +2022,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
 		    shmem_falloc->waitq &&
 		    vmf->pgoff >= shmem_falloc->start &&
 		    vmf->pgoff < shmem_falloc->next) {
+			struct file *fpin;
 			wait_queue_head_t *shmem_falloc_waitq;
 			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
 
 			ret = VM_FAULT_NOPAGE;
-			if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
-			   !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
-				/* It's polite to up mmap_sem if we can */
-				up_read(&vma->vm_mm->mmap_sem);
+			fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+			if (fpin)
 				ret = VM_FAULT_RETRY;
-			}
 
 			shmem_falloc_waitq = shmem_falloc->waitq;
 			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
@@ -2049,6 +2047,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
 			spin_lock(&inode->i_lock);
 			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
 			spin_unlock(&inode->i_lock);
+
+			if (fpin)
+				fput(fpin);
 			return ret;
 		}
 		spin_unlock(&inode->i_lock);
-- 
 Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-09-27  8:39 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-24 17:15 [PATCH] mm: drop mmap_sem before calling balance_dirty_pages() in write fault Johannes Weiner
2019-09-24 17:48 ` Matthew Wilcox
2019-09-24 19:42   ` Johannes Weiner
2019-09-24 20:46     ` Matthew Wilcox
2019-09-24 21:43       ` Johannes Weiner
2019-09-26 13:49         ` Kirill A. Shutemov
2019-09-26 18:50           ` Matthew Wilcox
2019-09-26 19:26           ` Johannes Weiner
2019-09-27  8:39             ` Kirill A. Shutemov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).