linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
@ 2012-12-14  5:49 Andy Lutomirski
  2012-12-14  7:27 ` Al Viro
  2012-12-15  2:17 ` [PATCH v2] " Andy Lutomirski
  0 siblings, 2 replies; 22+ messages in thread
From: Andy Lutomirski @ 2012-12-14  5:49 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Al Viro, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, Jörn Engel,
	Andy Lutomirski

This is a serious cause of mmap_sem contention.  MAP_POPULATE
and MCL_FUTURE, in particular, are disastrous in multithreaded programs.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---

Sensible people use anonymous mappings.  I write kernel patches :)

I'm not entirely thrilled by the aesthetics of this patch.  The MAP_POPULATE case
could also be improved by doing it without any lock at all.  This is still a big
improvement, though.

 arch/tile/mm/elf.c |   9 ++--
 fs/aio.c           |   8 ++--
 include/linux/mm.h |   8 ++--
 ipc/shm.c          |   6 ++-
 mm/fremap.c        |  10 ++--
 mm/mmap.c          | 133 +++++++++++++++++++++++++++++++++++++----------------
 mm/util.c          |   3 +-
 7 files changed, 117 insertions(+), 60 deletions(-)

diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 3cfa98b..a0441f2 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -129,12 +129,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 	 */
 	if (!retval) {
 		unsigned long addr = MEM_USER_INTRPT;
-		addr = mmap_region(NULL, addr, INTRPT_SIZE,
-				   MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
-				   VM_READ|VM_EXEC|
-				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
+		addr = mmap_region_unlock(NULL, addr, INTRPT_SIZE,
+					  MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
+					  VM_READ|VM_EXEC|
+					  VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
 		if (addr > (unsigned long) -PAGE_SIZE)
 			retval = (int) addr;
+		return retval;  /* We already unlocked mmap_sem. */
 	}
 #endif
 
diff --git a/fs/aio.c b/fs/aio.c
index 71f613c..8e2b162 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -127,11 +127,10 @@ static int aio_setup_ring(struct kioctx *ctx)
 	info->mmap_size = nr_pages * PAGE_SIZE;
 	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
 	down_write(&ctx->mm->mmap_sem);
-	info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
-					PROT_READ|PROT_WRITE,
-					MAP_ANONYMOUS|MAP_PRIVATE, 0);
+	info->mmap_base = do_mmap_pgoff_unlock(NULL, 0, info->mmap_size,
+					       PROT_READ|PROT_WRITE,
+					       MAP_ANONYMOUS|MAP_PRIVATE, 0);
 	if (IS_ERR((void *)info->mmap_base)) {
-		up_write(&ctx->mm->mmap_sem);
 		info->mmap_size = 0;
 		aio_free_ring(ctx);
 		return -EAGAIN;
@@ -141,7 +140,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 	info->nr_pages = get_user_pages(current, ctx->mm,
 					info->mmap_base, nr_pages, 
 					1, 0, info->ring_pages, NULL);
-	up_write(&ctx->mm->mmap_sem);
 
 	if (unlikely(info->nr_pages != nr_pages)) {
 		aio_free_ring(ctx);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bcaab4e..bb13d11 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1441,12 +1441,12 @@ extern int install_special_mapping(struct mm_struct *mm,
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 
-extern unsigned long mmap_region(struct file *file, unsigned long addr,
+extern unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long flags,
 	vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
-        unsigned long, unsigned long,
-        unsigned long, unsigned long);
+extern unsigned long do_mmap_pgoff_unlock(struct file *, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flags, unsigned long pgoff);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
 /* These take the mm semaphore themselves */
diff --git a/ipc/shm.c b/ipc/shm.c
index dff40c9..d0001c8 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1068,12 +1068,14 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 		    addr > current->mm->start_stack - size - PAGE_SIZE * 5)
 			goto invalid;
 	}
-		
-	user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
+
+	user_addr = do_mmap_pgoff_unlock(file, addr, size, prot, flags, 0);
 	*raddr = user_addr;
 	err = 0;
 	if (IS_ERR_VALUE(user_addr))
 		err = (long)user_addr;
+	goto out_fput;
+
 invalid:
 	up_write(&current->mm->mmap_sem);
 
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e..232402c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -200,8 +200,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			struct file *file = get_file(vma->vm_file);
 
 			flags &= MAP_NONBLOCK;
-			addr = mmap_region(file, start, size,
-					flags, vma->vm_flags, pgoff);
+			addr = mmap_region_unlock(file, start, size,
+						  flags, vma->vm_flags, pgoff);
 			fput(file);
 			if (IS_ERR_VALUE(addr)) {
 				err = addr;
@@ -209,7 +209,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 				BUG_ON(addr != start);
 				err = 0;
 			}
-			goto out;
+			return err;  /* We just unlocked. */
 		}
 		mutex_lock(&mapping->i_mmap_mutex);
 		flush_dcache_mmap_lock(mapping);
@@ -237,6 +237,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			/*
 			 * might be mapping previously unmapped range of file
 			 */
+			if (unlikely(has_write_lock)) {
+				downgrade_write(&mm->mmap_sem);
+				has_write_lock = 0;
+			}
 			mlock_vma_pages_range(vma, start, start + size);
 		} else {
 			if (unlikely(has_write_lock)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c4..d275e05 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -999,7 +999,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
 
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+unsigned long do_mmap_pgoff_unlock(struct file *file, unsigned long addr,
 			unsigned long len, unsigned long prot,
 			unsigned long flags, unsigned long pgoff)
 {
@@ -1017,31 +1017,39 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
 			prot |= PROT_EXEC;
 
-	if (!len)
-		return -EINVAL;
+	if (!len) {
+		addr = -EINVAL;
+		goto out_unlock;
+	}
 
 	if (!(flags & MAP_FIXED))
 		addr = round_hint_to_min(addr);
 
 	/* Careful about overflows.. */
 	len = PAGE_ALIGN(len);
-	if (!len)
-		return -ENOMEM;
+	if (!len) {
+		addr = -ENOMEM;
+		goto out_unlock;
+	}
 
 	/* offset overflow? */
-	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
-               return -EOVERFLOW;
+	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) {
+		addr = -EOVERFLOW;
+		goto out_unlock;
+	}
 
 	/* Too many mappings? */
-	if (mm->map_count > sysctl_max_map_count)
-		return -ENOMEM;
+	if (mm->map_count > sysctl_max_map_count) {
+		addr = -ENOMEM;
+		goto out_unlock;
+	}
 
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
 	addr = get_unmapped_area(file, addr, len, pgoff, flags);
 	if (addr & ~PAGE_MASK)
-		return addr;
+		goto out_unlock;
 
 	/* Do simple checking here so the lower-level routines won't have
 	 * to. we assume access permissions have been handled by the open
@@ -1050,9 +1058,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
-	if (flags & MAP_LOCKED)
-		if (!can_do_mlock())
-			return -EPERM;
+	if (flags & MAP_LOCKED) {
+		if (!can_do_mlock()) {
+			addr = -EPERM;
+			goto out_unlock;
+		}
+	}
 
 	/* mlock MCL_FUTURE? */
 	if (vm_flags & VM_LOCKED) {
@@ -1061,8 +1072,10 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		locked += mm->locked_vm;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		lock_limit >>= PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			return -EAGAIN;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+			addr = -EAGAIN;
+			goto out_unlock;
+		}
 	}
 
 	inode = file ? file->f_path.dentry->d_inode : NULL;
@@ -1070,21 +1083,27 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	if (file) {
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
-			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
-				return -EACCES;
+			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) {
+				addr = -EACCES;
+				goto out_unlock;
+			}
 
 			/*
 			 * Make sure we don't allow writing to an append-only
 			 * file..
 			 */
-			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
-				return -EACCES;
+			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) {
+				addr = -EACCES;
+				goto out_unlock;
+			}
 
 			/*
 			 * Make sure there are no mandatory locks on the file.
 			 */
-			if (locks_verify_locked(inode))
-				return -EAGAIN;
+			if (locks_verify_locked(inode)) {
+				addr = -EAGAIN;
+				goto out_unlock;
+			}
 
 			vm_flags |= VM_SHARED | VM_MAYSHARE;
 			if (!(file->f_mode & FMODE_WRITE))
@@ -1092,20 +1111,27 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 
 			/* fall through */
 		case MAP_PRIVATE:
-			if (!(file->f_mode & FMODE_READ))
-				return -EACCES;
+			if (!(file->f_mode & FMODE_READ)) {
+				addr = -EACCES;
+				goto out_unlock;
+			}
 			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
-				if (vm_flags & VM_EXEC)
-					return -EPERM;
+				if (vm_flags & VM_EXEC) {
+					addr = -EPERM;
+					goto out_unlock;
+				}
 				vm_flags &= ~VM_MAYEXEC;
 			}
 
-			if (!file->f_op || !file->f_op->mmap)
-				return -ENODEV;
+			if (!file->f_op || !file->f_op->mmap) {
+				addr = -ENODEV;
+				goto out_unlock;
+			}
 			break;
 
 		default:
-			return -EINVAL;
+			addr = -EINVAL;
+			goto out_unlock;
 		}
 	} else {
 		switch (flags & MAP_TYPE) {
@@ -1123,11 +1149,16 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 			pgoff = addr >> PAGE_SHIFT;
 			break;
 		default:
-			return -EINVAL;
+			addr = -EINVAL;
+			goto out_unlock;
 		}
 	}
 
-	return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+	return mmap_region_unlock(file, addr, len, flags, vm_flags, pgoff);
+
+out_unlock:
+	up_write(&mm->mmap_sem);
+	return addr;
 }
 
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1240,9 +1271,9 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
-unsigned long mmap_region(struct file *file, unsigned long addr,
-			  unsigned long len, unsigned long flags,
-			  vm_flags_t vm_flags, unsigned long pgoff)
+unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
+				 unsigned long len, unsigned long flags,
+				 vm_flags_t vm_flags, unsigned long pgoff)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1251,19 +1282,24 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	struct rb_node **rb_link, *rb_parent;
 	unsigned long charged = 0;
 	struct inode *inode =  file ? file->f_path.dentry->d_inode : NULL;
+	bool downgraded = false;
 
 	/* Clear old maps */
 	error = -ENOMEM;
 munmap_back:
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
-		if (do_munmap(mm, addr, len))
-			return -ENOMEM;
+		if (do_munmap(mm, addr, len)) {
+			error = -ENOMEM;
+			goto unacct_error;
+		}
 		goto munmap_back;
 	}
 
 	/* Check against address space limit. */
-	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
-		return -ENOMEM;
+	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+		error = -ENOMEM;
+		goto unacct_error;
+	}
 
 	/*
 	 * Set 'VM_NORESERVE' if we should not account for the
@@ -1284,8 +1320,10 @@ munmap_back:
 	 */
 	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
-		if (security_vm_enough_memory_mm(mm, charged))
-			return -ENOMEM;
+		if (security_vm_enough_memory_mm(mm, charged)) {
+			error = -ENOMEM;
+			goto unacct_error;
+		}
 		vm_flags |= VM_ACCOUNT;
 	}
 
@@ -1372,15 +1410,27 @@ out:
 	perf_event_mmap(vma);
 
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+
+	downgraded = false;
 	if (vm_flags & VM_LOCKED) {
+		downgrade_write(&mm->mmap_sem);
+		downgraded = true;
 		if (!mlock_vma_pages_range(vma, addr, addr + len))
 			mm->locked_vm += (len >> PAGE_SHIFT);
-	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) {
+		downgrade_write(&mm->mmap_sem);
+		downgraded = true;
 		make_pages_present(addr, addr + len);
+	}
 
 	if (file)
 		uprobe_mmap(vma);
 
+	if (downgraded)
+		up_read(&mm->mmap_sem);
+	else
+		up_write(&mm->mmap_sem);
+
 	return addr;
 
 unmap_and_free_vma:
@@ -1397,6 +1447,9 @@ free_vma:
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
+
+	up_write(&mm->mmap_sem);
+
 	return error;
 }
 
diff --git a/mm/util.c b/mm/util.c
index dc3036c..fd54884 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -359,8 +359,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
 		down_write(&mm->mmap_sem);
-		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
-		up_write(&mm->mmap_sem);
+		ret = do_mmap_pgoff_unlock(file, addr, len, prot, flag, pgoff);
 	}
 	return ret;
 }
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-14  5:49 [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap Andy Lutomirski
@ 2012-12-14  7:27 ` Al Viro
  2012-12-14 11:14   ` Andy Lutomirski
  2012-12-15  2:17 ` [PATCH v2] " Andy Lutomirski
  1 sibling, 1 reply; 22+ messages in thread
From: Al Viro @ 2012-12-14  7:27 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, J??rn Engel

On Thu, Dec 13, 2012 at 09:49:43PM -0800, Andy Lutomirski wrote:
> This is a serious cause of mmap_sem contention.  MAP_POPULATE
> and MCL_FUTURE, in particular, are disastrous in multithreaded programs.
> 
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
> 
> Sensible people use anonymous mappings.  I write kernel patches :)
> 
> I'm not entirely thrilled by the aesthetics of this patch.  The MAP_POPULATE case
> could also be improved by doing it without any lock at all.  This is still a big
> improvement, though.

Wait a minute.  get_user_pages() relies on ->mmap_sem being held.  Unless
I'm seriously misreading your patch it removes that protection.  And yes,
I'm aware of execve-related exception; it's in special circumstances -
bprm->mm is guaranteed to be not shared (and we need to rearchitect that
area anyway, but that's a separate story).

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-14  7:27 ` Al Viro
@ 2012-12-14 11:14   ` Andy Lutomirski
  2012-12-14 14:49     ` Al Viro
  0 siblings, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2012-12-14 11:14 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-mm, linux-kernel, Andrew Morton, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, J??rn Engel

On Thu, Dec 13, 2012 at 11:27 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, Dec 13, 2012 at 09:49:43PM -0800, Andy Lutomirski wrote:
>> This is a serious cause of mmap_sem contention.  MAP_POPULATE
>> and MCL_FUTURE, in particular, are disastrous in multithreaded programs.
>>
>> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
>> ---
>>
>> Sensible people use anonymous mappings.  I write kernel patches :)
>>
>> I'm not entirely thrilled by the aesthetics of this patch.  The MAP_POPULATE case
>> could also be improved by doing it without any lock at all.  This is still a big
>> improvement, though.
>
> Wait a minute.  get_user_pages() relies on ->mmap_sem being held.  Unless
> I'm seriously misreading your patch it removes that protection.  And yes,
> I'm aware of execve-related exception; it's in special circumstances -
> bprm->mm is guaranteed to be not shared (and we need to rearchitect that
> area anyway, but that's a separate story).

Unless I completely screwed up the patch, ->mmap_sem is still held for
read (it's downgraded from write).  It's just not held for write
anymore.

--Andy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-14 11:14   ` Andy Lutomirski
@ 2012-12-14 14:49     ` Al Viro
  2012-12-14 16:12       ` Andy Lutomirski
  0 siblings, 1 reply; 22+ messages in thread
From: Al Viro @ 2012-12-14 14:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, J??rn Engel

On Fri, Dec 14, 2012 at 03:14:50AM -0800, Andy Lutomirski wrote:

> > Wait a minute.  get_user_pages() relies on ->mmap_sem being held.  Unless
> > I'm seriously misreading your patch it removes that protection.  And yes,
> > I'm aware of execve-related exception; it's in special circumstances -
> > bprm->mm is guaranteed to be not shared (and we need to rearchitect that
> > area anyway, but that's a separate story).
> 
> Unless I completely screwed up the patch, ->mmap_sem is still held for
> read (it's downgraded from write).  It's just not held for write
> anymore.

Huh?  I'm talking about the call of get_user_pages() in aio_setup_ring().
With your patch it's done completely outside of ->mmap_sem, isn't it?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-14 14:49     ` Al Viro
@ 2012-12-14 16:12       ` Andy Lutomirski
  2012-12-16  8:41         ` Ingo Molnar
  2012-12-16 17:04         ` Al Viro
  0 siblings, 2 replies; 22+ messages in thread
From: Andy Lutomirski @ 2012-12-14 16:12 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-mm, linux-kernel, Andrew Morton, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, J??rn Engel

On Fri, Dec 14, 2012 at 6:49 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Fri, Dec 14, 2012 at 03:14:50AM -0800, Andy Lutomirski wrote:
>
>> > Wait a minute.  get_user_pages() relies on ->mmap_sem being held.  Unless
>> > I'm seriously misreading your patch it removes that protection.  And yes,
>> > I'm aware of execve-related exception; it's in special circumstances -
>> > bprm->mm is guaranteed to be not shared (and we need to rearchitect that
>> > area anyway, but that's a separate story).
>>
>> Unless I completely screwed up the patch, ->mmap_sem is still held for
>> read (it's downgraded from write).  It's just not held for write
>> anymore.
>
> Huh?  I'm talking about the call of get_user_pages() in aio_setup_ring().
> With your patch it's done completely outside of ->mmap_sem, isn't it?

Oh, /that/ call to get_user_pages.  That would qualify as screwing up...

Since dropping and reacquiring mmap_sem there is probably a bad idea
there, I'll rework this and post a v2.

--Andy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-14  5:49 [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap Andy Lutomirski
  2012-12-14  7:27 ` Al Viro
@ 2012-12-15  2:17 ` Andy Lutomirski
  2012-12-16  9:00   ` Ingo Molnar
                     ` (2 more replies)
  1 sibling, 3 replies; 22+ messages in thread
From: Andy Lutomirski @ 2012-12-15  2:17 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Al Viro, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, Jörn Engel,
	Andy Lutomirski

This is a serious cause of mmap_sem contention.  MAP_POPULATE
and MCL_FUTURE, in particular, are disastrous in multithreaded programs.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---

Changes from v1:

The non-unlocking versions of do_mmap_pgoff and mmap_region are still
available for aio_setup_ring's benefit.  In theory, aio_setup_ring
would do better with a lock-downgrading version, but that would be
somewhat ugly and doesn't help my workload.

 arch/tile/mm/elf.c |  9 +++---
 fs/aio.c           |  4 +++
 include/linux/mm.h | 19 ++++++++++--
 ipc/shm.c          |  6 ++--
 mm/fremap.c        | 10 ++++--
 mm/mmap.c          | 89 ++++++++++++++++++++++++++++++++++++++++++++++++------
 mm/util.c          |  3 +-
 7 files changed, 117 insertions(+), 23 deletions(-)

diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 3cfa98b..a0441f2 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -129,12 +129,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 	 */
 	if (!retval) {
 		unsigned long addr = MEM_USER_INTRPT;
-		addr = mmap_region(NULL, addr, INTRPT_SIZE,
-				   MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
-				   VM_READ|VM_EXEC|
-				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
+		addr = mmap_region_unlock(NULL, addr, INTRPT_SIZE,
+					  MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
+					  VM_READ|VM_EXEC|
+					  VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
 		if (addr > (unsigned long) -PAGE_SIZE)
 			retval = (int) addr;
+		return retval;  /* We already unlocked mmap_sem. */
 	}
 #endif
 
diff --git a/fs/aio.c b/fs/aio.c
index 71f613c..253396c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -127,6 +127,10 @@ static int aio_setup_ring(struct kioctx *ctx)
 	info->mmap_size = nr_pages * PAGE_SIZE;
 	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
 	down_write(&ctx->mm->mmap_sem);
+	/*
+	 * XXX: If MCL_FUTURE is set, this will hold mmap_sem for write for
+	 *      longer than necessary.
+	 */
 	info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
 					PROT_READ|PROT_WRITE,
 					MAP_ANONYMOUS|MAP_PRIVATE, 0);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bcaab4e..139f636 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1441,14 +1441,27 @@ extern int install_special_mapping(struct mm_struct *mm,
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 
+/* These must be called with mmap_sem held for write. */
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long flags,
 	vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
-        unsigned long, unsigned long,
-        unsigned long, unsigned long);
+extern unsigned long do_mmap_pgoff(struct file *, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flags, unsigned long pgoff);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
+/*
+ * These must be called with mmap_sem held for write, and they will release
+ * mmap_sem before they return.  They hold mmap_sem for a shorter time than
+ * the non-unlocking variants.
+ */
+extern unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
+	unsigned long len, unsigned long flags,
+	vm_flags_t vm_flags, unsigned long pgoff);
+extern unsigned long do_mmap_pgoff_unlock(struct file *, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flags, unsigned long pgoff);
+
 /* These take the mm semaphore themselves */
 extern unsigned long vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
diff --git a/ipc/shm.c b/ipc/shm.c
index dff40c9..d0001c8 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1068,12 +1068,14 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 		    addr > current->mm->start_stack - size - PAGE_SIZE * 5)
 			goto invalid;
 	}
-		
-	user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
+
+	user_addr = do_mmap_pgoff_unlock(file, addr, size, prot, flags, 0);
 	*raddr = user_addr;
 	err = 0;
 	if (IS_ERR_VALUE(user_addr))
 		err = (long)user_addr;
+	goto out_fput;
+
 invalid:
 	up_write(&current->mm->mmap_sem);
 
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e..7ebe0a4 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -200,8 +200,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			struct file *file = get_file(vma->vm_file);
 
 			flags &= MAP_NONBLOCK;
-			addr = mmap_region(file, start, size,
-					flags, vma->vm_flags, pgoff);
+			addr = mmap_region_unlock(file, start, size,
+						  flags, vma->vm_flags, pgoff);
 			fput(file);
 			if (IS_ERR_VALUE(addr)) {
 				err = addr;
@@ -209,7 +209,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 				BUG_ON(addr != start);
 				err = 0;
 			}
-			goto out;
+			return err;  /* We already unlocked. */
 		}
 		mutex_lock(&mapping->i_mmap_mutex);
 		flush_dcache_mmap_lock(mapping);
@@ -237,6 +237,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			/*
 			 * might be mapping previously unmapped range of file
 			 */
+			if (unlikely(has_write_lock)) {
+				downgrade_write(&mm->mmap_sem);
+				has_write_lock = 0;
+			}
 			mlock_vma_pages_range(vma, start, start + size);
 		} else {
 			if (unlikely(has_write_lock)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c4..2dd6b2f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -995,13 +995,22 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 	return hint;
 }
 
+static unsigned long mmap_region_helper(struct file *file, unsigned long addr,
+					unsigned long len, unsigned long flags,
+					vm_flags_t vm_flags,
+                                        unsigned long pgoff, int *downgraded);
+
 /*
  * The caller must hold down_write(&current->mm->mmap_sem).
+ *
+ * If downgraded is non-null, do_mmap_pgoff_helper may downgrade mmap_sem
+ * and write 1 to *downgraded.
  */
-
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
-			unsigned long len, unsigned long prot,
-			unsigned long flags, unsigned long pgoff)
+static unsigned long do_mmap_pgoff_helper(
+	struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flags, unsigned long pgoff,
+	int *downgraded)
 {
 	struct mm_struct * mm = current->mm;
 	struct inode *inode;
@@ -1127,7 +1136,32 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		}
 	}
 
-	return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+	return mmap_region_helper(file, addr, len, flags, vm_flags, pgoff,
+				  downgraded);
+}
+
+
+unsigned long do_mmap_pgoff_unlock(struct file *file, unsigned long addr,
+				   unsigned long len, unsigned long prot,
+				   unsigned long flags, unsigned long pgoff)
+{
+	int downgraded = 0;
+	unsigned long ret = do_mmap_pgoff_helper(file, addr, len,
+		prot, flags, pgoff, &downgraded);
+
+	if (downgraded)
+		up_read(&current->mm->mmap_sem);
+	else
+		up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+			    unsigned long len, unsigned long prot,
+			    unsigned long flags, unsigned long pgoff)
+{
+	return do_mmap_pgoff_helper(file, addr, len, prot, flags, pgoff, 0);
 }
 
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1240,9 +1274,14 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
-unsigned long mmap_region(struct file *file, unsigned long addr,
-			  unsigned long len, unsigned long flags,
-			  vm_flags_t vm_flags, unsigned long pgoff)
+/*
+ * If downgraded is null, then mmap_sem won't be touched.  Otherwise it
+ * may be downgraded, in which case *downgraded will be set to 1.
+ */
+static unsigned long mmap_region_helper(struct file *file, unsigned long addr,
+					unsigned long len, unsigned long flags,
+					vm_flags_t vm_flags,
+					unsigned long pgoff, int *downgraded)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1373,10 +1412,19 @@ out:
 
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
+		if (downgraded) {
+			downgrade_write(&mm->mmap_sem);
+			*downgraded = 1;
+		}
 		if (!mlock_vma_pages_range(vma, addr, addr + len))
 			mm->locked_vm += (len >> PAGE_SHIFT);
-	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) {
+		if (downgraded) {
+			downgrade_write(&mm->mmap_sem);
+			*downgraded = 1;
+		}
 		make_pages_present(addr, addr + len);
+	}
 
 	if (file)
 		uprobe_mmap(vma);
@@ -1400,6 +1448,29 @@ unacct_error:
 	return error;
 }
 
+unsigned long mmap_region(struct file *file, unsigned long addr,
+			  unsigned long len, unsigned long flags,
+			  vm_flags_t vm_flags, unsigned long pgoff)
+{
+	return mmap_region_helper(file, addr, len, flags, vm_flags, pgoff, 0);
+}
+
+unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
+				 unsigned long len, unsigned long flags,
+				 vm_flags_t vm_flags, unsigned long pgoff)
+{
+	int downgraded = 0;
+	unsigned long ret = mmap_region_helper(file, addr, len,
+		flags, vm_flags, pgoff, &downgraded);
+
+	if (downgraded)
+		up_read(&current->mm->mmap_sem);
+	else
+		up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
 /* Get an address range which is currently unmapped.
  * For shmat() with addr=0.
  *
diff --git a/mm/util.c b/mm/util.c
index dc3036c..fd54884 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -359,8 +359,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
 		down_write(&mm->mmap_sem);
-		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
-		up_write(&mm->mmap_sem);
+		ret = do_mmap_pgoff_unlock(file, addr, len, prot, flag, pgoff);
 	}
 	return ret;
 }
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-14 16:12       ` Andy Lutomirski
@ 2012-12-16  8:41         ` Ingo Molnar
  2012-12-16 17:04         ` Al Viro
  1 sibling, 0 replies; 22+ messages in thread
From: Ingo Molnar @ 2012-12-16  8:41 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Al Viro, linux-mm, linux-kernel, Andrew Morton,
	Michel Lespinasse, Hugh Dickins, J??rn Engel


* Andy Lutomirski <luto@amacapital.net> wrote:

> On Fri, Dec 14, 2012 at 6:49 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > On Fri, Dec 14, 2012 at 03:14:50AM -0800, Andy Lutomirski wrote:
> >
> >> > Wait a minute.  get_user_pages() relies on ->mmap_sem being held.  Unless
> >> > I'm seriously misreading your patch it removes that protection.  And yes,
> >> > I'm aware of execve-related exception; it's in special circumstances -
> >> > bprm->mm is guaranteed to be not shared (and we need to rearchitect that
> >> > area anyway, but that's a separate story).
> >>
> >> Unless I completely screwed up the patch, ->mmap_sem is still held for
> >> read (it's downgraded from write).  It's just not held for write
> >> anymore.
> >
> > Huh?  I'm talking about the call of get_user_pages() in aio_setup_ring().
> > With your patch it's done completely outside of ->mmap_sem, isn't it?
> 
> Oh, /that/ call to get_user_pages.  That would qualify as screwing up...
> 
> Since dropping and reacquiring mmap_sem there is probably a 
> bad idea there, I'll rework this and post a v2.

It probably does not matter much, as aio_setup() is an utter 
slowpath, but I suspect you could still use the downgrading 
variant of do_mmap_pgoff_unlock() here too:

	int downgraded = 0;

	...

        down_write(&ctx->mm->mmap_sem);
        /*
         * XXX: If MCL_FUTURE is set, this will hold mmap_sem for write for
         *      longer than necessary.
         */
        info->mmap_base = do_mmap_pgoff_helper(NULL, 0, info->mmap_size,
                                        PROT_READ|PROT_WRITE,
                                        MAP_ANONYMOUS|MAP_PRIVATE, 0, &downgraded);
        if (IS_ERR((void *)info->mmap_base)) {
		up_read_write(&ctx->mm->mmap_sem, downgraded);
                info->mmap_size = 0;
                aio_free_ring(ctx);
                return -EAGAIN;
        }

        dprintk("mmap address: 0x%08lx\n", info->mmap_base);
        info->nr_pages = get_user_pages(current, ctx->mm,
                                        info->mmap_base, nr_pages,
                                        1, 0, info->ring_pages, NULL);
	up_read_write(&ctx->mm->mmap_sem, downgraded);

Where up_read_write(lock, read) is a new primitive/wrapper that 
does the up_read()/up_write() depending on the value of 
'downgraded'.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-15  2:17 ` [PATCH v2] " Andy Lutomirski
@ 2012-12-16  9:00   ` Ingo Molnar
  2012-12-16 17:52     ` Andy Lutomirski
  2012-12-16 12:39   ` [PATCH v2] " Michel Lespinasse
  2012-12-16 19:58   ` Linus Torvalds
  2 siblings, 1 reply; 22+ messages in thread
From: Ingo Molnar @ 2012-12-16  9:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Al Viro,
	Michel Lespinasse, Hugh Dickins, J??rn Engel, Linus Torvalds


* Andy Lutomirski <luto@amacapital.net> wrote:

> This is a serious cause of mmap_sem contention.  MAP_POPULATE
> and MCL_FUTURE, in particular, are disastrous in multithreaded programs.
> 
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
> 
> Changes from v1:
> 
> The non-unlocking versions of do_mmap_pgoff and mmap_region are still
> available for aio_setup_ring's benefit.  In theory, aio_setup_ring
> would do better with a lock-downgrading version, but that would be
> somewhat ugly and doesn't help my workload.
> 
>  arch/tile/mm/elf.c |  9 +++---
>  fs/aio.c           |  4 +++
>  include/linux/mm.h | 19 ++++++++++--
>  ipc/shm.c          |  6 ++--
>  mm/fremap.c        | 10 ++++--
>  mm/mmap.c          | 89 ++++++++++++++++++++++++++++++++++++++++++++++++------
>  mm/util.c          |  3 +-
>  7 files changed, 117 insertions(+), 23 deletions(-)

> +unsigned long mmap_region(struct file *file, unsigned long addr,
> +			  unsigned long len, unsigned long flags,
> +			  vm_flags_t vm_flags, unsigned long pgoff)
> +{
> +	return mmap_region_helper(file, addr, len, flags, vm_flags, pgoff, 0);
> +}
> +

That 0 really wants to be NULL ...

Also, with your patch applied there's no user of mmap_region() 
left anymore.

More fundamentally, while I agree with the optimization, 
couldn't we de-uglify it a bit more?

In particular, instead of this wrappery:

> +unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
> +				 unsigned long len, unsigned long flags,
> +				 vm_flags_t vm_flags, unsigned long pgoff)
> +{
> +	int downgraded = 0;
> +	unsigned long ret = mmap_region_helper(file, addr, len,
> +		flags, vm_flags, pgoff, &downgraded);
> +
> +	if (downgraded)
> +		up_read(&current->mm->mmap_sem);
> +	else
> +		up_write(&current->mm->mmap_sem);
> +
> +	return ret;
> +}

1)

We could at minimum wrap up the conditional unlocking as:

	up_read_write(&mm->mmap_sem, read_locked);

With that I'd also suggest to rename 'downgraded' to 
'read_locked', which more clearly expresses the locking state.

2)

More aggressively, we could just make it the _rule_ that the mm 
lock gets downgraded to read in mmap_region_helper(), no matter 
what.

>From a quick look I *think* all the usage sites (including 
sys_aio_setup()) are fine with that unlocking - but I could be 
wrong.

There's a couple of shorter codepaths that would now see an 
extra op of downgrading:

	down_write(&mm->mmap_sem);
	...
	downgrade_write(&mm->mmap_sem);
	...
	up_read(&mm->mmap_sem);

with not much work done with the lock read-locked - but I think 
they are all fine and mostly affect error paths. So there's no 
real value in keeping the conditional nature of the unlocking I 
think.

That way all the usage sites could do a *much* cleaner pattern 
of:

	down_write(&mm->mmap_sem);
	...
	do_mmap_pgoff_downgrade_write(...);
	...
	up_read(&mm->mmap_sem);

... and that kind of cleanliness would instantly halve the size 
of your patch, it would improve all use sites, and would turn 
your patch into something I'd want to see applied straight away.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-15  2:17 ` [PATCH v2] " Andy Lutomirski
  2012-12-16  9:00   ` Ingo Molnar
@ 2012-12-16 12:39   ` Michel Lespinasse
  2012-12-16 18:05     ` Andy Lutomirski
  2012-12-16 19:58   ` Linus Torvalds
  2 siblings, 1 reply; 22+ messages in thread
From: Michel Lespinasse @ 2012-12-16 12:39 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Al Viro, Ingo Molnar,
	Hugh Dickins, Jörn Engel, Linus Torvalds

On Fri, Dec 14, 2012 at 6:17 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> This is a serious cause of mmap_sem contention.  MAP_POPULATE
> and MCL_FUTURE, in particular, are disastrous in multithreaded programs.
>
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
>
> Changes from v1:
>
> The non-unlocking versions of do_mmap_pgoff and mmap_region are still
> available for aio_setup_ring's benefit.  In theory, aio_setup_ring
> would do better with a lock-downgrading version, but that would be
> somewhat ugly and doesn't help my workload.

Hi Andy,

I agree that the long mmap_sem hold times when using MAP_POPULATE,
MAP_LOCKED or MCL_FUTURE are a problem. However, I'm not entirely
happy with your proposed solution.

My main concern is that just downgrading the mmap_sem only hides the
problem: as soon as a writer gets queued on that mmap_sem,
reader/writer fairness kicks in and blocks any new readers, which
makes the problem reappear. So in order to completely fix the issue,
we should look for a way that doesn't require holding the mmap_sem
(even in read mode) for the entire duration of the populate or mlock
operation.

I think this could be done by extending the mlock work I did as part
of v2.6.38-rc1. The commit message for
fed067da46ad3b9acedaf794a5f05d0bc153280b explains the idea; basically
mlock() was split into do_mlock() which just sets the VM_LOCKED flag
on vmas as needed, and do_mlock_pages() which goes through a range of
addresses and actually populates/mlocks each individual page that is
part of a VM_LOCKED vma.

This could be easily extended for mlocks that happen in mmap_region()
due to MAP_LOCKED or MCL_FUTURE: mmap_region() would just set the
VM_LOCKED flag and defer the work of actually populating/mlocking the
individual pages. I think the only constraint here is that the pages
must be locked before returning to userspace, so we may be able to use
the task_work mechanism to achieve that. Later on (but before
returning to userspace) we would notice we have some mlock work to do
and call do_mlock_pages() to achieve that.

I think the benefits of this approach would be:
- no mmap_sem locking changes around mmap_region() - which also means
that all code paths to mmap_region() can instantly benefit
- do_mlock_pages() doesn't need to hold a read lock on mmap_sem for
the entire duration of the operation, so we can fully solve the
problem instead of just making it harder to trigger

Now for handling MAP_POPULATE, we would probably want to use a similar
mechanism as well, so that we don't need to hold the mmap_sem for the
entire duration of the populate. This is similar in principle to the
MAP_LOCKED case; however this may require the introduction of a new
VM_POPULATE vma flag in order to avoid the possibility of a race where
someone replaces our vma with another before we get a chance to
populate it.

I don't have an implementation for this idea yet; however I'm hoping
to come up with one before xmas. Before then, any comments on the idea
?

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-14 16:12       ` Andy Lutomirski
  2012-12-16  8:41         ` Ingo Molnar
@ 2012-12-16 17:04         ` Al Viro
  2012-12-16 17:48           ` Al Viro
                             ` (3 more replies)
  1 sibling, 4 replies; 22+ messages in thread
From: Al Viro @ 2012-12-16 17:04 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, J??rn Engel

On Fri, Dec 14, 2012 at 08:12:45AM -0800, Andy Lutomirski wrote:
> On Fri, Dec 14, 2012 at 6:49 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> > On Fri, Dec 14, 2012 at 03:14:50AM -0800, Andy Lutomirski wrote:
> >
> >> > Wait a minute.  get_user_pages() relies on ->mmap_sem being held.  Unless
> >> > I'm seriously misreading your patch it removes that protection.  And yes,
> >> > I'm aware of execve-related exception; it's in special circumstances -
> >> > bprm->mm is guaranteed to be not shared (and we need to rearchitect that
> >> > area anyway, but that's a separate story).
> >>
> >> Unless I completely screwed up the patch, ->mmap_sem is still held for
> >> read (it's downgraded from write).  It's just not held for write
> >> anymore.
> >
> > Huh?  I'm talking about the call of get_user_pages() in aio_setup_ring().
> > With your patch it's done completely outside of ->mmap_sem, isn't it?
> 
> Oh, /that/ call to get_user_pages.  That would qualify as screwing up...
> 
> Since dropping and reacquiring mmap_sem there is probably a bad idea
> there, I'll rework this and post a v2.

FWIW, I've done some checking of ->mmap_sem uses yesterday.  Got further than
the last time; catch so far, just from find_vma() audit:
* arm swp_emulate.c - missing ->mmap_sem around find_vma().  Fix sent to
rmk.
* blackfin ptrace - find_vma() without any protection, definitely broken
* m68k sys_cacheflush() - ditto
* mips process_fpemu_return() - ditto
* mips octeon_flush_cache_sigtramp() - ditto
* omap_vout_uservirt_to_phys() - ditto, patch sent
* vb2_get_contig_userptr() - probaly a bug, unless I've misread the (very
twisty maze of) v4l2 code leading to it
* vb2_get_contig_userptr() - ditto
* gntdev_ioctl_get_offset_for_vaddr() - definitely broken
and there's a couple of dubious places in arch/* I hadn't finished with,
plus a lot in mm/* proper.

That's just from a couple of days of RTFS.  The locking in there is far too
convoluted as it is; worse, it's not localized code-wise, so rechecking
correctness is going to remain a big time-sink ;-/

Making it *more* complex doesn't look like a good idea, TBH...

BTW, the __get_user_pages()/find_extend_vma()/mlock_vma_pages_range() pile is
really asking for trouble; sure, the recursion there is limited, but it
deserves a comment.  Moreover, the damn thing is reachable from coredump
path and there we do *not* have ->mmap_sem held.  We don't reach the
VM_BUG_ON() in __mlock_vma_pages_range(), but the reason for that also
deserves a comment, IMO.

Moreover, I'm not quite convinced that huge_memory.c and ksm.c can't run
into all kinds of interesting races with ongoing coredump.  Looking into
it...

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-16 17:04         ` Al Viro
@ 2012-12-16 17:48           ` Al Viro
  2012-12-16 18:49           ` Johannes Weiner
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 22+ messages in thread
From: Al Viro @ 2012-12-16 17:48 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, J??rn Engel

On Sun, Dec 16, 2012 at 05:04:03PM +0000, Al Viro wrote:

> Moreover, I'm not quite convinced that huge_memory.c and ksm.c can't run
> into all kinds of interesting races with ongoing coredump.  Looking into
> it...

Specifically, is collapse_huge_page() safe in parallel with ->core_dump()?
It *can* run while we are in the middle of e.g. elf_core_dump().

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-16  9:00   ` Ingo Molnar
@ 2012-12-16 17:52     ` Andy Lutomirski
  2012-12-17  9:52       ` Ingo Molnar
  0 siblings, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2012-12-16 17:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mm, linux-kernel, Andrew Morton, Al Viro,
	Michel Lespinasse, Hugh Dickins, J??rn Engel, Linus Torvalds

On Sun, Dec 16, 2012 at 1:00 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Andy Lutomirski <luto@amacapital.net> wrote:
>
>> This is a serious cause of mmap_sem contention.  MAP_POPULATE
>> and MCL_FUTURE, in particular, are disastrous in multithreaded programs.
>>
>> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
>> ---
>>
>> Changes from v1:
>>
>> The non-unlocking versions of do_mmap_pgoff and mmap_region are still
>> available for aio_setup_ring's benefit.  In theory, aio_setup_ring
>> would do better with a lock-downgrading version, but that would be
>> somewhat ugly and doesn't help my workload.
>>
>>  arch/tile/mm/elf.c |  9 +++---
>>  fs/aio.c           |  4 +++
>>  include/linux/mm.h | 19 ++++++++++--
>>  ipc/shm.c          |  6 ++--
>>  mm/fremap.c        | 10 ++++--
>>  mm/mmap.c          | 89 ++++++++++++++++++++++++++++++++++++++++++++++++------
>>  mm/util.c          |  3 +-
>>  7 files changed, 117 insertions(+), 23 deletions(-)
>
>> +unsigned long mmap_region(struct file *file, unsigned long addr,
>> +                       unsigned long len, unsigned long flags,
>> +                       vm_flags_t vm_flags, unsigned long pgoff)
>> +{
>> +     return mmap_region_helper(file, addr, len, flags, vm_flags, pgoff, 0);
>> +}
>> +
>
> That 0 really wants to be NULL ...

Sigh.  I blame C++11 -- I wanted to type nullptr, but that's no good :)

>
> Also, with your patch applied there's no user of mmap_region()
> left anymore.
>
> More fundamentally, while I agree with the optimization,
> couldn't we de-uglify it a bit more?
>
> In particular, instead of this wrappery:
>
>> +unsigned long mmap_region_unlock(struct file *file, unsigned long addr,
>> +                              unsigned long len, unsigned long flags,
>> +                              vm_flags_t vm_flags, unsigned long pgoff)
>> +{
>> +     int downgraded = 0;
>> +     unsigned long ret = mmap_region_helper(file, addr, len,
>> +             flags, vm_flags, pgoff, &downgraded);
>> +
>> +     if (downgraded)
>> +             up_read(&current->mm->mmap_sem);
>> +     else
>> +             up_write(&current->mm->mmap_sem);
>> +
>> +     return ret;
>> +}
>
> 1)
>
> We could at minimum wrap up the conditional unlocking as:
>
>         up_read_write(&mm->mmap_sem, read_locked);
>
> With that I'd also suggest to rename 'downgraded' to
> 'read_locked', which more clearly expresses the locking state.
>
> 2)
>
> More aggressively, we could just make it the _rule_ that the mm
> lock gets downgraded to read in mmap_region_helper(), no matter
> what.
>
> From a quick look I *think* all the usage sites (including
> sys_aio_setup()) are fine with that unlocking - but I could be
> wrong.

They are.

>
> There's a couple of shorter codepaths that would now see an
> extra op of downgrading:
>
>         down_write(&mm->mmap_sem);
>         ...
>         downgrade_write(&mm->mmap_sem);
>         ...
>         up_read(&mm->mmap_sem);
>
> with not much work done with the lock read-locked - but I think
> they are all fine and mostly affect error paths. So there's no
> real value in keeping the conditional nature of the unlocking I
> think.

There's also the normal (i.e. neither lock nor populate) success path.
 Does this matter?  Presumably downgrade_write + up_read isn't much
slower than up_write.

--Andy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-16 12:39   ` [PATCH v2] " Michel Lespinasse
@ 2012-12-16 18:05     ` Andy Lutomirski
  2012-12-17  3:29       ` Michel Lespinasse
  0 siblings, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2012-12-16 18:05 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: linux-mm, linux-kernel, Andrew Morton, Al Viro, Ingo Molnar,
	Hugh Dickins, Jörn Engel, Linus Torvalds

On Sun, Dec 16, 2012 at 4:39 AM, Michel Lespinasse <walken@google.com> wrote:
> On Fri, Dec 14, 2012 at 6:17 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> This is a serious cause of mmap_sem contention.  MAP_POPULATE
>> and MCL_FUTURE, in particular, are disastrous in multithreaded programs.
>>
>> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
>> ---
>>
>> Changes from v1:
>>
>> The non-unlocking versions of do_mmap_pgoff and mmap_region are still
>> available for aio_setup_ring's benefit.  In theory, aio_setup_ring
>> would do better with a lock-downgrading version, but that would be
>> somewhat ugly and doesn't help my workload.
>
> Hi Andy,
>
> I agree that the long mmap_sem hold times when using MAP_POPULATE,
> MAP_LOCKED or MCL_FUTURE are a problem. However, I'm not entirely
> happy with your proposed solution.
>
> My main concern is that just downgrading the mmap_sem only hides the
> problem: as soon as a writer gets queued on that mmap_sem,
> reader/writer fairness kicks in and blocks any new readers, which
> makes the problem reappear. So in order to completely fix the issue,
> we should look for a way that doesn't require holding the mmap_sem
> (even in read mode) for the entire duration of the populate or mlock
> operation.

Ugh.

At least with my patch, mmap in MCL_FUTURE mode is no worse than mmap
+ mlock.  I suspect I haven't hit this because all my mmaping is done
by one thread, so it never ends up waiting for itself, and the other
thread have very short mmap_sem hold times.

>
> I think this could be done by extending the mlock work I did as part
> of v2.6.38-rc1. The commit message for
> c explains the idea; basically
> mlock() was split into do_mlock() which just sets the VM_LOCKED flag
> on vmas as needed, and do_mlock_pages() which goes through a range of
> addresses and actually populates/mlocks each individual page that is
> part of a VM_LOCKED vma.
>

Doesn't this have the same problem?  It holds mmap_sem for read for a
long time, and if another writer comes in then r/w starvation
prevention will kick in.

> This could be easily extended for mlocks that happen in mmap_region()
> due to MAP_LOCKED or MCL_FUTURE: mmap_region() would just set the
> VM_LOCKED flag and defer the work of actually populating/mlocking the
> individual pages. I think the only constraint here is that the pages
> must be locked before returning to userspace, so we may be able to use
> the task_work mechanism to achieve that. Later on (but before
> returning to userspace) we would notice we have some mlock work to do
> and call do_mlock_pages() to achieve that.
>
> I think the benefits of this approach would be:
> - no mmap_sem locking changes around mmap_region() - which also means
> that all code paths to mmap_region() can instantly benefit
> - do_mlock_pages() doesn't need to hold a read lock on mmap_sem for
> the entire duration of the operation, so we can fully solve the
> problem instead of just making it harder to trigger
>
> Now for handling MAP_POPULATE, we would probably want to use a similar
> mechanism as well, so that we don't need to hold the mmap_sem for the
> entire duration of the populate. This is similar in principle to the
> MAP_LOCKED case; however this may require the introduction of a new
> VM_POPULATE vma flag in order to avoid the possibility of a race where
> someone replaces our vma with another before we get a chance to
> populate it.
>
> I don't have an implementation for this idea yet; however I'm hoping
> to come up with one before xmas. Before then, any comments on the idea
> ?

IMO this all sucks.  What we want is some way to lock a single vma or
(better?) a range of virtual addresses.  mmap_region (or some
equivalent) ought to be able to return a read-locked vma, and the
caller can (while still holding that lock) do whatever they want and
take their time at it.

If mmap_sem were only held while actually modifying the tree of vmas,
hold times could be nice and short.

--Andy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-16 17:04         ` Al Viro
  2012-12-16 17:48           ` Al Viro
@ 2012-12-16 18:49           ` Johannes Weiner
  2012-12-16 19:53           ` Al Viro
  2012-12-16 20:16           ` Al Viro
  3 siblings, 0 replies; 22+ messages in thread
From: Johannes Weiner @ 2012-12-16 18:49 UTC (permalink / raw)
  To: Al Viro
  Cc: Andy Lutomirski, linux-mm, linux-kernel, Andrew Morton,
	Ingo Molnar, Michel Lespinasse, Hugh Dickins, J??rn Engel

On Sun, Dec 16, 2012 at 05:04:03PM +0000, Al Viro wrote:
> BTW, the __get_user_pages()/find_extend_vma()/mlock_vma_pages_range() pile is
> really asking for trouble; sure, the recursion there is limited, but it
> deserves a comment.

Agreed, how about this?

---
From: Johannes Weiner <hannes@cmpxchg.org>
Subject: [patch] mm: mlock: document scary-looking stack expansion mlock
 chain

The fact that mlock calls get_user_pages, and get_user_pages might
call mlock when expanding a stack looks like a potential recursion.

However, mlock makes sure the requested range is already contained
within a vma, so no stack expansion will actually happen from mlock.

Should this ever change: the stack expansion mlocks only the newly
expanded range and so will not result in recursive expansion.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/mlock.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/mlock.c b/mm/mlock.c
index f0b9ce5..17cf905 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -186,6 +186,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 	if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
 		gup_flags |= FOLL_FORCE;
 
+	/*
+	 * We made sure addr is within a VMA, so the following will
+	 * not result in a stack expansion that recurses back here.
+	 */
 	return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
 				NULL, NULL, nonblocking);
 }
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-16 17:04         ` Al Viro
  2012-12-16 17:48           ` Al Viro
  2012-12-16 18:49           ` Johannes Weiner
@ 2012-12-16 19:53           ` Al Viro
  2012-12-16 20:16           ` Al Viro
  3 siblings, 0 replies; 22+ messages in thread
From: Al Viro @ 2012-12-16 19:53 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, J??rn Engel

On Sun, Dec 16, 2012 at 05:04:03PM +0000, Al Viro wrote:

> FWIW, I've done some checking of ->mmap_sem uses yesterday.  Got further than
> the last time; catch so far, just from find_vma() audit:
> * arm swp_emulate.c - missing ->mmap_sem around find_vma().  Fix sent to
> rmk.
> * blackfin ptrace - find_vma() without any protection, definitely broken
> * m68k sys_cacheflush() - ditto
> * mips process_fpemu_return() - ditto
> * mips octeon_flush_cache_sigtramp() - ditto
> * omap_vout_uservirt_to_phys() - ditto, patch sent
> * vb2_get_contig_userptr() - probaly a bug, unless I've misread the (very
> twisty maze of) v4l2 code leading to it
> * vb2_get_contig_userptr() - ditto
> * gntdev_ioctl_get_offset_for_vaddr() - definitely broken
> and there's a couple of dubious places in arch/* I hadn't finished with,
> plus a lot in mm/* proper.
> 
> That's just from a couple of days of RTFS.  The locking in there is far too
> convoluted as it is; worse, it's not localized code-wise, so rechecking
> correctness is going to remain a big time-sink ;-/
> 
> Making it *more* complex doesn't look like a good idea, TBH...

While we are at it: fs/proc/task_nommu.c:m_stop() is fucked.  It assumes
that the process in question hadn't done execve() since m_start().  And
it doesn't hold anywhere near enough locks to guarantee that.  task_mmu.c
counterpart avoids that fun by using ->vm_mm to get to mm_struct in question.
Completely untested patch follows; if it works, that's -stable fodder.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 1ccfa53..dc26605 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -211,6 +211,14 @@ static int show_tid_map(struct seq_file *m, void *_p)
 	return show_map(m, _p, 0);
 }
 
+static void stop_it(void *_p)
+{
+	struct vm_area_struct *vma = rb_entry(_p, struct vm_area_struct, vm_rb);
+	struct mm_struct *mm = vma->vm_mm;
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+}
+
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
@@ -235,6 +243,8 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
 		if (n-- == 0)
 			return p;
+	up_read(&mm->mmap_sem);
+	mmput(mm);
 	return NULL;
 }
 
@@ -243,9 +253,8 @@ static void m_stop(struct seq_file *m, void *_vml)
 	struct proc_maps_private *priv = m->private;
 
 	if (priv->task) {
-		struct mm_struct *mm = priv->task->mm;
-		up_read(&mm->mmap_sem);
-		mmput(mm);
+		if (_vml)
+			stop_it(_vml);
 		put_task_struct(priv->task);
 	}
 }
@@ -255,7 +264,12 @@ static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
 	struct rb_node *p = _p;
 
 	(*pos)++;
-	return p ? rb_next(p) : NULL;
+	if (!p)
+		return NULL;
+	p = rb_next(p);
+	if (!p)
+		stop_it(_p);
+	return p;
 }
 
 static const struct seq_operations proc_pid_maps_ops = {

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-15  2:17 ` [PATCH v2] " Andy Lutomirski
  2012-12-16  9:00   ` Ingo Molnar
  2012-12-16 12:39   ` [PATCH v2] " Michel Lespinasse
@ 2012-12-16 19:58   ` Linus Torvalds
  2 siblings, 0 replies; 22+ messages in thread
From: Linus Torvalds @ 2012-12-16 19:58 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, Linux Kernel Mailing List, Andrew Morton, Al Viro,
	Ingo Molnar, Michel Lespinasse, Hugh Dickins, Jörn Engel

On Fri, Dec 14, 2012 at 6:17 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> This is a serious cause of mmap_sem contention.  MAP_POPULATE
> and MCL_FUTURE, in particular, are disastrous in multithreaded programs.
>
> Signed-off-by: Andy Lutomirski <luto@amacapital.net>

Ugh. This patch is just too ugly.

Conditional locking like this is just too disgusting for words. And
this v2 is worse, with that whole disgusting 'downgraded' pointer
thing.

I'm not applying disgusting hacks like this. I suspect you can clean
it up by moving the mlock/populate logic into the (few) callers
instead (first as a separate patch that doesn't do the downgrading)
and then a separate patch that does the downgrade in the callers,
possibly using a "finish_mmap" helper function that releases the lock.

No "if (write) up_write() else up_read()" crap. Instead, make the
finish_mmap helper do something like

  if (!populate_r_mlock) {
    up_write(mmap_sem);
    return;
  }
  downgrade(mmap_sem);
  .. populate and mlock ..
  up_read(mmap_sem);

and you never have any odd "now I'm holding it for writing" state
variable with conditional locking rules etc.

           Linus

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-16 17:04         ` Al Viro
                             ` (2 preceding siblings ...)
  2012-12-16 19:53           ` Al Viro
@ 2012-12-16 20:16           ` Al Viro
  3 siblings, 0 replies; 22+ messages in thread
From: Al Viro @ 2012-12-16 20:16 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, J??rn Engel

On Sun, Dec 16, 2012 at 05:04:03PM +0000, Al Viro wrote:

> That's just from a couple of days of RTFS.  The locking in there is far too
> convoluted as it is; worse, it's not localized code-wise, so rechecking
> correctness is going to remain a big time-sink ;-/
> 
> Making it *more* complex doesn't look like a good idea, TBH...

... and another fun place: kvm_setup_async_pf() grabs a _passive_ reference
to current->mm (->mm_count, not ->mm_users), sticks it into work->mm and
schedules execution of async_pf_execute().  Which does use_mm() (still no
active refs acquired), grabs work->mm->mmap_sem shared and proceeds to call
get_user_pages().  What's going to happen if somebody does kill -9 to
the process that had started that?

get_user_pages() in parallel with exit_mmap() is a Bad Thing(tm) and I don't
see anything on the exit path that would've waited for that work to finish.
I might've missed something here, but...  Note that aio (another place
playing with use_mm(), also without an active ref) has an explicit hook
for mmput() to call before proceeding to exit_mmap(); I don't see anything
similar here.

Not that aio.c approach had been all that safe - get_task_mm() will refuse
to pick use_mm'ed one, but there are places open-coding it without the
check for PF_KTHREAD.  Few of them, fortunately, but...

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-16 18:05     ` Andy Lutomirski
@ 2012-12-17  3:29       ` Michel Lespinasse
  2012-12-17 22:01         ` Andy Lutomirski
  0 siblings, 1 reply; 22+ messages in thread
From: Michel Lespinasse @ 2012-12-17  3:29 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Al Viro, Ingo Molnar,
	Hugh Dickins, Jörn Engel, Linus Torvalds

On Sun, Dec 16, 2012 at 10:05 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> On Sun, Dec 16, 2012 at 4:39 AM, Michel Lespinasse <walken@google.com> wrote:
>> My main concern is that just downgrading the mmap_sem only hides the
>> problem: as soon as a writer gets queued on that mmap_sem,
>> reader/writer fairness kicks in and blocks any new readers, which
>> makes the problem reappear. So in order to completely fix the issue,
>> we should look for a way that doesn't require holding the mmap_sem
>> (even in read mode) for the entire duration of the populate or mlock
>> operation.
>
> Ugh.
>
> At least with my patch, mmap in MCL_FUTURE mode is no worse than mmap
> + mlock.  I suspect I haven't hit this because all my mmaping is done
> by one thread, so it never ends up waiting for itself, and the other
> thread have very short mmap_sem hold times.

Yes, you won't hit the problems with long read-side mmap_sem hold
times if you don't have other threads blocking for the write side.

>> I think this could be done by extending the mlock work I did as part
>> of v2.6.38-rc1. The commit message for
>> c explains the idea; basically
>> mlock() was split into do_mlock() which just sets the VM_LOCKED flag
>> on vmas as needed, and do_mlock_pages() which goes through a range of
>> addresses and actually populates/mlocks each individual page that is
>> part of a VM_LOCKED vma.
>
> Doesn't this have the same problem?  It holds mmap_sem for read for a
> long time, and if another writer comes in then r/w starvation
> prevention will kick in.

Well, my point is that do_mlock_pages() doesn't need to hold the
mmap_sem read side for a long time. It currently releases it when
faulting a page requires a disk read, and could conceptually release
it more often if needed.

We can't easily release mmap_sem from within mmap_region() since
mmap_region's callers don't expect it; however we can defer the page
mlocking and we don't have to hold mmap_sem continuously until then.
The only constraints are the new VM_LOCKED region's pages must be
mlocked before we return to userspace, and that if a concurrent thread
modifies the mappings while we don't hold mmap_sem, and creates a new
non-mlocked region, we shouldn't mlock those pages in
do_mlock_pages().

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-16 17:52     ` Andy Lutomirski
@ 2012-12-17  9:52       ` Ingo Molnar
  2012-12-18  0:54         ` [PATCH v3] " Andy Lutomirski
  0 siblings, 1 reply; 22+ messages in thread
From: Ingo Molnar @ 2012-12-17  9:52 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Al Viro,
	Michel Lespinasse, Hugh Dickins, J??rn Engel, Linus Torvalds


* Andy Lutomirski <luto@amacapital.net> wrote:

> > 2)
> >
> > More aggressively, we could just make it the _rule_ that the 
> > mm lock gets downgraded to read in mmap_region_helper(), no 
> > matter what.
> >
> > From a quick look I *think* all the usage sites (including 
> > sys_aio_setup()) are fine with that unlocking - but I could 
> > be wrong.
> 
> They are.

Lets try that then - the ugliness of the current patch is 
certainly a problem.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-17  3:29       ` Michel Lespinasse
@ 2012-12-17 22:01         ` Andy Lutomirski
  0 siblings, 0 replies; 22+ messages in thread
From: Andy Lutomirski @ 2012-12-17 22:01 UTC (permalink / raw)
  To: Michel Lespinasse
  Cc: linux-mm, linux-kernel, Andrew Morton, Al Viro, Ingo Molnar,
	Hugh Dickins, Jörn Engel, Linus Torvalds

On Sun, Dec 16, 2012 at 7:29 PM, Michel Lespinasse <walken@google.com> wrote:
> On Sun, Dec 16, 2012 at 10:05 AM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Sun, Dec 16, 2012 at 4:39 AM, Michel Lespinasse <walken@google.com> wrote:
>>> I think this could be done by extending the mlock work I did as part
>>> of v2.6.38-rc1. The commit message for
>>> c explains the idea; basically
>>> mlock() was split into do_mlock() which just sets the VM_LOCKED flag
>>> on vmas as needed, and do_mlock_pages() which goes through a range of
>>> addresses and actually populates/mlocks each individual page that is
>>> part of a VM_LOCKED vma.
>>
>> Doesn't this have the same problem?  It holds mmap_sem for read for a
>> long time, and if another writer comes in then r/w starvation
>> prevention will kick in.
>
> Well, my point is that do_mlock_pages() doesn't need to hold the
> mmap_sem read side for a long time. It currently releases it when
> faulting a page requires a disk read, and could conceptually release
> it more often if needed.

I can't find this code.  It looks like do_mlock_pages calls
__mlock_vma_pages_range, which calls __get_user_pages, which makes its
way to __do_fault, which doesn't seem to drop mmap_sem.

--Andy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v3] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-17  9:52       ` Ingo Molnar
@ 2012-12-18  0:54         ` Andy Lutomirski
  2012-12-20  2:22           ` Simon Jeons
  0 siblings, 1 reply; 22+ messages in thread
From: Andy Lutomirski @ 2012-12-18  0:54 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Andrew Morton, Al Viro, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, Jörn Engel,
	Andy Lutomirski

This is a serious cause of mmap_sem contention.  MAP_POPULATE
and MCL_FUTURE, in particular, are disastrous in multithreaded programs.

This is not a complete solution due to reader/writer fairness.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---

Changes from v2:

The mmap functions now unconditionally downgrade mmap_sem.  This is
slightly slower but a lot simpler.

Changes from v1:

The non-unlocking versions of do_mmap_pgoff and mmap_region are still
available for aio_setup_ring's benefit.  In theory, aio_setup_ring
would do better with a lock-downgrading version, but that would be
somewhat ugly and doesn't help my workload.

 arch/tile/mm/elf.c |  11 +++--
 fs/aio.c           |  11 ++---
 include/linux/mm.h |  15 +++++--
 ipc/shm.c          |   8 +++-
 mm/fremap.c        |   9 +++-
 mm/mmap.c          | 122 ++++++++++++++++++++++++++++++++++++-----------------
 mm/util.c          |   5 ++-
 7 files changed, 123 insertions(+), 58 deletions(-)

diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 3cfa98b..313acb2 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -129,12 +129,15 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 	 */
 	if (!retval) {
 		unsigned long addr = MEM_USER_INTRPT;
-		addr = mmap_region(NULL, addr, INTRPT_SIZE,
-				   MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
-				   VM_READ|VM_EXEC|
-				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
+		addr = mmap_region_downgrade_write(
+			NULL, addr, INTRPT_SIZE,
+			MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
+			VM_READ|VM_EXEC|
+			VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
+		up_read(&mm->mmap-sem);
 		if (addr > (unsigned long) -PAGE_SIZE)
 			retval = (int) addr;
+		return retval;  /* We already unlocked mmap_sem. */
 	}
 #endif
 
diff --git a/fs/aio.c b/fs/aio.c
index 71f613c..912d3d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -127,11 +127,12 @@ static int aio_setup_ring(struct kioctx *ctx)
 	info->mmap_size = nr_pages * PAGE_SIZE;
 	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
 	down_write(&ctx->mm->mmap_sem);
-	info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
-					PROT_READ|PROT_WRITE,
-					MAP_ANONYMOUS|MAP_PRIVATE, 0);
+	info->mmap_base = do_mmap_pgoff_downgrade_write(
+		NULL, 0, info->mmap_size,
+		PROT_READ|PROT_WRITE,
+		MAP_ANONYMOUS|MAP_PRIVATE, 0);
 	if (IS_ERR((void *)info->mmap_base)) {
-		up_write(&ctx->mm->mmap_sem);
+		up_read(&ctx->mm->mmap_sem);
 		info->mmap_size = 0;
 		aio_free_ring(ctx);
 		return -EAGAIN;
@@ -141,7 +142,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 	info->nr_pages = get_user_pages(current, ctx->mm,
 					info->mmap_base, nr_pages, 
 					1, 0, info->ring_pages, NULL);
-	up_write(&ctx->mm->mmap_sem);
+	up_read(&ctx->mm->mmap_sem);
 
 	if (unlikely(info->nr_pages != nr_pages)) {
 		aio_free_ring(ctx);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bcaab4e..a44aa00 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1441,12 +1441,19 @@ extern int install_special_mapping(struct mm_struct *mm,
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 
-extern unsigned long mmap_region(struct file *file, unsigned long addr,
+/*
+ * These functions are called with mmap_sem held for write and they return
+ * with mmap_sem held for read.
+ */
+extern unsigned long mmap_region_downgrade_write(
+	struct file *file, unsigned long addr,
 	unsigned long len, unsigned long flags,
 	vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
-        unsigned long, unsigned long,
-        unsigned long, unsigned long);
+extern unsigned long do_mmap_pgoff_downgrade_write(
+	struct file *, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flags, unsigned long pgoff);
+
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
 /* These take the mm semaphore themselves */
diff --git a/ipc/shm.c b/ipc/shm.c
index dff40c9..482f3d6 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1068,12 +1068,16 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 		    addr > current->mm->start_stack - size - PAGE_SIZE * 5)
 			goto invalid;
 	}
-		
-	user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
+
+	user_addr = do_mmap_pgoff_downgrade_write(file, addr, size,
+						  prot, flags, 0);
+	up_read(&current->mm->mmap_sem);
 	*raddr = user_addr;
 	err = 0;
 	if (IS_ERR_VALUE(user_addr))
 		err = (long)user_addr;
+	goto out_fput;
+
 invalid:
 	up_write(&current->mm->mmap_sem);
 
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e..55c4a9b 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -200,8 +200,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			struct file *file = get_file(vma->vm_file);
 
 			flags &= MAP_NONBLOCK;
-			addr = mmap_region(file, start, size,
-					flags, vma->vm_flags, pgoff);
+			addr = mmap_region_downgrade_write(
+				file, start, size, flags, vma->vm_flags, pgoff);
+			has_write_lock = 0;
 			fput(file);
 			if (IS_ERR_VALUE(addr)) {
 				err = addr;
@@ -237,6 +238,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			/*
 			 * might be mapping previously unmapped range of file
 			 */
+			if (unlikely(has_write_lock)) {
+				downgrade_write(&mm->mmap_sem);
+				has_write_lock = 0;
+			}
 			mlock_vma_pages_range(vma, start, start + size);
 		} else {
 			if (unlikely(has_write_lock)) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 9a796c4..3913262 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -999,9 +999,10 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
 
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
-			unsigned long len, unsigned long prot,
-			unsigned long flags, unsigned long pgoff)
+unsigned long do_mmap_pgoff_downgrade_write(
+	struct file *file, unsigned long addr,
+	unsigned long len, unsigned long prot,
+	unsigned long flags, unsigned long pgoff)
 {
 	struct mm_struct * mm = current->mm;
 	struct inode *inode;
@@ -1017,31 +1018,39 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
 			prot |= PROT_EXEC;
 
-	if (!len)
-		return -EINVAL;
+	if (!len) {
+		addr = -EINVAL;
+		goto out_downgrade;
+	}
 
 	if (!(flags & MAP_FIXED))
 		addr = round_hint_to_min(addr);
 
 	/* Careful about overflows.. */
 	len = PAGE_ALIGN(len);
-	if (!len)
-		return -ENOMEM;
+	if (!len) {
+		addr = -ENOMEM;
+		goto out_downgrade;
+	}
 
 	/* offset overflow? */
-	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
-               return -EOVERFLOW;
+	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) {
+		addr = -EOVERFLOW;
+		goto out_downgrade;
+	}
 
 	/* Too many mappings? */
-	if (mm->map_count > sysctl_max_map_count)
-		return -ENOMEM;
+	if (mm->map_count > sysctl_max_map_count) {
+		addr = -ENOMEM;
+		goto out_downgrade;
+	}
 
 	/* Obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space.
 	 */
 	addr = get_unmapped_area(file, addr, len, pgoff, flags);
 	if (addr & ~PAGE_MASK)
-		return addr;
+		goto out_downgrade;
 
 	/* Do simple checking here so the lower-level routines won't have
 	 * to. we assume access permissions have been handled by the open
@@ -1050,9 +1059,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
-	if (flags & MAP_LOCKED)
-		if (!can_do_mlock())
-			return -EPERM;
+	if (flags & MAP_LOCKED) {
+		if (!can_do_mlock()) {
+			addr = -EPERM;
+			goto out_downgrade;
+		}
+	}
 
 	/* mlock MCL_FUTURE? */
 	if (vm_flags & VM_LOCKED) {
@@ -1061,8 +1073,10 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		locked += mm->locked_vm;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		lock_limit >>= PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			return -EAGAIN;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+			addr = -EAGAIN;
+			goto out_downgrade;
+		}
 	}
 
 	inode = file ? file->f_path.dentry->d_inode : NULL;
@@ -1070,21 +1084,27 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	if (file) {
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
-			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
-				return -EACCES;
+			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) {
+				addr = -EACCES;
+				goto out_downgrade;
+			}
 
 			/*
 			 * Make sure we don't allow writing to an append-only
 			 * file..
 			 */
-			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
-				return -EACCES;
+			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) {
+				addr = -EACCES;
+				goto out_downgrade;
+			}
 
 			/*
 			 * Make sure there are no mandatory locks on the file.
 			 */
-			if (locks_verify_locked(inode))
-				return -EAGAIN;
+			if (locks_verify_locked(inode)) {
+				addr = -EAGAIN;
+				goto out_downgrade;
+			}
 
 			vm_flags |= VM_SHARED | VM_MAYSHARE;
 			if (!(file->f_mode & FMODE_WRITE))
@@ -1092,20 +1112,27 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 
 			/* fall through */
 		case MAP_PRIVATE:
-			if (!(file->f_mode & FMODE_READ))
-				return -EACCES;
+			if (!(file->f_mode & FMODE_READ)) {
+				addr = -EACCES;
+				goto out_downgrade;
+			}
 			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
-				if (vm_flags & VM_EXEC)
-					return -EPERM;
+				if (vm_flags & VM_EXEC) {
+					addr = -EPERM;
+					goto out_downgrade;
+				}
 				vm_flags &= ~VM_MAYEXEC;
 			}
 
-			if (!file->f_op || !file->f_op->mmap)
-				return -ENODEV;
+			if (!file->f_op || !file->f_op->mmap) {
+				addr = -ENODEV;
+				goto out_downgrade;
+			}
 			break;
 
 		default:
-			return -EINVAL;
+			addr = -EINVAL;
+			goto out_downgrade;
 		}
 	} else {
 		switch (flags & MAP_TYPE) {
@@ -1123,11 +1150,17 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 			pgoff = addr >> PAGE_SHIFT;
 			break;
 		default:
-			return -EINVAL;
+			addr = -EINVAL;
+			goto out_downgrade;
 		}
 	}
 
-	return mmap_region(file, addr, len, flags, vm_flags, pgoff);
+	return mmap_region_downgrade_write(file, addr, len,
+		flags, vm_flags, pgoff);
+
+out_downgrade:
+	downgrade_write(&mm->mmap_sem);
+	return addr;
 }
 
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1240,9 +1273,10 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
-unsigned long mmap_region(struct file *file, unsigned long addr,
-			  unsigned long len, unsigned long flags,
-			  vm_flags_t vm_flags, unsigned long pgoff)
+unsigned long mmap_region_downgrade_write(
+	struct file *file, unsigned long addr,
+	unsigned long len, unsigned long flags,
+	vm_flags_t vm_flags, unsigned long pgoff)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1262,8 +1296,10 @@ munmap_back:
 	}
 
 	/* Check against address space limit. */
-	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
-		return -ENOMEM;
+	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+		error = -ENOMEM;
+		goto unacct_error;
+	}
 
 	/*
 	 * Set 'VM_NORESERVE' if we should not account for the
@@ -1284,8 +1320,10 @@ munmap_back:
 	 */
 	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
-		if (security_vm_enough_memory_mm(mm, charged))
-			return -ENOMEM;
+		if (security_vm_enough_memory_mm(mm, charged)) {
+			error = -ENOMEM;
+			goto unacct_error;
+		}
 		vm_flags |= VM_ACCOUNT;
 	}
 
@@ -1369,9 +1407,12 @@ munmap_back:
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
+	downgrade_write(&mm->mmap_sem);
+
 	perf_event_mmap(vma);
 
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+
 	if (vm_flags & VM_LOCKED) {
 		if (!mlock_vma_pages_range(vma, addr, addr + len))
 			mm->locked_vm += (len >> PAGE_SHIFT);
@@ -1397,6 +1438,9 @@ free_vma:
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
+
+	downgrade_write(&mm->mmap_sem);
+
 	return error;
 }
 
diff --git a/mm/util.c b/mm/util.c
index dc3036c..ab489a7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -359,8 +359,9 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
 		down_write(&mm->mmap_sem);
-		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
-		up_write(&mm->mmap_sem);
+		ret = do_mmap_pgoff_downgrade_write(
+			file, addr, len, prot, flag, pgoff);
+		up_read(&mm->mmap_sem);
 	}
 	return ret;
 }
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v3] mm: Downgrade mmap_sem before locking or populating on mmap
  2012-12-18  0:54         ` [PATCH v3] " Andy Lutomirski
@ 2012-12-20  2:22           ` Simon Jeons
  0 siblings, 0 replies; 22+ messages in thread
From: Simon Jeons @ 2012-12-20  2:22 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-mm, linux-kernel, Andrew Morton, Al Viro, Ingo Molnar,
	Michel Lespinasse, Hugh Dickins, Jörn Engel

On Mon, 2012-12-17 at 16:54 -0800, Andy Lutomirski wrote:
> This is a serious cause of mmap_sem contention.  MAP_POPULATE
> and MCL_FUTURE, in particular, are disastrous in multithreaded programs.
> 
> This is not a complete solution due to reader/writer fairness.
> 

Hi Andy, could you explain what issue you meet, which part of kernel
will be influence by it and how you resolve the issue in details, it's
too hard for other guys to get useful information form your patch
changlog.

> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
> ---
> 
> Changes from v2:
> 
> The mmap functions now unconditionally downgrade mmap_sem.  This is
> slightly slower but a lot simpler.
> 
> Changes from v1:
> 
> The non-unlocking versions of do_mmap_pgoff and mmap_region are still
> available for aio_setup_ring's benefit.  In theory, aio_setup_ring
> would do better with a lock-downgrading version, but that would be
> somewhat ugly and doesn't help my workload.
> 
>  arch/tile/mm/elf.c |  11 +++--
>  fs/aio.c           |  11 ++---
>  include/linux/mm.h |  15 +++++--
>  ipc/shm.c          |   8 +++-
>  mm/fremap.c        |   9 +++-
>  mm/mmap.c          | 122 ++++++++++++++++++++++++++++++++++++-----------------
>  mm/util.c          |   5 ++-
>  7 files changed, 123 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
> index 3cfa98b..313acb2 100644
> --- a/arch/tile/mm/elf.c
> +++ b/arch/tile/mm/elf.c
> @@ -129,12 +129,15 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
>  	 */
>  	if (!retval) {
>  		unsigned long addr = MEM_USER_INTRPT;
> -		addr = mmap_region(NULL, addr, INTRPT_SIZE,
> -				   MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
> -				   VM_READ|VM_EXEC|
> -				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
> +		addr = mmap_region_downgrade_write(
> +			NULL, addr, INTRPT_SIZE,
> +			MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
> +			VM_READ|VM_EXEC|
> +			VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
> +		up_read(&mm->mmap-sem);
>  		if (addr > (unsigned long) -PAGE_SIZE)
>  			retval = (int) addr;
> +		return retval;  /* We already unlocked mmap_sem. */
>  	}
>  #endif
>  
> diff --git a/fs/aio.c b/fs/aio.c
> index 71f613c..912d3d8 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -127,11 +127,12 @@ static int aio_setup_ring(struct kioctx *ctx)
>  	info->mmap_size = nr_pages * PAGE_SIZE;
>  	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
>  	down_write(&ctx->mm->mmap_sem);
> -	info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
> -					PROT_READ|PROT_WRITE,
> -					MAP_ANONYMOUS|MAP_PRIVATE, 0);
> +	info->mmap_base = do_mmap_pgoff_downgrade_write(
> +		NULL, 0, info->mmap_size,
> +		PROT_READ|PROT_WRITE,
> +		MAP_ANONYMOUS|MAP_PRIVATE, 0);
>  	if (IS_ERR((void *)info->mmap_base)) {
> -		up_write(&ctx->mm->mmap_sem);
> +		up_read(&ctx->mm->mmap_sem);
>  		info->mmap_size = 0;
>  		aio_free_ring(ctx);
>  		return -EAGAIN;
> @@ -141,7 +142,7 @@ static int aio_setup_ring(struct kioctx *ctx)
>  	info->nr_pages = get_user_pages(current, ctx->mm,
>  					info->mmap_base, nr_pages, 
>  					1, 0, info->ring_pages, NULL);
> -	up_write(&ctx->mm->mmap_sem);
> +	up_read(&ctx->mm->mmap_sem);
>  
>  	if (unlikely(info->nr_pages != nr_pages)) {
>  		aio_free_ring(ctx);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index bcaab4e..a44aa00 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1441,12 +1441,19 @@ extern int install_special_mapping(struct mm_struct *mm,
>  
>  extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
>  
> -extern unsigned long mmap_region(struct file *file, unsigned long addr,
> +/*
> + * These functions are called with mmap_sem held for write and they return
> + * with mmap_sem held for read.
> + */
> +extern unsigned long mmap_region_downgrade_write(
> +	struct file *file, unsigned long addr,
>  	unsigned long len, unsigned long flags,
>  	vm_flags_t vm_flags, unsigned long pgoff);
> -extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
> -        unsigned long, unsigned long,
> -        unsigned long, unsigned long);
> +extern unsigned long do_mmap_pgoff_downgrade_write(
> +	struct file *, unsigned long addr,
> +	unsigned long len, unsigned long prot,
> +	unsigned long flags, unsigned long pgoff);
> +
>  extern int do_munmap(struct mm_struct *, unsigned long, size_t);
>  
>  /* These take the mm semaphore themselves */
> diff --git a/ipc/shm.c b/ipc/shm.c
> index dff40c9..482f3d6 100644
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -1068,12 +1068,16 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
>  		    addr > current->mm->start_stack - size - PAGE_SIZE * 5)
>  			goto invalid;
>  	}
> -		
> -	user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
> +
> +	user_addr = do_mmap_pgoff_downgrade_write(file, addr, size,
> +						  prot, flags, 0);
> +	up_read(&current->mm->mmap_sem);
>  	*raddr = user_addr;
>  	err = 0;
>  	if (IS_ERR_VALUE(user_addr))
>  		err = (long)user_addr;
> +	goto out_fput;
> +
>  invalid:
>  	up_write(&current->mm->mmap_sem);
>  
> diff --git a/mm/fremap.c b/mm/fremap.c
> index a0aaf0e..55c4a9b 100644
> --- a/mm/fremap.c
> +++ b/mm/fremap.c
> @@ -200,8 +200,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
>  			struct file *file = get_file(vma->vm_file);
>  
>  			flags &= MAP_NONBLOCK;
> -			addr = mmap_region(file, start, size,
> -					flags, vma->vm_flags, pgoff);
> +			addr = mmap_region_downgrade_write(
> +				file, start, size, flags, vma->vm_flags, pgoff);
> +			has_write_lock = 0;
>  			fput(file);
>  			if (IS_ERR_VALUE(addr)) {
>  				err = addr;
> @@ -237,6 +238,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
>  			/*
>  			 * might be mapping previously unmapped range of file
>  			 */
> +			if (unlikely(has_write_lock)) {
> +				downgrade_write(&mm->mmap_sem);
> +				has_write_lock = 0;
> +			}
>  			mlock_vma_pages_range(vma, start, start + size);
>  		} else {
>  			if (unlikely(has_write_lock)) {
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 9a796c4..3913262 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -999,9 +999,10 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
>   * The caller must hold down_write(&current->mm->mmap_sem).
>   */
>  
> -unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
> -			unsigned long len, unsigned long prot,
> -			unsigned long flags, unsigned long pgoff)
> +unsigned long do_mmap_pgoff_downgrade_write(
> +	struct file *file, unsigned long addr,
> +	unsigned long len, unsigned long prot,
> +	unsigned long flags, unsigned long pgoff)
>  {
>  	struct mm_struct * mm = current->mm;
>  	struct inode *inode;
> @@ -1017,31 +1018,39 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
>  		if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
>  			prot |= PROT_EXEC;
>  
> -	if (!len)
> -		return -EINVAL;
> +	if (!len) {
> +		addr = -EINVAL;
> +		goto out_downgrade;
> +	}
>  
>  	if (!(flags & MAP_FIXED))
>  		addr = round_hint_to_min(addr);
>  
>  	/* Careful about overflows.. */
>  	len = PAGE_ALIGN(len);
> -	if (!len)
> -		return -ENOMEM;
> +	if (!len) {
> +		addr = -ENOMEM;
> +		goto out_downgrade;
> +	}
>  
>  	/* offset overflow? */
> -	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
> -               return -EOVERFLOW;
> +	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) {
> +		addr = -EOVERFLOW;
> +		goto out_downgrade;
> +	}
>  
>  	/* Too many mappings? */
> -	if (mm->map_count > sysctl_max_map_count)
> -		return -ENOMEM;
> +	if (mm->map_count > sysctl_max_map_count) {
> +		addr = -ENOMEM;
> +		goto out_downgrade;
> +	}
>  
>  	/* Obtain the address to map to. we verify (or select) it and ensure
>  	 * that it represents a valid section of the address space.
>  	 */
>  	addr = get_unmapped_area(file, addr, len, pgoff, flags);
>  	if (addr & ~PAGE_MASK)
> -		return addr;
> +		goto out_downgrade;
>  
>  	/* Do simple checking here so the lower-level routines won't have
>  	 * to. we assume access permissions have been handled by the open
> @@ -1050,9 +1059,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
>  	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
>  			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
>  
> -	if (flags & MAP_LOCKED)
> -		if (!can_do_mlock())
> -			return -EPERM;
> +	if (flags & MAP_LOCKED) {
> +		if (!can_do_mlock()) {
> +			addr = -EPERM;
> +			goto out_downgrade;
> +		}
> +	}
>  
>  	/* mlock MCL_FUTURE? */
>  	if (vm_flags & VM_LOCKED) {
> @@ -1061,8 +1073,10 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
>  		locked += mm->locked_vm;
>  		lock_limit = rlimit(RLIMIT_MEMLOCK);
>  		lock_limit >>= PAGE_SHIFT;
> -		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
> -			return -EAGAIN;
> +		if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +			addr = -EAGAIN;
> +			goto out_downgrade;
> +		}
>  	}
>  
>  	inode = file ? file->f_path.dentry->d_inode : NULL;
> @@ -1070,21 +1084,27 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
>  	if (file) {
>  		switch (flags & MAP_TYPE) {
>  		case MAP_SHARED:
> -			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
> -				return -EACCES;
> +			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) {
> +				addr = -EACCES;
> +				goto out_downgrade;
> +			}
>  
>  			/*
>  			 * Make sure we don't allow writing to an append-only
>  			 * file..
>  			 */
> -			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
> -				return -EACCES;
> +			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) {
> +				addr = -EACCES;
> +				goto out_downgrade;
> +			}
>  
>  			/*
>  			 * Make sure there are no mandatory locks on the file.
>  			 */
> -			if (locks_verify_locked(inode))
> -				return -EAGAIN;
> +			if (locks_verify_locked(inode)) {
> +				addr = -EAGAIN;
> +				goto out_downgrade;
> +			}
>  
>  			vm_flags |= VM_SHARED | VM_MAYSHARE;
>  			if (!(file->f_mode & FMODE_WRITE))
> @@ -1092,20 +1112,27 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
>  
>  			/* fall through */
>  		case MAP_PRIVATE:
> -			if (!(file->f_mode & FMODE_READ))
> -				return -EACCES;
> +			if (!(file->f_mode & FMODE_READ)) {
> +				addr = -EACCES;
> +				goto out_downgrade;
> +			}
>  			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
> -				if (vm_flags & VM_EXEC)
> -					return -EPERM;
> +				if (vm_flags & VM_EXEC) {
> +					addr = -EPERM;
> +					goto out_downgrade;
> +				}
>  				vm_flags &= ~VM_MAYEXEC;
>  			}
>  
> -			if (!file->f_op || !file->f_op->mmap)
> -				return -ENODEV;
> +			if (!file->f_op || !file->f_op->mmap) {
> +				addr = -ENODEV;
> +				goto out_downgrade;
> +			}
>  			break;
>  
>  		default:
> -			return -EINVAL;
> +			addr = -EINVAL;
> +			goto out_downgrade;
>  		}
>  	} else {
>  		switch (flags & MAP_TYPE) {
> @@ -1123,11 +1150,17 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
>  			pgoff = addr >> PAGE_SHIFT;
>  			break;
>  		default:
> -			return -EINVAL;
> +			addr = -EINVAL;
> +			goto out_downgrade;
>  		}
>  	}
>  
> -	return mmap_region(file, addr, len, flags, vm_flags, pgoff);
> +	return mmap_region_downgrade_write(file, addr, len,
> +		flags, vm_flags, pgoff);
> +
> +out_downgrade:
> +	downgrade_write(&mm->mmap_sem);
> +	return addr;
>  }
>  
>  SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
> @@ -1240,9 +1273,10 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
>  	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
>  }
>  
> -unsigned long mmap_region(struct file *file, unsigned long addr,
> -			  unsigned long len, unsigned long flags,
> -			  vm_flags_t vm_flags, unsigned long pgoff)
> +unsigned long mmap_region_downgrade_write(
> +	struct file *file, unsigned long addr,
> +	unsigned long len, unsigned long flags,
> +	vm_flags_t vm_flags, unsigned long pgoff)
>  {
>  	struct mm_struct *mm = current->mm;
>  	struct vm_area_struct *vma, *prev;
> @@ -1262,8 +1296,10 @@ munmap_back:
>  	}
>  
>  	/* Check against address space limit. */
> -	if (!may_expand_vm(mm, len >> PAGE_SHIFT))
> -		return -ENOMEM;
> +	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
> +		error = -ENOMEM;
> +		goto unacct_error;
> +	}
>  
>  	/*
>  	 * Set 'VM_NORESERVE' if we should not account for the
> @@ -1284,8 +1320,10 @@ munmap_back:
>  	 */
>  	if (accountable_mapping(file, vm_flags)) {
>  		charged = len >> PAGE_SHIFT;
> -		if (security_vm_enough_memory_mm(mm, charged))
> -			return -ENOMEM;
> +		if (security_vm_enough_memory_mm(mm, charged)) {
> +			error = -ENOMEM;
> +			goto unacct_error;
> +		}
>  		vm_flags |= VM_ACCOUNT;
>  	}
>  
> @@ -1369,9 +1407,12 @@ munmap_back:
>  	if (correct_wcount)
>  		atomic_inc(&inode->i_writecount);
>  out:
> +	downgrade_write(&mm->mmap_sem);
> +
>  	perf_event_mmap(vma);
>  
>  	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
> +
>  	if (vm_flags & VM_LOCKED) {
>  		if (!mlock_vma_pages_range(vma, addr, addr + len))
>  			mm->locked_vm += (len >> PAGE_SHIFT);
> @@ -1397,6 +1438,9 @@ free_vma:
>  unacct_error:
>  	if (charged)
>  		vm_unacct_memory(charged);
> +
> +	downgrade_write(&mm->mmap_sem);
> +
>  	return error;
>  }
>  
> diff --git a/mm/util.c b/mm/util.c
> index dc3036c..ab489a7 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -359,8 +359,9 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
>  	ret = security_mmap_file(file, prot, flag);
>  	if (!ret) {
>  		down_write(&mm->mmap_sem);
> -		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
> -		up_write(&mm->mmap_sem);
> +		ret = do_mmap_pgoff_downgrade_write(
> +			file, addr, len, prot, flag, pgoff);
> +		up_read(&mm->mmap_sem);
>  	}
>  	return ret;
>  }



^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2012-12-20  5:54 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-12-14  5:49 [PATCH] mm: Downgrade mmap_sem before locking or populating on mmap Andy Lutomirski
2012-12-14  7:27 ` Al Viro
2012-12-14 11:14   ` Andy Lutomirski
2012-12-14 14:49     ` Al Viro
2012-12-14 16:12       ` Andy Lutomirski
2012-12-16  8:41         ` Ingo Molnar
2012-12-16 17:04         ` Al Viro
2012-12-16 17:48           ` Al Viro
2012-12-16 18:49           ` Johannes Weiner
2012-12-16 19:53           ` Al Viro
2012-12-16 20:16           ` Al Viro
2012-12-15  2:17 ` [PATCH v2] " Andy Lutomirski
2012-12-16  9:00   ` Ingo Molnar
2012-12-16 17:52     ` Andy Lutomirski
2012-12-17  9:52       ` Ingo Molnar
2012-12-18  0:54         ` [PATCH v3] " Andy Lutomirski
2012-12-20  2:22           ` Simon Jeons
2012-12-16 12:39   ` [PATCH v2] " Michel Lespinasse
2012-12-16 18:05     ` Andy Lutomirski
2012-12-17  3:29       ` Michel Lespinasse
2012-12-17 22:01         ` Andy Lutomirski
2012-12-16 19:58   ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).