linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file()
@ 2015-03-20 14:47 Konstantin Khlebnikov
  2015-03-23 18:11 ` Davidlohr Bueso
  0 siblings, 1 reply; 14+ messages in thread
From: Konstantin Khlebnikov @ 2015-03-20 14:47 UTC (permalink / raw)
  To: linux-mm, Andrew Morton; +Cc: linux-kernel

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 kernel/fork.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index a7c596517bd6..aa2ba1a34ce8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -696,7 +696,7 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
 	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
 			!atomic_read(&mm->mm_users) || current->in_execve ||
-			lock_is_held(&mm->mmap_sem));
+			lockdep_is_held(&mm->mmap_sem));
 
 	if (new_exe_file)
 		get_file(new_exe_file);


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file()
  2015-03-20 14:47 [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file() Konstantin Khlebnikov
@ 2015-03-23 18:11 ` Davidlohr Bueso
  2015-03-23 19:10   ` Oleg Nesterov
  0 siblings, 1 reply; 14+ messages in thread
From: Davidlohr Bueso @ 2015-03-23 18:11 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-mm, Andrew Morton, linux-kernel, Oleg Nesterov

Actually, here's a patch that gets rid of that condition, per Oleg.

What do you think?

8<------------------------------------------------
Subject: [PATCH -next] prctl: avoid using mmap_sem for exe_file serialization

Oleg cleverly suggested using xchg() to set the new
mm->exe_file instead of calling set_mm_exe_file()
which requires some form of serialization -- mmap_sem
in this case. For archs that do not have atomic rmw
instructions we still fallback to a spinlock alternative,
so this should always be safe.  As such, we only need the
mmap_sem for looking up the backing vm_file, which can be
done sharing the lock. Naturally, this means we need to
manually deal with both the new and old file reference
counting, and we need not worry about the MMF_EXE_FILE_CHANGED
bits, which can probably be deleted in the future anyway.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
---
 kernel/fork.c | 11 ++++++-----
 kernel/sys.c  | 43 ++++++++++++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index bea5f36..98858b5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -688,15 +688,16 @@ EXPORT_SYMBOL_GPL(mmput);
  *
  * This changes mm's executale file (shown as symlink /proc/[pid]/exe).
  *
- * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE).
- * Callers prevent concurrent invocations: in mmput() nobody alive left,
- * in execve task is single-threaded, prctl holds mmap_sem exclusively.
+ * Main users are mmput() and sys_execve(). Callers prevent concurrent
+ * invocations: in mmput() nobody alive left, in execve task is single
+ * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
+ * mm->exe_file, but does so without using set_mm_exe_file() in order
+ * to do avoid the need for any locks.
  */
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
 	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
-			!atomic_read(&mm->mm_users) || current->in_execve ||
-			lock_is_held(&mm->mmap_sem));
+			!atomic_read(&mm->mm_users) || current->in_execve);
 
 	if (new_exe_file)
 		get_file(new_exe_file);
diff --git a/kernel/sys.c b/kernel/sys.c
index 3be3449..1da6b17 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
-static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
 	struct fd exe;
+	struct file *old_exe, *exe_file;
 	struct inode *inode;
 	int err;
 
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
 	exe = fdget(fd);
 	if (!exe.file)
 		return -EBADF;
@@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
 	/*
 	 * Forbid mm->exe_file change if old file still mapped.
 	 */
+	exe_file = get_mm_exe_file(mm);
 	err = -EBUSY;
 	if (mm->exe_file) {
 		struct vm_area_struct *vma;
 
-		for (vma = mm->mmap; vma; vma = vma->vm_next)
-			if (vma->vm_file &&
-			    path_equal(&vma->vm_file->f_path,
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			if (path_equal(&vma->vm_file->f_path,
 				       &mm->exe_file->f_path))
-				goto exit;
+				goto exit_err;
+		}
+
+		up_read(&mm->mmap_sem);
+		fput(exe_file);
 	}
 
 	/*
@@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
 		goto exit;
 
 	err = 0;
-	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */
+	/* set the new file, lockless */
+	get_file(exe.file);
+	old_exe = xchg(&mm->exe_file, exe.file);
+	if (old_exe)
+		fput(old_exe);
 exit:
 	fdput(exe);
 	return err;
+exit_err:
+	up_read(&mm->mmap_sem);
+	fput(exe_file);
+	goto exit;
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
 	}
 
-	down_write(&mm->mmap_sem);
 	if (prctl_map.exe_fd != (u32)-1)
-		error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
-	downgrade_write(&mm->mmap_sem);
+		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
+	down_read(&mm->mmap_sem);
 	if (error)
 		goto out;
 
@@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
-	if (opt == PR_SET_MM_EXE_FILE) {
-		down_write(&mm->mmap_sem);
-		error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
-		up_write(&mm->mmap_sem);
-		return error;
-	}
+	if (opt == PR_SET_MM_EXE_FILE)
+		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
 	if (addr >= TASK_SIZE || addr < mmap_min_addr)
 		return -EINVAL;
-- 
2.1.4




^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file()
  2015-03-23 18:11 ` Davidlohr Bueso
@ 2015-03-23 19:10   ` Oleg Nesterov
  2015-03-24 14:38     ` Davidlohr Bueso
  2015-03-24 17:13     ` Konstantin Khlebnikov
  0 siblings, 2 replies; 14+ messages in thread
From: Oleg Nesterov @ 2015-03-23 19:10 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Konstantin Khlebnikov, linux-mm, Andrew Morton, linux-kernel

On 03/23, Davidlohr Bueso wrote:
>
>  void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>  {
>  	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
> -			!atomic_read(&mm->mm_users) || current->in_execve ||
> -			lock_is_held(&mm->mmap_sem));
> +			!atomic_read(&mm->mm_users) || current->in_execve);

Thanks, looks correct at first glance...

But can't we remove the ->in_execve check above? and check

			atomic_read(&mm->mm_users) <= 1

instead. OK, this is subjective, I won't insist. Just current->in_execve
looks a bit confusing, it means "I swear, the caller is flush_old_exec()
and this mm is actualy bprm->mm".

"atomic_read(&mm->mm_users) <= 1" looks a bit more "safe". But again,
I won't insist.

Oleg.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file()
  2015-03-23 19:10   ` Oleg Nesterov
@ 2015-03-24 14:38     ` Davidlohr Bueso
  2015-03-24 17:13     ` Konstantin Khlebnikov
  1 sibling, 0 replies; 14+ messages in thread
From: Davidlohr Bueso @ 2015-03-24 14:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-mm, Andrew Morton, linux-kernel

On Mon, 2015-03-23 at 20:10 +0100, Oleg Nesterov wrote:
> "atomic_read(&mm->mm_users) <= 1" looks a bit more "safe". But again,
> I won't insist.

Agreed, it is nicer to do that check, but I have no strong preference
either, perhaps Konstantin or akpm do. Anyway, here's the change that
can be folded in if you guys want to.

Thanks.

8<-------------------------------------------------
diff --git a/kernel/fork.c b/kernel/fork.c
index 98858b5..0c3de2b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -697,7 +697,7 @@ EXPORT_SYMBOL_GPL(mmput);
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
 	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
-			!atomic_read(&mm->mm_users) || current->in_execve);
+			atomic_read(&mm->mm_users) <= 1);
 
 	if (new_exe_file)
 		get_file(new_exe_file);



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file()
  2015-03-23 19:10   ` Oleg Nesterov
  2015-03-24 14:38     ` Davidlohr Bueso
@ 2015-03-24 17:13     ` Konstantin Khlebnikov
  2015-03-24 18:10       ` Oleg Nesterov
  1 sibling, 1 reply; 14+ messages in thread
From: Konstantin Khlebnikov @ 2015-03-24 17:13 UTC (permalink / raw)
  To: Oleg Nesterov, Davidlohr Bueso; +Cc: linux-mm, Andrew Morton, linux-kernel

On 23.03.2015 22:10, Oleg Nesterov wrote:
> On 03/23, Davidlohr Bueso wrote:
>>
>>   void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>>   {
>>   	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
>> -			!atomic_read(&mm->mm_users) || current->in_execve ||
>> -			lock_is_held(&mm->mmap_sem));
>> +			!atomic_read(&mm->mm_users) || current->in_execve);
>
> Thanks, looks correct at first glance...
>
> But can't we remove the ->in_execve check above? and check
>
> 			atomic_read(&mm->mm_users) <= 1
>
> instead. OK, this is subjective, I won't insist. Just current->in_execve
> looks a bit confusing, it means "I swear, the caller is flush_old_exec()
> and this mm is actualy bprm->mm".
>
> "atomic_read(&mm->mm_users) <= 1" looks a bit more "safe". But again,
> I won't insist.

Not so safe: this will race with get_task_mm().
A lot of proc files grab temporary reference to task mm.
But this just a debug -- we can place here "true".

>
> Oleg.
>

-- 
Konstantin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file()
  2015-03-24 17:13     ` Konstantin Khlebnikov
@ 2015-03-24 18:10       ` Oleg Nesterov
  2015-03-24 18:15         ` Konstantin Khlebnikov
  0 siblings, 1 reply; 14+ messages in thread
From: Oleg Nesterov @ 2015-03-24 18:10 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Davidlohr Bueso, linux-mm, Andrew Morton, linux-kernel

On 03/24, Konstantin Khlebnikov wrote:
>
> On 23.03.2015 22:10, Oleg Nesterov wrote:
>> On 03/23, Davidlohr Bueso wrote:
>>>
>>>   void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>>>   {
>>>   	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
>>> -			!atomic_read(&mm->mm_users) || current->in_execve ||
>>> -			lock_is_held(&mm->mmap_sem));
>>> +			!atomic_read(&mm->mm_users) || current->in_execve);
>>
>> Thanks, looks correct at first glance...
>>
>> But can't we remove the ->in_execve check above? and check
>>
>> 			atomic_read(&mm->mm_users) <= 1
>>
>> instead. OK, this is subjective, I won't insist. Just current->in_execve
>> looks a bit confusing, it means "I swear, the caller is flush_old_exec()
>> and this mm is actualy bprm->mm".
>>
>> "atomic_read(&mm->mm_users) <= 1" looks a bit more "safe". But again,
>> I won't insist.
>
> Not so safe: this will race with get_task_mm().

How?

If set_mm_exe_file() can race with get_task_mm() then we have a bug.
And it will be reported ;)

> A lot of proc files grab temporary reference to task mm.
> But this just a debug -- we can place here "true".

Yeees, probably rcu_dereference_raw() would be even better. set_mm_exe_file()
must be called only if nobody but us can access this mm.

Oleg.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file()
  2015-03-24 18:10       ` Oleg Nesterov
@ 2015-03-24 18:15         ` Konstantin Khlebnikov
  2015-03-24 19:02           ` Oleg Nesterov
  0 siblings, 1 reply; 14+ messages in thread
From: Konstantin Khlebnikov @ 2015-03-24 18:15 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Konstantin Khlebnikov, Davidlohr Bueso, linux-mm, Andrew Morton,
	Linux Kernel Mailing List

On Tue, Mar 24, 2015 at 9:10 PM, Oleg Nesterov <oleg@redhat.com> wrote:
> On 03/24, Konstantin Khlebnikov wrote:
>>
>> On 23.03.2015 22:10, Oleg Nesterov wrote:
>>> On 03/23, Davidlohr Bueso wrote:
>>>>
>>>>   void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>>>>   {
>>>>     struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
>>>> -                   !atomic_read(&mm->mm_users) || current->in_execve ||
>>>> -                   lock_is_held(&mm->mmap_sem));
>>>> +                   !atomic_read(&mm->mm_users) || current->in_execve);
>>>
>>> Thanks, looks correct at first glance...
>>>
>>> But can't we remove the ->in_execve check above? and check
>>>
>>>                      atomic_read(&mm->mm_users) <= 1
>>>
>>> instead. OK, this is subjective, I won't insist. Just current->in_execve
>>> looks a bit confusing, it means "I swear, the caller is flush_old_exec()
>>> and this mm is actualy bprm->mm".
>>>
>>> "atomic_read(&mm->mm_users) <= 1" looks a bit more "safe". But again,
>>> I won't insist.
>>
>> Not so safe: this will race with get_task_mm().
>
> How?

I mean rcu/lockdep debug migh race with get_task_mm() and generate
false-positive warning about non-protected rcu_dereference.

>
> If set_mm_exe_file() can race with get_task_mm() then we have a bug.
> And it will be reported ;)
>
>> A lot of proc files grab temporary reference to task mm.
>> But this just a debug -- we can place here "true".
>
> Yeees, probably rcu_dereference_raw() would be even better. set_mm_exe_file()
> must be called only if nobody but us can access this mm.

Yep.

>
> Oleg.
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file()
  2015-03-24 18:15         ` Konstantin Khlebnikov
@ 2015-03-24 19:02           ` Oleg Nesterov
  2015-03-25  1:30             ` [PATCH v2] prctl: avoid using mmap_sem for exe_file serialization Davidlohr Bueso
  0 siblings, 1 reply; 14+ messages in thread
From: Oleg Nesterov @ 2015-03-24 19:02 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Konstantin Khlebnikov, Davidlohr Bueso, linux-mm, Andrew Morton,
	Linux Kernel Mailing List

On 03/24, Konstantin Khlebnikov wrote:
>
> On Tue, Mar 24, 2015 at 9:10 PM, Oleg Nesterov <oleg@redhat.com> wrote:
> >>>
> >>> "atomic_read(&mm->mm_users) <= 1" looks a bit more "safe". But again,
> >>> I won't insist.
> >>
> >> Not so safe: this will race with get_task_mm().
> >
> > How?
>
> I mean rcu/lockdep debug migh race with get_task_mm() and generate
> false-positive warning about non-protected rcu_dereference.

Still can't understand, I think it can't... and if it could, then this
warning would not be false positive.

Anut this doesn't matter because we seem to agree this check should go away.

> > Yeees, probably rcu_dereference_raw() would be even better. set_mm_exe_file()
> > must be called only if nobody but us can access this mm.
>
> Yep.

Great. Davidlohr will you agree?

Oleg.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2] prctl: avoid using mmap_sem for exe_file serialization
  2015-03-24 19:02           ` Oleg Nesterov
@ 2015-03-25  1:30             ` Davidlohr Bueso
  2015-03-25  9:21               ` Konstantin Khlebnikov
  0 siblings, 1 reply; 14+ messages in thread
From: Davidlohr Bueso @ 2015-03-25  1:30 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Konstantin Khlebnikov, Konstantin Khlebnikov, linux-mm,
	Andrew Morton, Linux Kernel Mailing List

Oleg cleverly suggested using xchg() to set the new
mm->exe_file instead of calling set_mm_exe_file()
which requires some form of serialization -- mmap_sem
in this case. For archs that do not have atomic rmw
instructions we still fallback to a spinlock alternative,
so this should always be safe.  As such, we only need the
mmap_sem for looking up the backing vm_file, which can be
done sharing the lock. Naturally, this means we need to
manually deal with both the new and old file reference
counting, and we need not worry about the MMF_EXE_FILE_CHANGED
bits, which can probably be deleted in the future anyway.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
Changes from v1:
	- use rcu_dereference_raw()
	- comment lockless use in flush_old_exec()

 fs/exec.c     |  6 ++++++
 kernel/fork.c | 19 +++++++++++++------
 kernel/sys.c  | 43 ++++++++++++++++++++++++++-----------------
 3 files changed, 45 insertions(+), 23 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 314e8d8..02bfd98 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1082,7 +1082,13 @@ int flush_old_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out;
 
+	/*
+	 * Must be called _before_ exec_mmap() as bprm->mm is
+	 * not visible until then. This also enables the update
+	 * to be lockless.
+	 */
 	set_mm_exe_file(bprm->mm, bprm->file);
+
 	/*
 	 * Release all of the old mmap stuff
 	 */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3847f34..347f69c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -688,15 +688,22 @@ EXPORT_SYMBOL_GPL(mmput);
  *
  * This changes mm's executale file (shown as symlink /proc/[pid]/exe).
  *
- * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE).
- * Callers prevent concurrent invocations: in mmput() nobody alive left,
- * in execve task is single-threaded, prctl holds mmap_sem exclusively.
+ * Main users are mmput() and sys_execve(). Callers prevent concurrent
+ * invocations: in mmput() nobody alive left, in execve task is single
+ * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
+ * mm->exe_file, but does so without using set_mm_exe_file() in order
+ * to do avoid the need for any locks.
  */
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
-	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
-			!atomic_read(&mm->mm_users) || current->in_execve ||
-			lockdep_is_held(&mm->mmap_sem));
+	struct file *old_exe_file;
+
+	/*
+	 * It is safe to dereference the exe_file without RCU as
+	 * this function is only called if nobody else can access
+	 * this mm -- see comment above for justification.
+	 */
+	old_exe_file = rcu_dereference_raw(mm->exe_file);
 
 	if (new_exe_file)
 		get_file(new_exe_file);
diff --git a/kernel/sys.c b/kernel/sys.c
index 3be3449..1da6b17 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
-static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
 	struct fd exe;
+	struct file *old_exe, *exe_file;
 	struct inode *inode;
 	int err;
 
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
 	exe = fdget(fd);
 	if (!exe.file)
 		return -EBADF;
@@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
 	/*
 	 * Forbid mm->exe_file change if old file still mapped.
 	 */
+	exe_file = get_mm_exe_file(mm);
 	err = -EBUSY;
 	if (mm->exe_file) {
 		struct vm_area_struct *vma;
 
-		for (vma = mm->mmap; vma; vma = vma->vm_next)
-			if (vma->vm_file &&
-			    path_equal(&vma->vm_file->f_path,
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			if (path_equal(&vma->vm_file->f_path,
 				       &mm->exe_file->f_path))
-				goto exit;
+				goto exit_err;
+		}
+
+		up_read(&mm->mmap_sem);
+		fput(exe_file);
 	}
 
 	/*
@@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
 		goto exit;
 
 	err = 0;
-	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */
+	/* set the new file, lockless */
+	get_file(exe.file);
+	old_exe = xchg(&mm->exe_file, exe.file);
+	if (old_exe)
+		fput(old_exe);
 exit:
 	fdput(exe);
 	return err;
+exit_err:
+	up_read(&mm->mmap_sem);
+	fput(exe_file);
+	goto exit;
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
 	}
 
-	down_write(&mm->mmap_sem);
 	if (prctl_map.exe_fd != (u32)-1)
-		error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
-	downgrade_write(&mm->mmap_sem);
+		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
+	down_read(&mm->mmap_sem);
 	if (error)
 		goto out;
 
@@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
-	if (opt == PR_SET_MM_EXE_FILE) {
-		down_write(&mm->mmap_sem);
-		error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
-		up_write(&mm->mmap_sem);
-		return error;
-	}
+	if (opt == PR_SET_MM_EXE_FILE)
+		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
 	if (addr >= TASK_SIZE || addr < mmap_min_addr)
 		return -EINVAL;
-- 
2.1.4




^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] prctl: avoid using mmap_sem for exe_file serialization
  2015-03-25  1:30             ` [PATCH v2] prctl: avoid using mmap_sem for exe_file serialization Davidlohr Bueso
@ 2015-03-25  9:21               ` Konstantin Khlebnikov
  2015-03-25 10:42                 ` [PATCH v3] " Davidlohr Bueso
  0 siblings, 1 reply; 14+ messages in thread
From: Konstantin Khlebnikov @ 2015-03-25  9:21 UTC (permalink / raw)
  To: Davidlohr Bueso, Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-mm, Andrew Morton,
	Linux Kernel Mailing List

On 25.03.2015 04:30, Davidlohr Bueso wrote:
> Oleg cleverly suggested using xchg() to set the new
> mm->exe_file instead of calling set_mm_exe_file()
> which requires some form of serialization -- mmap_sem
> in this case. For archs that do not have atomic rmw
> instructions we still fallback to a spinlock alternative,
> so this should always be safe.  As such, we only need the
> mmap_sem for looking up the backing vm_file, which can be
> done sharing the lock. Naturally, this means we need to
> manually deal with both the new and old file reference
> counting, and we need not worry about the MMF_EXE_FILE_CHANGED
> bits, which can probably be deleted in the future anyway.
>
> Suggested-by: Oleg Nesterov <oleg@redhat.com>
> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
> ---
> Changes from v1:
> 	- use rcu_dereference_raw()
> 	- comment lockless use in flush_old_exec()
>
>   fs/exec.c     |  6 ++++++
>   kernel/fork.c | 19 +++++++++++++------
>   kernel/sys.c  | 43 ++++++++++++++++++++++++++-----------------
>   3 files changed, 45 insertions(+), 23 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 314e8d8..02bfd98 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1082,7 +1082,13 @@ int flush_old_exec(struct linux_binprm * bprm)
>   	if (retval)
>   		goto out;
>
> +	/*
> +	 * Must be called _before_ exec_mmap() as bprm->mm is
> +	 * not visible until then. This also enables the update
> +	 * to be lockless.
> +	 */
>   	set_mm_exe_file(bprm->mm, bprm->file);
> +
>   	/*
>   	 * Release all of the old mmap stuff
>   	 */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3847f34..347f69c 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -688,15 +688,22 @@ EXPORT_SYMBOL_GPL(mmput);
>    *
>    * This changes mm's executale file (shown as symlink /proc/[pid]/exe).
>    *
> - * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE).
> - * Callers prevent concurrent invocations: in mmput() nobody alive left,
> - * in execve task is single-threaded, prctl holds mmap_sem exclusively.
> + * Main users are mmput() and sys_execve(). Callers prevent concurrent
> + * invocations: in mmput() nobody alive left, in execve task is single
> + * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
> + * mm->exe_file, but does so without using set_mm_exe_file() in order
> + * to do avoid the need for any locks.
>    */
>   void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>   {
> -	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
> -			!atomic_read(&mm->mm_users) || current->in_execve ||
> -			lockdep_is_held(&mm->mmap_sem));
> +	struct file *old_exe_file;
> +
> +	/*
> +	 * It is safe to dereference the exe_file without RCU as
> +	 * this function is only called if nobody else can access
> +	 * this mm -- see comment above for justification.
> +	 */
> +	old_exe_file = rcu_dereference_raw(mm->exe_file);
>
>   	if (new_exe_file)
>   		get_file(new_exe_file);
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 3be3449..1da6b17 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
>   	return mask;
>   }
>
> -static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
> +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
>   {
>   	struct fd exe;
> +	struct file *old_exe, *exe_file;
>   	struct inode *inode;
>   	int err;
>
> -	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
> -
>   	exe = fdget(fd);
>   	if (!exe.file)
>   		return -EBADF;
> @@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
>   	/*
>   	 * Forbid mm->exe_file change if old file still mapped.
>   	 */
> +	exe_file = get_mm_exe_file(mm);
>   	err = -EBUSY;
>   	if (mm->exe_file) {

exe_file not mm->exe_file

>   		struct vm_area_struct *vma;
>
> -		for (vma = mm->mmap; vma; vma = vma->vm_next)
> -			if (vma->vm_file &&
> -			    path_equal(&vma->vm_file->f_path,
> +		down_read(&mm->mmap_sem);
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (!vma->vm_file)
> +				continue;
> +			if (path_equal(&vma->vm_file->f_path,
>   				       &mm->exe_file->f_path))

same here

> -				goto exit;
> +				goto exit_err;
> +		}
> +
> +		up_read(&mm->mmap_sem);
> +		fput(exe_file);
>   	}
>
>   	/*
> @@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
>   		goto exit;
>
>   	err = 0;
> -	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */
> +	/* set the new file, lockless */
> +	get_file(exe.file);
> +	old_exe = xchg(&mm->exe_file, exe.file);
> +	if (old_exe)
> +		fput(old_exe);
>   exit:
>   	fdput(exe);
>   	return err;
> +exit_err:
> +	up_read(&mm->mmap_sem);
> +	fput(exe_file);
> +	goto exit;
>   }
>
>   #ifdef CONFIG_CHECKPOINT_RESTORE
> @@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
>   		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
>   	}
>
> -	down_write(&mm->mmap_sem);
>   	if (prctl_map.exe_fd != (u32)-1)
> -		error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
> -	downgrade_write(&mm->mmap_sem);
> +		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
> +	down_read(&mm->mmap_sem);
>   	if (error)
>   		goto out;
>
> @@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
>   	if (!capable(CAP_SYS_RESOURCE))
>   		return -EPERM;
>
> -	if (opt == PR_SET_MM_EXE_FILE) {
> -		down_write(&mm->mmap_sem);
> -		error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
> -		up_write(&mm->mmap_sem);
> -		return error;
> -	}
> +	if (opt == PR_SET_MM_EXE_FILE)
> +		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
>
>   	if (addr >= TASK_SIZE || addr < mmap_min_addr)
>   		return -EINVAL;
>


-- 
Konstantin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3] prctl: avoid using mmap_sem for exe_file serialization
  2015-03-25  9:21               ` Konstantin Khlebnikov
@ 2015-03-25 10:42                 ` Davidlohr Bueso
  2015-03-25 11:08                   ` Konstantin Khlebnikov
  2015-03-25 12:53                   ` Oleg Nesterov
  0 siblings, 2 replies; 14+ messages in thread
From: Davidlohr Bueso @ 2015-03-25 10:42 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Oleg Nesterov, Konstantin Khlebnikov, linux-mm, Andrew Morton,
	Linux Kernel Mailing List

Oleg cleverly suggested using xchg() to set the new
mm->exe_file instead of calling set_mm_exe_file()
which requires some form of serialization -- mmap_sem
in this case. For archs that do not have atomic rmw
instructions we still fallback to a spinlock alternative,
so this should always be safe.  As such, we only need the
mmap_sem for looking up the backing vm_file, which can be
done sharing the lock. Naturally, this means we need to
manually deal with both the new and old file reference
counting, and we need not worry about the MMF_EXE_FILE_CHANGED
bits, which can probably be deleted in the future anyway.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>, 
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---

Changes from v2: use correct exe_file (sigh), per Konstantin.

 fs/exec.c     |  6 ++++++
 kernel/fork.c | 19 +++++++++++++------
 kernel/sys.c  | 47 ++++++++++++++++++++++++++++-------------------
 3 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 314e8d8..02bfd98 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1082,7 +1082,13 @@ int flush_old_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out;
 
+	/*
+	 * Must be called _before_ exec_mmap() as bprm->mm is
+	 * not visibile until then. This also enables the update
+	 * to be lockless.
+	 */
 	set_mm_exe_file(bprm->mm, bprm->file);
+
 	/*
 	 * Release all of the old mmap stuff
 	 */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3847f34..347f69c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -688,15 +688,22 @@ EXPORT_SYMBOL_GPL(mmput);
  *
  * This changes mm's executale file (shown as symlink /proc/[pid]/exe).
  *
- * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE).
- * Callers prevent concurrent invocations: in mmput() nobody alive left,
- * in execve task is single-threaded, prctl holds mmap_sem exclusively.
+ * Main users are mmput() and sys_execve(). Callers prevent concurrent
+ * invocations: in mmput() nobody alive left, in execve task is single
+ * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
+ * mm->exe_file, but does so without using set_mm_exe_file() in order
+ * to do avoid the need for any locks.
  */
 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 {
-	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
-			!atomic_read(&mm->mm_users) || current->in_execve ||
-			lockdep_is_held(&mm->mmap_sem));
+	struct file *old_exe_file;
+
+	/*
+	 * It is safe to dereference the exe_file without RCU as
+	 * this function is only called if nobody else can access
+	 * this mm -- see comment above for justification.
+	 */
+	old_exe_file = rcu_dereference_raw(mm->exe_file);
 
 	if (new_exe_file)
 		get_file(new_exe_file);
diff --git a/kernel/sys.c b/kernel/sys.c
index 3be3449..a4e372b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
-static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
 	struct fd exe;
+	struct file *old_exe, *exe_file;
 	struct inode *inode;
 	int err;
 
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
 	exe = fdget(fd);
 	if (!exe.file)
 		return -EBADF;
@@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
 	/*
 	 * Forbid mm->exe_file change if old file still mapped.
 	 */
+	exe_file = get_mm_exe_file(mm);
 	err = -EBUSY;
-	if (mm->exe_file) {
+	if (exe_file) {
 		struct vm_area_struct *vma;
 
-		for (vma = mm->mmap; vma; vma = vma->vm_next)
-			if (vma->vm_file &&
-			    path_equal(&vma->vm_file->f_path,
-				       &mm->exe_file->f_path))
-				goto exit;
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			if (path_equal(&vma->vm_file->f_path,
+				       &exe_file->f_path))
+				goto exit_err;
+		}
+
+		up_read(&mm->mmap_sem);
+		fput(exe_file);
 	}
 
 	/*
@@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
 		goto exit;
 
 	err = 0;
-	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */
+	/* set the new file, lockless */
+	get_file(exe.file);
+	old_exe = xchg(&mm->exe_file, exe.file);
+	if (old_exe)
+		fput(old_exe);
 exit:
 	fdput(exe);
 	return err;
+exit_err:
+	up_read(&mm->mmap_sem);
+	fput(exe_file);
+	goto exit;
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
 	}
 
-	down_write(&mm->mmap_sem);
 	if (prctl_map.exe_fd != (u32)-1)
-		error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
-	downgrade_write(&mm->mmap_sem);
+		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
+	down_read(&mm->mmap_sem);
 	if (error)
 		goto out;
 
@@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
-	if (opt == PR_SET_MM_EXE_FILE) {
-		down_write(&mm->mmap_sem);
-		error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
-		up_write(&mm->mmap_sem);
-		return error;
-	}
+	if (opt == PR_SET_MM_EXE_FILE)
+		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
 	if (addr >= TASK_SIZE || addr < mmap_min_addr)
 		return -EINVAL;
-- 
2.1.4




^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] prctl: avoid using mmap_sem for exe_file serialization
  2015-03-25 10:42                 ` [PATCH v3] " Davidlohr Bueso
@ 2015-03-25 11:08                   ` Konstantin Khlebnikov
  2015-03-25 12:50                     ` Davidlohr Bueso
  2015-03-25 12:53                   ` Oleg Nesterov
  1 sibling, 1 reply; 14+ messages in thread
From: Konstantin Khlebnikov @ 2015-03-25 11:08 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Oleg Nesterov, Konstantin Khlebnikov, linux-mm, Andrew Morton,
	Linux Kernel Mailing List

On 25.03.2015 13:42, Davidlohr Bueso wrote:
> Oleg cleverly suggested using xchg() to set the new
> mm->exe_file instead of calling set_mm_exe_file()
> which requires some form of serialization -- mmap_sem
> in this case. For archs that do not have atomic rmw
> instructions we still fallback to a spinlock alternative,
> so this should always be safe.  As such, we only need the
> mmap_sem for looking up the backing vm_file, which can be
> done sharing the lock. Naturally, this means we need to
> manually deal with both the new and old file reference
> counting, and we need not worry about the MMF_EXE_FILE_CHANGED
> bits, which can probably be deleted in the future anyway.
>
> Suggested-by: Oleg Nesterov <oleg@redhat.com>
> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>

Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

If this is preparation for future rework of mmap_sem maybe we could
postpone committing this patch.

> ---
>
> Changes from v2: use correct exe_file (sigh), per Konstantin.
>
>   fs/exec.c     |  6 ++++++
>   kernel/fork.c | 19 +++++++++++++------
>   kernel/sys.c  | 47 ++++++++++++++++++++++++++++-------------------
>   3 files changed, 47 insertions(+), 25 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 314e8d8..02bfd98 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1082,7 +1082,13 @@ int flush_old_exec(struct linux_binprm * bprm)
>   	if (retval)
>   		goto out;
>
> +	/*
> +	 * Must be called _before_ exec_mmap() as bprm->mm is
> +	 * not visibile until then. This also enables the update
> +	 * to be lockless.
> +	 */
>   	set_mm_exe_file(bprm->mm, bprm->file);
> +
>   	/*
>   	 * Release all of the old mmap stuff
>   	 */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3847f34..347f69c 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -688,15 +688,22 @@ EXPORT_SYMBOL_GPL(mmput);
>    *
>    * This changes mm's executale file (shown as symlink /proc/[pid]/exe).
>    *
> - * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE).
> - * Callers prevent concurrent invocations: in mmput() nobody alive left,
> - * in execve task is single-threaded, prctl holds mmap_sem exclusively.
> + * Main users are mmput() and sys_execve(). Callers prevent concurrent
> + * invocations: in mmput() nobody alive left, in execve task is single
> + * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
> + * mm->exe_file, but does so without using set_mm_exe_file() in order
> + * to do avoid the need for any locks.
>    */
>   void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>   {
> -	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
> -			!atomic_read(&mm->mm_users) || current->in_execve ||
> -			lockdep_is_held(&mm->mmap_sem));
> +	struct file *old_exe_file;
> +
> +	/*
> +	 * It is safe to dereference the exe_file without RCU as
> +	 * this function is only called if nobody else can access
> +	 * this mm -- see comment above for justification.
> +	 */
> +	old_exe_file = rcu_dereference_raw(mm->exe_file);
>
>   	if (new_exe_file)
>   		get_file(new_exe_file);
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 3be3449..a4e372b 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
>   	return mask;
>   }
>
> -static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
> +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
>   {
>   	struct fd exe;
> +	struct file *old_exe, *exe_file;
>   	struct inode *inode;
>   	int err;
>
> -	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
> -
>   	exe = fdget(fd);
>   	if (!exe.file)
>   		return -EBADF;
> @@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
>   	/*
>   	 * Forbid mm->exe_file change if old file still mapped.
>   	 */
> +	exe_file = get_mm_exe_file(mm);
>   	err = -EBUSY;
> -	if (mm->exe_file) {
> +	if (exe_file) {
>   		struct vm_area_struct *vma;
>
> -		for (vma = mm->mmap; vma; vma = vma->vm_next)
> -			if (vma->vm_file &&
> -			    path_equal(&vma->vm_file->f_path,
> -				       &mm->exe_file->f_path))
> -				goto exit;
> +		down_read(&mm->mmap_sem);
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (!vma->vm_file)
> +				continue;
> +			if (path_equal(&vma->vm_file->f_path,
> +				       &exe_file->f_path))
> +				goto exit_err;
> +		}
> +
> +		up_read(&mm->mmap_sem);
> +		fput(exe_file);
>   	}
>
>   	/*
> @@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
>   		goto exit;
>
>   	err = 0;
> -	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */
> +	/* set the new file, lockless */
> +	get_file(exe.file);
> +	old_exe = xchg(&mm->exe_file, exe.file);
> +	if (old_exe)
> +		fput(old_exe);
>   exit:
>   	fdput(exe);
>   	return err;
> +exit_err:
> +	up_read(&mm->mmap_sem);
> +	fput(exe_file);
> +	goto exit;
>   }
>
>   #ifdef CONFIG_CHECKPOINT_RESTORE
> @@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
>   		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
>   	}
>
> -	down_write(&mm->mmap_sem);
>   	if (prctl_map.exe_fd != (u32)-1)
> -		error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
> -	downgrade_write(&mm->mmap_sem);
> +		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
> +	down_read(&mm->mmap_sem);
>   	if (error)
>   		goto out;
>
> @@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
>   	if (!capable(CAP_SYS_RESOURCE))
>   		return -EPERM;
>
> -	if (opt == PR_SET_MM_EXE_FILE) {
> -		down_write(&mm->mmap_sem);
> -		error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
> -		up_write(&mm->mmap_sem);
> -		return error;
> -	}
> +	if (opt == PR_SET_MM_EXE_FILE)
> +		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
>
>   	if (addr >= TASK_SIZE || addr < mmap_min_addr)
>   		return -EINVAL;
>


-- 
Konstantin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] prctl: avoid using mmap_sem for exe_file serialization
  2015-03-25 11:08                   ` Konstantin Khlebnikov
@ 2015-03-25 12:50                     ` Davidlohr Bueso
  0 siblings, 0 replies; 14+ messages in thread
From: Davidlohr Bueso @ 2015-03-25 12:50 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Oleg Nesterov, Konstantin Khlebnikov, linux-mm, Andrew Morton,
	Linux Kernel Mailing List

On Wed, 2015-03-25 at 14:08 +0300, Konstantin Khlebnikov wrote:
> Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

Thanks.

> If this is preparation for future rework of mmap_sem maybe we could
> postpone committing this patch.

Personally, I think this patch is fine to commit for v4.1, along with
your rcu one, regardless of my mmap_sem work (which is still a wip).
Furthermore we've already got all the exe_file users updated now in
next, so I'd like to wrap this exe_file thing up if nobody has any
objections.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] prctl: avoid using mmap_sem for exe_file serialization
  2015-03-25 10:42                 ` [PATCH v3] " Davidlohr Bueso
  2015-03-25 11:08                   ` Konstantin Khlebnikov
@ 2015-03-25 12:53                   ` Oleg Nesterov
  1 sibling, 0 replies; 14+ messages in thread
From: Oleg Nesterov @ 2015-03-25 12:53 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Konstantin Khlebnikov, Konstantin Khlebnikov, linux-mm,
	Andrew Morton, Linux Kernel Mailing List

On 03/25, Davidlohr Bueso wrote:
>
> Changes from v2: use correct exe_file (sigh), per Konstantin.

Looks good, thanks!

Acked-by: Oleg Nesterov <oleg@redhat.com>


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-03-25 12:55 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-20 14:47 [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file() Konstantin Khlebnikov
2015-03-23 18:11 ` Davidlohr Bueso
2015-03-23 19:10   ` Oleg Nesterov
2015-03-24 14:38     ` Davidlohr Bueso
2015-03-24 17:13     ` Konstantin Khlebnikov
2015-03-24 18:10       ` Oleg Nesterov
2015-03-24 18:15         ` Konstantin Khlebnikov
2015-03-24 19:02           ` Oleg Nesterov
2015-03-25  1:30             ` [PATCH v2] prctl: avoid using mmap_sem for exe_file serialization Davidlohr Bueso
2015-03-25  9:21               ` Konstantin Khlebnikov
2015-03-25 10:42                 ` [PATCH v3] " Davidlohr Bueso
2015-03-25 11:08                   ` Konstantin Khlebnikov
2015-03-25 12:50                     ` Davidlohr Bueso
2015-03-25 12:53                   ` Oleg Nesterov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).