All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] procfs: Do not release pid_ns->proc_mnt too early
@ 2010-06-16 15:58 ` Louis Rilling
  0 siblings, 0 replies; 31+ messages in thread
From: Louis Rilling @ 2010-06-16 15:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Containers, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Oleg Nesterov, Pavel Emelyanov

Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
that release_task()->proc_flush_task() of container init can be called
before it is for some detached tasks in the namespace.

Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
whatever the ordering of tasks.

Signed-off-by: Louis Rilling <louis.rilling-aw0BnHfMbSpBDgjK7y7TUQ@public.gmane.org>
---
 fs/proc/base.c          |   17 +++++++++++++++++
 include/linux/proc_fs.h |    4 ++++
 kernel/fork.c           |    1 +
 3 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef8..4d7328f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
+/*
+ * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
+ * after container init calls itself proc_flush_task().
+ */
+void proc_new_task(struct task_struct *task)
+{
+	struct pid *pid;
+	int i;
+
+	if (!task->pid)
+		return;
+
+	pid = task_pid(task);
+	for (i = 0; i <= pid->level; i++)
+		mntget(pid->numbers[i].ns->proc_mnt);
+}
+
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 {
 	struct dentry *dentry, *leader, *dir;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 379eaed..f24faa1 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -104,6 +104,7 @@ struct vmcore {
 
 extern void proc_root_init(void);
 
+void proc_new_task(struct task_struct *task);
 void proc_flush_task(struct task_struct *task);
 
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
@@ -184,6 +185,9 @@ extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
 static inline void proc_net_remove(struct net *net, const char *name) {}
 
+static inline void proc_new_task(struct task_struct *task)
+{
+}
 static inline void proc_flush_task(struct task_struct *task)
 {
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14..c6c2874 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1281,6 +1281,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+	proc_new_task(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	perf_event_fork(p);
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH] procfs: Do not release pid_ns->proc_mnt too early
@ 2010-06-16 15:58 ` Louis Rilling
  0 siblings, 0 replies; 31+ messages in thread
From: Louis Rilling @ 2010-06-16 15:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Oleg Nesterov, Pavel Emelyanov, Linux Containers, linux-kernel,
	Louis Rilling

Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
that release_task()->proc_flush_task() of container init can be called
before it is for some detached tasks in the namespace.

Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
whatever the ordering of tasks.

Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
---
 fs/proc/base.c          |   17 +++++++++++++++++
 include/linux/proc_fs.h |    4 ++++
 kernel/fork.c           |    1 +
 3 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef8..4d7328f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
+/*
+ * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
+ * after container init calls itself proc_flush_task().
+ */
+void proc_new_task(struct task_struct *task)
+{
+	struct pid *pid;
+	int i;
+
+	if (!task->pid)
+		return;
+
+	pid = task_pid(task);
+	for (i = 0; i <= pid->level; i++)
+		mntget(pid->numbers[i].ns->proc_mnt);
+}
+
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 {
 	struct dentry *dentry, *leader, *dir;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 379eaed..f24faa1 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -104,6 +104,7 @@ struct vmcore {
 
 extern void proc_root_init(void);
 
+void proc_new_task(struct task_struct *task);
 void proc_flush_task(struct task_struct *task);
 
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
@@ -184,6 +185,9 @@ extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
 static inline void proc_net_remove(struct net *net, const char *name) {}
 
+static inline void proc_new_task(struct task_struct *task)
+{
+}
 static inline void proc_flush_task(struct task_struct *task)
 {
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14..c6c2874 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1281,6 +1281,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+	proc_new_task(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	perf_event_fork(p);
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-16 15:58 ` Louis Rilling
  (?)
@ 2010-06-16 16:04 ` Pavel Emelyanov
  2010-06-16 16:15   ` Louis Rilling
  2010-06-16 16:16   ` Louis Rilling
  -1 siblings, 2 replies; 31+ messages in thread
From: Pavel Emelyanov @ 2010-06-16 16:04 UTC (permalink / raw)
  To: Louis Rilling
  Cc: Andrew Morton, Oleg Nesterov, Linux Containers, linux-kernel

> +void proc_new_task(struct task_struct *task)
> +{
> +	struct pid *pid;
> +	int i;
> +
> +	if (!task->pid)
> +		return;
> +
> +	pid = task_pid(task);
> +	for (i = 0; i <= pid->level; i++)
> +		mntget(pid->numbers[i].ns->proc_mnt);

I feel I'm missing something significant, but this patch breaks
the mntget/mntput balance. Doesn't it?

> +}

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-16 16:04 ` Pavel Emelyanov
@ 2010-06-16 16:15   ` Louis Rilling
  2010-06-16 16:16   ` Louis Rilling
  1 sibling, 0 replies; 31+ messages in thread
From: Louis Rilling @ 2010-06-16 16:15 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andrew Morton, Oleg Nesterov, Linux Containers, linux-kernel

On 16/06/10 20:04 +0400, Pavel Emelyanov wrote:
> > +void proc_new_task(struct task_struct *task)
> > +{
> > +	struct pid *pid;
> > +	int i;
> > +
> > +	if (!task->pid)
> > +		return;
> > +
> > +	pid = task_pid(task);
> > +	for (i = 0; i <= pid->level; i++)
> > +		mntget(pid->numbers[i].ns->proc_mnt);
> 
> I feel I'm missing something significant, but this patch breaks
> the mntget/mntput balance. Doesn't it?

Why?

all mntget() here have their mntput() in proc_flush_task(). At least, this is
the intent...

Thanks,

Louis

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-16 16:04 ` Pavel Emelyanov
  2010-06-16 16:15   ` Louis Rilling
@ 2010-06-16 16:16   ` Louis Rilling
  1 sibling, 0 replies; 31+ messages in thread
From: Louis Rilling @ 2010-06-16 16:16 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andrew Morton, Oleg Nesterov, Linux Containers, linux-kernel

On 16/06/10 20:04 +0400, Pavel Emelyanov wrote:
> > +void proc_new_task(struct task_struct *task)
> > +{
> > +	struct pid *pid;
> > +	int i;
> > +
> > +	if (!task->pid)
> > +		return;
> > +
> > +	pid = task_pid(task);
> > +	for (i = 0; i <= pid->level; i++)
> > +		mntget(pid->numbers[i].ns->proc_mnt);
> 
> I feel I'm missing something significant, but this patch breaks
> the mntget/mntput balance. Doesn't it?

Gah. Sorry, the part in proc_flush_task() is missing. Will resend ASAP.

Thanks,

Louis

> 
> > +}
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-21 11:15             ` Louis Rilling
@ 2010-06-21 14:38               ` Oleg Nesterov
  0 siblings, 0 replies; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-21 14:38 UTC (permalink / raw)
  To: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel

On 06/21, Louis Rilling wrote:
>
> On 18/06/10 19:55 +0200, Oleg Nesterov wrote:
>
> > Yes, this should be fixed, I already realized this. work->func(ns) is
> > called when ns is already fixed.
> >
> > Otherwise, nobody should use ns->proc_mount when ns is already dead/freed.
>
> I meant the opposite. proc_mnt can be kept mounted somewhere,

I think, no. If it is kept mounted vfsmount should be different.

> and accesses to it
> will likely try to access (freed) pid_ns from it.

Not sure, there must be no tasks (and pids) in this ns.

By anyway I agree. This needs more thinking and we should do something
with sb->s_fs_info.

But given that nobody (including me) seem to like this approach - let's
forget this patch.

Thanks,

Oleg.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-21 14:15                 ` Louis Rilling
@ 2010-06-21 14:26                   ` Eric W. Biederman
  0 siblings, 0 replies; 31+ messages in thread
From: Eric W. Biederman @ 2010-06-21 14:26 UTC (permalink / raw)
  To: Linux Containers
  Cc: Andrew Morton, Pavel Emelyanov, linux-kernel, Pavel Emelyanov

Louis Rilling <Louis.Rilling@kerlabs.com> writes:

> On 21/06/10  5:58 -0700, Eric W. Biederman wrote:
>> Louis Rilling <Louis.Rilling@kerlabs.com> writes:
>> 
>> > On 18/06/10 18:27 +0200, Oleg Nesterov wrote:
>> >> On 06/18, Louis Rilling wrote:
>> >> >
>> >> > On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
>> >> > > On 06/17, Eric W. Biederman wrote:
>> >> > > >
>> >> > > > The task->children isn't changed until __unhash_process() which runs
>> >> > > > after flush_proc_task().
>> >> > >
>> >> > > Yes. But this is only the current implementation detail.
>> >> > > It would be nice to cleanup the code so that EXIT_DEAD tasks are
>> >> > > never sit in ->children list.
>> >> > >
>> >> > > > So we should be able to come up with
>> >> > > > a variant of do_wait() that zap_pid_ns_processes can use that does
>> >> > > > what we need.
>> >> > >
>> >> > > See above...
>> >> > >
>> >> > > Even if we modify do_wait() or add the new variant, how the caller
>> >> > > can wait for EXIT_DEAD tasks? I don't think we want to modify
>> >> > > release_task() to do __wake_up_parent() or something similar.
>> >> >
>> >> > Indeed, I was thinking about calling __wake_up_parent() from release_task()
>> >> > once parent->children becomes empty.
>> >> >
>> >> > Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
>> >> > in parent->signal could limit it. But if EXIT_DEAD children are removed from
>> >> > ->children before release_task(), I'm afraid that this becomes impossible.
>> >> 
>> >> Thinking more, even the current do_wait() from zap_pid_ns_processes()
>> >> is not really good. Suppose that some none-init thread is ptraced, then
>> >> zap_pid_ns_processes() will hange until the tracer does do_wait() or
>> >> exits.
>> >
>> > Is this really a bad thing? If somebody ptraces a task in a pid namespace, that
>> > sounds reasonable to have this namespace (and it's init task) pinned.
>> 
>> Louis.  Have you seen this problem hit without my setns patch?
>
> Yes. I hit it with Kerrighed patches.  I also have an ugly reproducer on
> 2.6.35-rc3 (see attachments). Ugly because I introduced artifical delays
> in release_task(). I couldn't trigger the bug without it, probably because the
> scheduler is too kind :)
>
> I'm using memory poisoining (SLAB and DEBUG_SLAB) to make it easy to observe the
> bug.
>
> Example:
> # ./proc_flush_task-bug-reproducer 1
>
>> 
>> I'm pretty certain that this hits because there are processes do_wait
>> does not wait for, in particular processes in a disjoint process tree.
>
> Indeed do_wait() misses EXIT_DEAD children.
>
>> 
>> So at this point I am really favoring killing the do_wait and making
>> this all asynchronous.
>
> Any idea about how to do it?

Some variant of the patches Oleg just recently posted.  I'm still not
comfortable with the extending the kernel mount to the entire lifetime
of the pid_namespace.  But it certainly is better than a lot of the
alternatives.


Eric

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-21 12:58               ` Eric W. Biederman
@ 2010-06-21 14:15                 ` Louis Rilling
  2010-06-21 14:26                   ` Eric W. Biederman
  0 siblings, 1 reply; 31+ messages in thread
From: Louis Rilling @ 2010-06-21 14:15 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Linux Containers, Andrew Morton, Pavel Emelyanov, linux-kernel,
	Pavel Emelyanov


[-- Attachment #1.1: Type: text/plain, Size: 2869 bytes --]

On 21/06/10  5:58 -0700, Eric W. Biederman wrote:
> Louis Rilling <Louis.Rilling@kerlabs.com> writes:
> 
> > On 18/06/10 18:27 +0200, Oleg Nesterov wrote:
> >> On 06/18, Louis Rilling wrote:
> >> >
> >> > On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
> >> > > On 06/17, Eric W. Biederman wrote:
> >> > > >
> >> > > > The task->children isn't changed until __unhash_process() which runs
> >> > > > after flush_proc_task().
> >> > >
> >> > > Yes. But this is only the current implementation detail.
> >> > > It would be nice to cleanup the code so that EXIT_DEAD tasks are
> >> > > never sit in ->children list.
> >> > >
> >> > > > So we should be able to come up with
> >> > > > a variant of do_wait() that zap_pid_ns_processes can use that does
> >> > > > what we need.
> >> > >
> >> > > See above...
> >> > >
> >> > > Even if we modify do_wait() or add the new variant, how the caller
> >> > > can wait for EXIT_DEAD tasks? I don't think we want to modify
> >> > > release_task() to do __wake_up_parent() or something similar.
> >> >
> >> > Indeed, I was thinking about calling __wake_up_parent() from release_task()
> >> > once parent->children becomes empty.
> >> >
> >> > Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
> >> > in parent->signal could limit it. But if EXIT_DEAD children are removed from
> >> > ->children before release_task(), I'm afraid that this becomes impossible.
> >> 
> >> Thinking more, even the current do_wait() from zap_pid_ns_processes()
> >> is not really good. Suppose that some none-init thread is ptraced, then
> >> zap_pid_ns_processes() will hange until the tracer does do_wait() or
> >> exits.
> >
> > Is this really a bad thing? If somebody ptraces a task in a pid namespace, that
> > sounds reasonable to have this namespace (and it's init task) pinned.
> 
> Louis.  Have you seen this problem hit without my setns patch?

Yes. I hit it with Kerrighed patches. I also have an ugly reproducer on
2.6.35-rc3 (see attachments). Ugly because I introduced artifical delays
in release_task(). I couldn't trigger the bug without it, probably because the
scheduler is too kind :)

I'm using memory poisoining (SLAB and DEBUG_SLAB) to make it easy to observe the
bug.

Example:
# ./proc_flush_task-bug-reproducer 1

> 
> I'm pretty certain that this hits because there are processes do_wait
> does not wait for, in particular processes in a disjoint process tree.

Indeed do_wait() misses EXIT_DEAD children.

> 
> So at this point I am really favoring killing the do_wait and making
> this all asynchronous.

Any idea about how to do it?

Thanks,

Louis

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #1.2: proc_flush_task-bug-reproducer.c --]
[-- Type: text/x-csrc, Size: 1605 bytes --]

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <signal.h>
#include <linux/sched.h>

int pipefd[2];

int init(void *arg)
{
	int nr, i, err;
	sighandler_t sigret;
	char c;

	close(pipefd[0]);

	err = setsid();
	if (err < 0) {
		perror("setsid");
		abort();
	}

	sigret = signal(SIGCHLD, SIG_IGN);
	if (sigret == SIG_ERR) {
		fprintf(stderr, "signal\n");
		abort();
	}
	
	nr = atoi(arg);
	for (i = 0; i < nr; i++) {
		err = fork();
		if (err < 0) {
			perror("fork");
			abort();
		} else if (err == 0) {
			printf("%d before\n", getpid());
			fflush(stdout);
			pause();
			printf("%d after\n", getpid());
			fflush(stdout);
			return 0;
		}
	}

	err = write(pipefd[1], &c, 1);
	if (err != 1) {
		perror("write");
		abort();
	}

	pause();

	return 0;
}

int main(int argc, char *argv[])
{
	long stack_size = sysconf(_SC_PAGESIZE);
	void *stack = alloca(stack_size) + stack_size;
	pid_t pid;
	char c;
	int ret;

	ret = pipe(pipefd);
	if (ret) {
		perror("pipe");
		abort();
	}

	ret = clone(init, stack, CLONE_NEWPID | SIGCHLD, argv[1]);
	if (ret < 0) {
		perror("clone");
		abort();
	}
	pid = ret;
	printf("%d\n", pid);
	fflush(stdout);

	close(pipefd[1]);
	ret = read(pipefd[0], &c, 1);
	if (ret != 1) {
		if (ret) {
			perror("read");
			abort();
		} else {
			sleep(5);
		}
	}

	printf("killing %d\n", pid);
	fflush(stdout);
	ret = kill(-pid, SIGKILL);
	if (ret) {
		perror("kill");
		abort();
	}

	return 0;
}

[-- Attachment #1.3: reproducer.patch --]
[-- Type: text/x-diff, Size: 715 bytes --]

commit 7b7cae6ae5c543b8e9cc84fc041d9bce36e7b674
Author: Louis Rilling <louis.rilling@kerlabs.com>
Date:   Wed Jun 16 16:20:02 2010 +0200

    proc_flush_task() debug

diff --git a/kernel/exit.c b/kernel/exit.c
index ceffc67..be8cdb0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -169,6 +169,14 @@ repeat:
 	atomic_dec(&__task_cred(p)->user->processes);
 	rcu_read_unlock();
 
+	if (task_pid(p)->level > 0) {
+		if (!thread_group_leader(p) || !is_container_init(p)) {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(10 * HZ);
+		}
+		printk("release_task: %d/%d\n", p->pid, task_pid(p)->numbers[1].nr);
+	}
+
 	proc_flush_task(p);
 
 	write_lock_irq(&tasklist_lock);

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-21 11:11             ` Louis Rilling
@ 2010-06-21 12:58               ` Eric W. Biederman
  2010-06-21 14:15                 ` Louis Rilling
  0 siblings, 1 reply; 31+ messages in thread
From: Eric W. Biederman @ 2010-06-21 12:58 UTC (permalink / raw)
  To: Louis Rilling
  Cc: Pavel Emelyanov, Andrew Morton, Pavel Emelyanov,
	Linux Containers, linux-kernel

Louis Rilling <Louis.Rilling@kerlabs.com> writes:

> On 18/06/10 18:27 +0200, Oleg Nesterov wrote:
>> On 06/18, Louis Rilling wrote:
>> >
>> > On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
>> > > On 06/17, Eric W. Biederman wrote:
>> > > >
>> > > > The task->children isn't changed until __unhash_process() which runs
>> > > > after flush_proc_task().
>> > >
>> > > Yes. But this is only the current implementation detail.
>> > > It would be nice to cleanup the code so that EXIT_DEAD tasks are
>> > > never sit in ->children list.
>> > >
>> > > > So we should be able to come up with
>> > > > a variant of do_wait() that zap_pid_ns_processes can use that does
>> > > > what we need.
>> > >
>> > > See above...
>> > >
>> > > Even if we modify do_wait() or add the new variant, how the caller
>> > > can wait for EXIT_DEAD tasks? I don't think we want to modify
>> > > release_task() to do __wake_up_parent() or something similar.
>> >
>> > Indeed, I was thinking about calling __wake_up_parent() from release_task()
>> > once parent->children becomes empty.
>> >
>> > Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
>> > in parent->signal could limit it. But if EXIT_DEAD children are removed from
>> > ->children before release_task(), I'm afraid that this becomes impossible.
>> 
>> Thinking more, even the current do_wait() from zap_pid_ns_processes()
>> is not really good. Suppose that some none-init thread is ptraced, then
>> zap_pid_ns_processes() will hange until the tracer does do_wait() or
>> exits.
>
> Is this really a bad thing? If somebody ptraces a task in a pid namespace, that
> sounds reasonable to have this namespace (and it's init task) pinned.

Louis.  Have you seen this problem hit without my setns patch?

I'm pretty certain that this hits because there are processes do_wait
does not wait for, in particular processes in a disjoint process tree.

So at this point I am really favoring killing the do_wait and making
this all asynchronous.

Eric

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-18 17:55             ` Oleg Nesterov
                               ` (2 preceding siblings ...)
  (?)
@ 2010-06-21 11:15             ` Louis Rilling
  2010-06-21 14:38               ` Oleg Nesterov
  -1 siblings, 1 reply; 31+ messages in thread
From: Louis Rilling @ 2010-06-21 11:15 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1136 bytes --]

On 18/06/10 19:55 +0200, Oleg Nesterov wrote:
> On 06/18, Louis Rilling wrote:
> >
> > On 18/06/10 18:08 +0200, Oleg Nesterov wrote:
> > >
> > > Not sure I ever understood this code. Certainly I can't say I understand
> > > it now. Still, do we really need this circle? I am almost sure the patch
> > > below is not right (and it wasn't tested at all), but could you take a
> > > look?
> >
> > I won't pretend understanding better than you do. Still I can try to give my 2
> > cents.
> >
> > Overall, I don't feel comfortable at being able to have a living proc_mnt
> > with a dead pid_ns.
> 
> Yes, this should be fixed, I already realized this. work->func(ns) is
> called when ns is already fixed.
> 
> Otherwise, nobody should use ns->proc_mount when ns is already dead/freed.

I meant the opposite. proc_mnt can be kept mounted somewhere, and accesses to it
will likely try to access (freed) pid_ns from it.

Thanks,

Louis

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-18 16:27           ` Oleg Nesterov
@ 2010-06-21 11:11             ` Louis Rilling
  2010-06-21 12:58               ` Eric W. Biederman
  0 siblings, 1 reply; 31+ messages in thread
From: Louis Rilling @ 2010-06-21 11:11 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Eric W. Biederman, Pavel Emelyanov, Andrew Morton,
	Pavel Emelyanov, Linux Containers, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1835 bytes --]

On 18/06/10 18:27 +0200, Oleg Nesterov wrote:
> On 06/18, Louis Rilling wrote:
> >
> > On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
> > > On 06/17, Eric W. Biederman wrote:
> > > >
> > > > The task->children isn't changed until __unhash_process() which runs
> > > > after flush_proc_task().
> > >
> > > Yes. But this is only the current implementation detail.
> > > It would be nice to cleanup the code so that EXIT_DEAD tasks are
> > > never sit in ->children list.
> > >
> > > > So we should be able to come up with
> > > > a variant of do_wait() that zap_pid_ns_processes can use that does
> > > > what we need.
> > >
> > > See above...
> > >
> > > Even if we modify do_wait() or add the new variant, how the caller
> > > can wait for EXIT_DEAD tasks? I don't think we want to modify
> > > release_task() to do __wake_up_parent() or something similar.
> >
> > Indeed, I was thinking about calling __wake_up_parent() from release_task()
> > once parent->children becomes empty.
> >
> > Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
> > in parent->signal could limit it. But if EXIT_DEAD children are removed from
> > ->children before release_task(), I'm afraid that this becomes impossible.
> 
> Thinking more, even the current do_wait() from zap_pid_ns_processes()
> is not really good. Suppose that some none-init thread is ptraced, then
> zap_pid_ns_processes() will hange until the tracer does do_wait() or
> exits.

Is this really a bad thing? If somebody ptraces a task in a pid namespace, that
sounds reasonable to have this namespace (and it's init task) pinned.

Louis

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-18 17:55             ` Oleg Nesterov
  (?)
  (?)
@ 2010-06-21 11:09             ` Louis Rilling
  -1 siblings, 0 replies; 31+ messages in thread
From: Louis Rilling @ 2010-06-21 11:09 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1628 bytes --]

On 18/06/10 19:55 +0200, Oleg Nesterov wrote:
> On 06/18, Louis Rilling wrote:
> > > @@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
> > >  		ei = PROC_I(sb->s_root->d_inode);
> > >  		if (!ei->pid) {
> > >  			rcu_read_lock();
> > > -			ei->pid = get_pid(find_pid_ns(1, ns));
> > > +			ei->pid = find_pid_ns(1, ns);
> >
> > I don't think that this is correct. IIUC, proc_delete_inode() calls put_pid() on
> > ei->pid.
> 
> Yes,
> 
> > So either a special case is added in proc_delete_inode(), or we try to
> > live with get_pid() here. I'm actually not sure that we can pretend that this
> > pid remains valid if we don't get_pid() here.
> 
> But please see another change below,
> 
> > > +static void proc_mntput(struct work_struct *work)
> > >  {
> > > +	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
> > > +
> > > +	PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
> > >  	mntput(ns->proc_mnt);
> > >  }
> 
> it clears ei->pid.
> 
> We are called from free_pid_ns() path, this ->pid must not have any reference.
> Any get_pid() implies get_pid_ns().
> 
> What do you think?

Hm, I didn't look close enough. Sorry about that. However, I'm still concerned
with this since this pid can have been freed right after container init's
release_task(), and I don't see how it is guaranteed that nobody still tries to
access this proc_mnt.

Thanks,

Louis

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-18 17:55             ` Oleg Nesterov
@ 2010-06-18 21:23               ` Oleg Nesterov
  -1 siblings, 0 replies; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-18 21:23 UTC (permalink / raw)
  To: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel,
	Louis Rilling

On 06/18, Oleg Nesterov wrote:
>
> work->func(ns) is
> called when ns is already fixed.
                            ^^^^^
(I meant freed)

We can move kmem_cache_free(ns) into work->func(), or we can optimize
the usage of schedule_work(), something like the patch below.

Once again, it is completely untested, I do not pretend I understand
this code today, and I am sure the patch is wrong. I only try to discuss
the idea to break the circular reference. In any case, I am not proud
of this patch ;)

Or we should do the more sophisticated change suggested by Pavel. But
I'd like to avoid the changes in do_wait/release_task, this doesn't
look right to me.

Oleg.

 include/linux/pid_namespace.h |    1 +
 kernel/pid_namespace.c        |   35 ++++++++++++++++++++++++++++++++++-
 fs/proc/base.c                |    4 ----
 fs/proc/root.c                |   10 ++++++----
 4 files changed, 41 insertions(+), 9 deletions(-)

--- 34-rc1/include/linux/pid_namespace.h~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/include/linux/pid_namespace.h	2010-06-18 22:37:45.000000000 +0200
@@ -26,6 +26,7 @@ struct pid_namespace {
 	struct pid_namespace *parent;
 #ifdef CONFIG_PROC_FS
 	struct vfsmount *proc_mnt;
+	struct list_head dead_node;
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
--- 34-rc1/kernel/pid_namespace.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/kernel/pid_namespace.c	2010-06-18 22:52:41.000000000 +0200
@@ -10,6 +10,7 @@
 
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
@@ -105,15 +106,47 @@ out:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void destroy_pid_namespace(struct pid_namespace *ns)
+static LIST_HEAD(dead_list);
+static DEFINE_SPINLOCK(dead_lock);
+
+static void do_destroy_pid_namespace(struct pid_namespace *ns)
 {
 	int i;
 
+	pid_ns_release_proc(ns);
+
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
 	kmem_cache_free(pid_ns_cachep, ns);
 }
 
+static void dead_work_func(struct work_struct *unused)
+{
+	LIST_HEAD(list);
+	unsigned long flags;
+	struct pid_namespace *ns, *tmp;
+
+	spin_lock_irqsave(&dead_lock, flags);
+	list_splice_init(&dead_list, &list);
+	spin_unlock_irqrestore(&dead_lock, flags);
+
+	list_for_each_entry_safe(ns, tmp, &list, dead_node)
+		do_destroy_pid_namespace(ns);
+}
+
+static DECLARE_WORK(dead_work, dead_work_func);
+
+static void destroy_pid_namespace(struct pid_namespace *ns)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dead_lock, flags);
+	list_add(&ns->dead_node, &dead_list);
+	spin_unlock_irqrestore(&dead_lock, flags);
+
+	schedule_work(&dead_work);
+}
+
 struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
 {
 	if (!(flags & CLONE_NEWPID))
--- 34-rc1/fs/proc/base.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/base.c	2010-06-18 17:49:45.000000000 +0200
@@ -2720,10 +2720,6 @@ void proc_flush_task(struct task_struct 
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
 	}
-
-	upid = &pid->numbers[pid->level];
-	if (upid->nr == 1)
-		pid_ns_release_proc(upid->ns);
 }
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,
--- 34-rc1/fs/proc/root.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/root.c	2010-06-18 22:54:03.000000000 +0200
@@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
 	struct pid_namespace *ns;
 
 	ns = (struct pid_namespace *)data;
-	sb->s_fs_info = get_pid_ns(ns);
+	sb->s_fs_info = ns;
 	return set_anon_super(sb, NULL);
 }
 
@@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
 		ei = PROC_I(sb->s_root->d_inode);
 		if (!ei->pid) {
 			rcu_read_lock();
-			ei->pid = get_pid(find_pid_ns(1, ns));
+			ei->pid = find_pid_ns(1, ns);
 			rcu_read_unlock();
 		}
 
@@ -92,7 +92,6 @@ static void proc_kill_sb(struct super_bl
 
 	ns = (struct pid_namespace *)sb->s_fs_info;
 	kill_anon_super(sb);
-	put_pid_ns(ns);
 }
 
 static struct file_system_type proc_fs_type = {
@@ -218,5 +217,8 @@ int pid_ns_prepare_proc(struct pid_names
 
 void pid_ns_release_proc(struct pid_namespace *ns)
 {
-	mntput(ns->proc_mnt);
+	if (ns->proc_mnt) {
+		PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
+		mntput(ns->proc_mnt);
+	}
 }

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
@ 2010-06-18 21:23               ` Oleg Nesterov
  0 siblings, 0 replies; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-18 21:23 UTC (permalink / raw)
  To: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel,
	Louis Rilling, Eric W. Biederman

On 06/18, Oleg Nesterov wrote:
>
> work->func(ns) is
> called when ns is already fixed.
                            ^^^^^
(I meant freed)

We can move kmem_cache_free(ns) into work->func(), or we can optimize
the usage of schedule_work(), something like the patch below.

Once again, it is completely untested, I do not pretend I understand
this code today, and I am sure the patch is wrong. I only try to discuss
the idea to break the circular reference. In any case, I am not proud
of this patch ;)

Or we should do the more sophisticated change suggested by Pavel. But
I'd like to avoid the changes in do_wait/release_task, this doesn't
look right to me.

Oleg.

 include/linux/pid_namespace.h |    1 +
 kernel/pid_namespace.c        |   35 ++++++++++++++++++++++++++++++++++-
 fs/proc/base.c                |    4 ----
 fs/proc/root.c                |   10 ++++++----
 4 files changed, 41 insertions(+), 9 deletions(-)

--- 34-rc1/include/linux/pid_namespace.h~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/include/linux/pid_namespace.h	2010-06-18 22:37:45.000000000 +0200
@@ -26,6 +26,7 @@ struct pid_namespace {
 	struct pid_namespace *parent;
 #ifdef CONFIG_PROC_FS
 	struct vfsmount *proc_mnt;
+	struct list_head dead_node;
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
--- 34-rc1/kernel/pid_namespace.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/kernel/pid_namespace.c	2010-06-18 22:52:41.000000000 +0200
@@ -10,6 +10,7 @@
 
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
@@ -105,15 +106,47 @@ out:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void destroy_pid_namespace(struct pid_namespace *ns)
+static LIST_HEAD(dead_list);
+static DEFINE_SPINLOCK(dead_lock);
+
+static void do_destroy_pid_namespace(struct pid_namespace *ns)
 {
 	int i;
 
+	pid_ns_release_proc(ns);
+
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
 	kmem_cache_free(pid_ns_cachep, ns);
 }
 
+static void dead_work_func(struct work_struct *unused)
+{
+	LIST_HEAD(list);
+	unsigned long flags;
+	struct pid_namespace *ns, *tmp;
+
+	spin_lock_irqsave(&dead_lock, flags);
+	list_splice_init(&dead_list, &list);
+	spin_unlock_irqrestore(&dead_lock, flags);
+
+	list_for_each_entry_safe(ns, tmp, &list, dead_node)
+		do_destroy_pid_namespace(ns);
+}
+
+static DECLARE_WORK(dead_work, dead_work_func);
+
+static void destroy_pid_namespace(struct pid_namespace *ns)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dead_lock, flags);
+	list_add(&ns->dead_node, &dead_list);
+	spin_unlock_irqrestore(&dead_lock, flags);
+
+	schedule_work(&dead_work);
+}
+
 struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
 {
 	if (!(flags & CLONE_NEWPID))
--- 34-rc1/fs/proc/base.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/base.c	2010-06-18 17:49:45.000000000 +0200
@@ -2720,10 +2720,6 @@ void proc_flush_task(struct task_struct 
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
 	}
-
-	upid = &pid->numbers[pid->level];
-	if (upid->nr == 1)
-		pid_ns_release_proc(upid->ns);
 }
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,
--- 34-rc1/fs/proc/root.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/root.c	2010-06-18 22:54:03.000000000 +0200
@@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
 	struct pid_namespace *ns;
 
 	ns = (struct pid_namespace *)data;
-	sb->s_fs_info = get_pid_ns(ns);
+	sb->s_fs_info = ns;
 	return set_anon_super(sb, NULL);
 }
 
@@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
 		ei = PROC_I(sb->s_root->d_inode);
 		if (!ei->pid) {
 			rcu_read_lock();
-			ei->pid = get_pid(find_pid_ns(1, ns));
+			ei->pid = find_pid_ns(1, ns);
 			rcu_read_unlock();
 		}
 
@@ -92,7 +92,6 @@ static void proc_kill_sb(struct super_bl
 
 	ns = (struct pid_namespace *)sb->s_fs_info;
 	kill_anon_super(sb);
-	put_pid_ns(ns);
 }
 
 static struct file_system_type proc_fs_type = {
@@ -218,5 +217,8 @@ int pid_ns_prepare_proc(struct pid_names
 
 void pid_ns_release_proc(struct pid_namespace *ns)
 {
-	mntput(ns->proc_mnt);
+	if (ns->proc_mnt) {
+		PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
+		mntput(ns->proc_mnt);
+	}
 }


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-18 17:33         ` Louis Rilling
@ 2010-06-18 17:55             ` Oleg Nesterov
  0 siblings, 0 replies; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-18 17:55 UTC (permalink / raw)
  To: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel,
	Louis Rilling

On 06/18, Louis Rilling wrote:
>
> On 18/06/10 18:08 +0200, Oleg Nesterov wrote:
> >
> > Not sure I ever understood this code. Certainly I can't say I understand
> > it now. Still, do we really need this circle? I am almost sure the patch
> > below is not right (and it wasn't tested at all), but could you take a
> > look?
>
> I won't pretend understanding better than you do. Still I can try to give my 2
> cents.
>
> Overall, I don't feel comfortable at being able to have a living proc_mnt
> with a dead pid_ns.

Yes, this should be fixed, I already realized this. work->func(ns) is
called when ns is already fixed.

Otherwise, nobody should use ns->proc_mount when ns is already dead/freed.

> > Note: afaics we have another problem. What if copy_process(CLONE_NEWPID)
> > fails after pid_ns_prepare_proc() ? Who will do mntput() ? This patch
> > should fix this too (again, ___if___ it correct).
>
> Sounds true.

Good.

> > @@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
> >  		ei = PROC_I(sb->s_root->d_inode);
> >  		if (!ei->pid) {
> >  			rcu_read_lock();
> > -			ei->pid = get_pid(find_pid_ns(1, ns));
> > +			ei->pid = find_pid_ns(1, ns);
>
> I don't think that this is correct. IIUC, proc_delete_inode() calls put_pid() on
> ei->pid.

Yes,

> So either a special case is added in proc_delete_inode(), or we try to
> live with get_pid() here. I'm actually not sure that we can pretend that this
> pid remains valid if we don't get_pid() here.

But please see another change below,

> > +static void proc_mntput(struct work_struct *work)
> >  {
> > +	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
> > +
> > +	PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
> >  	mntput(ns->proc_mnt);
> >  }

it clears ei->pid.

We are called from free_pid_ns() path, this ->pid must not have any reference.
Any get_pid() implies get_pid_ns().

What do you think?

Oleg.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
@ 2010-06-18 17:55             ` Oleg Nesterov
  0 siblings, 0 replies; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-18 17:55 UTC (permalink / raw)
  To: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel,
	Louis Rilling

On 06/18, Louis Rilling wrote:
>
> On 18/06/10 18:08 +0200, Oleg Nesterov wrote:
> >
> > Not sure I ever understood this code. Certainly I can't say I understand
> > it now. Still, do we really need this circle? I am almost sure the patch
> > below is not right (and it wasn't tested at all), but could you take a
> > look?
>
> I won't pretend understanding better than you do. Still I can try to give my 2
> cents.
>
> Overall, I don't feel comfortable at being able to have a living proc_mnt
> with a dead pid_ns.

Yes, this should be fixed, I already realized this. work->func(ns) is
called when ns is already fixed.

Otherwise, nobody should use ns->proc_mount when ns is already dead/freed.

> > Note: afaics we have another problem. What if copy_process(CLONE_NEWPID)
> > fails after pid_ns_prepare_proc() ? Who will do mntput() ? This patch
> > should fix this too (again, ___if___ it correct).
>
> Sounds true.

Good.

> > @@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
> >  		ei = PROC_I(sb->s_root->d_inode);
> >  		if (!ei->pid) {
> >  			rcu_read_lock();
> > -			ei->pid = get_pid(find_pid_ns(1, ns));
> > +			ei->pid = find_pid_ns(1, ns);
>
> I don't think that this is correct. IIUC, proc_delete_inode() calls put_pid() on
> ei->pid.

Yes,

> So either a special case is added in proc_delete_inode(), or we try to
> live with get_pid() here. I'm actually not sure that we can pretend that this
> pid remains valid if we don't get_pid() here.

But please see another change below,

> > +static void proc_mntput(struct work_struct *work)
> >  {
> > +	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
> > +
> > +	PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
> >  	mntput(ns->proc_mnt);
> >  }

it clears ei->pid.

We are called from free_pid_ns() path, this ->pid must not have any reference.
Any get_pid() implies get_pid_ns().

What do you think?

Oleg.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-18 16:08         ` Oleg Nesterov
  (?)
@ 2010-06-18 17:33         ` Louis Rilling
  2010-06-18 17:55             ` Oleg Nesterov
  -1 siblings, 1 reply; 31+ messages in thread
From: Louis Rilling @ 2010-06-18 17:33 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 5608 bytes --]

On 18/06/10 18:08 +0200, Oleg Nesterov wrote:
> On 06/18, Oleg Nesterov wrote:
> >
> > On 06/18, Louis Rilling wrote:
> > >
> > > On 17/06/10 23:20 +0200, Oleg Nesterov wrote:
> > > > On 06/16, Louis Rilling wrote:
> > > > >
> > > > > Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> > > > > that release_task()->proc_flush_task() of container init can be called
> > > > > before it is for some detached tasks in the namespace.
> > > > >
> > > > > Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> > > > > whatever the ordering of tasks.
> > > >
> > > > I must have missed something, but can't we just move mntput() ?
> > >
> > > See the log of the commit introducing pid_ns_release_proc() (6f4e6433):
> > >
> > >     Sice the namespace holds the vfsmnt, vfsmnt holds the superblock and the
> > >     superblock holds the namespace we must explicitly break this circle to destroy
> > >     all the stuff.  This is done after the init of the namespace dies.
> >
> > I see thanks.
> 
> Not sure I ever understood this code. Certainly I can't say I understand
> it now. Still, do we really need this circle? I am almost sure the patch
> below is not right (and it wasn't tested at all), but could you take a
> look?

I won't pretend understanding better than you do. Still I can try to give my 2
cents.

Overall, I don't feel comfortable at being able to have a living proc_mnt
with a dead pid_ns. On the contrary, proc_mnt is almost never used from pid_ns.
proc_flush_task() excepted, the only users I saw use
current->nsproxy->pid_ns->proc_mnt, which makes them safe.

> 
> Note: afaics we have another problem. What if copy_process(CLONE_NEWPID)
> fails after pid_ns_prepare_proc() ? Who will do mntput() ? This patch
> should fix this too (again, ___if___ it correct).

Sounds true.

> 
> Oleg.
> 
>  include/linux/pid_namespace.h |    1 +
>  kernel/pid_namespace.c        |    3 +++
>  fs/proc/base.c                |    4 ----
>  fs/proc/root.c                |   18 ++++++++++++++----
>  4 files changed, 18 insertions(+), 8 deletions(-)
> 
> --- 34-rc1/include/linux/pid_namespace.h~PID_NS	2010-06-18 17:48:56.000000000 +0200
> +++ 34-rc1/include/linux/pid_namespace.h	2010-06-18 17:53:11.000000000 +0200
> @@ -26,6 +26,7 @@ struct pid_namespace {
>  	struct pid_namespace *parent;
>  #ifdef CONFIG_PROC_FS
>  	struct vfsmount *proc_mnt;
> +	struct work_struct proc_put;
>  #endif
>  #ifdef CONFIG_BSD_PROCESS_ACCT
>  	struct bsd_acct_struct *bacct;
> --- 34-rc1/kernel/pid_namespace.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
> +++ 34-rc1/kernel/pid_namespace.c	2010-06-18 17:53:44.000000000 +0200
> @@ -10,6 +10,7 @@
>  
>  #include <linux/pid.h>
>  #include <linux/pid_namespace.h>
> +#include <linux/proc_fs.h>
>  #include <linux/syscalls.h>
>  #include <linux/err.h>
>  #include <linux/acct.h>
> @@ -109,6 +110,8 @@ static void destroy_pid_namespace(struct
>  {
>  	int i;
>  
> +	pid_ns_release_proc(ns);
> +
>  	for (i = 0; i < PIDMAP_ENTRIES; i++)
>  		kfree(ns->pidmap[i].page);
>  	kmem_cache_free(pid_ns_cachep, ns);
> --- 34-rc1/fs/proc/base.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
> +++ 34-rc1/fs/proc/base.c	2010-06-18 17:49:45.000000000 +0200
> @@ -2720,10 +2720,6 @@ void proc_flush_task(struct task_struct 
>  		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
>  					tgid->numbers[i].nr);
>  	}
> -
> -	upid = &pid->numbers[pid->level];
> -	if (upid->nr == 1)
> -		pid_ns_release_proc(upid->ns);
>  }
>  
>  static struct dentry *proc_pid_instantiate(struct inode *dir,
> --- 34-rc1/fs/proc/root.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
> +++ 34-rc1/fs/proc/root.c	2010-06-18 17:56:57.000000000 +0200
> @@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
>  	struct pid_namespace *ns;
>  
>  	ns = (struct pid_namespace *)data;
> -	sb->s_fs_info = get_pid_ns(ns);
> +	sb->s_fs_info = ns;
>  	return set_anon_super(sb, NULL);
>  }
>  
> @@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
>  		ei = PROC_I(sb->s_root->d_inode);
>  		if (!ei->pid) {
>  			rcu_read_lock();
> -			ei->pid = get_pid(find_pid_ns(1, ns));
> +			ei->pid = find_pid_ns(1, ns);

I don't think that this is correct. IIUC, proc_delete_inode() calls put_pid() on
ei->pid. So either a special case is added in proc_delete_inode(), or we try to
live with get_pid() here. I'm actually not sure that we can pretend that this
pid remains valid if we don't get_pid() here.

Thanks,

Louis

>  			rcu_read_unlock();
>  		}
>  
> @@ -92,7 +92,6 @@ static void proc_kill_sb(struct super_bl
>  
>  	ns = (struct pid_namespace *)sb->s_fs_info;
>  	kill_anon_super(sb);
> -	put_pid_ns(ns);
>  }
>  
>  static struct file_system_type proc_fs_type = {
> @@ -216,7 +215,18 @@ int pid_ns_prepare_proc(struct pid_names
>  	return 0;
>  }
>  
> -void pid_ns_release_proc(struct pid_namespace *ns)
> +static void proc_mntput(struct work_struct *work)
>  {
> +	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
> +
> +	PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
>  	mntput(ns->proc_mnt);
>  }
> +
> +void pid_ns_release_proc(struct pid_namespace *ns)
> +{
> +	if (ns->proc_mnt) {
> +		INIT_WORK(&ns->proc_put, proc_mntput);
> +		schedule_work(&ns->proc_put);
> +	}
> +}
> 
> 

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
       [not found]           ` <20100618082738.GE16877-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
@ 2010-06-18 16:27             ` Oleg Nesterov
  0 siblings, 0 replies; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-18 16:27 UTC (permalink / raw)
  To: Eric W. Biederman, Pavel Emelyanov, Andrew Morton, Pavel Emelyanov

On 06/18, Louis Rilling wrote:
>
> On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
> > On 06/17, Eric W. Biederman wrote:
> > >
> > > The task->children isn't changed until __unhash_process() which runs
> > > after flush_proc_task().
> >
> > Yes. But this is only the current implementation detail.
> > It would be nice to cleanup the code so that EXIT_DEAD tasks are
> > never sit in ->children list.
> >
> > > So we should be able to come up with
> > > a variant of do_wait() that zap_pid_ns_processes can use that does
> > > what we need.
> >
> > See above...
> >
> > Even if we modify do_wait() or add the new variant, how the caller
> > can wait for EXIT_DEAD tasks? I don't think we want to modify
> > release_task() to do __wake_up_parent() or something similar.
>
> Indeed, I was thinking about calling __wake_up_parent() from release_task()
> once parent->children becomes empty.
>
> Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
> in parent->signal could limit it. But if EXIT_DEAD children are removed from
> ->children before release_task(), I'm afraid that this becomes impossible.

Thinking more, even the current do_wait() from zap_pid_ns_processes()
is not really good. Suppose that some none-init thread is ptraced, then
zap_pid_ns_processes() will hange until the tracer does do_wait() or
exits.

Oleg.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-18  8:27         ` Louis Rilling
@ 2010-06-18 16:27           ` Oleg Nesterov
  2010-06-21 11:11             ` Louis Rilling
       [not found]           ` <20100618082738.GE16877-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
  1 sibling, 1 reply; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-18 16:27 UTC (permalink / raw)
  To: Eric W. Biederman, Pavel Emelyanov, Andrew Morton,
	Pavel Emelyanov, Linux Containers, linux-kernel, Louis Rilling

On 06/18, Louis Rilling wrote:
>
> On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
> > On 06/17, Eric W. Biederman wrote:
> > >
> > > The task->children isn't changed until __unhash_process() which runs
> > > after flush_proc_task().
> >
> > Yes. But this is only the current implementation detail.
> > It would be nice to cleanup the code so that EXIT_DEAD tasks are
> > never sit in ->children list.
> >
> > > So we should be able to come up with
> > > a variant of do_wait() that zap_pid_ns_processes can use that does
> > > what we need.
> >
> > See above...
> >
> > Even if we modify do_wait() or add the new variant, how the caller
> > can wait for EXIT_DEAD tasks? I don't think we want to modify
> > release_task() to do __wake_up_parent() or something similar.
>
> Indeed, I was thinking about calling __wake_up_parent() from release_task()
> once parent->children becomes empty.
>
> Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
> in parent->signal could limit it. But if EXIT_DEAD children are removed from
> ->children before release_task(), I'm afraid that this becomes impossible.

Thinking more, even the current do_wait() from zap_pid_ns_processes()
is not really good. Suppose that some none-init thread is ptraced, then
zap_pid_ns_processes() will hange until the tracer does do_wait() or
exits.

Oleg.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-18 11:15     ` Oleg Nesterov
@ 2010-06-18 16:08         ` Oleg Nesterov
  0 siblings, 0 replies; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-18 16:08 UTC (permalink / raw)
  To: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel,
	Louis Rilling

On 06/18, Oleg Nesterov wrote:
>
> On 06/18, Louis Rilling wrote:
> >
> > On 17/06/10 23:20 +0200, Oleg Nesterov wrote:
> > > On 06/16, Louis Rilling wrote:
> > > >
> > > > Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> > > > that release_task()->proc_flush_task() of container init can be called
> > > > before it is for some detached tasks in the namespace.
> > > >
> > > > Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> > > > whatever the ordering of tasks.
> > >
> > > I must have missed something, but can't we just move mntput() ?
> >
> > See the log of the commit introducing pid_ns_release_proc() (6f4e6433):
> >
> >     Sice the namespace holds the vfsmnt, vfsmnt holds the superblock and the
> >     superblock holds the namespace we must explicitly break this circle to destroy
> >     all the stuff.  This is done after the init of the namespace dies.
>
> I see thanks.

Not sure I ever understood this code. Certainly I can't say I understand
it now. Still, do we really need this circle? I am almost sure the patch
below is not right (and it wasn't tested at all), but could you take a
look?

Note: afaics we have another problem. What if copy_process(CLONE_NEWPID)
fails after pid_ns_prepare_proc() ? Who will do mntput() ? This patch
should fix this too (again, ___if___ it correct).

Oleg.

 include/linux/pid_namespace.h |    1 +
 kernel/pid_namespace.c        |    3 +++
 fs/proc/base.c                |    4 ----
 fs/proc/root.c                |   18 ++++++++++++++----
 4 files changed, 18 insertions(+), 8 deletions(-)

--- 34-rc1/include/linux/pid_namespace.h~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/include/linux/pid_namespace.h	2010-06-18 17:53:11.000000000 +0200
@@ -26,6 +26,7 @@ struct pid_namespace {
 	struct pid_namespace *parent;
 #ifdef CONFIG_PROC_FS
 	struct vfsmount *proc_mnt;
+	struct work_struct proc_put;
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
--- 34-rc1/kernel/pid_namespace.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/kernel/pid_namespace.c	2010-06-18 17:53:44.000000000 +0200
@@ -10,6 +10,7 @@
 
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
@@ -109,6 +110,8 @@ static void destroy_pid_namespace(struct
 {
 	int i;
 
+	pid_ns_release_proc(ns);
+
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
 	kmem_cache_free(pid_ns_cachep, ns);
--- 34-rc1/fs/proc/base.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/base.c	2010-06-18 17:49:45.000000000 +0200
@@ -2720,10 +2720,6 @@ void proc_flush_task(struct task_struct 
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
 	}
-
-	upid = &pid->numbers[pid->level];
-	if (upid->nr == 1)
-		pid_ns_release_proc(upid->ns);
 }
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,
--- 34-rc1/fs/proc/root.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/root.c	2010-06-18 17:56:57.000000000 +0200
@@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
 	struct pid_namespace *ns;
 
 	ns = (struct pid_namespace *)data;
-	sb->s_fs_info = get_pid_ns(ns);
+	sb->s_fs_info = ns;
 	return set_anon_super(sb, NULL);
 }
 
@@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
 		ei = PROC_I(sb->s_root->d_inode);
 		if (!ei->pid) {
 			rcu_read_lock();
-			ei->pid = get_pid(find_pid_ns(1, ns));
+			ei->pid = find_pid_ns(1, ns);
 			rcu_read_unlock();
 		}
 
@@ -92,7 +92,6 @@ static void proc_kill_sb(struct super_bl
 
 	ns = (struct pid_namespace *)sb->s_fs_info;
 	kill_anon_super(sb);
-	put_pid_ns(ns);
 }
 
 static struct file_system_type proc_fs_type = {
@@ -216,7 +215,18 @@ int pid_ns_prepare_proc(struct pid_names
 	return 0;
 }
 
-void pid_ns_release_proc(struct pid_namespace *ns)
+static void proc_mntput(struct work_struct *work)
 {
+	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
+
+	PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
 	mntput(ns->proc_mnt);
 }
+
+void pid_ns_release_proc(struct pid_namespace *ns)
+{
+	if (ns->proc_mnt) {
+		INIT_WORK(&ns->proc_put, proc_mntput);
+		schedule_work(&ns->proc_put);
+	}
+}

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
@ 2010-06-18 16:08         ` Oleg Nesterov
  0 siblings, 0 replies; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-18 16:08 UTC (permalink / raw)
  To: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel,
	Louis Rilling

On 06/18, Oleg Nesterov wrote:
>
> On 06/18, Louis Rilling wrote:
> >
> > On 17/06/10 23:20 +0200, Oleg Nesterov wrote:
> > > On 06/16, Louis Rilling wrote:
> > > >
> > > > Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> > > > that release_task()->proc_flush_task() of container init can be called
> > > > before it is for some detached tasks in the namespace.
> > > >
> > > > Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> > > > whatever the ordering of tasks.
> > >
> > > I must have missed something, but can't we just move mntput() ?
> >
> > See the log of the commit introducing pid_ns_release_proc() (6f4e6433):
> >
> >     Sice the namespace holds the vfsmnt, vfsmnt holds the superblock and the
> >     superblock holds the namespace we must explicitly break this circle to destroy
> >     all the stuff.  This is done after the init of the namespace dies.
>
> I see thanks.

Not sure I ever understood this code. Certainly I can't say I understand
it now. Still, do we really need this circle? I am almost sure the patch
below is not right (and it wasn't tested at all), but could you take a
look?

Note: afaics we have another problem. What if copy_process(CLONE_NEWPID)
fails after pid_ns_prepare_proc() ? Who will do mntput() ? This patch
should fix this too (again, ___if___ it correct).

Oleg.

 include/linux/pid_namespace.h |    1 +
 kernel/pid_namespace.c        |    3 +++
 fs/proc/base.c                |    4 ----
 fs/proc/root.c                |   18 ++++++++++++++----
 4 files changed, 18 insertions(+), 8 deletions(-)

--- 34-rc1/include/linux/pid_namespace.h~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/include/linux/pid_namespace.h	2010-06-18 17:53:11.000000000 +0200
@@ -26,6 +26,7 @@ struct pid_namespace {
 	struct pid_namespace *parent;
 #ifdef CONFIG_PROC_FS
 	struct vfsmount *proc_mnt;
+	struct work_struct proc_put;
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
--- 34-rc1/kernel/pid_namespace.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/kernel/pid_namespace.c	2010-06-18 17:53:44.000000000 +0200
@@ -10,6 +10,7 @@
 
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
@@ -109,6 +110,8 @@ static void destroy_pid_namespace(struct
 {
 	int i;
 
+	pid_ns_release_proc(ns);
+
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
 	kmem_cache_free(pid_ns_cachep, ns);
--- 34-rc1/fs/proc/base.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/base.c	2010-06-18 17:49:45.000000000 +0200
@@ -2720,10 +2720,6 @@ void proc_flush_task(struct task_struct 
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
 	}
-
-	upid = &pid->numbers[pid->level];
-	if (upid->nr == 1)
-		pid_ns_release_proc(upid->ns);
 }
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,
--- 34-rc1/fs/proc/root.c~PID_NS	2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/root.c	2010-06-18 17:56:57.000000000 +0200
@@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
 	struct pid_namespace *ns;
 
 	ns = (struct pid_namespace *)data;
-	sb->s_fs_info = get_pid_ns(ns);
+	sb->s_fs_info = ns;
 	return set_anon_super(sb, NULL);
 }
 
@@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
 		ei = PROC_I(sb->s_root->d_inode);
 		if (!ei->pid) {
 			rcu_read_lock();
-			ei->pid = get_pid(find_pid_ns(1, ns));
+			ei->pid = find_pid_ns(1, ns);
 			rcu_read_unlock();
 		}
 
@@ -92,7 +92,6 @@ static void proc_kill_sb(struct super_bl
 
 	ns = (struct pid_namespace *)sb->s_fs_info;
 	kill_anon_super(sb);
-	put_pid_ns(ns);
 }
 
 static struct file_system_type proc_fs_type = {
@@ -216,7 +215,18 @@ int pid_ns_prepare_proc(struct pid_names
 	return 0;
 }
 
-void pid_ns_release_proc(struct pid_namespace *ns)
+static void proc_mntput(struct work_struct *work)
 {
+	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
+
+	PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
 	mntput(ns->proc_mnt);
 }
+
+void pid_ns_release_proc(struct pid_namespace *ns)
+{
+	if (ns->proc_mnt) {
+		INIT_WORK(&ns->proc_put, proc_mntput);
+		schedule_work(&ns->proc_put);
+	}
+}



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-18  8:20   ` Louis Rilling
@ 2010-06-18 11:15     ` Oleg Nesterov
  2010-06-18 16:08         ` Oleg Nesterov
  0 siblings, 1 reply; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-18 11:15 UTC (permalink / raw)
  To: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel

On 06/18, Louis Rilling wrote:
>
> On 17/06/10 23:20 +0200, Oleg Nesterov wrote:
> > On 06/16, Louis Rilling wrote:
> > >
> > > Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> > > that release_task()->proc_flush_task() of container init can be called
> > > before it is for some detached tasks in the namespace.
> > >
> > > Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> > > whatever the ordering of tasks.
> >
> > I must have missed something, but can't we just move mntput() ?
>
> See the log of the commit introducing pid_ns_release_proc() (6f4e6433):
>
>     Sice the namespace holds the vfsmnt, vfsmnt holds the superblock and the
>     superblock holds the namespace we must explicitly break this circle to destroy
>     all the stuff.  This is done after the init of the namespace dies.

I see thanks.

Besides, put_pid_ns() can be called in any context...

Oleg.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-17 21:36       ` Oleg Nesterov
@ 2010-06-18  8:27         ` Louis Rilling
  2010-06-18 16:27           ` Oleg Nesterov
       [not found]           ` <20100618082738.GE16877-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
  0 siblings, 2 replies; 31+ messages in thread
From: Louis Rilling @ 2010-06-18  8:27 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Eric W. Biederman, Pavel Emelyanov, Andrew Morton,
	Pavel Emelyanov, Linux Containers, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1274 bytes --]

On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
> On 06/17, Eric W. Biederman wrote:
> >
> > The task->children isn't changed until __unhash_process() which runs
> > after flush_proc_task().
> 
> Yes. But this is only the current implementation detail.
> It would be nice to cleanup the code so that EXIT_DEAD tasks are
> never sit in ->children list.
> 
> > So we should be able to come up with
> > a variant of do_wait() that zap_pid_ns_processes can use that does
> > what we need.
> 
> See above...
> 
> Even if we modify do_wait() or add the new variant, how the caller
> can wait for EXIT_DEAD tasks? I don't think we want to modify
> release_task() to do __wake_up_parent() or something similar.

Indeed, I was thinking about calling __wake_up_parent() from release_task()
once parent->children becomes empty.

Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
in parent->signal could limit it. But if EXIT_DEAD children are removed from
->children before release_task(), I'm afraid that this becomes impossible.

Thanks,

Louis

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-17 21:20 ` Oleg Nesterov
@ 2010-06-18  8:20   ` Louis Rilling
  2010-06-18 11:15     ` Oleg Nesterov
  0 siblings, 1 reply; 31+ messages in thread
From: Louis Rilling @ 2010-06-18  8:20 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1877 bytes --]

On 17/06/10 23:20 +0200, Oleg Nesterov wrote:
> On 06/16, Louis Rilling wrote:
> >
> > Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> > that release_task()->proc_flush_task() of container init can be called
> > before it is for some detached tasks in the namespace.
> >
> > Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> > whatever the ordering of tasks.
> 
> I must have missed something, but can't we just move mntput() ?

See the log of the commit introducing pid_ns_release_proc() (6f4e6433):

    Sice the namespace holds the vfsmnt, vfsmnt holds the superblock and the
    superblock holds the namespace we must explicitly break this circle to destroy
    all the stuff.  This is done after the init of the namespace dies.  Running a
    few steps forward - when init exits it will kill all its children, so no
    proc_mnt will be needed after its death.

Thanks,

Louis

> 
> Oleg.
> 
> --- x/kernel/pid_namespace.c
> +++ x/kernel/pid_namespace.c
> @@ -110,6 +110,9 @@ static void destroy_pid_namespace(struct
>  {
>  	int i;
>  
> +	if (ns->proc_mount)
> +		mntput(ns->proc_mount);
> +
>  	for (i = 0; i < PIDMAP_ENTRIES; i++)
>  		kfree(ns->pidmap[i].page);
>  	kmem_cache_free(pid_ns_cachep, ns);
> --- x/fs/proc/base.c
> +++ x/fs/proc/base.c
> @@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct 
>  		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
>  					tgid->numbers[i].nr);
>  	}
> -
> -	upid = &pid->numbers[pid->level];
> -	if (upid->nr == 1)
> -		pid_ns_release_proc(upid->ns);
>  }
>  
>  static struct dentry *proc_pid_instantiate(struct inode *dir,
> 

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-17 13:41     ` Eric W. Biederman
  2010-06-17 14:20       ` Louis Rilling
@ 2010-06-17 21:36       ` Oleg Nesterov
  2010-06-18  8:27         ` Louis Rilling
  1 sibling, 1 reply; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-17 21:36 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Pavel Emelyanov, Louis Rilling, Andrew Morton, Pavel Emelyanov,
	Linux Containers, linux-kernel

On 06/17, Eric W. Biederman wrote:
>
> The task->children isn't changed until __unhash_process() which runs
> after flush_proc_task().

Yes. But this is only the current implementation detail.
It would be nice to cleanup the code so that EXIT_DEAD tasks are
never sit in ->children list.

> So we should be able to come up with
> a variant of do_wait() that zap_pid_ns_processes can use that does
> what we need.

See above...

Even if we modify do_wait() or add the new variant, how the caller
can wait for EXIT_DEAD tasks? I don't think we want to modify
release_task() to do __wake_up_parent() or something similar.

Oleg.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-16 16:34 Louis Rilling
       [not found] ` <1276706068-18567-1-git-send-email-louis.rilling-aw0BnHfMbSpBDgjK7y7TUQ@public.gmane.org>
@ 2010-06-17 21:20 ` Oleg Nesterov
  2010-06-18  8:20   ` Louis Rilling
  1 sibling, 1 reply; 31+ messages in thread
From: Oleg Nesterov @ 2010-06-17 21:20 UTC (permalink / raw)
  To: Louis Rilling
  Cc: Andrew Morton, Pavel Emelyanov, Linux Containers, linux-kernel

On 06/16, Louis Rilling wrote:
>
> Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> that release_task()->proc_flush_task() of container init can be called
> before it is for some detached tasks in the namespace.
>
> Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> whatever the ordering of tasks.

I must have missed something, but can't we just move mntput() ?

Oleg.

--- x/kernel/pid_namespace.c
+++ x/kernel/pid_namespace.c
@@ -110,6 +110,9 @@ static void destroy_pid_namespace(struct
 {
 	int i;
 
+	if (ns->proc_mount)
+		mntput(ns->proc_mount);
+
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
 	kmem_cache_free(pid_ns_cachep, ns);
--- x/fs/proc/base.c
+++ x/fs/proc/base.c
@@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct 
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
 	}
-
-	upid = &pid->numbers[pid->level];
-	if (upid->nr == 1)
-		pid_ns_release_proc(upid->ns);
 }
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-17 13:41     ` Eric W. Biederman
@ 2010-06-17 14:20       ` Louis Rilling
  2010-06-17 21:36       ` Oleg Nesterov
  1 sibling, 0 replies; 31+ messages in thread
From: Louis Rilling @ 2010-06-17 14:20 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Pavel Emelyanov, Andrew Morton, Oleg Nesterov, Pavel Emelyanov,
	Linux Containers, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 3538 bytes --]

On Thu, Jun 17, 2010 at 06:41:49AM -0700, Eric W. Biederman wrote:
> Pavel Emelyanov <xemul@parallels.com> writes:
> 
> > On 06/16/2010 08:34 PM, Louis Rilling wrote:
> >> [ Resending, hopefully with all pieces ]
> >> 
> >> Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> >> that release_task()->proc_flush_task() of container init can be called
> >> before it is for some detached tasks in the namespace.
> 
> There are two ways we can go from here.
> 
> - Completely asynchronous children exiting.
> - Waiting for all of our children to exit.

Agreed.

> 
> Your patch appears to be a weird middle ground, that is hard to
> analyze, abusing the mount count as a thread count.
> 
> I have been weighing the options between them, and to me properly
> waiting for all the processes to exit in zap_pid_ns_processes as we
> currently try to do is in our advantage.  It is simple and it means
> one less cache line bounce for a write to global variable in the
> much more common fork/exit path that we have to deal with.
> 
> The task->children isn't changed until __unhash_process() which runs
> after flush_proc_task().  So we should be able to come up with
> a variant of do_wait() that zap_pid_ns_processes can use that does
> what we need.

Sounds correct.

> 
> Louis do you want to look at that?

Give me a few days to look at that. IMHO my patch fixes the bug (see the
comment below), which was an emergency at work. Coding an improved fix is
lower priority :)

> 
> >> Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> >> whatever the ordering of tasks.
> >
> > See one comment below.
> >
> >> Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
> >> ---
> >>  fs/proc/base.c          |   18 ++++++++++++++++++
> >>  include/linux/proc_fs.h |    4 ++++
> >>  kernel/fork.c           |    1 +
> >>  3 files changed, 23 insertions(+), 0 deletions(-)
> >> 
> >> diff --git a/fs/proc/base.c b/fs/proc/base.c
> >> index acb7ef8..d6cdd91 100644
> >> --- a/fs/proc/base.c
> >> +++ b/fs/proc/base.c
> >> @@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
> >>  	.setattr	= proc_setattr,
> >>  };
> >>  
> >> +/*
> >> + * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
> >> + * after container init calls itself proc_flush_task().
> >> + */
> >> +void proc_new_task(struct task_struct *task)
> >> +{
> >> +	struct pid *pid;
> >> +	int i;
> >> +
> >> +	if (!task->pid)
> >> +		return;
> >> +
> >> +	pid = task_pid(task);
> >> +	for (i = 0; i <= pid->level; i++)
> >> +		mntget(pid->numbers[i].ns->proc_mnt);
> >
> > NAK. Pids live their own lives - task can change one, pid will
> > become orphan and will be destroyed, so you'll leak.
> 
> There is that nasty case in exec isn't there.  Why we ever made it
> part of the ABI that calling exec on a thread changes the pid of
> that thread to the pid of the thread group is beyond me.

You're right that I forgot about de_thread(). However, de_thread() does not
replace a task pid with an arbitrary other pid. The new pid lives in the same
pid namespaces, so that proc_flush_task() will call mntput() on the same
proc_mnts as the ones on which proc_new_task() called mntget().

Thanks,

Louis

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-17  9:53     ` Pavel Emelyanov
  (?)
@ 2010-06-17 13:41     ` Eric W. Biederman
  2010-06-17 14:20       ` Louis Rilling
  2010-06-17 21:36       ` Oleg Nesterov
  -1 siblings, 2 replies; 31+ messages in thread
From: Eric W. Biederman @ 2010-06-17 13:41 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Louis Rilling, Andrew Morton, Oleg Nesterov, Pavel Emelyanov,
	Linux Containers, linux-kernel

Pavel Emelyanov <xemul@parallels.com> writes:

> On 06/16/2010 08:34 PM, Louis Rilling wrote:
>> [ Resending, hopefully with all pieces ]
>> 
>> Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
>> that release_task()->proc_flush_task() of container init can be called
>> before it is for some detached tasks in the namespace.

There are two ways we can go from here.

- Completely asynchronous children exiting.
- Waiting for all of our children to exit.

Your patch appears to be a weird middle ground, that is hard to
analyze, abusing the mount count as a thread count.

I have been weighing the options between them, and to me properly
waiting for all the processes to exit in zap_pid_ns_processes as we
currently try to do is in our advantage.  It is simple and it means
one less cache line bounce for a write to global variable in the
much more common fork/exit path that we have to deal with.

The task->children isn't changed until __unhash_process() which runs
after flush_proc_task().  So we should be able to come up with
a variant of do_wait() that zap_pid_ns_processes can use that does
what we need.

Louis do you want to look at that?

>> Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
>> whatever the ordering of tasks.
>
> See one comment below.
>
>> Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
>> ---
>>  fs/proc/base.c          |   18 ++++++++++++++++++
>>  include/linux/proc_fs.h |    4 ++++
>>  kernel/fork.c           |    1 +
>>  3 files changed, 23 insertions(+), 0 deletions(-)
>> 
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index acb7ef8..d6cdd91 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
>>  	.setattr	= proc_setattr,
>>  };
>>  
>> +/*
>> + * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
>> + * after container init calls itself proc_flush_task().
>> + */
>> +void proc_new_task(struct task_struct *task)
>> +{
>> +	struct pid *pid;
>> +	int i;
>> +
>> +	if (!task->pid)
>> +		return;
>> +
>> +	pid = task_pid(task);
>> +	for (i = 0; i <= pid->level; i++)
>> +		mntget(pid->numbers[i].ns->proc_mnt);
>
> NAK. Pids live their own lives - task can change one, pid will
> become orphan and will be destroyed, so you'll leak.

There is that nasty case in exec isn't there.  Why we ever made it
part of the ABI that calling exec on a thread changes the pid of
that thread to the pid of the thread group is beyond me.

> There's another big problem with proc mount - you can umount proc
> and the namespace will have a stale pointer.

Not true. pid_ns->proc_mnt is an internal kernel mount. See
pid_ns_prepare_proc.

> I think we should have a kern_mount-ed proc at the namespace createi

I agree, and we mostly do.  In my queue for the unsharing of the pid
namespace I have a patch that makes that more explicit.

Eric

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
  2010-06-16 16:34 Louis Rilling
@ 2010-06-17  9:53     ` Pavel Emelyanov
  2010-06-17 21:20 ` Oleg Nesterov
  1 sibling, 0 replies; 31+ messages in thread
From: Pavel Emelyanov @ 2010-06-17  9:53 UTC (permalink / raw)
  To: Louis Rilling, Eric W. Biederman
  Cc: Linux Containers, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Andrew Morton, Oleg Nesterov, Pavel Emelyanov

On 06/16/2010 08:34 PM, Louis Rilling wrote:
> [ Resending, hopefully with all pieces ]
> 
> Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> that release_task()->proc_flush_task() of container init can be called
> before it is for some detached tasks in the namespace.
> 
> Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> whatever the ordering of tasks.

See one comment below.

> Signed-off-by: Louis Rilling <louis.rilling-aw0BnHfMbSpBDgjK7y7TUQ@public.gmane.org>
> ---
>  fs/proc/base.c          |   18 ++++++++++++++++++
>  include/linux/proc_fs.h |    4 ++++
>  kernel/fork.c           |    1 +
>  3 files changed, 23 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index acb7ef8..d6cdd91 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
>  	.setattr	= proc_setattr,
>  };
>  
> +/*
> + * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
> + * after container init calls itself proc_flush_task().
> + */
> +void proc_new_task(struct task_struct *task)
> +{
> +	struct pid *pid;
> +	int i;
> +
> +	if (!task->pid)
> +		return;
> +
> +	pid = task_pid(task);
> +	for (i = 0; i <= pid->level; i++)
> +		mntget(pid->numbers[i].ns->proc_mnt);

NAK. Pids live their own lives - task can change one, pid will
become orphan and will be destroyed, so you'll leak.

There's another big problem with proc mount - you can umount proc
and the namespace will have a stale pointer.

I think we should have a kern_mount-ed proc at the namespace createi

> +}
> +
>  static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>  {
>  	struct dentry *dentry, *leader, *dir;
> @@ -2744,6 +2761,7 @@ void proc_flush_task(struct task_struct *task)
>  		upid = &pid->numbers[i];
>  		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
>  					tgid->numbers[i].nr);
> +		mntput(upid->ns->proc_mnt);
>  	}
>  
>  	upid = &pid->numbers[pid->level];
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index 379eaed..f24faa1 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -104,6 +104,7 @@ struct vmcore {
>  
>  extern void proc_root_init(void);
>  
> +void proc_new_task(struct task_struct *task);
>  void proc_flush_task(struct task_struct *task);
>  
>  extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
> @@ -184,6 +185,9 @@ extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
>  #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
>  static inline void proc_net_remove(struct net *net, const char *name) {}
>  
> +static inline void proc_new_task(struct task_struct *task)
> +{
> +}
>  static inline void proc_flush_task(struct task_struct *task)
>  {
>  }
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b6cce14..c6c2874 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1281,6 +1281,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>  	total_forks++;
>  	spin_unlock(&current->sighand->siglock);
>  	write_unlock_irq(&tasklist_lock);
> +	proc_new_task(p);
>  	proc_fork_connector(p);
>  	cgroup_post_fork(p);
>  	perf_event_fork(p);

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early
@ 2010-06-17  9:53     ` Pavel Emelyanov
  0 siblings, 0 replies; 31+ messages in thread
From: Pavel Emelyanov @ 2010-06-17  9:53 UTC (permalink / raw)
  To: Louis Rilling, Eric W. Biederman
  Cc: Andrew Morton, Oleg Nesterov, Pavel Emelyanov, Linux Containers,
	linux-kernel

On 06/16/2010 08:34 PM, Louis Rilling wrote:
> [ Resending, hopefully with all pieces ]
> 
> Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> that release_task()->proc_flush_task() of container init can be called
> before it is for some detached tasks in the namespace.
> 
> Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> whatever the ordering of tasks.

See one comment below.

> Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
> ---
>  fs/proc/base.c          |   18 ++++++++++++++++++
>  include/linux/proc_fs.h |    4 ++++
>  kernel/fork.c           |    1 +
>  3 files changed, 23 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index acb7ef8..d6cdd91 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
>  	.setattr	= proc_setattr,
>  };
>  
> +/*
> + * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
> + * after container init calls itself proc_flush_task().
> + */
> +void proc_new_task(struct task_struct *task)
> +{
> +	struct pid *pid;
> +	int i;
> +
> +	if (!task->pid)
> +		return;
> +
> +	pid = task_pid(task);
> +	for (i = 0; i <= pid->level; i++)
> +		mntget(pid->numbers[i].ns->proc_mnt);

NAK. Pids live their own lives - task can change one, pid will
become orphan and will be destroyed, so you'll leak.

There's another big problem with proc mount - you can umount proc
and the namespace will have a stale pointer.

I think we should have a kern_mount-ed proc at the namespace createi

> +}
> +
>  static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
>  {
>  	struct dentry *dentry, *leader, *dir;
> @@ -2744,6 +2761,7 @@ void proc_flush_task(struct task_struct *task)
>  		upid = &pid->numbers[i];
>  		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
>  					tgid->numbers[i].nr);
> +		mntput(upid->ns->proc_mnt);
>  	}
>  
>  	upid = &pid->numbers[pid->level];
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index 379eaed..f24faa1 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -104,6 +104,7 @@ struct vmcore {
>  
>  extern void proc_root_init(void);
>  
> +void proc_new_task(struct task_struct *task);
>  void proc_flush_task(struct task_struct *task);
>  
>  extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
> @@ -184,6 +185,9 @@ extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
>  #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
>  static inline void proc_net_remove(struct net *net, const char *name) {}
>  
> +static inline void proc_new_task(struct task_struct *task)
> +{
> +}
>  static inline void proc_flush_task(struct task_struct *task)
>  {
>  }
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b6cce14..c6c2874 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1281,6 +1281,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>  	total_forks++;
>  	spin_unlock(&current->sighand->siglock);
>  	write_unlock_irq(&tasklist_lock);
> +	proc_new_task(p);
>  	proc_fork_connector(p);
>  	cgroup_post_fork(p);
>  	perf_event_fork(p);


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH] procfs: Do not release pid_ns->proc_mnt too early
@ 2010-06-16 16:34 Louis Rilling
       [not found] ` <1276706068-18567-1-git-send-email-louis.rilling-aw0BnHfMbSpBDgjK7y7TUQ@public.gmane.org>
  2010-06-17 21:20 ` Oleg Nesterov
  0 siblings, 2 replies; 31+ messages in thread
From: Louis Rilling @ 2010-06-16 16:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Oleg Nesterov, Pavel Emelyanov, Linux Containers, linux-kernel,
	Louis Rilling

[ Resending, hopefully with all pieces ]

Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
that release_task()->proc_flush_task() of container init can be called
before it is for some detached tasks in the namespace.

Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
whatever the ordering of tasks.

Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
---
 fs/proc/base.c          |   18 ++++++++++++++++++
 include/linux/proc_fs.h |    4 ++++
 kernel/fork.c           |    1 +
 3 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef8..d6cdd91 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
+/*
+ * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
+ * after container init calls itself proc_flush_task().
+ */
+void proc_new_task(struct task_struct *task)
+{
+	struct pid *pid;
+	int i;
+
+	if (!task->pid)
+		return;
+
+	pid = task_pid(task);
+	for (i = 0; i <= pid->level; i++)
+		mntget(pid->numbers[i].ns->proc_mnt);
+}
+
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 {
 	struct dentry *dentry, *leader, *dir;
@@ -2744,6 +2761,7 @@ void proc_flush_task(struct task_struct *task)
 		upid = &pid->numbers[i];
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
+		mntput(upid->ns->proc_mnt);
 	}
 
 	upid = &pid->numbers[pid->level];
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 379eaed..f24faa1 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -104,6 +104,7 @@ struct vmcore {
 
 extern void proc_root_init(void);
 
+void proc_new_task(struct task_struct *task);
 void proc_flush_task(struct task_struct *task);
 
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
@@ -184,6 +185,9 @@ extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
 #define proc_net_fops_create(net, name, mode, fops)  ({ (void)(mode), NULL; })
 static inline void proc_net_remove(struct net *net, const char *name) {}
 
+static inline void proc_new_task(struct task_struct *task)
+{
+}
 static inline void proc_flush_task(struct task_struct *task)
 {
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14..c6c2874 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1281,6 +1281,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+	proc_new_task(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	perf_event_fork(p);
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2010-06-21 14:38 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-06-16 15:58 [PATCH] procfs: Do not release pid_ns->proc_mnt too early Louis Rilling
2010-06-16 15:58 ` Louis Rilling
2010-06-16 16:04 ` Pavel Emelyanov
2010-06-16 16:15   ` Louis Rilling
2010-06-16 16:16   ` Louis Rilling
2010-06-16 16:34 Louis Rilling
     [not found] ` <1276706068-18567-1-git-send-email-louis.rilling-aw0BnHfMbSpBDgjK7y7TUQ@public.gmane.org>
2010-06-17  9:53   ` Pavel Emelyanov
2010-06-17  9:53     ` Pavel Emelyanov
2010-06-17 13:41     ` Eric W. Biederman
2010-06-17 14:20       ` Louis Rilling
2010-06-17 21:36       ` Oleg Nesterov
2010-06-18  8:27         ` Louis Rilling
2010-06-18 16:27           ` Oleg Nesterov
2010-06-21 11:11             ` Louis Rilling
2010-06-21 12:58               ` Eric W. Biederman
2010-06-21 14:15                 ` Louis Rilling
2010-06-21 14:26                   ` Eric W. Biederman
     [not found]           ` <20100618082738.GE16877-Hu8+6S1rdjywhHL9vcZdMVaTQe2KTcn/@public.gmane.org>
2010-06-18 16:27             ` Oleg Nesterov
2010-06-17 21:20 ` Oleg Nesterov
2010-06-18  8:20   ` Louis Rilling
2010-06-18 11:15     ` Oleg Nesterov
2010-06-18 16:08       ` Oleg Nesterov
2010-06-18 16:08         ` Oleg Nesterov
2010-06-18 17:33         ` Louis Rilling
2010-06-18 17:55           ` Oleg Nesterov
2010-06-18 17:55             ` Oleg Nesterov
2010-06-18 21:23             ` Oleg Nesterov
2010-06-18 21:23               ` Oleg Nesterov
2010-06-21 11:09             ` Louis Rilling
2010-06-21 11:15             ` Louis Rilling
2010-06-21 14:38               ` Oleg Nesterov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.