[PATCH] pid_ns: Fix race between setns'ed fork() and zap_pid_ns_processes()

* [PATCH] pid_ns: Fix race between setns'ed fork() and zap_pid_ns_processes()
@ 2017-05-12 13:07 Kirill Tkhai
  2017-05-12 14:26 ` Eric W. Biederman
  2017-05-12 15:40 ` Oleg Nesterov
  0 siblings, 2 replies; 6+ messages in thread
From: Kirill Tkhai @ 2017-05-12 13:07 UTC (permalink / raw)
  To: mhocko, avagin, peterz, oleg, linux-kernel, rppt, ebiederm, luto,
	gorcunov, akpm, mingo, ktkhai, serge

Imagine we have a pid namespace and a task from its parent's pid_ns,
which made setns() to the pid namespace. The task is doing fork(),
while the pid namespace's child reaper is dying. We have the race
between them:

Task from parent pid_ns             Child reaper
copy_process()                      ..
  alloc_pid()                       ..
  ..                                zap_pid_ns_processes()
  ..                                  disable_pid_allocation()
  ..                                  read_lock(&tasklist_lock)
  ..                                  iterate over pids in pid_ns
  ..                                    kill tasks linked to pids
  ..                                  read_unlock(&tasklist_lock)
  write_lock_irq(&tasklist_lock);   ..
  attach_pid(p, PIDTYPE_PID);       ..
  ..                                ..

So, just created task p won't receive SIGKILL signal,
and the pid namespace will be in contradictory state.
Only manual kill will help there, but does the userspace
care about this? I suppose, the most users just inject
a task into a pid namespace and wait a SIGCHLD from it.

The patch fixes the problem. It moves disable_pid_allocation()
into find_child_reaper() where tasklist_lock is held,
and this allows to simply check for (pid_ns->nr_hashed & PIDNS_HASH_ADDING)
in copy_process(). If allocation is disabled, we just
return -ENOMEM like it's made for such cases in alloc_pid().

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Ingo Molnar <mingo@kernel.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Oleg Nesterov <oleg@redhat.com>
CC: Mike Rapoport <rppt@linux.vnet.ibm.com>
CC: Michal Hocko <mhocko@suse.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: "Eric W. Biederman" <ebiederm@xmission.com>
CC: Andrei Vagin <avagin@openvz.org>
CC: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Serge Hallyn <serge@hallyn.com>
---
 kernel/exit.c          |    2 ++
 kernel/fork.c          |   15 ++++++++++-----
 kernel/pid_namespace.c |    3 ---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 516acdb0e0ec..9310e69fbc5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -586,6 +586,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
 		return reaper;
 	}
 
+	/* Don't allow any more processes into the pid namespace */
+	disable_pid_allocation(pid_ns);
 	write_unlock_irq(&tasklist_lock);
 	if (unlikely(pid_ns == &init_pid_ns)) {
 		panic("Attempted to kill init! exitcode=0x%08x\n",
diff --git a/kernel/fork.c b/kernel/fork.c
index bfd91b180778..dbafabf6c7b1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1523,6 +1523,7 @@ static __latent_entropy struct task_struct *copy_process(
 					unsigned long tls,
 					int node)
 {
+	struct pid_namespace *pid_ns;
 	int retval;
 	struct task_struct *p;
 
@@ -1735,8 +1736,9 @@ static __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_cleanup_io;
 
+	pid_ns = p->nsproxy->pid_ns_for_children;
 	if (pid != &init_struct_pid) {
-		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+		pid = alloc_pid(pid_ns);
 		if (IS_ERR(pid)) {
 			retval = PTR_ERR(pid);
 			goto bad_fork_cleanup_thread;
@@ -1845,10 +1847,11 @@ static __latent_entropy struct task_struct *copy_process(
 	*/
 	recalc_sigpending();
 	if (signal_pending(current)) {
-		spin_unlock(&current->sighand->siglock);
-		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_cancel_cgroup;
+		goto bad_fork_unlock_siglock;
+	} else if (unlikely(!(pid_ns->nr_hashed & PIDNS_HASH_ADDING))) {
+		retval = -ENOMEM;
+		goto bad_fork_unlock_siglock;
 	}
 
 	if (likely(p->pid)) {
@@ -1906,7 +1909,9 @@ static __latent_entropy struct task_struct *copy_process(
 
 	return p;
 
-bad_fork_cancel_cgroup:
+bad_fork_unlock_siglock:
+	spin_unlock(&current->sighand->siglock);
+	write_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p);
 bad_fork_free_pid:
 	cgroup_threadgroup_change_end(current);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d1f3e9f558b8..aedf86a8017e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -210,9 +210,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	struct task_struct *task, *me = current;
 	int init_pids = thread_group_leader(me) ? 1 : 2;
 
-	/* Don't allow any more processes into the pid namespace */
-	disable_pid_allocation(pid_ns);
-
 	/*
 	 * Ignore SIGCHLD causing any terminated children to autoreap.
 	 * This speeds up the namespace shutdown, plus see the comment

^ permalink raw reply related	[flat|nested] 6+ messages in thread