linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [path][rfc] add PR_DETACH prctl command
@ 2011-02-23 13:50 Stas Sergeev
  2011-02-23 19:14 ` Oleg Nesterov
  2011-04-08 18:13 ` [path][rfc] add PR_DETACH prctl command Bryan Donlan
  0 siblings, 2 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-02-23 13:50 UTC (permalink / raw)
  To: Linux kernel; +Cc: Oleg Nesterov

[-- Attachment #1: Type: text/plain, Size: 451 bytes --]

Hi.

The attched patch adds the PR_DETACH prctl command.
It is needed for those rare but unfortunate cases, where
you can't daemonize your process before creating a thread.
The effect of this command is similar to the fork() and then
exit() on parent, except that:
1. PID does not change
2. Threads are not destroyed

It would be nice to know what people think about such an
approach.

Signed-off-by: stsp@aknet.ru
CC: Oleg Nesterov <oleg@redhat.com>

[-- Attachment #2: pr_detach.diff --]
[-- Type: text/plain, Size: 5514 bytes --]

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7c99c1c..ccccfa8 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -462,7 +462,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
-	seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
+	seq_printf(m, "%d (%s) %c %d %d %d %d %d %llu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
 %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
 		pid_nr_ns(pid, ns),
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5..75c977e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1194,7 +1194,7 @@ struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
 	atomic_t usage;
-	unsigned int flags;	/* per process flags, defined below */
+	u64 flags;	/* per process flags, defined below */
 	unsigned int ptrace;
 
 	int lock_depth;		/* BKL lock depth */
@@ -1746,6 +1746,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
 #define PF_FREEZER_NOSIG 0x80000000	/* Freezer won't send signals to it */
+#define PF_DETACH	 0x100000000ULL	/* Detach from parent */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc55..f11c1ca 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -181,7 +181,7 @@ static inline void check_for_tasks(int cpu)
 		    (!cputime_eq(p->utime, cputime_zero) ||
 		     !cputime_eq(p->stime, cputime_zero)))
 			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
-				"(state = %ld, flags = %x)\n",
+				"(state = %ld, flags = %llx)\n",
 				p->comm, task_pid_nr(p), cpu,
 				p->state, p->flags);
 	}
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45eb..2c8f050 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1507,6 +1507,38 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int retval = 0;
+	pid_t pid = task_pid_vnr(p);
+	uid_t uid = __task_cred(p)->uid;
+
+	get_task_struct(p);
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		read_unlock(&tasklist_lock);
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->exit_code >> 8);
+	}
+
+	p->flags &= ~PF_DETACH;
+	if (!ptrace_reparented(p))
+		p->parent = init_pid_ns.child_reaper;
+	p->real_parent = init_pid_ns.child_reaper;
+	p->exit_signal = SIGCHLD;
+	list_move_tail(&p->sibling, &p->real_parent->children);
+
+	read_unlock(&tasklist_lock);
+	if (wo->wo_stat)
+		retval = put_user(p->exit_code, wo->wo_stat);
+
+	if (!retval)
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->exit_code >> 8);
+	else
+		put_task_struct(p);
+	return retval;
+}
+
 /*
  * Consider @p for a wait by @parent.
  *
@@ -1549,6 +1581,9 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (p->exit_state == EXIT_DEAD)
 		return 0;
 
+	if (p->flags & PF_DETACH)
+		return wait_task_detached(wo, p);
+
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..2cd495a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1450,10 +1450,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	BUG_ON(sig == -1);
 
- 	/* do_notify_parent_cldstop should have been called instead.  */
- 	BUG_ON(task_is_stopped_or_traced(tsk));
+	/* do_notify_parent_cldstop should have been called instead.  */
+	BUG_ON(task_is_stopped_or_traced(tsk));
 
-	BUG_ON(!task_ptrace(tsk) &&
+	BUG_ON(!task_ptrace(tsk) && (tsk->flags & PF_EXITING) &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
 	info.si_signo = sig;
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..c09205f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1736,6 +1736,22 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH:
+			error = -EPERM;
+			/* if parent is init, or not a group leader - bail */
+			if (me->real_parent == init_pid_ns.child_reaper)
+				break;
+			if (me->group_leader != me)
+				break;
+			if (arg2 & ~0xff)
+				break;
+			write_lock_irq(&tasklist_lock);
+			me->exit_code = arg2 << 8;
+			me->flags |= PF_DETACH;
+			do_notify_parent(me, me->exit_signal);
+			write_unlock_irq(&tasklist_lock);
+			error = 0;
+			break;
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-02-23 13:50 [path][rfc] add PR_DETACH prctl command Stas Sergeev
@ 2011-02-23 19:14 ` Oleg Nesterov
  2011-02-23 20:35   ` Stas Sergeev
  2011-04-08 18:13 ` [path][rfc] add PR_DETACH prctl command Bryan Donlan
  1 sibling, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-02-23 19:14 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 02/23, Stas Sergeev wrote:
>
> Hi.
>
> The attched patch adds the PR_DETACH prctl command.

Hi. The patch doesn't look right at first glance, but to me
this is not the main problem.

> It is needed for those rare but unfortunate cases, where
> you can't daemonize your process before creating a thread.
> The effect of this command is similar to the fork() and then
> exit() on parent, except that:
> 1. PID does not change
> 2. Threads are not destroyed
>
> It would be nice to know what people think about such an
> approach.

Well. You should somehow convince people we need this ;) This is
the main problem.

I am not going to discuss this, I never know when it comes to the
new feautures. And you need the authoritative ack, probably you
can ask Linus + Roland directly.

As for the patch itself,

> +static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
> +{
> +	int retval = 0;
> +	pid_t pid = task_pid_vnr(p);
> +	uid_t uid = __task_cred(p)->uid;
> +
> +	get_task_struct(p);
> +	if (unlikely(wo->wo_flags & WNOWAIT)) {
> +		read_unlock(&tasklist_lock);
> +		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
> +			p->exit_code >> 8);
> +	}
> +
> +	p->flags &= ~PF_DETACH;

Only current can change its ->flags, this is racy

> +	if (!ptrace_reparented(p))
> +		p->parent = init_pid_ns.child_reaper;
> +	p->real_parent = init_pid_ns.child_reaper;
> +	p->exit_signal = SIGCHLD;
> +	list_move_tail(&p->sibling, &p->real_parent->children);

No, we can't do this under read_lock(tasklist). And you forgot about
threads, they also have ->real_parent == old_parent.

The usage of ->exit_code doesn't look right, espeicaily if it is traced.

And other problems afaics....

> @@ -1549,6 +1581,9 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
>  	if (p->exit_state == EXIT_DEAD)
>  		return 0;
>
> +	if (p->flags & PF_DETACH)
> +		return wait_task_detached(wo, p);

What if it is already dead? We are goint to reparent it, but init
won't notice the new zombie.

And what if do_wait() was called without WEXITED? say, the old parent
does waitpid(WSTOPPED).

> @@ -1450,10 +1450,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
>  
>  	BUG_ON(sig == -1);
>  
> - 	/* do_notify_parent_cldstop should have been called instead.  */
> - 	BUG_ON(task_is_stopped_or_traced(tsk));
> +	/* do_notify_parent_cldstop should have been called instead.  */
> +	BUG_ON(task_is_stopped_or_traced(tsk));
>  
> -	BUG_ON(!task_ptrace(tsk) &&
> +	BUG_ON(!task_ptrace(tsk) && (tsk->flags & PF_EXITING) &&
>  	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));

Afaics, you are trying to hide the problem.... The code below can make
tsk detached if real_parent ignores SIGCHLD.

> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1736,6 +1736,22 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  			else
>  				error = PR_MCE_KILL_DEFAULT;
>  			break;
> +		case PR_DETACH:
> +			error = -EPERM;
> +			/* if parent is init, or not a group leader - bail */
> +			if (me->real_parent == init_pid_ns.child_reaper)

This is not exactly right. What if the child of init's sub-thread
does PR_DETACH?

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-02-23 19:14 ` Oleg Nesterov
@ 2011-02-23 20:35   ` Stas Sergeev
  2011-02-24 13:29     ` Oleg Nesterov
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-02-23 20:35 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

23.02.2011 22:14, Oleg Nesterov wrote:
> The attched patch adds the PR_DETACH prctl command.
> Hi. The patch doesn't look right at first glance,
I know, thanks for the review. That's basically an RFC material.

> Well. You should somehow convince people we need this ;)
I don't have to: I need this patch, and that's the main motivation
for me. :) Though of course I'll offer it for inclusion when its ready,
but I am developing it mostly for my project.
google reveals that many people were confused by the fact that
daemon() silently drops all the threads, and the man page says
nothing about that nasty habit. And I really think there is no other
way to daemonize the process with threads, than to use something
like this patch, or is there?

> Only current can change its ->flags, this is racy
Oh my, add a new lock only for that? :((
Add another thread_struct member only for that?
Abuse ->exit_state only for that?
Nothing looks good...

>> +	if (!ptrace_reparented(p))
>> +		p->parent = init_pid_ns.child_reaper;
>> +	p->real_parent = init_pid_ns.child_reaper;
>> +	p->exit_signal = SIGCHLD;
>> +	list_move_tail(&p->sibling,&p->real_parent->children);
> No, we can't do this under read_lock(tasklist). And you forgot about
> threads, they also have ->real_parent == old_parent.
Thanks, will fix.

> The usage of ->exit_code doesn't look right, espeicaily if it is traced.
Could you please elaborate on that? I am using the
->exit_code to pass the (fake) exit code to the parent.
The argument of my PR_DETACH is an exit code to pass.
What is a problem with that?

> What if it is already dead? We are goint to reparent it, but init
> won't notice the new zombie.
>
> And what if do_wait() was called without WEXITED? say, the old parent
> does waitpid(WSTOPPED).
Will fix.

>> @@ -1450,10 +1450,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
>>
>>   	BUG_ON(sig == -1);
>>
>> - 	/* do_notify_parent_cldstop should have been called instead.  */
>> - 	BUG_ON(task_is_stopped_or_traced(tsk));
>> +	/* do_notify_parent_cldstop should have been called instead.  */
>> +	BUG_ON(task_is_stopped_or_traced(tsk));
>>
>> -	BUG_ON(!task_ptrace(tsk)&&
>> +	BUG_ON(!task_ptrace(tsk)&&  (tsk->flags&  PF_EXITING)&&
>>   	(tsk->group_leader != tsk || !thread_group_empty(tsk)));
> Afaics, you are trying to hide the problem.... The code below can make
> tsk detached if real_parent ignores SIGCHLD.
Will fix the problem with parent ignoring SIGCHLD, thanks.
Though could you please clarify whether or not you see the
above hunk wrong? It is there just because the group is not
empty when the leader does PR_DETACH, so I adjusted the
sanity check.

>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -1736,6 +1736,22 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>>   			else
>>   				error = PR_MCE_KILL_DEFAULT;
>>   			break;
>> +		case PR_DETACH:
>> +			error = -EPERM;
>> +			/* if parent is init, or not a group leader - bail */
>> +			if (me->real_parent == init_pid_ns.child_reaper)
> This is not exactly right. What if the child of init's sub-thread
> does PR_DETACH?
Will fix.

Thanks for your review! I'll update the patch.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-02-23 20:35   ` Stas Sergeev
@ 2011-02-24 13:29     ` Oleg Nesterov
  2011-02-24 15:13       ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-02-24 13:29 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 02/23, Stas Sergeev wrote:
>
> 23.02.2011 22:14, Oleg Nesterov wrote:
>
>> Well. You should somehow convince people we need this ;)
> I don't have to: I need this patch, and that's the main motivation
> for me. :)

OK,

> Though of course I'll offer it for inclusion when its ready,
> but I am developing it mostly for my project.
> google reveals that many people were confused by the fact that
> daemon() silently drops all the threads, and the man page says
> nothing about that nasty habit.

I think it does... from man daemon

	This function forks,  and  if  the  fork(2)  succeeds,
	the  parent  calls _exit(2),

and of course fork() doesn't clone the sub-threads.

> And I really think there is no other
> way to daemonize the process with threads, than to use something
> like this patch, or is there?

Depends on what "daemonize" means. Even with this patch, setsid()
after PR_DETACH can fail because we do not change the pids and
the caller can still be pgrp leader.

And. What if the parent of PR_DETACH caller blocks or ignores
SIGCHLD or simply doesn't call do_wait()? The caller can run with
PR_DETACH set without any effect "forever".

There are other problems. For example, if the caller is ptraced
it can silently disappear from parent's radar. If you fix this, then
the actual reparenting can be delayed unpredictably whatever the
real parent does.

So, to be honest, I do not think this idea will be accepted, and I don't
really understand your motivation. But once again, I never argue with the
"we need this feature" requests, no need to convince me.

>> Only current can change its ->flags, this is racy
> Oh my, add a new lock only for that? :((
> Add another thread_struct member only for that?
> Abuse ->exit_state only for that?
> Nothing looks good...

Yep. But this is minor, if nothing else you can use signal->flags.
In fact, I do not understand why only the group leader can do
prctl(PR_DETACH) and why this flag is per-thread.

>> The usage of ->exit_code doesn't look right, espeicaily if it is traced.
> Could you please elaborate on that? I am using the
> ->exit_code to pass the (fake) exit code to the parent.
> The argument of my PR_DETACH is an exit code to pass.
> What is a problem with that?

The problem is that ptrace uses this ->exit_code member as well.
Suppose that the (ptraced) task calls PR_DETACH and, say, recieves
a signal after that. See ptrace_signal().

>>> @@ -1450,10 +1450,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
>>>
>>>   	BUG_ON(sig == -1);
>>>
>>> - 	/* do_notify_parent_cldstop should have been called instead.  */
>>> - 	BUG_ON(task_is_stopped_or_traced(tsk));
>>> +	/* do_notify_parent_cldstop should have been called instead.  */
>>> +	BUG_ON(task_is_stopped_or_traced(tsk));
>>>
>>> -	BUG_ON(!task_ptrace(tsk)&&
>>> +	BUG_ON(!task_ptrace(tsk)&&  (tsk->flags&  PF_EXITING)&&
>>>   	(tsk->group_leader != tsk || !thread_group_empty(tsk)));
>> Afaics, you are trying to hide the problem.... The code below can make
>> tsk detached if real_parent ignores SIGCHLD.
> Will fix the problem with parent ignoring SIGCHLD, thanks.
> Though could you please clarify whether or not you see the
> above hunk wrong? It is there just because the group is not
> empty when the leader does PR_DETACH, so I adjusted the
> sanity check.

I understand why you added PF_EXITING. And, once again, this is not
right afaics. The current condition is more or less "random" and mostly
historical, although correct. If we want to take PF_EXITING into account,
we should just add BUG_ON(!(tsk->flags & PF_EXITING)). IOW, it is just
wrong to call this function unless this tsk exits.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-02-24 13:29     ` Oleg Nesterov
@ 2011-02-24 15:13       ` Stas Sergeev
  2011-02-24 15:32         ` Oleg Nesterov
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-02-24 15:13 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

24.02.2011 16:29, Oleg Nesterov wrote:
>> And I really think there is no other
>> way to daemonize the process with threads, than to use something
>> like this patch, or is there?
> Depends on what "daemonize" means. Even with this patch, setsid()
> after PR_DETACH can fail because we do not change the pids and
> the caller can still be pgrp leader.
Yes, I am using TIOCNOTTY ioctl instead.
It doesn't detach the entire group from tty though, but the plan
is to implement also TIOCNOTTY_GRP, in case PR_DETACH is done.

> And. What if the parent of PR_DETACH caller blocks or ignores
> SIGCHLD or simply doesn't call do_wait()? The caller can run with
> PR_DETACH set without any effect "forever".
I am currently rewriting the patch to solve this all.
What I am trying to do now, is to reparent directly in prctl(),
but delay the list_move_tail(&p->sibling, &p->real_parent->children);
to the wait() call. If this is a feasible solution, I'll post the new patch.

> So, to be honest, I do not think this idea will be accepted, and I don't
> really understand your motivation. But once again, I never argue with the
> "we need this feature" requests, no need to convince me.
Lets see if the clean implementation is possible first. :)

> The problem is that ptrace uses this ->exit_code member as well.
> Suppose that the (ptraced) task calls PR_DETACH and, say, recieves
> a signal after that. See ptrace_signal().
Also do_signal_stop() seems to alter it.
Do you mean right now it can't happen that multiple events
alter the exit_code, and the parent notices only the last one?
In this case I need to add a separate variable.

> I understand why you added PF_EXITING. And, once again, this is not
> right afaics. The current condition is more or less "random" and mostly
> historical, although correct. If we want to take PF_EXITING into account,
> we should just add BUG_ON(!(tsk->flags&  PF_EXITING)). IOW, it is just
> wrong to call this function unless this tsk exits.
OK, I'll address also this.

Thanks for your time, I am hoping to post the patch that addresses
the pointed problems.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-02-24 15:13       ` Stas Sergeev
@ 2011-02-24 15:32         ` Oleg Nesterov
  2011-03-31 16:10           ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-02-24 15:32 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 02/24, Stas Sergeev wrote:
>
> I am currently rewriting the patch to solve this all.
> What I am trying to do now, is to reparent directly in prctl(),
> but delay the list_move_tail(&p->sibling, &p->real_parent->children);
> to the wait() call.

Ooh... good luck ;) but I expect this won't be simple. In fact,
I do not understand how it is possible to do this correctly.

>> The problem is that ptrace uses this ->exit_code member as well.
>> Suppose that the (ptraced) task calls PR_DETACH and, say, recieves
>> a signal after that. See ptrace_signal().
> Also do_signal_stop() seems to alter it.

Yes. It is not immediately obvious but this is in fact for ptrace.
Even if this thread is not traced, the tracer can attach later.

> Do you mean right now it can't happen that multiple events
> alter the exit_code, and the parent notices only the last one?

Yes.

> In this case I need to add a separate variable.

I'd suggest you to always report 0 as the exit status. Much simpler.
Then you can do another patch if you really want to report arg2.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-02-24 15:32         ` Oleg Nesterov
@ 2011-03-31 16:10           ` Stas Sergeev
  2011-03-31 17:02             ` Oleg Nesterov
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-03-31 16:10 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

[-- Attachment #1: Type: text/plain, Size: 188 bytes --]

Hi Oleg.

I found some time to get back to that patch and
to address all of the problems you pointed.
What do you think about the attached patch?
I didn't expect it would became that big.

[-- Attachment #2: pr_detach.diff --]
[-- Type: text/plain, Size: 19287 bytes --]

commit 1a19a1ed5f1ab86e3fb029f201383627a6b2bbd5
Author: Stas <stas@stas.(none)>
Date:   Thu Mar 31 19:58:17 2011 +0400

    implement PR_DETACH

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7c99c1c..77df70d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -139,9 +139,10 @@ static const char *task_state_array[] = {
 	"t (tracing stop)",	/*   8 */
 	"Z (zombie)",		/*  16 */
 	"X (dead)",		/*  32 */
-	"x (dead)",		/*  64 */
-	"K (wakekill)",		/* 128 */
-	"W (waking)",		/* 256 */
+	"d (detached)",		/*  64 */
+	"x (dead)",		/* 128 */
+	"K (wakekill)",		/* 256 */
+	"W (waking)",		/* 512 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index caa151f..fdf71a9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -158,6 +158,8 @@ extern struct cred init_cred;
 	.parent		= &tsk,						\
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.detached_children	= LIST_HEAD_INIT(tsk.detached_children),\
+	.detached_sibling	= LIST_HEAD_INIT(tsk.detached_sibling),	\
 	.group_leader	= &tsk,						\
 	RCU_INIT_POINTER(.real_cred, &init_cred),			\
 	RCU_INIT_POINTER(.cred, &init_cred),				\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5..eb99afb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -186,13 +186,14 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 /* in tsk->exit_state */
 #define EXIT_ZOMBIE		16
 #define EXIT_DEAD		32
+#define EXIT_DETACHED		64
 /* in tsk->state again */
-#define TASK_DEAD		64
-#define TASK_WAKEKILL		128
-#define TASK_WAKING		256
-#define TASK_STATE_MAX		512
+#define TASK_DEAD		128
+#define TASK_WAKEKILL		256
+#define TASK_WAKING		512
+#define TASK_STATE_MAX		1024
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
+#define TASK_STATE_TO_CHAR_STR "RSDTtZXdxKW"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@ -1260,6 +1261,8 @@ struct task_struct {
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
+	int exit_flags;
+	int detach_code;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
@@ -1292,7 +1295,10 @@ struct task_struct {
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
+	struct list_head detached_children;	/* list of my detached children */
+	struct list_head detached_sibling;	/* linkage in my parent's detached children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
+	int num_waiters;			/* detached task may have 2 */
 
 	/*
 	 * ptraced is the list of tasks this task is using ptrace on.
@@ -1747,6 +1753,10 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
 #define PF_FREEZER_NOSIG 0x80000000	/* Freezer won't send signals to it */
 
+/* exit flags */
+#define EF_RETCODE_READ	0x00000001	/* parent read(ed) exit code */
+#define EF_DCODE_READ	0x00000002	/* parent read(ed) detach code */
+
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
  * tasks can access tsk->flags in readonly mode for example
@@ -2096,6 +2106,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
+extern int do_signal_parent(struct task_struct *, int, int, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45eb..26d162e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
+		list_del_init(&p->detached_sibling);
 		__this_cpu_dec(process_counts);
 	}
 	list_del_rcu(&p->thread_group);
@@ -804,12 +805,28 @@ static void forget_original_parent(struct task_struct *father)
 		} while_each_thread(p, t);
 		reparent_leader(father, p, &dead_children);
 	}
+	list_for_each_entry(p, &father->detached_children, detached_sibling) {
+		BUG_ON(p->num_waiters == 0);
+		/* see if original parent didn't care to read detach code */
+		if (!(p->exit_flags & EF_DCODE_READ))
+			p->num_waiters--;
+		if (p->exit_state == EXIT_DETACHED) {
+			BUG_ON(p->num_waiters != 1);
+			/* continue as normal task */
+			p->exit_state = 0;
+		} else if (p->exit_state == EXIT_ZOMBIE && !p->num_waiters) {
+			BUG_ON(!(p->exit_flags & EF_RETCODE_READ));
+			p->exit_state = EXIT_DEAD;
+			list_move_tail(&p->sibling, &dead_children);
+		}
+	}
 	write_unlock_irq(&tasklist_lock);
 
 	BUG_ON(!list_empty(&father->children));
 
 	list_for_each_entry_safe(p, n, &dead_children, sibling) {
 		list_del_init(&p->sibling);
+		list_del_init(&p->detached_sibling);
 		release_task(p);
 	}
 }
@@ -861,7 +878,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	if (signal >= 0)
 		signal = do_notify_parent(tsk, signal);
 
-	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+	/* EXIT_DETACHED case means that the previous parent still alive */
+	if (tsk->exit_state == EXIT_DETACHED || signal != DEATH_REAP)
+		tsk->exit_state = EXIT_ZOMBIE;
+	else
+		tsk->exit_state = EXIT_DEAD;
 
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
@@ -1195,14 +1216,25 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
+static int _wait_task_zombie(struct wait_opts *wo, struct task_struct *p,
+	int dcode)
 {
 	unsigned long state;
-	int retval, status, traced;
+	int retval, status, traced, keep_task;
 	pid_t pid = task_pid_vnr(p);
 	uid_t uid = __task_cred(p)->uid;
 	struct siginfo __user *infop;
 
+	/* see if already waited */
+	if (p->exit_flags & (dcode ? EF_DCODE_READ : EF_RETCODE_READ))
+		return 0;
+
+	/*
+	 * We don't reap group leaders with subthreads.
+	 */
+	if (delay_group_leader(p))
+		return 0;
+
 	if (!likely(wo->wo_flags & WEXITED))
 		return 0;
 
@@ -1309,8 +1341,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 
 	retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
-	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
-		? p->signal->group_exit_code : p->exit_code;
+	if (!dcode)
+		status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+			? p->signal->group_exit_code : p->exit_code;
+	else
+		status = p->detach_code;
 	if (!retval && wo->wo_stat)
 		retval = put_user(status, wo->wo_stat);
 
@@ -1340,8 +1375,18 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	if (!retval)
 		retval = pid;
 
+	keep_task = 0;
+	write_lock_irq(&tasklist_lock);
+	p->exit_flags |= (dcode ? EF_DCODE_READ : EF_RETCODE_READ);
+	p->num_waiters--;
+
+	if (p->num_waiters > 0) {
+		/* not all waiters are satisfied yet */
+		p->exit_state = EXIT_ZOMBIE;
+		keep_task = 1;
+	}
+
 	if (traced) {
-		write_lock_irq(&tasklist_lock);
 		/* We dropped tasklist, ptracer could die and untrace */
 		ptrace_unlink(p);
 		/*
@@ -1353,17 +1398,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 			do_notify_parent(p, p->exit_signal);
 			if (!task_detached(p)) {
 				p->exit_state = EXIT_ZOMBIE;
-				p = NULL;
+				keep_task = 1;
 			}
 		}
-		write_unlock_irq(&tasklist_lock);
 	}
-	if (p != NULL)
+	write_unlock_irq(&tasklist_lock);
+
+	if (!keep_task)
 		release_task(p);
 
 	return retval;
 }
 
+static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
+{
+	return _wait_task_zombie(wo, p, 0);
+}
+
 static int *task_stopped_code(struct task_struct *p, bool ptrace)
 {
 	if (ptrace) {
@@ -1507,21 +1558,61 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
-/*
- * Consider @p for a wait by @parent.
- *
- * -ECHILD should be in ->notask_error before the first call.
- * Returns nonzero for a final return, when we have unlocked tasklist_lock.
- * Returns zero if the search for a child should continue;
- * then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
- */
-static int wait_consider_task(struct wait_opts *wo, int ptrace,
-				struct task_struct *p)
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int retval = 0;
+	unsigned long state;
+	pid_t pid;
+	uid_t uid;
+
+	if (p->exit_flags & EF_DCODE_READ)
+		return 0;
+
+	if (!likely(wo->wo_flags & WEXITED))
+		return 0;
+
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		get_task_struct(p);
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		read_unlock(&tasklist_lock);
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	}
+
+	state = xchg(&p->exit_state, 0);
+	/* check for race because of read_lock(&tasklist_lock) */
+	if (state != EXIT_DETACHED) {
+		BUG_ON(state != 0);
+		return 0;
+	}
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+	if (wo->wo_stat)
+		retval = put_user(p->detach_code, wo->wo_stat);
+
+	if (!retval) {
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	} else {
+		put_task_struct(p);
+	}
+
+	write_lock_irq(&tasklist_lock);
+	p->num_waiters--;
+	p->exit_flags |= EF_DCODE_READ;
+	write_unlock_irq(&tasklist_lock);
+
+	return retval;
+}
+
+static int can_wait_task_common(struct wait_opts *wo, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
 	if (!ret)
-		return ret;
+		return 0;
 
 	ret = security_task_wait(p);
 	if (unlikely(ret < 0)) {
@@ -1537,7 +1628,25 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+	if (p->exit_state == EXIT_DEAD)
+		return 0;
+
+	return 1;
+}
+
+static int can_wait_task_ptrace(struct wait_opts *wo, struct task_struct *p)
+{
+	/* don't worry, gcc will optimize away this function :) */
+	return can_wait_task_common(wo, p);
+}
+
+static int can_wait_task(struct wait_opts *wo, struct task_struct *p)
+{
+	int ret = can_wait_task_common(wo, p);
+	if (!ret)
+		return 0;
+
+	if (unlikely(task_ptrace(p))) {
 		/*
 		 * This child is hidden by ptrace.
 		 * We aren't allowed to see it now, but eventually we will.
@@ -1546,13 +1655,22 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (p->exit_state == EXIT_DEAD)
-		return 0;
+	return 1;
+}
 
-	/*
-	 * We don't reap group leaders with subthreads.
-	 */
-	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in ->notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then ->notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int wait_consider_task(struct wait_opts *wo, int ptrace,
+				struct task_struct *p)
+{
+	if (p->exit_state == EXIT_ZOMBIE)
 		return wait_task_zombie(wo, p);
 
 	/*
@@ -1578,10 +1696,29 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
  */
 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
+	int ret;
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->children, sibling) {
-		int ret = wait_consider_task(wo, 0, p);
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_consider_task(wo, 0, p);
+		if (ret)
+			return ret;
+	}
+
+	list_for_each_entry(p, &tsk->detached_children, detached_sibling) {
+		if (p->exit_state != EXIT_DETACHED &&
+				p->exit_state != EXIT_ZOMBIE)
+			continue;
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		if (p->exit_state == EXIT_ZOMBIE)
+			ret = _wait_task_zombie(wo, p, 1);
+		else
+			ret = wait_task_detached(wo, p);
 		if (ret)
 			return ret;
 	}
@@ -1594,7 +1731,10 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
-		int ret = wait_consider_task(wo, 1, p);
+		int ret = can_wait_task_ptrace(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_consider_task(wo, 1, p);
 		if (ret)
 			return ret;
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..60166dc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1070,6 +1070,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+	INIT_LIST_HEAD(&p->detached_children);
+	INIT_LIST_HEAD(&p->detached_sibling);
 	rcu_copy_process(p);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
@@ -1233,6 +1235,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
+	p->exit_flags = 0;
+	p->num_waiters = 1;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..54b93c7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1434,14 +1434,8 @@ ret:
 	return ret;
 }
 
-/*
- * Let a parent know about the death of a child.
- * For a stopped/continued status change, use do_notify_parent_cldstop instead.
- *
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
- */
-int do_notify_parent(struct task_struct *tsk, int sig)
+int do_signal_parent(struct task_struct *tsk, int sig, int sicode,
+	int sistatus)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1450,11 +1444,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	BUG_ON(sig == -1);
 
- 	/* do_notify_parent_cldstop should have been called instead.  */
- 	BUG_ON(task_is_stopped_or_traced(tsk));
-
-	BUG_ON(!task_ptrace(tsk) &&
-	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+	/* do_notify_parent_cldstop should have been called instead.  */
+	BUG_ON(task_is_stopped_or_traced(tsk));
 
 	info.si_signo = sig;
 	info.si_errno = 0;
@@ -1480,15 +1471,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
 				tsk->signal->stime));
 
-	info.si_status = tsk->exit_code & 0x7f;
-	if (tsk->exit_code & 0x80)
-		info.si_code = CLD_DUMPED;
-	else if (tsk->exit_code & 0x7f)
-		info.si_code = CLD_KILLED;
-	else {
-		info.si_code = CLD_EXITED;
-		info.si_status = tsk->exit_code >> 8;
-	}
+	info.si_code = sicode;
+	info.si_status = sistatus;
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
@@ -1510,9 +1494,11 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 		 * is implementation-defined: we do (if you don't want
 		 * it, just use SIG_IGN instead).
 		 */
-		ret = tsk->exit_signal = -1;
+		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
 			sig = -1;
+		/* reap process now, rather than promoting to zombie */
+		ret = DEATH_REAP;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
@@ -1522,6 +1508,33 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	return ret;
 }
 
+/*
+ * Let a parent know about the death of a child.
+ * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
+ */
+int do_notify_parent(struct task_struct *tsk, int sig)
+{
+	int sicode, sistatus;
+
+	BUG_ON(!task_ptrace(tsk) &&
+	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
+	sistatus = tsk->exit_code & 0x7f;
+	if (tsk->exit_code & 0x80)
+		sicode = CLD_DUMPED;
+	else if (tsk->exit_code & 0x7f)
+		sicode = CLD_KILLED;
+	else {
+		sicode = CLD_EXITED;
+		sistatus = tsk->exit_code >> 8;
+	}
+
+	return do_signal_parent(tsk, sig, sicode, sistatus);
+}
+
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 {
 	struct siginfo info;
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..e5d6332 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/tracehook.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
@@ -1736,6 +1737,50 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH: {
+			struct task_struct *p, *old_parent;
+			int notif = DEATH_REAP;
+			error = -EPERM;
+			/* not detaching from init */
+			if (me->real_parent == init_pid_ns.child_reaper)
+				break;
+			if (arg2 & ~0x7f)
+				break;
+			write_lock_irq(&tasklist_lock);
+			old_parent = me->real_parent;
+			me->detach_code = arg2 << 8;
+			if (!task_detached(me))
+				notif = do_signal_parent(me, me->exit_signal,
+					CLD_DETACHED, arg2);
+			if (notif != DEATH_REAP) {
+				list_add_tail(&me->detached_sibling,
+					&me->real_parent->detached_children);
+				me->exit_state = EXIT_DETACHED;
+				me->num_waiters++;
+			} else {
+				me->exit_state = 0;
+			}
+			if (!ptrace_reparented(me))
+				me->parent = init_pid_ns.child_reaper;
+			me->real_parent = init_pid_ns.child_reaper;
+			list_move_tail(&me->sibling,
+					&me->real_parent->children);
+			/* detaching makes us a group leader */
+			me->group_leader = me;
+			/* reparent threads */
+			p = me;
+			while_each_thread(me, p) {
+				if (p->real_parent != old_parent)
+					continue;
+				if (!ptrace_reparented(p))
+					p->parent = init_pid_ns.child_reaper;
+				p->real_parent = init_pid_ns.child_reaper;
+			}
+			me->exit_signal = SIGCHLD;
+			write_unlock_irq(&tasklist_lock);
+			error = 0;
+			break;
+		}
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-03-31 16:10           ` Stas Sergeev
@ 2011-03-31 17:02             ` Oleg Nesterov
  2011-03-31 17:47               ` Stas Sergeev
                                 ` (2 more replies)
  0 siblings, 3 replies; 57+ messages in thread
From: Oleg Nesterov @ 2011-03-31 17:02 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

Hi Stas,

On 03/31, Stas Sergeev wrote:
>
> I found some time to get back to that patch and
> to address all of the problems you pointed.
> What do you think about the attached patch?
> I didn't expect it would became that big.

	 fs/proc/array.c               |    7 -
	 include/asm-generic/siginfo.h |    3 
	 include/linux/init_task.h     |    2 
	 include/linux/prctl.h         |    2 
	 include/linux/sched.h         |   21 +++-
	 kernel/exit.c                 |  200 +++++++++++++++++++++++++++++++++++-------
	 kernel/fork.c                 |    4 
	 kernel/signal.c               |   59 +++++++-----
	 kernel/sys.c                  |   45 +++++++++
	 9 files changed, 281 insertions(+), 62 deletions(-)

Eek! Not only it is big. It is complex and changes a lot of core
kernel code.

Sorry Stas, I am not going to try to review it carefully. As I said,
you need to convince lkml we need this feature first. And iirc you
are not going to suggest this change for everyone.

I guess, the main complication is that you are trying to ensure the
old parent can do wait() without -ECHLD... This complicates everything
soooooooooo much. I _feel_ this can be simplified.... but in any case
we need the nasty complications. And for what?


I only looked at sys_prctl() code, and almost every line looks wrong.
Hmm... in fact, the changes in exit.c look wrong too, but I didn't really
try to understand them.

> @@ -1736,6 +1737,50 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  			else
>  				error = PR_MCE_KILL_DEFAULT;
>  			break;
> +		case PR_DETACH: {
> +			struct task_struct *p, *old_parent;
> +			int notif = DEATH_REAP;
> +			error = -EPERM;
> +			/* not detaching from init */
> +			if (me->real_parent == init_pid_ns.child_reaper)

2 problems. You shouldn't use init_pid_ns, you need the task's namespace.
Also, the task can be the child of /sbin/init's sub-thread.

> +			write_lock_irq(&tasklist_lock);
> +			old_parent = me->real_parent;
> +			me->detach_code = arg2 << 8;
> +			if (!task_detached(me))
> +				notif = do_signal_parent(me, me->exit_signal,
> +					CLD_DETACHED, arg2);

This is simply wrong. We reparent the whole thread group, we should
always notify the old parent. Or never. but this shouldn't depend on
the thread.

> +			if (notif != DEATH_REAP) {
> +				list_add_tail(&me->detached_sibling,
> +					&me->real_parent->detached_children);
> +				me->exit_state = EXIT_DETACHED;

No, no, we can't set ->exit_state != 0. This means the task is dead.

> +			if (!ptrace_reparented(me))
> +				me->parent = init_pid_ns.child_reaper;

Again, this shouldn't use init_pid_ns.child_reaper. But the main problem,
you can't trust ptrace_reparented(). What if the old parent ptraces this
task?

> +			/* detaching makes us a group leader */
> +			me->group_leader = me;

How? Now, we can't change ->group_leader, this is simply not possible
and very wrong. If nothing else, think about tid/tgid, but there are
a lot more problems.

> +			while_each_thread(me, p) {
> +				if (p->real_parent != old_parent)
> +					continue;
> +				if (!ptrace_reparented(p))
> +					p->parent = init_pid_ns.child_reaper;
> +				p->real_parent = init_pid_ns.child_reaper;

The same problems as above, pluse "p->real_parent != old_parent" looks
bogus.


Well. Once again, I never argue with new features, but you need to
convince lkml. Probably it is simple to implement PR_DETACH so that
the task just "disappears" from the old_parent's radar. Otherwise
we need more complications, but I'd rather add the fake TASK_ZOMBIE
task_struct for that. This will be much, much simply although not
pretty anyway.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-03-31 17:02             ` Oleg Nesterov
@ 2011-03-31 17:47               ` Stas Sergeev
  2011-03-31 18:18                 ` Oleg Nesterov
  2011-04-01 17:02               ` Stas Sergeev
  2011-04-04 14:34               ` Stas Sergeev
  2 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-03-31 17:47 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

31.03.2011 21:02, Oleg Nesterov wrote:
> I only looked at sys_prctl() code, and almost every line looks wrong.
Well, the lines you pointed, were also in the
previous patch, and, as you didn't complained
back then, I thought they were fine. :)

>> +			if (notif != DEATH_REAP) {
>> +				list_add_tail(&me->detached_sibling,
>> +					&me->real_parent->detached_children);
>> +				me->exit_state = EXIT_DETACHED;
> No, no, we can't set ->exit_state != 0. This means the task is dead.
Does this really break things? I mean, there are
probably no checks like "if (task->exit_state)", every
time the exact value is checked. And EXIT_DETACHED
is only set for the short period: the wait() from parent
(either new or old) will remove it.

>> +			/* detaching makes us a group leader */
>> +			me->group_leader = me;
> How? Now, we can't change ->group_leader, this is simply not possible
> and very wrong. If nothing else, think about tid/tgid, but there are
> a lot more problems.
OK, thats why in the previous patch I was allowing only
the group leader to detach, but you complained. Should
I return that back?

>> +			while_each_thread(me, p) {
>> +				if (p->real_parent != old_parent)
>> +					continue;
>> +				if (!ptrace_reparented(p))
>> +					p->parent = init_pid_ns.child_reaper;
>> +				p->real_parent = init_pid_ns.child_reaper;
> The same problems as above, pluse "p->real_parent != old_parent" looks
> bogus.
Just an extra care. Should I just remove that check?

> Well. Once again, I never argue with new features, but you need to
> convince lkml. Probably it is simple to implement PR_DETACH so that
> the task just "disappears" from the old_parent's radar.
Yes, that worked for me too:
https://lkml.org/lkml/2010/12/25/37
Yes, I know there are bugs too. :) But, at least the patch
is just few lines.

> Otherwise
> we need more complications, but I'd rather add the fake TASK_ZOMBIE
> task_struct for that. This will be much, much simply although not
> pretty anyway.
Well, maybe the patch looks more complex than it actually is.
How it works:
- num_waiters is set to 1 by fork(). Then PR_DETACH may increment
it if the old parent does not ignore the SIGCHLD. Both the
wait_task_zombie() and wait_task_detached() do decrement that
counter, and when it is zero, the task is reaped. Also, if the old
parent terminates without wait(), it decrements that counter, and,
if needed, reaps the task.
- exit_state is set to EXIT_DETACHED if the parent doesn't ignore
SIGCHLD. Then, if old parent wait()s, exit_state gets reset to 0. But
if the process exits, exit_state gets set to EXIT_ZOMBIE, and then,
by the use of the num_waiters counter, I make sure both parents
waited, before releasing the task.
There were some rearrangements in the exit.c code, that are
not directly related to the new feature. I can split them to the
separate patches, if that will help.

As for convincing LKML... Well, when the code is right, maybe. :))

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-03-31 17:47               ` Stas Sergeev
@ 2011-03-31 18:18                 ` Oleg Nesterov
  2011-03-31 20:58                   ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-03-31 18:18 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 03/31, Stas Sergeev wrote:
>
> 31.03.2011 21:02, Oleg Nesterov wrote:
>> I only looked at sys_prctl() code, and almost every line looks wrong.
> Well, the lines you pointed, were also in the
> previous patch, and, as you didn't complained
> back then, I thought they were fine. :)

Of course, I forgot completely what the previous patch did ;)
Perhaps I missed them before, or perhaps I stopped looking after
I noticed other problems.

>>> +			if (notif != DEATH_REAP) {
>>> +				list_add_tail(&me->detached_sibling,
>>> +					&me->real_parent->detached_children);
>>> +				me->exit_state = EXIT_DETACHED;
>> No, no, we can't set ->exit_state != 0. This means the task is dead.
> Does this really break things? I mean, there are
> probably no checks like "if (task->exit_state)",

There are. zap_other_threads() for example. More importantly, we
can have more of them. exit_state != 0 means: this task has passed
exit_notify(), we shouldn't break this.

>>> +			/* detaching makes us a group leader */
>>> +			me->group_leader = me;
>> How? Now, we can't change ->group_leader, this is simply not possible
>> and very wrong. If nothing else, think about tid/tgid, but there are
>> a lot more problems.
> OK, thats why in the previous patch I was allowing only
> the group leader to detach, but you complained. Should
> I return that back?

Ah, I _seem_ to recall... Yes, it seems strange that only group leader
can do PR_DETACH. If we implement this, I think any thread should be
allowed to call prctl(DETACH). But why do we need to change the leader?

>> Well. Once again, I never argue with new features, but you need to
>> convince lkml. Probably it is simple to implement PR_DETACH so that
>> the task just "disappears" from the old_parent's radar.
> Yes, that worked for me too:
> https://lkml.org/lkml/2010/12/25/37
> Yes, I know there are bugs too. :) But, at least the patch
> is just few lines.

I didn't look at the patch above, but probably it makes more sense.
At least for the start.

>> Otherwise
>> we need more complications, but I'd rather add the fake TASK_ZOMBIE
>> task_struct for that. This will be much, much simply although not
>> pretty anyway.
> Well, maybe the patch looks more complex than it actually is.
> How it works:

I didn't mean it is very hard to understand the intent. What is
really hard (at least to me) to see if it is correct or not.

And in any case it adds a lot of the code, and complicates the things.

So, just in case, I have to admit: yes, personally I dislike this
new feature ;) But, again, I am not going to argue.

> There were some rearrangements in the exit.c code, that are
> not directly related to the new feature. I can split them to the
> separate patches, if that will help.

Yes, this always help.

> As for convincing LKML... Well, when the code is right, maybe. :))

Maybe ;)

But, otoh. It is quite possible you will get the quick nack...

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-03-31 18:18                 ` Oleg Nesterov
@ 2011-03-31 20:58                   ` Stas Sergeev
  2011-04-02 13:55                     ` Oleg Nesterov
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-03-31 20:58 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

31.03.2011 22:18, Oleg Nesterov wrote:
> >  +			if (me->real_parent == init_pid_ns.child_reaper)
>  Also, the task can be the child of /sbin/init's sub-thread.
Hmm, how to check then? Should I add the "exact_parent" just
for that? Or traverse the sibling list? How bad. :(


> There are. zap_other_threads() for example. More importantly, we
> can have more of them. exit_state != 0 means: this task has passed
> exit_notify(), we shouldn't break this.
OK.

> Ah, I _seem_ to recall... Yes, it seems strange that only group leader
> can do PR_DETACH. If we implement this, I think any thread should be
> allowed to call prctl(DETACH). But why do we need to change the leader?
OK, I didn't know it is safe to leave it unchanged.

 > I didn't look at the patch above, but probably it makes more sense.
 > At least for the start.
IIRC strace didn't like the fact that wait() fails, and that's
why I started to add the complexity.

> I didn't mean it is very hard to understand the intent. What is
> really hard (at least to me) to see if it is correct or not.
OK, I'll try to break it into a small chunks then.
But the fact that such a seemingly simple functionality
requires so many changes, is IMHO bad by itself. And it
is also non-trivial to implement mostly because of the
ptrace hooks that are everywhere. Some cleanups are
certainly needed in that area. :)

> So, just in case, I have to admit: yes, personally I dislike this
> new feature ;)
Do you dislike a feature by itself, or by its implementation?
The feature by itself does nothing but to allow the daemon()
to work with threads, which can't be bad IMO, and, in some
cases, the inability to do so is difficult to work around. Of course,
these cases have nothing to do with the good design, but rather
with the vendor-provided poorly-coded drivers and libs. In all
the better cases you have the trivial workarounds, but why to
search for the workarounds at all, in the first place? Why not
to have the daemon() to "just work" with threads? :)

> >  As for convincing LKML... Well, when the code is right, maybe. :))
> Maybe ;)
>
> But, otoh. It is quite possible you will get the quick nack...
Well, with such a rate of additions into a thread_struct, I'll
probably nack it myself soon, but again, I am surprised that
such a simple task requires so many untrivial changes.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-03-31 17:02             ` Oleg Nesterov
  2011-03-31 17:47               ` Stas Sergeev
@ 2011-04-01 17:02               ` Stas Sergeev
  2011-04-02 14:06                 ` Oleg Nesterov
  2011-04-04 14:34               ` Stas Sergeev
  2 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-01 17:02 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

[-- Attachment #1: Type: text/plain, Size: 713 bytes --]

Hi Oleg.

Here are the splitted patches.
What do you think?
I admit that I haven't found yet the solutions to all
the problems you pointed yesterday, namely to the
check of "real_parent == init" and ptrace_reparented,
so ignore these 2 for now.
But probably now you can have a look into the exit.c part?
The first 2 patches are just the rearrangements and
should not incur any functional changes. The third one
is an implementation of pr_detach.
This time, the child is allowed to disappear from parent's
radar in case the old parent was slow to wait(), and the
process have exited and the new parent have wait()ed.
This is probably fine and not worth the complications,
what do you think?

Thanks for your time!

[-- Attachment #2: 01_sigpar.diff --]
[-- Type: text/plain, Size: 3645 bytes --]

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5..e74882f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2096,6 +2096,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
+extern int do_signal_parent(struct task_struct *, int, int, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..54b93c7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1434,14 +1434,8 @@ ret:
 	return ret;
 }
 
-/*
- * Let a parent know about the death of a child.
- * For a stopped/continued status change, use do_notify_parent_cldstop instead.
- *
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
- */
-int do_notify_parent(struct task_struct *tsk, int sig)
+int do_signal_parent(struct task_struct *tsk, int sig, int sicode,
+	int sistatus)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1450,11 +1444,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	BUG_ON(sig == -1);
 
- 	/* do_notify_parent_cldstop should have been called instead.  */
- 	BUG_ON(task_is_stopped_or_traced(tsk));
-
-	BUG_ON(!task_ptrace(tsk) &&
-	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+	/* do_notify_parent_cldstop should have been called instead.  */
+	BUG_ON(task_is_stopped_or_traced(tsk));
 
 	info.si_signo = sig;
 	info.si_errno = 0;
@@ -1480,15 +1471,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
 				tsk->signal->stime));
 
-	info.si_status = tsk->exit_code & 0x7f;
-	if (tsk->exit_code & 0x80)
-		info.si_code = CLD_DUMPED;
-	else if (tsk->exit_code & 0x7f)
-		info.si_code = CLD_KILLED;
-	else {
-		info.si_code = CLD_EXITED;
-		info.si_status = tsk->exit_code >> 8;
-	}
+	info.si_code = sicode;
+	info.si_status = sistatus;
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
@@ -1510,9 +1494,11 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 		 * is implementation-defined: we do (if you don't want
 		 * it, just use SIG_IGN instead).
 		 */
-		ret = tsk->exit_signal = -1;
+		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
 			sig = -1;
+		/* reap process now, rather than promoting to zombie */
+		ret = DEATH_REAP;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
@@ -1522,6 +1508,33 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	return ret;
 }
 
+/*
+ * Let a parent know about the death of a child.
+ * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
+ */
+int do_notify_parent(struct task_struct *tsk, int sig)
+{
+	int sicode, sistatus;
+
+	BUG_ON(!task_ptrace(tsk) &&
+	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
+	sistatus = tsk->exit_code & 0x7f;
+	if (tsk->exit_code & 0x80)
+		sicode = CLD_DUMPED;
+	else if (tsk->exit_code & 0x7f)
+		sicode = CLD_KILLED;
+	else {
+		sicode = CLD_EXITED;
+		sistatus = tsk->exit_code >> 8;
+	}
+
+	return do_signal_parent(tsk, sig, sicode, sistatus);
+}
+
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 {
 	struct siginfo info;

[-- Attachment #3: 02_cwaittsk.diff --]
[-- Type: text/plain, Size: 3108 bytes --]

diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45eb..2aa64e8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1507,21 +1507,11 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
-/*
- * Consider @p for a wait by @parent.
- *
- * -ECHILD should be in ->notask_error before the first call.
- * Returns nonzero for a final return, when we have unlocked tasklist_lock.
- * Returns zero if the search for a child should continue;
- * then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
- */
-static int wait_consider_task(struct wait_opts *wo, int ptrace,
-				struct task_struct *p)
+static int can_wait_task_common(struct wait_opts *wo, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
 	if (!ret)
-		return ret;
+		return 0;
 
 	ret = security_task_wait(p);
 	if (unlikely(ret < 0)) {
@@ -1537,7 +1527,25 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+	if (p->exit_state == EXIT_DEAD)
+		return 0;
+
+	return 1;
+}
+
+static int can_wait_task_ptrace(struct wait_opts *wo, struct task_struct *p)
+{
+	/* don't worry, gcc will optimize away this function :) */
+	return can_wait_task_common(wo, p);
+}
+
+static int can_wait_task(struct wait_opts *wo, struct task_struct *p)
+{
+	int ret = can_wait_task_common(wo, p);
+	if (!ret)
+		return 0;
+
+	if (unlikely(task_ptrace(p))) {
 		/*
 		 * This child is hidden by ptrace.
 		 * We aren't allowed to see it now, but eventually we will.
@@ -1546,9 +1554,21 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (p->exit_state == EXIT_DEAD)
-		return 0;
+	return 1;
+}
 
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in ->notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then ->notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int wait_consider_task(struct wait_opts *wo, int ptrace,
+				struct task_struct *p)
+{
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
@@ -1578,10 +1598,14 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
  */
 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
+	int ret;
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->children, sibling) {
-		int ret = wait_consider_task(wo, 0, p);
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_consider_task(wo, 0, p);
 		if (ret)
 			return ret;
 	}
@@ -1594,7 +1618,10 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
-		int ret = wait_consider_task(wo, 1, p);
+		int ret = can_wait_task_ptrace(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_consider_task(wo, 1, p);
 		if (ret)
 			return ret;
 	}

[-- Attachment #4: 03_prdetach.diff --]
[-- Type: text/plain, Size: 6539 bytes --]

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index caa151f..fdf71a9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -158,6 +158,8 @@ extern struct cred init_cred;
 	.parent		= &tsk,						\
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.detached_children	= LIST_HEAD_INIT(tsk.detached_children),\
+	.detached_sibling	= LIST_HEAD_INIT(tsk.detached_sibling),	\
 	.group_leader	= &tsk,						\
 	RCU_INIT_POINTER(.real_cred, &init_cred),			\
 	RCU_INIT_POINTER(.cred, &init_cred),				\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e74882f..0c4f070 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1260,6 +1260,8 @@ struct task_struct {
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
+	int detach_code;
+	int detaching;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
@@ -1292,6 +1294,8 @@ struct task_struct {
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
+	struct list_head detached_children;	/* list of my detached children */
+	struct list_head detached_sibling;	/* linkage in my parent's detached children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 2aa64e8..e725933 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
+		list_del_init(&p->detached_sibling);
 		__this_cpu_dec(process_counts);
 	}
 	list_del_rcu(&p->thread_group);
@@ -810,6 +811,7 @@ static void forget_original_parent(struct task_struct *father)
 
 	list_for_each_entry_safe(p, n, &dead_children, sibling) {
 		list_del_init(&p->sibling);
+		list_del_init(&p->detached_sibling);
 		release_task(p);
 	}
 }
@@ -1507,6 +1509,45 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int dt, retval = 0;
+	pid_t pid;
+	uid_t uid;
+
+	if (!likely(wo->wo_flags & WEXITED))
+		return 0;
+
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	}
+
+	dt = xchg(&p->detaching, 0);
+	if (dt != 1)
+		return 0;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	if (wo->wo_stat)
+		retval = put_user(p->detach_code, wo->wo_stat);
+
+	if (!retval) {
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	} else {
+		put_task_struct(p);
+	}
+
+	return retval;
+}
+
 static int can_wait_task_common(struct wait_opts *wo, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
@@ -1610,6 +1651,15 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 			return ret;
 	}
 
+	list_for_each_entry(p, &tsk->detached_children, detached_sibling) {
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_task_detached(wo, p);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..aa8c1e7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1070,6 +1070,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+	INIT_LIST_HEAD(&p->detached_children);
+	INIT_LIST_HEAD(&p->detached_sibling);
 	rcu_copy_process(p);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
@@ -1233,6 +1235,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
+	p->detaching = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..a3fa15e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/tracehook.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
@@ -1736,6 +1737,42 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH: {
+			struct task_struct *p;
+			struct pid_namespace *pid_ns = task_active_pid_ns(me);
+			int notif = DEATH_REAP;
+			error = -EPERM;
+			/* not detaching from init */
+			if (me->real_parent == pid_ns->child_reaper)
+				break;
+			if (arg2 & ~0x7f)
+				break;
+			write_lock_irq(&tasklist_lock);
+			me->detach_code = arg2 << 8;
+			notif = do_signal_parent(me, me->exit_signal,
+					CLD_DETACHED, arg2);
+			if (notif != DEATH_REAP) {
+				list_add_tail(&me->detached_sibling,
+					&me->real_parent->detached_children);
+				me->detaching = 1;
+			}
+			if (!ptrace_reparented(me))
+				me->parent = pid_ns->child_reaper;
+			me->real_parent = pid_ns->child_reaper;
+			list_move_tail(&me->sibling,
+					&me->real_parent->children);
+			/* reparent threads */
+			p = me;
+			while_each_thread(me, p) {
+				if (!ptrace_reparented(p))
+					p->parent = pid_ns->child_reaper;
+				p->real_parent = pid_ns->child_reaper;
+			}
+			me->exit_signal = SIGCHLD;
+			write_unlock_irq(&tasklist_lock);
+			error = 0;
+			break;
+		}
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-03-31 20:58                   ` Stas Sergeev
@ 2011-04-02 13:55                     ` Oleg Nesterov
  2011-04-02 18:20                       ` Stas Sergeev
  2011-04-02 22:00                       ` Stas Sergeev
  0 siblings, 2 replies; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-02 13:55 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 04/01, Stas Sergeev wrote:
>
> 31.03.2011 22:18, Oleg Nesterov wrote:
>> >  +			if (me->real_parent == init_pid_ns.child_reaper)
>>  Also, the task can be the child of /sbin/init's sub-thread.
> Hmm, how to check then? Should I add the "exact_parent" just
> for that? Or traverse the sibling list? How bad. :(

You can check same_thread_group(real_parent, ns->child_reaper)

> > I didn't look at the patch above, but probably it makes more sense.
> > At least for the start.
> IIRC strace didn't like the fact that wait() fails, and that's
> why I started to add the complexity.

Of course, the task should never escape ptrace, but this is
another issue?

And yes, it is not good if the child simply disappears, but the
semantics is "strange" in any case.

> OK, I'll try to break it into a small chunks then.
> But the fact that such a seemingly simple functionality
> requires so many changes, is IMHO bad by itself.

Indeed. You are trying to do something unnatural ;)

> And it
> is also non-trivial to implement mostly because of the
> ptrace hooks that are everywhere. Some cleanups are
> certainly needed in that area. :)

Not sure ptrace is the main problem... To me, it is the problem,
yes, but mostly from the semantics pov.

>> So, just in case, I have to admit: yes, personally I dislike this
>> new feature ;)
> Do you dislike a feature by itself, or by its implementation?

Both ;)

> The feature by itself does nothing but to allow the daemon()
> to work with threads, which can't be bad IMO,

Well. It is bad even if the patch was correct. This complicates
and slows down the code. The code should be maintained. And for
what?

> Of course,
> these cases have nothing to do with the good design,

Indeed. You suggest the exotic and non-portable feature, almost
nobody will use it. But every user should pay for that. This is
not fair.

> but why to
> search for the workarounds at all, in the first place? Why not
> to have the daemon() to "just work" with threads? :)

The kernel is already huge. I simply can't understand why do
you think this is good idea to add more bloat for this feature.

And this is not enough to daemonize anyway. setsid() won't work.



Stas, please do not try to convince me, you can't. OTOH, let me
repeat, you do not need to convince me. I'd suggest you to discuss
this with Linus. If he agrees - then we can look at your patch
seriously and discuss the implementation details. Everything can
be implemented.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-01 17:02               ` Stas Sergeev
@ 2011-04-02 14:06                 ` Oleg Nesterov
  0 siblings, 0 replies; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-02 14:06 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 04/01, Stas Sergeev wrote:
> Hi Oleg.
>
> This time, the child is allowed to disappear from parent's
> radar in case the old parent was slow to wait(),

Hmm... Again, I didn't really read the patch... But iiuc, the
detached child can exit and init can reap it, after that the
old parent gets ECHLD?

You know, even if the patch was correct and this feature was
acked, I'd try to nack this ;) do_wait() from parent should
always work or it should always return ECHLD, but it should
not depend on /dev/random. This is really weird, imho.

OK, even ignoring prctl() code he patch is buggy. __unhash_process()
does list_del_init(&p->detached_sibling). This is too late. If the
old parent has already called wait_task_detached() and cleared
->detaching, this child will add the unnecessary cost to every
subsequent do_wait() call. If the old parent exits, list_del_init()
can crash the kernel.

I didn't read the patch further.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-02 13:55                     ` Oleg Nesterov
@ 2011-04-02 18:20                       ` Stas Sergeev
  2011-04-02 22:00                       ` Stas Sergeev
  1 sibling, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-02 18:20 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

02.04.2011 17:55, Oleg Nesterov wrote:
>> OK, I'll try to break it into a small chunks then.
>> But the fact that such a seemingly simple functionality
>> requires so many changes, is IMHO bad by itself.
> Indeed. You are trying to do something unnatural ;)
Well, the fact that fork/exit was always used to detach
from parent, doesn't mean that it was natural: a single
syscall looks more natural. :)

> Indeed. You suggest the exotic and non-portable feature, almost
> nobody will use it. But every user should pay for that. This is
> not fair.
If that is true, then of course this should not be ever applied.
But why do you think so? What gets slowed down? wait()?
But if the detached_sibling list is empty, then why?

> The kernel is already huge. I simply can't understand why do
> you think this is good idea to add more bloat for this feature.
That depends on the final code (if it will ever be produced).
Maybe it can be very simple? :) I don't want to add bloat, I
want a small and simple patch.

> And this is not enough to daemonize anyway. setsid() won't work.
But TIOCNOTTY will.

> Stas, please do not try to convince me, you can't. OTOH, let me
> repeat, you do not need to convince me. I'd suggest you to discuss
> this with Linus. If he agrees - then we can look at your patch
Well, if the patch is bloated or buggy, there is no reason to
discuss it with Linus. :) If the patch is small and simple, that
alone will give it lots of credits.

 > Hmm... Again, I didn't really read the patch... But iiuc, the
 > detached child can exit and init can reap it, after that the
 > old parent gets ECHLD?
Yes. Because making sure that both parents wait()ed, probably
cannot be reliably implemented without the write_lock_irq(&tasklist_lock),
while the current code uses only read lock. Or you really need
a fake task_struct...

>  do_wait() from parent should
>  always work or it should always return ECHLD, but it should
>  not depend on /dev/random. This is really weird, imho.
OK, in this case, I guess, the fake task_struct is the last chance.

>  does list_del_init(&p->detached_sibling). This is too late. If the
>  old parent has already called wait_task_detached() and cleared
>  ->detaching, this child will add the unnecessary cost to every
>  subsequent do_wait() call.
OK, will add the forgotten "if (->detaching)" check before calling
wait_task_detached(), thanks.

>  If the old parent exits, list_del_init()
>  can crash the kernel.
Why?

Yes, I can read Russian, as you asked in another e-mail. :)
Let me clarify once again: I don't want to add neither bloat, nor
the slowdown on the common path. But I haven't yet convinced myself
that this is unavoidable. If you haven't noticed, I shrunk the
patch 3 times, and fixed most of the bugs already. :)) And I am not
even trying to convince you (or Linus) in anything, before the pach
is made simple. If it can't be made simple, then I'll leave it to
my project only. That was the intention anyway. :)


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-02 13:55                     ` Oleg Nesterov
  2011-04-02 18:20                       ` Stas Sergeev
@ 2011-04-02 22:00                       ` Stas Sergeev
  1 sibling, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-02 22:00 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

02.04.2011 17:55, Oleg Nesterov wrote:
>
>>>>   +			if (me->real_parent == init_pid_ns.child_reaper)
>>>   Also, the task can be the child of /sbin/init's sub-thread.
>> Hmm, how to check then? Should I add the "exact_parent" just
>> for that? Or traverse the sibling list? How bad. :(
> You can check same_thread_group(real_parent, ns->child_reaper)
But real_parent==ns->child_reaper in our case, so what does
this check give?

>  acked, I'd try to nack this;)  do_wait() from parent should
>  always work or it should always return ECHLD, but it should
>  not depend on /dev/random. This is really weird, imho.
OK, this can be fixed by delaying the wait() from init till
the old parent wait()s or die. Will fix.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-03-31 17:02             ` Oleg Nesterov
  2011-03-31 17:47               ` Stas Sergeev
  2011-04-01 17:02               ` Stas Sergeev
@ 2011-04-04 14:34               ` Stas Sergeev
  2011-04-04 16:03                 ` Oleg Nesterov
  2 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-04 14:34 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

[-- Attachment #1: Type: text/plain, Size: 328 bytes --]

Hi Oleg.

Here's the patch that addresses your concerns
about the late deleting from list.
Also, the patch is shrunk twice.
I think it is about to be trivial this time.
I still haven't solved the problems with checking
parent and checking ptrace, so ignore them for
now (or give me the hints:)
Do we still have other bugs here?

[-- Attachment #2: 01_sigpar.diff --]
[-- Type: text/plain, Size: 3645 bytes --]

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5..e74882f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2096,6 +2096,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
+extern int do_signal_parent(struct task_struct *, int, int, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..54b93c7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1434,14 +1434,8 @@ ret:
 	return ret;
 }
 
-/*
- * Let a parent know about the death of a child.
- * For a stopped/continued status change, use do_notify_parent_cldstop instead.
- *
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
- */
-int do_notify_parent(struct task_struct *tsk, int sig)
+int do_signal_parent(struct task_struct *tsk, int sig, int sicode,
+	int sistatus)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1450,11 +1444,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	BUG_ON(sig == -1);
 
- 	/* do_notify_parent_cldstop should have been called instead.  */
- 	BUG_ON(task_is_stopped_or_traced(tsk));
-
-	BUG_ON(!task_ptrace(tsk) &&
-	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+	/* do_notify_parent_cldstop should have been called instead.  */
+	BUG_ON(task_is_stopped_or_traced(tsk));
 
 	info.si_signo = sig;
 	info.si_errno = 0;
@@ -1480,15 +1471,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
 				tsk->signal->stime));
 
-	info.si_status = tsk->exit_code & 0x7f;
-	if (tsk->exit_code & 0x80)
-		info.si_code = CLD_DUMPED;
-	else if (tsk->exit_code & 0x7f)
-		info.si_code = CLD_KILLED;
-	else {
-		info.si_code = CLD_EXITED;
-		info.si_status = tsk->exit_code >> 8;
-	}
+	info.si_code = sicode;
+	info.si_status = sistatus;
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
@@ -1510,9 +1494,11 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 		 * is implementation-defined: we do (if you don't want
 		 * it, just use SIG_IGN instead).
 		 */
-		ret = tsk->exit_signal = -1;
+		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
 			sig = -1;
+		/* reap process now, rather than promoting to zombie */
+		ret = DEATH_REAP;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
@@ -1522,6 +1508,33 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	return ret;
 }
 
+/*
+ * Let a parent know about the death of a child.
+ * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
+ */
+int do_notify_parent(struct task_struct *tsk, int sig)
+{
+	int sicode, sistatus;
+
+	BUG_ON(!task_ptrace(tsk) &&
+	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
+	sistatus = tsk->exit_code & 0x7f;
+	if (tsk->exit_code & 0x80)
+		sicode = CLD_DUMPED;
+	else if (tsk->exit_code & 0x7f)
+		sicode = CLD_KILLED;
+	else {
+		sicode = CLD_EXITED;
+		sistatus = tsk->exit_code >> 8;
+	}
+
+	return do_signal_parent(tsk, sig, sicode, sistatus);
+}
+
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 {
 	struct siginfo info;

[-- Attachment #3: pr_detach2.diff --]
[-- Type: text/plain, Size: 5084 bytes --]

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e74882f..2e2acba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1260,6 +1260,8 @@ struct task_struct {
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
+	int detach_code;
+	int detaching;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45eb..276b39f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -791,7 +791,14 @@ static void forget_original_parent(struct task_struct *father)
 	reaper = find_new_reaper(father);
 
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
-		struct task_struct *t = p;
+		struct task_struct *t;
+		if (p->detaching) {
+			list_move_tail(&p->sibling,
+					&p->real_parent->children);
+			p->detaching = 0;
+			continue;
+		}
+		t = p;
 		do {
 			t->real_parent = reaper;
 			if (t->parent == father) {
@@ -1507,6 +1514,50 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int dt, retval = 0;
+	pid_t pid;
+	uid_t uid;
+
+	if (!likely(wo->wo_flags & WEXITED))
+		return 0;
+
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	}
+
+	dt = xchg(&p->detaching, 0);
+	if (dt != 1)
+		return 0;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	/* hand it over to init */
+	write_lock_irq(&tasklist_lock);
+	list_move_tail(&p->sibling, &p->real_parent->children);
+	write_unlock_irq(&tasklist_lock);
+
+	if (wo->wo_stat)
+		retval = put_user(p->detach_code, wo->wo_stat);
+
+	if (!retval) {
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	} else {
+		put_task_struct(p);
+	}
+
+	return retval;
+}
+
 /*
  * Consider @p for a wait by @parent.
  *
@@ -1549,6 +1600,9 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (p->exit_state == EXIT_DEAD)
 		return 0;
 
+	if (p->detaching)
+		return wait_task_detached(wo, p);
+
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..dd28aff 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1233,6 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
+	p->detaching = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..e4dadd6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/tracehook.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
@@ -1736,6 +1737,40 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH: {
+			struct task_struct *p;
+			struct pid_namespace *pid_ns = task_active_pid_ns(me);
+			int notif = DEATH_REAP;
+			error = -EPERM;
+			/* not detaching from init */
+			if (me->real_parent == pid_ns->child_reaper)
+				break;
+			if (arg2 & ~0x7f)
+				break;
+			write_lock_irq(&tasklist_lock);
+			me->detach_code = arg2 << 8;
+			notif = do_signal_parent(me, me->exit_signal,
+					CLD_DETACHED, arg2);
+			if (notif != DEATH_REAP)
+				me->detaching = 1;
+			else
+				list_move_tail(&me->sibling,
+					&me->real_parent->children);
+			if (!ptrace_reparented(me))
+				me->parent = pid_ns->child_reaper;
+			me->real_parent = pid_ns->child_reaper;
+			/* reparent threads */
+			p = me;
+			while_each_thread(me, p) {
+				if (!ptrace_reparented(p))
+					p->parent = pid_ns->child_reaper;
+				p->real_parent = pid_ns->child_reaper;
+			}
+			me->exit_signal = SIGCHLD;
+			write_unlock_irq(&tasklist_lock);
+			error = 0;
+			break;
+		}
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-04 14:34               ` Stas Sergeev
@ 2011-04-04 16:03                 ` Oleg Nesterov
  2011-04-04 20:05                   ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-04 16:03 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

Hi Stas,

On 04/04, Stas Sergeev wrote:
>
> Here's the patch that addresses your concerns
> about the late deleting from list.
> Also, the patch is shrunk twice.
> I think it is about to be trivial this time.

But it is very wrong at first glance...

> I still haven't solved the problems with checking
> parent and checking ptrace, so ignore them for
> now (or give me the hints:)

Not sure I understand your question...

But, once again. You should not use ptrace_reparented(). Reparanted or
not, the ptraced thread must not change its ->parent.

Sorry Stas, I have no time to read the patch, just one thing...

> +			me->detach_code = arg2 << 8;
> ...
> +			if (notif != DEATH_REAP)
> +				me->detaching = 1;
> +			else
> +				list_move_tail(&me->sibling,
> +					&me->real_parent->children);

This is simply wrong unless the caller is the group_leader. Probably
there was some confusion, I thought we already discussed this. But this
is minor...

> +			while_each_thread(me, p) {
> +				if (!ptrace_reparented(p))
> +					p->parent = pid_ns->child_reaper;
> +				p->real_parent = pid_ns->child_reaper;

Eek. Even ignoring ptrace, this is weird. We change parent/real_parent,
but we do not do list_move_tail(sibling) until wait_task_detached() !
No, I think we should not do this even if this was correct. I'll try
to nack this in any case, even if there were no immediate problems ;)
IMHO, this is insane.

But this is wrong. Well. Suppose that the caller of PR_DETACH exits
before the old parent does do_wait(). What /sbin/init (who is the new
parent) can do after it gets SIGCHLD? If can't see this zombie. Nor
the old parent can release this task due to ->detaching. Eventually
/sbin/init can reap it if it does, say, waitpid(-1), but still this
is wrong.

Or. Suppose that the old parent exits after its child does PR_DETACH.
You changed forget_original_parent() but this is not enough, note that
find_new_reaper() can pick the live sub-thread. In this case the child
will be moved to init's ->children list, and after that we are changing
->real_parent back.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-04 16:03                 ` Oleg Nesterov
@ 2011-04-04 20:05                   ` Stas Sergeev
  2011-04-05 15:15                     ` Oleg Nesterov
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-04 20:05 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

04.04.2011 20:03, Oleg Nesterov wrote:
>> I still haven't solved the problems with checking
>> parent and checking ptrace, so ignore them for
>> now (or give me the hints:)
> Not sure I understand your question...
I wonder how to check whether the child was
reparented to init, and not is a child on init's
subthread. You suggested to check
same_thread_group(real_parent, ns->child_reaper),
but I don't understand that suggestion because in
our case real_parent==ns->child_reaper, yet it is not
enough, as it could be of CLONE_PARENT (I suppose).

>> +			while_each_thread(me, p) {
>> +				if (!ptrace_reparented(p))
>> +					p->parent = pid_ns->child_reaper;
>> +				p->real_parent = pid_ns->child_reaper;
> Eek. Even ignoring ptrace, this is weird. We change parent/real_parent,
> but we do not do list_move_tail(sibling) until wait_task_detached() !
> No, I think we should not do this even if this was correct. I'll try
> to nack this in any case, even if there were no immediate problems ;)
> IMHO, this is insane.
>
> But this is wrong. Well. Suppose that the caller of PR_DETACH exits
> before the old parent does do_wait(). What /sbin/init (who is the new
> parent) can do after it gets SIGCHLD? If can't see this zombie. Nor
> the old parent can release this task due to ->detaching. Eventually
> /sbin/init can reap it if it does, say, waitpid(-1), but still this
> is wrong.
No, the idea was like that: the old parent either wait()s or
exits, then init became a "real" parent of that process, and
reaps it immediately. I think that's natural: the very same will happen
if the child didn't do pr_detach. If it exits, its current parent
have to either wait(), or exit. If it doesn't do so - zombie.
I decided that if I still allow wait() for the old parent, I'd better
leave that logic untouched. So: until the old parent wait()s
or exits, we have zombie, and that's what I supposed to implement.

> Or. Suppose that the old parent exits after its child does PR_DETACH.
> You changed forget_original_parent() but this is not enough, note that
> find_new_reaper() can pick the live sub-thread. In this case the child
> will be moved to init's ->children list, and after that we are changing
> ->real_parent back.
How? I think I prevented that with this:
---

+			p->detaching = 0;
+			continue;
---
So it should not go down to get reparented back.
Or am I missing something?

Thanks for your time!


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-04 20:05                   ` Stas Sergeev
@ 2011-04-05 15:15                     ` Oleg Nesterov
  2011-04-05 16:25                       ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-05 15:15 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 04/05, Stas Sergeev wrote:
>
> 04.04.2011 20:03, Oleg Nesterov wrote:
>>> I still haven't solved the problems with checking
>>> parent and checking ptrace, so ignore them for
>>> now (or give me the hints:)
>> Not sure I understand your question...
> I wonder how to check whether the child was
> reparented to init, and not is a child on init's
> subthread.

I don't understand the "and not is a child on init's subthread".
If the child was created by init's sub-thread, it is the child
of the whole thread group.

->real_parent points to thread, yes. But the parent is the whole
process, not thread. The only reason for this oddity is __WNOTHREAD.

> You suggested to check
> same_thread_group(real_parent, ns->child_reaper),

Yes. This means: our parent is the init process.

>>> +			while_each_thread(me, p) {
>>> +				if (!ptrace_reparented(p))
>>> +					p->parent = pid_ns->child_reaper;
>>> +				p->real_parent = pid_ns->child_reaper;
>> Eek. Even ignoring ptrace, this is weird. We change parent/real_parent,
>> but we do not do list_move_tail(sibling) until wait_task_detached() !
>> No, I think we should not do this even if this was correct. I'll try
>> to nack this in any case, even if there were no immediate problems ;)
>> IMHO, this is insane.
>>
>> But this is wrong. Well. Suppose that the caller of PR_DETACH exits
>> before the old parent does do_wait(). What /sbin/init (who is the new
>> parent) can do after it gets SIGCHLD? If can't see this zombie. Nor
>> the old parent can release this task due to ->detaching. Eventually
>> /sbin/init can reap it if it does, say, waitpid(-1), but still this
>> is wrong.
> No, the idea was like that: the old parent either wait()s or
> exits, then init became a "real" parent of that process, and
> reaps it immediately.

I understand this, but

> I think that's natural:

I strongly disagree, this is not natural. I mean, ->real_parent
and the head of ->sibling list should match each other.

For example. Ignoring other problems, you are doing list_move() in
do_wait() pathes. This happens to work now because everything is
protected by the global tasklist. But we are going to change the
locking, it should be per-process.

> If it exits,

If it exits, it notifies the new ->real_parent == init. init receives
SIGCHLD, but do_wait() can't see this process on ->children lists.

> its current parent
> have to either wait(), or exit. If it doesn't do so - zombie.

Indeed. And, if the old parent does wait(), this simply does
list_move_tail(p->sibling), a zombie won't be reaped. And nobody will
reap it, until init does waitpid(-1). This was my point, this is wrong.

>> Or. Suppose that the old parent exits after its child does PR_DETACH.
>> You changed forget_original_parent() but this is not enough, note that
>> find_new_reaper() can pick the live sub-thread. In this case the child
>> will be moved to init's ->children list, and after that we are changing
>> ->real_parent back.
> How? I think I prevented that with this:
> ---
>
> +			p->detaching = 0;
> +			continue;

Yes, thanks, I didn't notice "continue". But then this is wrong again.
This can race with wait_task_detached() called by our sub-thread, it
can clear ->detaching before we check it.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-05 15:15                     ` Oleg Nesterov
@ 2011-04-05 16:25                       ` Stas Sergeev
  2011-04-05 16:45                         ` Oleg Nesterov
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-05 16:25 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

05.04.2011 19:15, Oleg Nesterov wrote:
> I don't understand the "and not is a child on init's subthread".
> If the child was created by init's sub-thread, it is the child
> of the whole thread group.
>
> ->real_parent points to thread, yes.
OK, sorry, I thought you mean the CLONE_PARENT case, where
the real_parent points _not_ to thread. That's why the confusion.

>   But the parent is the whole
> process, not thread. The only reason for this oddity is __WNOTHREAD.
OK, now I see what problem you are pointing to.
Will fix.

>> How? I think I prevented that with this:
>> ---
>>
>> +			p->detaching = 0;
>> +			continue;
> Yes, thanks, I didn't notice "continue". But then this is wrong again.
> This can race with wait_task_detached() called by our sub-thread, it
> can clear ->detaching before we check it.
But the above code is under write_lock_irq(&tasklist_lock), so why
would it race?

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-05 16:25                       ` Stas Sergeev
@ 2011-04-05 16:45                         ` Oleg Nesterov
  2011-04-05 17:51                           ` Stas Sergeev
                                             ` (5 more replies)
  0 siblings, 6 replies; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-05 16:45 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 04/05, Stas Sergeev wrote:
>
> 05.04.2011 19:15, Oleg Nesterov wrote:
>>>
>>> +			p->detaching = 0;
>>> +			continue;
>> Yes, thanks, I didn't notice "continue". But then this is wrong again.
>> This can race with wait_task_detached() called by our sub-thread, it
>> can clear ->detaching before we check it.
> But the above code is under write_lock_irq(&tasklist_lock), so why
> would it race?

wait_task_detached() clears detaching, then it drops tasklist and takes
it for writing. forget_original_parent() can come in between.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-05 16:45                         ` Oleg Nesterov
@ 2011-04-05 17:51                           ` Stas Sergeev
  2011-04-08 10:51                           ` Stas Sergeev
                                             ` (4 subsequent siblings)
  5 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-05 17:51 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

[-- Attachment #1: Type: text/plain, Size: 166 bytes --]

Hi Oleg, here's the patch that should address
the mentioned problems. Or does it add more? :)
I try to delay the notification of init till the detaching
is complete.

[-- Attachment #2: a.diff --]
[-- Type: text/plain, Size: 7888 bytes --]

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index caa151f..fdf71a9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -158,6 +158,8 @@ extern struct cred init_cred;
 	.parent		= &tsk,						\
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.detached_children	= LIST_HEAD_INIT(tsk.detached_children),\
+	.detached_sibling	= LIST_HEAD_INIT(tsk.detached_sibling),	\
 	.group_leader	= &tsk,						\
 	RCU_INIT_POINTER(.real_cred, &init_cred),			\
 	RCU_INIT_POINTER(.cred, &init_cred),				\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e74882f..0c4f070 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1260,6 +1260,8 @@ struct task_struct {
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
+	int detach_code;
+	int detaching;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
@@ -1292,6 +1294,8 @@ struct task_struct {
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
+	struct list_head detached_children;	/* list of my detached children */
+	struct list_head detached_sibling;	/* linkage in my parent's detached children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 2aa64e8..289baf3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
+		list_del_init(&p->detached_sibling);
 		__this_cpu_dec(process_counts);
 	}
 	list_del_rcu(&p->thread_group);
@@ -804,6 +805,16 @@ static void forget_original_parent(struct task_struct *father)
 		} while_each_thread(p, t);
 		reparent_leader(father, p, &dead_children);
 	}
+	list_for_each_entry_safe(p, n, &father->detached_children,
+			detached_sibling) {
+		int signal;
+		p->detaching = 0;
+		list_del_init(&p->detached_sibling);
+		if (p->exit_state == EXIT_ZOMBIE) {
+			signal = do_notify_parent(p, SIGCHLD);
+			BUG_ON(signal == DEATH_REAP);
+		}
+	}
 	write_unlock_irq(&tasklist_lock);
 
 	BUG_ON(!list_empty(&father->children));
@@ -858,7 +869,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		tsk->exit_signal = SIGCHLD;
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-	if (signal >= 0)
+	/* delay parent notification for detaching tasks */
+	if (signal >= 0 && !tsk->detaching)
 		signal = do_notify_parent(tsk, signal);
 
 	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
@@ -1507,6 +1519,53 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int dt, signal, retval = 0;
+	pid_t pid;
+	uid_t uid;
+
+	if (!likely(wo->wo_flags & WEXITED))
+		return 0;
+
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	}
+
+	dt = xchg(&p->detaching, 0);
+	if (dt != 1)
+		return 0;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	write_lock_irq(&tasklist_lock);
+	list_del_init(&p->detached_sibling);
+	if (p->exit_state == EXIT_ZOMBIE) {
+		signal = do_notify_parent(p, SIGCHLD);
+		BUG_ON(signal == DEATH_REAP);
+	}
+	write_unlock_irq(&tasklist_lock);
+
+	if (wo->wo_stat)
+		retval = put_user(p->detach_code, wo->wo_stat);
+
+	if (!retval) {
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	} else {
+		put_task_struct(p);
+	}
+
+	return retval;
+}
+
 static int can_wait_task_common(struct wait_opts *wo, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
@@ -1572,7 +1631,8 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
-	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p) &&
+			!p->detaching)
 		return wait_task_zombie(wo, p);
 
 	/*
@@ -1610,6 +1670,15 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 			return ret;
 	}
 
+	list_for_each_entry(p, &tsk->detached_children, detached_sibling) {
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_task_detached(wo, p);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..aa8c1e7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1070,6 +1070,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+	INIT_LIST_HEAD(&p->detached_children);
+	INIT_LIST_HEAD(&p->detached_sibling);
 	rcu_copy_process(p);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
@@ -1233,6 +1235,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
+	p->detaching = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..6074b02 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/tracehook.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
@@ -1736,6 +1737,45 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH: {
+			struct task_struct *p;
+			struct pid_namespace *pid_ns = task_active_pid_ns(me);
+			int notif = DEATH_REAP;
+			error = -EPERM;
+			/* not detaching from init */
+			if (same_thread_group(me->real_parent,
+					pid_ns->child_reaper))
+				break;
+			if (arg2 & ~0x7f)
+				break;
+			write_lock_irq(&tasklist_lock);
+			me->detach_code = arg2 << 8;
+			notif = do_signal_parent(me, me->exit_signal,
+					CLD_DETACHED, arg2);
+			if (notif != DEATH_REAP && thread_group_leader(me)) {
+				list_add_tail(&me->detached_sibling,
+					&me->real_parent->detached_children);
+				me->detaching = 1;
+			}
+			if (!task_ptrace(me))
+				me->parent = pid_ns->child_reaper;
+			me->real_parent = pid_ns->child_reaper;
+			if (thread_group_leader(me)) {
+				list_move_tail(&me->sibling,
+						&me->real_parent->children);
+				/* reparent threads */
+				p = me;
+				while_each_thread(me, p) {
+					if (!task_ptrace(p))
+						p->parent = pid_ns->child_reaper;
+					p->real_parent = pid_ns->child_reaper;
+				}
+			}
+			me->exit_signal = SIGCHLD;
+			write_unlock_irq(&tasklist_lock);
+			error = 0;
+			break;
+		}
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-05 16:45                         ` Oleg Nesterov
  2011-04-05 17:51                           ` Stas Sergeev
@ 2011-04-08 10:51                           ` Stas Sergeev
  2011-04-08 18:55                             ` Oleg Nesterov
  2011-04-11 11:15                           ` Stas Sergeev
                                             ` (3 subsequent siblings)
  5 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-08 10:51 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

[-- Attachment #1: Type: text/plain, Size: 170 bytes --]

Hi Oleg.

I updated patch to fix the race between wait_task_detached()
and wait_task_zombie() by using is_detaching flag.
Here's the patch.
What problems do remain here?

[-- Attachment #2: 01_sigpar.diff --]
[-- Type: text/plain, Size: 3645 bytes --]

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5..e74882f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2096,6 +2096,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
+extern int do_signal_parent(struct task_struct *, int, int, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..54b93c7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1434,14 +1434,8 @@ ret:
 	return ret;
 }
 
-/*
- * Let a parent know about the death of a child.
- * For a stopped/continued status change, use do_notify_parent_cldstop instead.
- *
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
- */
-int do_notify_parent(struct task_struct *tsk, int sig)
+int do_signal_parent(struct task_struct *tsk, int sig, int sicode,
+	int sistatus)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1450,11 +1444,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	BUG_ON(sig == -1);
 
- 	/* do_notify_parent_cldstop should have been called instead.  */
- 	BUG_ON(task_is_stopped_or_traced(tsk));
-
-	BUG_ON(!task_ptrace(tsk) &&
-	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+	/* do_notify_parent_cldstop should have been called instead.  */
+	BUG_ON(task_is_stopped_or_traced(tsk));
 
 	info.si_signo = sig;
 	info.si_errno = 0;
@@ -1480,15 +1471,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
 				tsk->signal->stime));
 
-	info.si_status = tsk->exit_code & 0x7f;
-	if (tsk->exit_code & 0x80)
-		info.si_code = CLD_DUMPED;
-	else if (tsk->exit_code & 0x7f)
-		info.si_code = CLD_KILLED;
-	else {
-		info.si_code = CLD_EXITED;
-		info.si_status = tsk->exit_code >> 8;
-	}
+	info.si_code = sicode;
+	info.si_status = sistatus;
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
@@ -1510,9 +1494,11 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 		 * is implementation-defined: we do (if you don't want
 		 * it, just use SIG_IGN instead).
 		 */
-		ret = tsk->exit_signal = -1;
+		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
 			sig = -1;
+		/* reap process now, rather than promoting to zombie */
+		ret = DEATH_REAP;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
@@ -1522,6 +1508,33 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	return ret;
 }
 
+/*
+ * Let a parent know about the death of a child.
+ * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
+ */
+int do_notify_parent(struct task_struct *tsk, int sig)
+{
+	int sicode, sistatus;
+
+	BUG_ON(!task_ptrace(tsk) &&
+	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
+	sistatus = tsk->exit_code & 0x7f;
+	if (tsk->exit_code & 0x80)
+		sicode = CLD_DUMPED;
+	else if (tsk->exit_code & 0x7f)
+		sicode = CLD_KILLED;
+	else {
+		sicode = CLD_EXITED;
+		sistatus = tsk->exit_code >> 8;
+	}
+
+	return do_signal_parent(tsk, sig, sicode, sistatus);
+}
+
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 {
 	struct siginfo info;

[-- Attachment #3: 02_cwaittsk.diff --]
[-- Type: text/plain, Size: 3108 bytes --]

diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45eb..2aa64e8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1507,21 +1507,11 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
-/*
- * Consider @p for a wait by @parent.
- *
- * -ECHILD should be in ->notask_error before the first call.
- * Returns nonzero for a final return, when we have unlocked tasklist_lock.
- * Returns zero if the search for a child should continue;
- * then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
- */
-static int wait_consider_task(struct wait_opts *wo, int ptrace,
-				struct task_struct *p)
+static int can_wait_task_common(struct wait_opts *wo, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
 	if (!ret)
-		return ret;
+		return 0;
 
 	ret = security_task_wait(p);
 	if (unlikely(ret < 0)) {
@@ -1537,7 +1527,25 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+	if (p->exit_state == EXIT_DEAD)
+		return 0;
+
+	return 1;
+}
+
+static int can_wait_task_ptrace(struct wait_opts *wo, struct task_struct *p)
+{
+	/* don't worry, gcc will optimize away this function :) */
+	return can_wait_task_common(wo, p);
+}
+
+static int can_wait_task(struct wait_opts *wo, struct task_struct *p)
+{
+	int ret = can_wait_task_common(wo, p);
+	if (!ret)
+		return 0;
+
+	if (unlikely(task_ptrace(p))) {
 		/*
 		 * This child is hidden by ptrace.
 		 * We aren't allowed to see it now, but eventually we will.
@@ -1546,9 +1554,21 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (p->exit_state == EXIT_DEAD)
-		return 0;
+	return 1;
+}
 
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in ->notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then ->notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int wait_consider_task(struct wait_opts *wo, int ptrace,
+				struct task_struct *p)
+{
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
@@ -1578,10 +1598,14 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
  */
 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
+	int ret;
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->children, sibling) {
-		int ret = wait_consider_task(wo, 0, p);
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_consider_task(wo, 0, p);
 		if (ret)
 			return ret;
 	}
@@ -1594,7 +1618,10 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
-		int ret = wait_consider_task(wo, 1, p);
+		int ret = can_wait_task_ptrace(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_consider_task(wo, 1, p);
 		if (ret)
 			return ret;
 	}

[-- Attachment #4: pr_detach3.diff --]
[-- Type: text/plain, Size: 8157 bytes --]

commit b05a48839018ec852ed4c7bd867534df47643d82
Author: Stas <stas@stas.(none)>
Date:   Thu Apr 7 12:04:19 2011 +0400

    implement PR_DETACH

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index caa151f..fdf71a9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -158,6 +158,8 @@ extern struct cred init_cred;
 	.parent		= &tsk,						\
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.detached_children	= LIST_HEAD_INIT(tsk.detached_children),\
+	.detached_sibling	= LIST_HEAD_INIT(tsk.detached_sibling),	\
 	.group_leader	= &tsk,						\
 	RCU_INIT_POINTER(.real_cred, &init_cred),			\
 	RCU_INIT_POINTER(.cred, &init_cred),				\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e74882f..c8a1741 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1260,6 +1260,9 @@ struct task_struct {
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
+	int detach_code;
+	int detaching;
+	int is_detaching:1;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
@@ -1292,6 +1295,8 @@ struct task_struct {
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
+	struct list_head detached_children;	/* list of my detached children */
+	struct list_head detached_sibling;	/* linkage in my parent's detached children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 2aa64e8..a2c5cfb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
+		list_del_init(&p->detached_sibling);
 		__this_cpu_dec(process_counts);
 	}
 	list_del_rcu(&p->thread_group);
@@ -804,6 +805,17 @@ static void forget_original_parent(struct task_struct *father)
 		} while_each_thread(p, t);
 		reparent_leader(father, p, &dead_children);
 	}
+	list_for_each_entry_safe(p, n, &father->detached_children,
+			detached_sibling) {
+		int signal;
+		p->detaching = 0;
+		p->is_detaching = 0;
+		list_del_init(&p->detached_sibling);
+		if (p->exit_state == EXIT_ZOMBIE) {
+			signal = do_notify_parent(p, SIGCHLD);
+			BUG_ON(signal == DEATH_REAP);
+		}
+	}
 	write_unlock_irq(&tasklist_lock);
 
 	BUG_ON(!list_empty(&father->children));
@@ -858,7 +870,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		tsk->exit_signal = SIGCHLD;
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-	if (signal >= 0)
+	/* delay parent notification for detaching tasks */
+	if (signal >= 0 && !tsk->is_detaching)
 		signal = do_notify_parent(tsk, signal);
 
 	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
@@ -1507,6 +1520,54 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int dt, signal, retval = 0;
+	pid_t pid;
+	uid_t uid;
+
+	if (!likely(wo->wo_flags & WEXITED))
+		return 0;
+
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	}
+
+	dt = xchg(&p->detaching, 0);
+	if (dt != 1)
+		return 0;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	write_lock_irq(&tasklist_lock);
+	list_del_init(&p->detached_sibling);
+	if (p->exit_state == EXIT_ZOMBIE) {
+		signal = do_notify_parent(p, SIGCHLD);
+		BUG_ON(signal == DEATH_REAP);
+	}
+	p->is_detaching = 0;
+	write_unlock_irq(&tasklist_lock);
+
+	if (wo->wo_stat)
+		retval = put_user(p->detach_code, wo->wo_stat);
+
+	if (!retval) {
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	} else {
+		put_task_struct(p);
+	}
+
+	return retval;
+}
+
 static int can_wait_task_common(struct wait_opts *wo, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
@@ -1572,7 +1633,8 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
-	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p) &&
+			!p->is_detaching)
 		return wait_task_zombie(wo, p);
 
 	/*
@@ -1610,6 +1672,15 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 			return ret;
 	}
 
+	list_for_each_entry(p, &tsk->detached_children, detached_sibling) {
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_task_detached(wo, p);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..feadef7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1070,6 +1070,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+	INIT_LIST_HEAD(&p->detached_children);
+	INIT_LIST_HEAD(&p->detached_sibling);
 	rcu_copy_process(p);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
@@ -1233,6 +1235,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
+	p->detaching = 0;
+	p->is_detaching = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..4e1d1e9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/tracehook.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
@@ -1736,6 +1737,46 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH: {
+			struct task_struct *p;
+			struct pid_namespace *pid_ns = task_active_pid_ns(me);
+			int notif = DEATH_REAP;
+			error = -EPERM;
+			/* not detaching from init */
+			if (same_thread_group(me->real_parent,
+					pid_ns->child_reaper))
+				break;
+			if (arg2 & ~0x7f)
+				break;
+			write_lock_irq(&tasklist_lock);
+			me->detach_code = arg2 << 8;
+			notif = do_signal_parent(me, me->exit_signal,
+					CLD_DETACHED, arg2);
+			if (notif != DEATH_REAP && thread_group_leader(me)) {
+				list_add_tail(&me->detached_sibling,
+					&me->real_parent->detached_children);
+				me->detaching = 1;
+				me->is_detaching = 1;
+			}
+			if (!task_ptrace(me))
+				me->parent = pid_ns->child_reaper;
+			me->real_parent = pid_ns->child_reaper;
+			if (thread_group_leader(me)) {
+				list_move_tail(&me->sibling,
+						&me->real_parent->children);
+				/* reparent threads */
+				p = me;
+				while_each_thread(me, p) {
+					if (!task_ptrace(p))
+						p->parent = pid_ns->child_reaper;
+					p->real_parent = pid_ns->child_reaper;
+				}
+			}
+			me->exit_signal = SIGCHLD;
+			write_unlock_irq(&tasklist_lock);
+			error = 0;
+			break;
+		}
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-02-23 13:50 [path][rfc] add PR_DETACH prctl command Stas Sergeev
  2011-02-23 19:14 ` Oleg Nesterov
@ 2011-04-08 18:13 ` Bryan Donlan
  2011-04-08 20:26   ` Stas Sergeev
  1 sibling, 1 reply; 57+ messages in thread
From: Bryan Donlan @ 2011-04-08 18:13 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel, Oleg Nesterov

On Wed, Feb 23, 2011 at 08:50, Stas Sergeev <stsp@aknet.ru> wrote:
> Hi.
>
> The attched patch adds the PR_DETACH prctl command.
> It is needed for those rare but unfortunate cases, where
> you can't daemonize your process before creating a thread.
> The effect of this command is similar to the fork() and then
> exit() on parent, except that:
> 1. PID does not change
> 2. Threads are not destroyed
>
> It would be nice to know what people think about such an
> approach.

I can't comment on the patch itself, but, if your application knows it
might have to daemonize after spinning up threads, why not simply
fork() immediately on startup, and have the parent simply wait forever
for either the child to die or for a daemonize signal from the child?
If done early enough it shouldn't tie up too much memory, and it
wouldn't require any of these invasive kernel changes. As an added
bonus, it's portable to all unixen :)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-08 10:51                           ` Stas Sergeev
@ 2011-04-08 18:55                             ` Oleg Nesterov
  2011-04-08 20:16                               ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-08 18:55 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 04/08, Stas Sergeev wrote:
>
> I updated patch to fix the race between wait_task_detached()
> and wait_task_zombie() by using is_detaching flag.
> Here's the patch.
> What problems do remain here?

Stas, sorry, I simply have no time to read this patch. Not to mention,
you know that I dislike it in any case.

I only quickly looked at the changes in sys_prctl and they are definitely
wrong.

> +			me->real_parent = pid_ns->child_reaper;
> +			if (thread_group_leader(me)) {
> +				list_move_tail(&me->sibling,
> +						&me->real_parent->children);
> +				/* reparent threads */
> +				p = me;
> +				while_each_thread(me, p) {
> +					if (!task_ptrace(p))
> +						p->parent = pid_ns->child_reaper;
> +					p->real_parent = pid_ns->child_reaper;

Why do you check thread_group_leader() ???? What if the caller is not
the leader?

I told this many times, and I got lost. If you reparent, you should
always reparent the whole group.

Also, you are doing do_signal_parent(me). I do not know what it does
(once again, I didn't read the patch), but this looks suspicious.
A sub-thread shouldn't notify the parent.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-08 18:55                             ` Oleg Nesterov
@ 2011-04-08 20:16                               ` Stas Sergeev
  0 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-08 20:16 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

08.04.2011 22:55, Oleg Nesterov wrote:
> Why do you check thread_group_leader() ???? What if the caller is not
> the leader?
>
> I told this many times, and I got lost. If you reparent, you should
> always reparent the whole group.
Arrgh, OK, hope I realize that now, sorry! There were lots of other
problems, so I was constantly forgetting about this one. Or,
maybe, I couldn't get away with the fact that any sub-thread
can reparent the whole group. OK, will definitely fix this time. :))
Thanks!

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-08 18:13 ` [path][rfc] add PR_DETACH prctl command Bryan Donlan
@ 2011-04-08 20:26   ` Stas Sergeev
  2011-04-08 20:52     ` Bryan Donlan
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-08 20:26 UTC (permalink / raw)
  To: Bryan Donlan; +Cc: Linux kernel, Oleg Nesterov

08.04.2011 22:13, Bryan Donlan wrote:
> I can't comment on the patch itself, but, if your application knows it
> might have to daemonize after spinning up threads, why not simply
> fork() immediately on startup, and have the parent simply wait forever
> for either the child to die or for a daemonize signal from the child?
> If done early enough it shouldn't tie up too much memory, and it
> wouldn't require any of these invasive kernel changes. As an added
> bonus, it's portable to all unixen :)
Yes, thats almost always true. Except when you deal with
the vendor-provided poorly-coded drivers and libs, where
you get the drivers initialized only via the lib. And the
initialization process must be finished before the boot-up
can continue, but that's not the whole story: only the
process that initialized that lib, can then work with it.
And of course that lib creates a dozen of threads...
OK, you've got the idea. :)
But anyway. Yes, not everyone have to deal with such a
nightmare, this is very rare. But people being confused
by the fact that daemon() silently loses threads, are not
rare. So I just wonder: even if the workaround is simple,
why searching for the workaround at all? If you need
2 syscalls to detach from parent, and you loose the threads
in a process, then why not to have a single call, that
detaches without loosing threads?
But in any case, I am mostly inclinced to leave that patch
for my project only.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-08 20:26   ` Stas Sergeev
@ 2011-04-08 20:52     ` Bryan Donlan
  2011-04-08 21:14       ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Bryan Donlan @ 2011-04-08 20:52 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel, Oleg Nesterov

2011/4/8 Stas Sergeev <stsp@aknet.ru>:
> 08.04.2011 22:13, Bryan Donlan wrote:
>>
>> I can't comment on the patch itself, but, if your application knows it
>> might have to daemonize after spinning up threads, why not simply
>> fork() immediately on startup, and have the parent simply wait forever
>> for either the child to die or for a daemonize signal from the child?
>> If done early enough it shouldn't tie up too much memory, and it
>> wouldn't require any of these invasive kernel changes. As an added
>> bonus, it's portable to all unixen :)
>
> Yes, thats almost always true. Except when you deal with
> the vendor-provided poorly-coded drivers and libs, where
> you get the drivers initialized only via the lib. And the
> initialization process must be finished before the boot-up
> can continue, but that's not the whole story: only the
> process that initialized that lib, can then work with it.
> And of course that lib creates a dozen of threads...
> OK, you've got the idea. :)

Still, you can workaround this by either:
a) Load the vendor library via dlopen()
b) Use a separate launcher executable to handle the fork-and-wait

Either of these approaches are far simpler than patching the kernel,
more portable, and much, much easier to get right. In fact, here's a
quick and dirty implementation of the latter:

#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

void sigusr1_handler(int unused)
{
        (void)unused;

        _exit(0);
}

int main(int argc, char **argv)
{
        int waitstatus;
        pid_t child, parent = getpid();

        (void)argc;

        signal(SIGUSR1, sigusr1_handler);

        child = fork();
        if (child < 0) {
                perror("Fork failed");
                return EXIT_FAILURE;
        }
        if (child == 0) {
                char envbuf[20];
                sprintf(envbuf, "%d", (int)parent);
                setenv("LAUNCHER_PID", envbuf, 1);

                execvp(argv[1], &argv[1]);
                perror("Launching child failed");
                _exit(EXIT_FAILURE);
        }

        wait(&waitstatus);

        if (WIFEXITED(waitstatus))
                _exit(WEXITSTATUS(waitstatus));

        _exit(256);
}

Just kill(atoi(getenv("LAUNCHER_PID")), SIGUSR1) to detach. Much
easier than doing some very racy things in the kernel, no? It's
certainly more obvious that this ought to be correct in the face of
races with its parent :)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-08 20:52     ` Bryan Donlan
@ 2011-04-08 21:14       ` Stas Sergeev
  2011-04-08 21:25         ` Bryan Donlan
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-08 21:14 UTC (permalink / raw)
  To: Bryan Donlan; +Cc: Linux kernel

09.04.2011 00:52, Bryan Donlan wrote:
> Still, you can workaround this by either:
Yes, sure.

> a) Load the vendor library via dlopen()
Too much to dlsym(), and by the way, what does this give?
I _have to_ init that lib early at bootup, so I don't see how
dlopen will help, could you clarify?

> b) Use a separate launcher executable to handle the fork-and-wait
Yes, that's certainly possible, except that I was trying the
semaphore instead of a signal, as in your example, but its
the same.

> Just kill(atoi(getenv("LAUNCHER_PID")), SIGUSR1) to detach. Much
> easier than doing some very racy things in the kernel, no? It's
> certainly more obvious that this ought to be correct in the face of
> races with its parent :)
But what races do you mean? Yes, the workaround is a
workaround, and it works. And is simpler to implement. :)
But what problems do you see with the PR_DETACH approach,
except for the bugs in my patch? :) I mean, the fact that
daemon() silently loses threads, sounds like a limitation,
after all.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-08 21:14       ` Stas Sergeev
@ 2011-04-08 21:25         ` Bryan Donlan
  2011-04-08 21:38           ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Bryan Donlan @ 2011-04-08 21:25 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel, Oleg Nesterov

2011/4/8 Stas Sergeev <stsp@aknet.ru>:
> 09.04.2011 00:52, Bryan Donlan wrote:
>>
>> Still, you can workaround this by either:
>
> Yes, sure.
>
>> a) Load the vendor library via dlopen()
>
> Too much to dlsym(), and by the way, what does this give?
> I _have to_ init that lib early at bootup, so I don't see how
> dlopen will help, could you clarify?

Using dlopen will allow you to perform the fork() prior to running the
library's initialization code.

>> Just kill(atoi(getenv("LAUNCHER_PID")), SIGUSR1) to detach. Much
>> easier than doing some very racy things in the kernel, no? It's
>> certainly more obvious that this ought to be correct in the face of
>> races with its parent :)
>
> But what races do you mean? Yes, the workaround is a
> workaround, and it works. And is simpler to implement. :)
> But what problems do you see with the PR_DETACH approach,
> except for the bugs in my patch? :) I mean, the fact that
> daemon() silently loses threads, sounds like a limitation,
> after all.

As I said, I can't comment on the patch - I'm not familiar with that
part of the kernel, so I don't know what kind of races may be lurking.
But based on Oleg's comments, it seems clear to me that implementing
your PR_DETACH is quite a complex thing to be doing. It is true that
being able to daemonize() without losing threads would be a nice thing
to have; I just have my doubts that it's worth the potential bugs that
such a change might introduce, when it's only needed in a somewhat
rare case, and when perfectly good workarounds exist in userspace.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-08 21:25         ` Bryan Donlan
@ 2011-04-08 21:38           ` Stas Sergeev
  0 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-08 21:38 UTC (permalink / raw)
  To: Bryan Donlan; +Cc: Linux kernel, Oleg Nesterov

09.04.2011 01:25, Bryan Donlan wrote:
> Using dlopen will allow you to perform the fork() prior to running the
> library's initialization code.
But I don't even need that: I init it manually, there
is no automatic ctors there.

> As I said, I can't comment on the patch - I'm not familiar with that
> part of the kernel, so I don't know what kind of races may be lurking.
> But based on Oleg's comments, it seems clear to me that implementing
> your PR_DETACH is quite a complex thing to be doing.
Yes, that's true, but treat that as my personal tour to the
kernel internals. I wasn't realizing the complexity initially,
and now its almost done, so I wonder what the result will
look like, it will it worth all the troubles.

>   It is true that
> being able to daemonize() without losing threads would be a nice thing
> to have; I just have my doubts that it's worth the potential bugs that
> such a change might introduce, when it's only needed in a somewhat
> rare case, and when perfectly good workarounds exist in userspace.
Yes, I share that concerns too. And it is very unlikely that
I will propose that patch for inclusion, but I think it is possible
to code that up small and simple, in which case maybe even
Oleg will not hate it all that much. ;))

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command
  2011-04-05 16:45                         ` Oleg Nesterov
  2011-04-05 17:51                           ` Stas Sergeev
  2011-04-08 10:51                           ` Stas Sergeev
@ 2011-04-11 11:15                           ` Stas Sergeev
  2011-04-19 14:44                           ` [path][rfc] add PR_DETACH prctl command [1/3] Stas Sergeev
                                             ` (2 subsequent siblings)
  5 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-11 11:15 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

[-- Attachment #1: Type: text/plain, Size: 88 bytes --]

Hi Oleg.

In the attached patch I finally reparent the whole group.
How does that look?

[-- Attachment #2: pr_detach4.diff --]
[-- Type: text/plain, Size: 8023 bytes --]

commit c95ea73afce29a9c47bab29787d6eb014a8de9e6
Author: Stas <stas@stas.(none)>
Date:   Mon Apr 11 15:06:33 2011 +0400

    implement PR_DETACH

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index caa151f..fdf71a9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -158,6 +158,8 @@ extern struct cred init_cred;
 	.parent		= &tsk,						\
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.detached_children	= LIST_HEAD_INIT(tsk.detached_children),\
+	.detached_sibling	= LIST_HEAD_INIT(tsk.detached_sibling),	\
 	.group_leader	= &tsk,						\
 	RCU_INIT_POINTER(.real_cred, &init_cred),			\
 	RCU_INIT_POINTER(.cred, &init_cred),				\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e74882f..c8a1741 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1260,6 +1260,9 @@ struct task_struct {
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
+	int detach_code;
+	int detaching;
+	int is_detaching:1;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
@@ -1292,6 +1295,8 @@ struct task_struct {
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
+	struct list_head detached_children;	/* list of my detached children */
+	struct list_head detached_sibling;	/* linkage in my parent's detached children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 2aa64e8..a2c5cfb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
+		list_del_init(&p->detached_sibling);
 		__this_cpu_dec(process_counts);
 	}
 	list_del_rcu(&p->thread_group);
@@ -804,6 +805,17 @@ static void forget_original_parent(struct task_struct *father)
 		} while_each_thread(p, t);
 		reparent_leader(father, p, &dead_children);
 	}
+	list_for_each_entry_safe(p, n, &father->detached_children,
+			detached_sibling) {
+		int signal;
+		p->detaching = 0;
+		p->is_detaching = 0;
+		list_del_init(&p->detached_sibling);
+		if (p->exit_state == EXIT_ZOMBIE) {
+			signal = do_notify_parent(p, SIGCHLD);
+			BUG_ON(signal == DEATH_REAP);
+		}
+	}
 	write_unlock_irq(&tasklist_lock);
 
 	BUG_ON(!list_empty(&father->children));
@@ -858,7 +870,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		tsk->exit_signal = SIGCHLD;
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-	if (signal >= 0)
+	/* delay parent notification for detaching tasks */
+	if (signal >= 0 && !tsk->is_detaching)
 		signal = do_notify_parent(tsk, signal);
 
 	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
@@ -1507,6 +1520,54 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int dt, signal, retval = 0;
+	pid_t pid;
+	uid_t uid;
+
+	if (!likely(wo->wo_flags & WEXITED))
+		return 0;
+
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	}
+
+	dt = xchg(&p->detaching, 0);
+	if (dt != 1)
+		return 0;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	write_lock_irq(&tasklist_lock);
+	list_del_init(&p->detached_sibling);
+	if (p->exit_state == EXIT_ZOMBIE) {
+		signal = do_notify_parent(p, SIGCHLD);
+		BUG_ON(signal == DEATH_REAP);
+	}
+	p->is_detaching = 0;
+	write_unlock_irq(&tasklist_lock);
+
+	if (wo->wo_stat)
+		retval = put_user(p->detach_code, wo->wo_stat);
+
+	if (!retval) {
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	} else {
+		put_task_struct(p);
+	}
+
+	return retval;
+}
+
 static int can_wait_task_common(struct wait_opts *wo, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
@@ -1572,7 +1633,8 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
-	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p) &&
+			!p->is_detaching)
 		return wait_task_zombie(wo, p);
 
 	/*
@@ -1610,6 +1672,15 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 			return ret;
 	}
 
+	list_for_each_entry(p, &tsk->detached_children, detached_sibling) {
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_task_detached(wo, p);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..feadef7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1070,6 +1070,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+	INIT_LIST_HEAD(&p->detached_children);
+	INIT_LIST_HEAD(&p->detached_sibling);
 	rcu_copy_process(p);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
@@ -1233,6 +1235,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
+	p->detaching = 0;
+	p->is_detaching = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..1d88ee8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/tracehook.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
@@ -1736,6 +1737,41 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH: {
+			struct task_struct *p, *leader;
+			int notif;
+			struct pid_namespace *pid_ns = task_active_pid_ns(me);
+			error = -EPERM;
+			/* not detaching from init */
+			if (same_thread_group(me->real_parent,
+					pid_ns->child_reaper))
+				break;
+			if (arg2 & ~0x7f)
+				break;
+			write_lock_irq(&tasklist_lock);
+			leader = me->group_leader;
+			leader->detach_code = arg2 << 8;
+			notif = do_signal_parent(leader, leader->exit_signal,
+					CLD_DETACHED, arg2);
+			if (notif != DEATH_REAP) {
+				list_add_tail(&leader->detached_sibling,
+					&leader->real_parent->detached_children);
+				leader->detaching = 1;
+				leader->is_detaching = 1;
+			}
+			p = leader;
+			do {
+				if (!task_ptrace(p))
+					p->parent = pid_ns->child_reaper;
+				p->real_parent = pid_ns->child_reaper;
+			} while_each_thread(leader, p);
+			list_move_tail(&leader->sibling,
+					&leader->real_parent->children);
+			leader->exit_signal = SIGCHLD;
+			write_unlock_irq(&tasklist_lock);
+			error = 0;
+			break;
+		}
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [1/3]
  2011-04-05 16:45                         ` Oleg Nesterov
                                             ` (2 preceding siblings ...)
  2011-04-11 11:15                           ` Stas Sergeev
@ 2011-04-19 14:44                           ` Stas Sergeev
  2011-04-19 14:50                           ` [path][rfc] add PR_DETACH prctl command [2/3] Stas Sergeev
  2011-04-19 14:54                           ` [path][rfc] add PR_DETACH prctl command [3/3] Stas Sergeev
  5 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-19 14:44 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

[-- Attachment #1: Type: text/plain, Size: 216 bytes --]

Reposting per Oleg's request.

The attached patch adds do_signal_parent() function.
It is the same as do_notify_parent(), but can be used
when the child is not terminating, but (for example)
detaches with PR_DETACH.

[-- Attachment #2: 01_sigpar.diff --]
[-- Type: text/plain, Size: 3645 bytes --]

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5..e74882f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2096,6 +2096,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
+extern int do_signal_parent(struct task_struct *, int, int, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..54b93c7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1434,14 +1434,8 @@ ret:
 	return ret;
 }
 
-/*
- * Let a parent know about the death of a child.
- * For a stopped/continued status change, use do_notify_parent_cldstop instead.
- *
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
- */
-int do_notify_parent(struct task_struct *tsk, int sig)
+int do_signal_parent(struct task_struct *tsk, int sig, int sicode,
+	int sistatus)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1450,11 +1444,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	BUG_ON(sig == -1);
 
- 	/* do_notify_parent_cldstop should have been called instead.  */
- 	BUG_ON(task_is_stopped_or_traced(tsk));
-
-	BUG_ON(!task_ptrace(tsk) &&
-	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+	/* do_notify_parent_cldstop should have been called instead.  */
+	BUG_ON(task_is_stopped_or_traced(tsk));
 
 	info.si_signo = sig;
 	info.si_errno = 0;
@@ -1480,15 +1471,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
 				tsk->signal->stime));
 
-	info.si_status = tsk->exit_code & 0x7f;
-	if (tsk->exit_code & 0x80)
-		info.si_code = CLD_DUMPED;
-	else if (tsk->exit_code & 0x7f)
-		info.si_code = CLD_KILLED;
-	else {
-		info.si_code = CLD_EXITED;
-		info.si_status = tsk->exit_code >> 8;
-	}
+	info.si_code = sicode;
+	info.si_status = sistatus;
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
@@ -1510,9 +1494,11 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 		 * is implementation-defined: we do (if you don't want
 		 * it, just use SIG_IGN instead).
 		 */
-		ret = tsk->exit_signal = -1;
+		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
 			sig = -1;
+		/* reap process now, rather than promoting to zombie */
+		ret = DEATH_REAP;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
@@ -1522,6 +1508,33 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	return ret;
 }
 
+/*
+ * Let a parent know about the death of a child.
+ * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
+ */
+int do_notify_parent(struct task_struct *tsk, int sig)
+{
+	int sicode, sistatus;
+
+	BUG_ON(!task_ptrace(tsk) &&
+	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
+	sistatus = tsk->exit_code & 0x7f;
+	if (tsk->exit_code & 0x80)
+		sicode = CLD_DUMPED;
+	else if (tsk->exit_code & 0x7f)
+		sicode = CLD_KILLED;
+	else {
+		sicode = CLD_EXITED;
+		sistatus = tsk->exit_code >> 8;
+	}
+
+	return do_signal_parent(tsk, sig, sicode, sistatus);
+}
+
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 {
 	struct siginfo info;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/3]
  2011-04-05 16:45                         ` Oleg Nesterov
                                             ` (3 preceding siblings ...)
  2011-04-19 14:44                           ` [path][rfc] add PR_DETACH prctl command [1/3] Stas Sergeev
@ 2011-04-19 14:50                           ` Stas Sergeev
  2011-04-19 14:54                           ` [path][rfc] add PR_DETACH prctl command [3/3] Stas Sergeev
  5 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-19 14:50 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

[-- Attachment #1: Type: text/plain, Size: 297 bytes --]

The attached patch adds the can_wait_task() and
can_wait_task_ptrace() functions by splitting out the
checks from wait_consider_task().

		int ret = wait_consider_task(wo, 0, p);

gets then replaced by

		ret = can_wait_task(wo, p);
		if (!ret)
			continue;
		ret = wait_consider_task(wo, 0, p);


[-- Attachment #2: 02_cwaittsk.diff --]
[-- Type: text/plain, Size: 3108 bytes --]

diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45eb..2aa64e8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1507,21 +1507,11 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
-/*
- * Consider @p for a wait by @parent.
- *
- * -ECHILD should be in ->notask_error before the first call.
- * Returns nonzero for a final return, when we have unlocked tasklist_lock.
- * Returns zero if the search for a child should continue;
- * then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
- */
-static int wait_consider_task(struct wait_opts *wo, int ptrace,
-				struct task_struct *p)
+static int can_wait_task_common(struct wait_opts *wo, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
 	if (!ret)
-		return ret;
+		return 0;
 
 	ret = security_task_wait(p);
 	if (unlikely(ret < 0)) {
@@ -1537,7 +1527,25 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+	if (p->exit_state == EXIT_DEAD)
+		return 0;
+
+	return 1;
+}
+
+static int can_wait_task_ptrace(struct wait_opts *wo, struct task_struct *p)
+{
+	/* don't worry, gcc will optimize away this function :) */
+	return can_wait_task_common(wo, p);
+}
+
+static int can_wait_task(struct wait_opts *wo, struct task_struct *p)
+{
+	int ret = can_wait_task_common(wo, p);
+	if (!ret)
+		return 0;
+
+	if (unlikely(task_ptrace(p))) {
 		/*
 		 * This child is hidden by ptrace.
 		 * We aren't allowed to see it now, but eventually we will.
@@ -1546,9 +1554,21 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	if (p->exit_state == EXIT_DEAD)
-		return 0;
+	return 1;
+}
 
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in ->notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then ->notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
+ */
+static int wait_consider_task(struct wait_opts *wo, int ptrace,
+				struct task_struct *p)
+{
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
@@ -1578,10 +1598,14 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
  */
 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
+	int ret;
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->children, sibling) {
-		int ret = wait_consider_task(wo, 0, p);
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_consider_task(wo, 0, p);
 		if (ret)
 			return ret;
 	}
@@ -1594,7 +1618,10 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
-		int ret = wait_consider_task(wo, 1, p);
+		int ret = can_wait_task_ptrace(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_consider_task(wo, 1, p);
 		if (ret)
 			return ret;
 	}

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-05 16:45                         ` Oleg Nesterov
                                             ` (4 preceding siblings ...)
  2011-04-19 14:50                           ` [path][rfc] add PR_DETACH prctl command [2/3] Stas Sergeev
@ 2011-04-19 14:54                           ` Stas Sergeev
  2011-04-19 14:58                             ` Alan Cox
  5 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-19 14:54 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

[-- Attachment #1: Type: text/plain, Size: 330 bytes --]

The attached patch implements the PR_DETACH prctl
command. It detaches the entire process group from
its parent, allowing the parent to still read the detach
code with normal wait(). If the process then exits, the
notification of the new parent is delayed till the old parent
does either wait() to read the detach code, or exits.

[-- Attachment #2: pr_detach4.diff --]
[-- Type: text/plain, Size: 8023 bytes --]

commit c95ea73afce29a9c47bab29787d6eb014a8de9e6
Author: Stas <stas@stas.(none)>
Date:   Mon Apr 11 15:06:33 2011 +0400

    implement PR_DETACH

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index caa151f..fdf71a9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -158,6 +158,8 @@ extern struct cred init_cred;
 	.parent		= &tsk,						\
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.detached_children	= LIST_HEAD_INIT(tsk.detached_children),\
+	.detached_sibling	= LIST_HEAD_INIT(tsk.detached_sibling),	\
 	.group_leader	= &tsk,						\
 	RCU_INIT_POINTER(.real_cred, &init_cred),			\
 	RCU_INIT_POINTER(.cred, &init_cred),				\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e74882f..c8a1741 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1260,6 +1260,9 @@ struct task_struct {
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
+	int detach_code;
+	int detaching;
+	int is_detaching:1;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
@@ -1292,6 +1295,8 @@ struct task_struct {
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
+	struct list_head detached_children;	/* list of my detached children */
+	struct list_head detached_sibling;	/* linkage in my parent's detached children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 2aa64e8..a2c5cfb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
+		list_del_init(&p->detached_sibling);
 		__this_cpu_dec(process_counts);
 	}
 	list_del_rcu(&p->thread_group);
@@ -804,6 +805,17 @@ static void forget_original_parent(struct task_struct *father)
 		} while_each_thread(p, t);
 		reparent_leader(father, p, &dead_children);
 	}
+	list_for_each_entry_safe(p, n, &father->detached_children,
+			detached_sibling) {
+		int signal;
+		p->detaching = 0;
+		p->is_detaching = 0;
+		list_del_init(&p->detached_sibling);
+		if (p->exit_state == EXIT_ZOMBIE) {
+			signal = do_notify_parent(p, SIGCHLD);
+			BUG_ON(signal == DEATH_REAP);
+		}
+	}
 	write_unlock_irq(&tasklist_lock);
 
 	BUG_ON(!list_empty(&father->children));
@@ -858,7 +870,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		tsk->exit_signal = SIGCHLD;
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-	if (signal >= 0)
+	/* delay parent notification for detaching tasks */
+	if (signal >= 0 && !tsk->is_detaching)
 		signal = do_notify_parent(tsk, signal);
 
 	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
@@ -1507,6 +1520,54 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int dt, signal, retval = 0;
+	pid_t pid;
+	uid_t uid;
+
+	if (!likely(wo->wo_flags & WEXITED))
+		return 0;
+
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	}
+
+	dt = xchg(&p->detaching, 0);
+	if (dt != 1)
+		return 0;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	write_lock_irq(&tasklist_lock);
+	list_del_init(&p->detached_sibling);
+	if (p->exit_state == EXIT_ZOMBIE) {
+		signal = do_notify_parent(p, SIGCHLD);
+		BUG_ON(signal == DEATH_REAP);
+	}
+	p->is_detaching = 0;
+	write_unlock_irq(&tasklist_lock);
+
+	if (wo->wo_stat)
+		retval = put_user(p->detach_code, wo->wo_stat);
+
+	if (!retval) {
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code >> 8);
+	} else {
+		put_task_struct(p);
+	}
+
+	return retval;
+}
+
 static int can_wait_task_common(struct wait_opts *wo, struct task_struct *p)
 {
 	int ret = eligible_child(wo, p);
@@ -1572,7 +1633,8 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	/*
 	 * We don't reap group leaders with subthreads.
 	 */
-	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p) &&
+			!p->is_detaching)
 		return wait_task_zombie(wo, p);
 
 	/*
@@ -1610,6 +1672,15 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 			return ret;
 	}
 
+	list_for_each_entry(p, &tsk->detached_children, detached_sibling) {
+		ret = can_wait_task(wo, p);
+		if (!ret)
+			continue;
+		ret = wait_task_detached(wo, p);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..feadef7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1070,6 +1070,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+	INIT_LIST_HEAD(&p->detached_children);
+	INIT_LIST_HEAD(&p->detached_sibling);
 	rcu_copy_process(p);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
@@ -1233,6 +1235,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
+	p->detaching = 0;
+	p->is_detaching = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..1d88ee8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/tracehook.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
@@ -1736,6 +1737,41 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH: {
+			struct task_struct *p, *leader;
+			int notif;
+			struct pid_namespace *pid_ns = task_active_pid_ns(me);
+			error = -EPERM;
+			/* not detaching from init */
+			if (same_thread_group(me->real_parent,
+					pid_ns->child_reaper))
+				break;
+			if (arg2 & ~0x7f)
+				break;
+			write_lock_irq(&tasklist_lock);
+			leader = me->group_leader;
+			leader->detach_code = arg2 << 8;
+			notif = do_signal_parent(leader, leader->exit_signal,
+					CLD_DETACHED, arg2);
+			if (notif != DEATH_REAP) {
+				list_add_tail(&leader->detached_sibling,
+					&leader->real_parent->detached_children);
+				leader->detaching = 1;
+				leader->is_detaching = 1;
+			}
+			p = leader;
+			do {
+				if (!task_ptrace(p))
+					p->parent = pid_ns->child_reaper;
+				p->real_parent = pid_ns->child_reaper;
+			} while_each_thread(leader, p);
+			list_move_tail(&leader->sibling,
+					&leader->real_parent->children);
+			leader->exit_signal = SIGCHLD;
+			write_unlock_irq(&tasklist_lock);
+			error = 0;
+			break;
+		}
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 14:54                           ` [path][rfc] add PR_DETACH prctl command [3/3] Stas Sergeev
@ 2011-04-19 14:58                             ` Alan Cox
  2011-04-19 15:08                               ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Alan Cox @ 2011-04-19 14:58 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Oleg Nesterov, Linux kernel

On Tue, 19 Apr 2011 18:54:34 +0400
Stas Sergeev <stsp@aknet.ru> wrote:

> The attached patch implements the PR_DETACH prctl
> command. It detaches the entire process group from
> its parent, allowing the parent to still read the detach
> code with normal wait(). If the process then exits, the
> notification of the new parent is delayed till the old parent
> does either wait() to read the detach code, or exits.

What is the use case for this and why won't existing process group
interfaces do the job ?

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 14:58                             ` Alan Cox
@ 2011-04-19 15:08                               ` Stas Sergeev
  2011-04-19 15:54                                 ` Alan Cox
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-19 15:08 UTC (permalink / raw)
  To: Alan Cox; +Cc: Oleg Nesterov, Linux kernel

19.04.2011 18:58, Alan Cox wrote:
>> The attached patch implements the PR_DETACH prctl
>> command. It detaches the entire process group from
>> its parent, allowing the parent to still read the detach
>> code with normal wait(). If the process then exits, the
>> notification of the new parent is delayed till the old parent
>> does either wait() to read the detach code, or exits.
> What is the use case for this and why won't existing process group
> interfaces do the job ?
The use case is to daemonize the process with threads.
You first need to detach it from its parent, then use TIOCNOTTY
ioctl to detach from the tty (TIOCNOTTY_GRP doesn't seem
to exist too, though, but might be easy to implement).
There are a few workarounds to that that I am aware of,
but what exactly interfaces do you have in mind? I have
found nothing that allows to do the same without a workarounds
like this:
https://lkml.org/lkml/2011/4/8/278
The current way of detaching, which is a fork/exit combo,
loses all threads, so, when you call daemon() and you had
threads, you are a toast.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 15:08                               ` Stas Sergeev
@ 2011-04-19 15:54                                 ` Alan Cox
  2011-04-19 16:13                                   ` Oleg Nesterov
                                                     ` (3 more replies)
  0 siblings, 4 replies; 57+ messages in thread
From: Alan Cox @ 2011-04-19 15:54 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Oleg Nesterov, Linux kernel

> The use case is to daemonize the process with threads.
> You first need to detach it from its parent, then use TIOCNOTTY
> ioctl to detach from the tty (TIOCNOTTY_GRP doesn't seem
> to exist too, though, but might be easy to implement).
> There are a few workarounds to that that I am aware of,
> but what exactly interfaces do you have in mind? I have
> found nothing that allows to do the same without a workarounds
> like this:
> https://lkml.org/lkml/2011/4/8/278
> The current way of detaching, which is a fork/exit combo,
> loses all threads, so, when you call daemon() and you had
> threads, you are a toast.

Yes - you need to detach then create the threads.

The reason I ask is you appear to add overhead to various hot paths and
you add 48 bytes to each task struct if I read the code right. Thats half
a megabyte on a server running a pile of java gunge !

Alan

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 15:54                                 ` Alan Cox
@ 2011-04-19 16:13                                   ` Oleg Nesterov
  2011-04-19 16:29                                     ` Oleg Nesterov
  2011-04-19 16:19                                   ` Stas Sergeev
                                                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-19 16:13 UTC (permalink / raw)
  To: Alan Cox; +Cc: Stas Sergeev, Linux kernel

On 04/19, Alan Cox wrote:
>
> > The use case is to daemonize the process with threads.
> > You first need to detach it from its parent, then use TIOCNOTTY
> > ioctl to detach from the tty (TIOCNOTTY_GRP doesn't seem
> > to exist too, though, but might be easy to implement).
> > There are a few workarounds to that that I am aware of,
> > but what exactly interfaces do you have in mind? I have
> > found nothing that allows to do the same without a workarounds
> > like this:
> > https://lkml.org/lkml/2011/4/8/278
> > The current way of detaching, which is a fork/exit combo,
> > loses all threads, so, when you call daemon() and you had
> > threads, you are a toast.
>
> Yes - you need to detach then create the threads.
>
> The reason I ask is you appear to add overhead to various hot paths

... and complicates this code.

> and
> you add 48 bytes to each task struct if I read the code right. Thats half
> a megabyte on a server running a pile of java gunge !

... to add the non-portable (and _imho_ unneeded) feauture.

IOW, personally I do not like this change in any case. But I am not
going to argue if someone acks this new feauture.




I'll try to check these patches from the correctness pov tomorrow,
but to be honest I hope someone will nack them before I start ;)

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 15:54                                 ` Alan Cox
  2011-04-19 16:13                                   ` Oleg Nesterov
@ 2011-04-19 16:19                                   ` Stas Sergeev
  2011-04-20 13:12                                   ` [path][rfc] add PR_DETACH prctl command [1/2] Stas Sergeev
  2011-04-20 13:14                                   ` [path][rfc] add PR_DETACH prctl command [2/2] Stas Sergeev
  3 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-19 16:19 UTC (permalink / raw)
  To: Alan Cox; +Cc: Oleg Nesterov, Linux kernel

19.04.2011 19:54, Alan Cox wrote:
>> The use case is to daemonize the process with threads.
>> You first need to detach it from its parent, then use TIOCNOTTY
>> ioctl to detach from the tty (TIOCNOTTY_GRP doesn't seem
>> to exist too, though, but might be easy to implement).
>> There are a few workarounds to that that I am aware of,
>> but what exactly interfaces do you have in mind? I have
>> found nothing that allows to do the same without a workarounds
>> like this:
>> https://lkml.org/lkml/2011/4/8/278
>> The current way of detaching, which is a fork/exit combo,
>> loses all threads, so, when you call daemon() and you had
>> threads, you are a toast.
> Yes - you need to detach then create the threads.
That's painful sometimes:
https://lkml.org/lkml/2011/4/8/266

> The reason I ask is you appear to add overhead to various hot paths and
> you add 48 bytes to each task struct if I read the code right. Thats half
> a megabyte on a server running a pile of java gunge !
I don't think there is a real code overhead: the new list is
always empty (unless you detach). But the memory overhead
is real. Can be shrunk though.
Well, there is always an option to leave that patch for my own
project if people dislike it too much.
But I'd like to know the margins, i.e. how much memory is acceptable?
Eg, if it only uses 8 extra bytes (one pointer), is it acceptable then,
or not in any case? What is the policy?

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 16:13                                   ` Oleg Nesterov
@ 2011-04-19 16:29                                     ` Oleg Nesterov
  2011-04-19 16:54                                       ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-19 16:29 UTC (permalink / raw)
  To: Alan Cox; +Cc: Stas Sergeev, Linux kernel

On 04/19, Oleg Nesterov wrote:
>
> I'll try to check these patches from the correctness pov tomorrow,
> but to be honest I hope someone will nack them before I start ;)

OK, I briefly looked at 3/3. It looks certainly wrong.

	notif = do_signal_parent(...);
	if (notif != DEATH_REAP) {
	....

do_signal_parent() must not return DEATH_REAP (this means that
leader->exit_signal becomes -1), but this can happen and this is bug.

I can be wrong, but iirc this was already discussed, probably when
you sent the very first version which had the same problem. That is
why (in particular) do_notify_parent() does

	BUG_ON(tsk->group_leader != tsk || !thread_group_empty(tsk))

but you simply removed this check to hide the problem.



Also. I didn't actually read the patch yet, but iiuc: if a task T does
PR_DETACH and then exits, init can't reap it until the old parent does
wait. This can confuse the poor admin, he can see the zombies with
ppid = 1 and there is no way to understand why.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 16:29                                     ` Oleg Nesterov
@ 2011-04-19 16:54                                       ` Stas Sergeev
  2011-04-19 17:20                                         ` Oleg Nesterov
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-19 16:54 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

19.04.2011 20:29, Oleg Nesterov wrote:
>> I'll try to check these patches from the correctness pov tomorrow,
>> but to be honest I hope someone will nack them before I start ;)
> OK, I briefly looked at 3/3. It looks certainly wrong.
>
> 	notif = do_signal_parent(...);
> 	if (notif != DEATH_REAP) {
> 	....
>
> do_signal_parent() must not return DEATH_REAP (this means that
> leader->exit_signal becomes -1), but this can happen and this is bug.
>
Could you please clarify this a bit: according to the comments
in signal.c:
---
                  * We are exiting and our parent doesn't care.  POSIX.1
                  * defines special semantics for setting SIGCHLD to SIG_IGN
                  * or setting the SA_NOCLDWAIT flag: we should be reaped
                  * automatically and not left for our parent's wait4 call.
---
That's how I understand it: if DEATH_REAP is returned, the
parent ignores SIGCHILD, and in this case I am not allowing
it to read the detach code with wait(). What is the bug?

> Also. I didn't actually read the patch yet, but iiuc: if a task T does
> PR_DETACH and then exits, init can't reap it until the old parent does
> wait. This can confuse the poor admin, he can see the zombies with
> ppid = 1 and there is no way to understand why.
Oh my. :)) I guess you are not going to suggest a solution to
that "problem", other than to rip the patch? :)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 16:54                                       ` Stas Sergeev
@ 2011-04-19 17:20                                         ` Oleg Nesterov
  2011-04-19 17:41                                           ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-19 17:20 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 04/19, Stas Sergeev wrote:
>
> 19.04.2011 20:29, Oleg Nesterov wrote:
>>> I'll try to check these patches from the correctness pov tomorrow,
>>> but to be honest I hope someone will nack them before I start ;)
>> OK, I briefly looked at 3/3. It looks certainly wrong.
>>
>> 	notif = do_signal_parent(...);
>> 	if (notif != DEATH_REAP) {
>> 	....
>>
>> do_signal_parent() must not return DEATH_REAP (this means that
>> leader->exit_signal becomes -1), but this can happen and this is bug.
>>
> Could you please clarify this a bit: according to the comments
> in signal.c:
> ---
>                  * We are exiting and our parent doesn't care.  POSIX.1
>                  * defines special semantics for setting SIGCHLD to SIG_IGN
>                  * or setting the SA_NOCLDWAIT flag: we should be reaped
>                  * automatically and not left for our parent's wait4 call.
> ---
> That's how I understand it: if DEATH_REAP is returned, the
> parent ignores SIGCHILD, and in this case I am not allowing
> it to read the detach code with wait(). What is the bug?

Indeed. But, once again, that is why do_notify_parent() expects the dead
tsk! Please note that if it returns DEATH_REAP it sets ->exit_signal = -1.
And this is _only_ allowed if the leader is already dead and we are going
to reap it.

>> Also. I didn't actually read the patch yet, but iiuc: if a task T does
>> PR_DETACH and then exits, init can't reap it until the old parent does
>> wait. This can confuse the poor admin, he can see the zombies with
>> ppid = 1 and there is no way to understand why.
> Oh my. :)) I guess you are not going to suggest a solution to
> that "problem", other than to rip the patch? :)

Cough... well, the mentioned solution looks very nice to me ;)

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 17:20                                         ` Oleg Nesterov
@ 2011-04-19 17:41                                           ` Stas Sergeev
  2011-04-19 18:17                                             ` Oleg Nesterov
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-19 17:41 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

19.04.2011 21:20, Oleg Nesterov wrote:

>>> do_signal_parent() must not return DEATH_REAP (this means that
>>> leader->exit_signal becomes -1), but this can happen and this is bug.
>>>
>> Could you please clarify this a bit: according to the comments
>> in signal.c:
>> ---
>>                   * We are exiting and our parent doesn't care.  POSIX.1
>>                   * defines special semantics for setting SIGCHLD to SIG_IGN
>>                   * or setting the SA_NOCLDWAIT flag: we should be reaped
>>                   * automatically and not left for our parent's wait4 call.
>> ---
>> That's how I understand it: if DEATH_REAP is returned, the
>> parent ignores SIGCHILD, and in this case I am not allowing
>> it to read the detach code with wait(). What is the bug?
> Indeed. But, once again, that is why do_notify_parent() expects the dead
> tsk! Please note that if it returns DEATH_REAP it sets ->exit_signal = -1.
> And this is _only_ allowed if the leader is already dead and we are going
> to reap it.
Ah, so, by saying "do_signal_parent() must not return DEATH_REAP (this means that
leader->exit_signal becomes -1)", you actually meant
"do_signal_parent(), when returning DEATH_REAP, must not
set ->exit_signal = -1, because only do_notify_parent()
can do that"? In other words, do_signal_parent() can return
DEATH_REAP after all?
If so - will fix, thanks.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [3/3]
  2011-04-19 17:41                                           ` Stas Sergeev
@ 2011-04-19 18:17                                             ` Oleg Nesterov
  0 siblings, 0 replies; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-19 18:17 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Linux kernel

On 04/19, Stas Sergeev wrote:
>
> 19.04.2011 21:20, Oleg Nesterov wrote:
>
>>>> do_signal_parent() must not return DEATH_REAP (this means that
>>>> leader->exit_signal becomes -1), but this can happen and this is bug.
>>>>
>>> Could you please clarify this a bit: according to the comments
>>> in signal.c:
>>> ---
>>>                   * We are exiting and our parent doesn't care.  POSIX.1
>>>                   * defines special semantics for setting SIGCHLD to SIG_IGN
>>>                   * or setting the SA_NOCLDWAIT flag: we should be reaped
>>>                   * automatically and not left for our parent's wait4 call.
>>> ---
>>> That's how I understand it: if DEATH_REAP is returned, the
>>> parent ignores SIGCHILD, and in this case I am not allowing
>>> it to read the detach code with wait(). What is the bug?
>> Indeed. But, once again, that is why do_notify_parent() expects the dead
>> tsk! Please note that if it returns DEATH_REAP it sets ->exit_signal = -1.
>> And this is _only_ allowed if the leader is already dead and we are going
>> to reap it.
> Ah, so, by saying "do_signal_parent() must not return DEATH_REAP (this means that
> leader->exit_signal becomes -1)", you actually meant
> "do_signal_parent(), when returning DEATH_REAP, must not
> set ->exit_signal = -1,

Yes.

> because only do_notify_parent()
> can do that"?

because we can only do this if we are going to reap the task

> If so - will fix, thanks.

Stas, please do not trim CC. I am very glad Alan looked at this patch,
I hope he will participate. Better yet, add Linus too as I already asked ;)

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [1/2]
  2011-04-19 15:54                                 ` Alan Cox
  2011-04-19 16:13                                   ` Oleg Nesterov
  2011-04-19 16:19                                   ` Stas Sergeev
@ 2011-04-20 13:12                                   ` Stas Sergeev
  2011-04-20 13:14                                   ` [path][rfc] add PR_DETACH prctl command [2/2] Stas Sergeev
  3 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-20 13:12 UTC (permalink / raw)
  To: Alan Cox; +Cc: Oleg Nesterov, Linux kernel

[-- Attachment #1: Type: text/plain, Size: 389 bytes --]

Hi Alan, Oleg.

I changed the patch the way you asked.
It now consumes much fewer per-thread bytes
(2 bytes + padding), and does much fewer suspicious
things on the hot path. And no zombies with ppid=1.

The attached patch adds do_signal_parent() function.
It is the same as do_notify_parent(), but can be used
when the child is not terminating, but (for example)
detaches with PR_DETACH.

[-- Attachment #2: 01_sigpar.diff --]
[-- Type: text/plain, Size: 3688 bytes --]

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5..e74882f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2096,6 +2096,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
+extern int do_signal_parent(struct task_struct *, int, int, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff1..3e71e27 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1434,14 +1434,8 @@ ret:
 	return ret;
 }
 
-/*
- * Let a parent know about the death of a child.
- * For a stopped/continued status change, use do_notify_parent_cldstop instead.
- *
- * Returns -1 if our parent ignored us and so we've switched to
- * self-reaping, or else @sig.
- */
-int do_notify_parent(struct task_struct *tsk, int sig)
+int do_signal_parent(struct task_struct *tsk, int sig, int sicode,
+	int sistatus)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1450,11 +1444,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 
 	BUG_ON(sig == -1);
 
- 	/* do_notify_parent_cldstop should have been called instead.  */
- 	BUG_ON(task_is_stopped_or_traced(tsk));
-
-	BUG_ON(!task_ptrace(tsk) &&
-	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+	/* do_notify_parent_cldstop should have been called instead.  */
+	BUG_ON(task_is_stopped_or_traced(tsk));
 
 	info.si_signo = sig;
 	info.si_errno = 0;
@@ -1480,15 +1471,8 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
 				tsk->signal->stime));
 
-	info.si_status = tsk->exit_code & 0x7f;
-	if (tsk->exit_code & 0x80)
-		info.si_code = CLD_DUMPED;
-	else if (tsk->exit_code & 0x7f)
-		info.si_code = CLD_KILLED;
-	else {
-		info.si_code = CLD_EXITED;
-		info.si_status = tsk->exit_code >> 8;
-	}
+	info.si_code = sicode;
+	info.si_status = sistatus;
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
@@ -1510,9 +1494,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 		 * is implementation-defined: we do (if you don't want
 		 * it, just use SIG_IGN instead).
 		 */
-		ret = tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
 			sig = -1;
+		/* reap process now, rather than promoting to zombie */
+		ret = DEATH_REAP;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
@@ -1522,6 +1507,36 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	return ret;
 }
 
+/*
+ * Let a parent know about the death of a child.
+ * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
+ */
+int do_notify_parent(struct task_struct *tsk, int sig)
+{
+	int sicode, sistatus, ret;
+
+	BUG_ON(!task_ptrace(tsk) &&
+	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
+	sistatus = tsk->exit_code & 0x7f;
+	if (tsk->exit_code & 0x80)
+		sicode = CLD_DUMPED;
+	else if (tsk->exit_code & 0x7f)
+		sicode = CLD_KILLED;
+	else {
+		sicode = CLD_EXITED;
+		sistatus = tsk->exit_code >> 8;
+	}
+
+	ret = do_signal_parent(tsk, sig, sicode, sistatus);
+	if (ret == DEATH_REAP)
+		tsk->exit_signal = -1;
+	return ret;
+}
+
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 {
 	struct siginfo info;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-19 15:54                                 ` Alan Cox
                                                     ` (2 preceding siblings ...)
  2011-04-20 13:12                                   ` [path][rfc] add PR_DETACH prctl command [1/2] Stas Sergeev
@ 2011-04-20 13:14                                   ` Stas Sergeev
  2011-04-20 16:50                                     ` Oleg Nesterov
  3 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-20 13:14 UTC (permalink / raw)
  To: Alan Cox; +Cc: Oleg Nesterov, Linux kernel

[-- Attachment #1: Type: text/plain, Size: 230 bytes --]

The attached patch implements the PR_DETACH prctl
command. It detaches the entire process group from
its parent, allowing the parent to still read the detach
code with normal wait().
Can be used to daemonize process with threads.

[-- Attachment #2: pr_detach5.diff --]
[-- Type: text/plain, Size: 6177 bytes --]

commit fede2db736225c61f3b60031e1b967b622e532af
Author: Stas <stas@stas.(none)>
Date:   Wed Apr 20 17:04:50 2011 +0400

    implement PR_DETACH

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e74882f..7515e62 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1260,6 +1260,9 @@ struct task_struct {
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
+	uint8_t detach_code;
+	uint8_t detaching;
+	int pr_detached:1;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45eb..8dafa02 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -751,6 +751,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 				struct list_head *dead)
 {
 	list_move_tail(&p->sibling, &p->real_parent->children);
+	p->pr_detached = 0;
 
 	if (task_detached(p))
 		return;
@@ -1309,8 +1310,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 
 	retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
-	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
-		? p->signal->group_exit_code : p->exit_code;
+	if (!p->pr_detached)
+		status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+			? p->signal->group_exit_code : p->exit_code;
+	else
+		status = p->detach_code << 8;
 	if (!retval && wo->wo_stat)
 		retval = put_user(status, wo->wo_stat);
 
@@ -1322,7 +1326,10 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	if (!retval && infop) {
 		int why;
 
-		if ((status & 0x7f) == 0) {
+		if (p->pr_detached) {
+			why = CLD_DETACHED;
+			status >>= 8;
+		} else if ((status & 0x7f) == 0) {
 			why = CLD_EXITED;
 			status >>= 8;
 		} else {
@@ -1507,6 +1514,45 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int dt, retval = 0;
+	pid_t pid;
+	uid_t uid;
+
+	if (!likely(wo->wo_flags & WEXITED))
+		return 0;
+
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code);
+	}
+
+	dt = xchg(&p->detaching, 0);
+	if (!dt)
+		return 0;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	if (wo->wo_stat)
+		retval = put_user(p->detach_code << 8, wo->wo_stat);
+
+	if (!retval) {
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code);
+	} else {
+		put_task_struct(p);
+	}
+
+	return retval;
+}
+
 /*
  * Consider @p for a wait by @parent.
  *
@@ -1555,6 +1601,12 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
 		return wait_task_zombie(wo, p);
 
+	if (p->pr_detached) {
+		if (p->detaching)
+			return wait_task_detached(wo, p);
+		return 0;
+	}
+
 	/*
 	 * It's stopped or running now, so it might
 	 * later continue, exit, or stop again.
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..d3c3991 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1233,6 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
+	p->pr_detached = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/kernel/signal.c b/kernel/signal.c
index 3e71e27..125a3c0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1518,6 +1518,15 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 {
 	int sicode, sistatus, ret;
 
+	if (tsk->pr_detached) {
+		/* if parent did not read detach code, promote to zombie */
+		if (tsk->detaching)
+			return sig;
+		/* otherwise terminate */
+		tsk->exit_signal = -1;
+		return DEATH_REAP;
+	}
+
 	BUG_ON(!task_ptrace(tsk) &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..5b6a5ea 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/tracehook.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
@@ -1736,6 +1737,35 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH: {
+			int notif;
+			struct task_struct *leader;
+			struct pid_namespace *pid_ns;
+			error = -EPERM;
+			if (arg2 & ~0x7f)
+				break;
+			write_lock_irq(&tasklist_lock);
+			pid_ns = task_active_pid_ns(me);
+			leader = me->group_leader;
+			if (leader->pr_detached)
+				goto unlock;
+			/* not detaching from init */
+			if (same_thread_group(leader->real_parent,
+					pid_ns->child_reaper))
+				goto unlock;
+			leader->detach_code = arg2;
+			notif = do_signal_parent(leader, leader->exit_signal,
+					CLD_DETACHED, arg2);
+			if (notif != DEATH_REAP)
+				leader->detaching = 1;
+			else
+				leader->detaching = 0;
+			leader->pr_detached = 1;
+			error = 0;
+unlock:
+			write_unlock_irq(&tasklist_lock);
+			break;
+		}
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-20 13:14                                   ` [path][rfc] add PR_DETACH prctl command [2/2] Stas Sergeev
@ 2011-04-20 16:50                                     ` Oleg Nesterov
  2011-04-20 18:45                                       ` Stas Sergeev
  2011-04-21 10:02                                       ` Stas Sergeev
  0 siblings, 2 replies; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-20 16:50 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Alan Cox, Linux kernel

On 04/20, Stas Sergeev wrote:
>
> The attached patch implements the PR_DETACH prctl
> command. It detaches the entire process group from
> its parent, allowing the parent to still read the detach
> code with normal wait().
> Can be used to daemonize process with threads.

Now I do not understand this patch at all.

Stas, I already asked you, please describe the semantics of PR_DETACH.
Looking at the code, I can only see it was changed again, but I have
no idea what is the supposed behaviour.

At first glance, this version does not reparent the caller to init,
but still PR_DETACH checks ->real_parent != /sbin/init for unknown
reason...

IIUC, PR_DETACH simply fools ->real_parent so that it think this child
exits, while the child in fact runs after that but do_wait() can't see it.

Why? I don't understand the point.

And. To hide the pr_detached task from do_wait(). you changed
do_notify_parent() to returnd DEATH_REAP. I guess you did this to ensure
exit_notify() paths will set EXIT_DEAD and thus do_wait() can't see this
child again. This looks very ugly I must admit. And not 100% correct, we
notify the parent twice if ->detaching != 0. Also, because of this logic,
once wait_task_detached() drops tasklist it can race with exit_notify()
doing release_task(). Yes, task_struct can't go away, but we can't trust
task_pid_vnr(p). Hmm, the change in reparent_leader doesn't look right,
at least in case we reparent to sub-thread.

And of course, this breaks ptrace _completely_. I didn't read the patch
further, perhaps there are more problems.



Stas, I am sorry, I am tired ;) You are sending more and more versions
with the different semantics and they all are buggy. Please add Linus
and ask him if he is going to accept the new feature, and please describe
exactly what you are trying to achieve. Until then I do not want to spend
my time trying to understand yet another implementation of the hack which
I personally dislike.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-20 16:50                                     ` Oleg Nesterov
@ 2011-04-20 18:45                                       ` Stas Sergeev
  2011-04-20 19:33                                         ` Oleg Nesterov
  2011-04-21 10:02                                       ` Stas Sergeev
  1 sibling, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-20 18:45 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Alan Cox, Linux kernel

20.04.2011 20:50, Oleg Nesterov wrote:
>> The attached patch implements the PR_DETACH prctl
>> command. It detaches the entire process group from
>> its parent, allowing the parent to still read the detach
>> code with normal wait().
>> Can be used to daemonize process with threads.
> At first glance, this version does not reparent the caller to init,
> but still PR_DETACH checks ->real_parent != /sbin/init for unknown
> reason...
The reason is that the task could have been reparented
to ini by some other means (parent died) - in this case no
detaching.

> IIUC, PR_DETACH simply fools ->real_parent so that it think this child
> exits, while the child in fact runs after that but do_wait() can't see it.
It can check si_code to see what actually happened.

> Why? I don't understand the point.
It was the same also when reparenting was done: the
original parent was only able to read the detach code,
and nothing more. _Nothing_ was changed, the semantic
is completely the same! The implementation changed only.


> And. To hide the pr_detached task from do_wait(). you changed
> do_notify_parent() to returnd DEATH_REAP.
No, its hidden by the check in wait_consider_task().
do_notify_parent() was changed only to not allow the
second notification to the same parent. Again, this is
to completely preserve the old semantic, where's the
original parent was always notified only once.
So, the original parent gets only one notification on
PR_DETACH, and on exit() - it does not. Nothing was
changed.

>   I guess you did this to ensure
> exit_notify() paths will set EXIT_DEAD and thus do_wait() can't see this
> child again. This looks very ugly I must admit. And not 100% correct, we
> notify the parent twice if ->detaching != 0.
No: the do_dignal_parent() is not called in that case too.

> task_pid_vnr(p). Hmm, the change in reparent_leader doesn't look right,
> at least in case we reparent to sub-thread.
Why not? Whereever we reparented, I allow reparenting again.
But, more importantly, I unhide that task in wait() and allow the
notifications again, so without this change it can't work.

> And of course, this breaks ptrace _completely_.
Why? What exactly breaks?

> Stas, I am sorry, I am tired ;) You are sending more and more versions
> with the different semantics and they all are buggy.
I am not that sure the last ones are very buggy.
Of course some bugs are possible, but at least that semantic
was a kind of "agreed on", and didn't change at all.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-20 18:45                                       ` Stas Sergeev
@ 2011-04-20 19:33                                         ` Oleg Nesterov
  2011-04-20 20:35                                           ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-20 19:33 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Alan Cox, Linux kernel

On 04/20, Stas Sergeev wrote:
>
> 20.04.2011 20:50, Oleg Nesterov wrote:
>>> The attached patch implements the PR_DETACH prctl
>>> command. It detaches the entire process group from
>>> its parent, allowing the parent to still read the detach
>>> code with normal wait().
>>> Can be used to daemonize process with threads.
>> At first glance, this version does not reparent the caller to init,
>> but still PR_DETACH checks ->real_parent != /sbin/init for unknown
>> reason...
> The reason is that the task could have been reparented
> to ini by some other means (parent died)

This is clear,

- in this case no
> detaching.

Why? Now that we do not reparent PR_DETACH could works in this case too.
Nevermind, this is minor and probably makes sense, forget.

I still do not understand the point of PR_DETACH. Why do you think it is
needed without reparenting? OK, I do not really care ;)

>> IIUC, PR_DETACH simply fools ->real_parent so that it think this child
>> exits, while the child in fact runs after that but do_wait() can't see it.
> It can check si_code to see what actually happened.
>
>> Why? I don't understand the point.
> It was the same also when reparenting was done: the
> original parent was only able to read the detach code,
> and nothing more. _Nothing_ was changed, the semantic
> is completely the same!

The same? what about, say, ppid?

>> And. To hide the pr_detached task from do_wait(). you changed
>> do_notify_parent() to returnd DEATH_REAP.
> No, its hidden by the check in wait_consider_task().
> do_notify_parent() was changed only to not allow the
> second notification to the same parent.

Not only. Please look at your own code ;) wait_consider_task() checks
exit_state == EXIT_ZOMBIE before p->pr_detached, and thus do_notify_parent()
haas to return DEATH_REAP so that the caller will set EXIT_DEAD. Otherwise
the old parent could see EXIT_ZOMBIE && pr_detached task again.

> Again, this is
> to completely preserve the old semantic, where's the
> original parent was always notified only once.

Of course, it should be notified only once. but afaics it can be notified
twice.

>> This looks very ugly I must admit. And not 100% correct, we
>> notify the parent twice if ->detaching != 0.
>
> No: the do_dignal_parent() is not called in that case too.

OK, I misread this code. In fact I missed something else, can't recall
what I meant. Probably I was wrong anyway.

But. What if the child stops after PR_DETACH? It will notify the parent
anyway, no? And this can even happen after wait(WEXITED) succeeded.

>> task_pid_vnr(p). Hmm, the change in reparent_leader doesn't look right,
>> at least in case we reparent to sub-thread.
> Why not? Whereever we reparented, I allow reparenting again.

Even if the child reparents to the original parent's sub-thread?

> But, more importantly, I unhide that task in wait() and allow the
> notifications again, so without this change it can't work.

This is clear but see above.

>> And of course, this breaks ptrace _completely_.
> Why? What exactly breaks?

Everything. ptrace relies on do_wait(). wait_consider_task() doesn't
work if pr_detached (except for WEXITED of course).

>> Stas, I am sorry, I am tired ;) You are sending more and more versions
>> with the different semantics and they all are buggy.
> I am not that sure the last ones are very buggy.

Well, I am not sure.

> but at least that semantic
> was a kind of "agreed on", and didn't change at all.

Cough. So far nobody except you agrees with this semantics ;)

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-20 19:33                                         ` Oleg Nesterov
@ 2011-04-20 20:35                                           ` Stas Sergeev
  2011-04-21 20:00                                             ` Oleg Nesterov
  0 siblings, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-20 20:35 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Alan Cox, Linux kernel

20.04.2011 23:33, Oleg Nesterov wrote:

> I still do not understand the point of PR_DETACH. Why do you think it is
> needed without reparenting? OK, I do not really care ;)
Hmm, but that's really interesting, I wonder why do
you ask this. In my eyes, the reparenting was just a
"trick", which is now avoided for good. And I saved
a lot of per-thread memory by throwing that, and
shrunk the patch twice. Why do you think no-reparenting
changed something real? Could this change any use case?

>>> And. To hide the pr_detached task from do_wait(). you changed
>>> do_notify_parent() to returnd DEATH_REAP.
>> No, its hidden by the check in wait_consider_task().
>> do_notify_parent() was changed only to not allow the
>> second notification to the same parent.
> Not only. Please look at your own code ;) wait_consider_task() checks
> exit_state == EXIT_ZOMBIE before p->pr_detached, and thus do_notify_parent()
> haas to return DEATH_REAP so that the caller will set EXIT_DEAD. Otherwise
> the old parent could see EXIT_ZOMBIE&&  pr_detached task again.
Yes, but that's not to hide from do_wait().
At least as far as I understand, exit_notify() does
release_task() in this case, so that's not hiding: I
literally terminate the child this way.
Or am I missing something?

> But. What if the child stops after PR_DETACH? It will notify the parent
> anyway, no?
Ah, so that should be disallowed too, will fix.

>>> task_pid_vnr(p). Hmm, the change in reparent_leader doesn't look right,
>>> at least in case we reparent to sub-thread.
>> Why not? Whereever we reparented, I allow reparenting again.
> Even if the child reparents to the original parent's sub-thread?
OK. :)

> Everything. ptrace relies on do_wait(). wait_consider_task() doesn't
> work if pr_detached (except for WEXITED of course).
OK.

>> I am not that sure the last ones are very buggy.
> Well, I am not sure.
OK, at least the above 2 bugs are trivial to fix... :))
Thanks!


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-20 16:50                                     ` Oleg Nesterov
  2011-04-20 18:45                                       ` Stas Sergeev
@ 2011-04-21 10:02                                       ` Stas Sergeev
  2011-04-21 20:15                                         ` Oleg Nesterov
  1 sibling, 1 reply; 57+ messages in thread
From: Stas Sergeev @ 2011-04-21 10:02 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Alan Cox, Linux kernel

[-- Attachment #1: Type: text/plain, Size: 664 bytes --]

Hi Oleg.

Attaching the fixed patch.
Changes:
- in wait_task_consider(), pr_detach hides the process
only in !ptrace case
- reparenting within the group is no longer re-enables detaching
- in do_notify_parent_cldstop(), do not allow notifications of
real_parent (still allow for ptrace parent)
- scared by your question about reparenting, I reserved the
flags argument for the future extensions. :) Reparenting may
depend on a flag.

The attached patch implements the PR_DETACH prctl
command. It detaches the entire process group from
its parent, allowing the parent to still read the detach
code with normal wait().
Can be used to daemonize process with threads.

[-- Attachment #2: pr_detach6.diff --]
[-- Type: text/plain, Size: 7396 bytes --]

commit 0654195220b22090ed8f0648ce7d176374932dfa
Author: Stas <stas@stas.(none)>
Date:   Thu Apr 21 13:48:48 2011 +0400

    implement PR_DETACH

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index 942d30b..1da9c20 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -218,7 +218,8 @@ typedef struct siginfo {
 #define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
 #define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
 #define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
-#define NSIGCHLD	6
+#define CLD_DETACHED	(__SI_CHLD|7)	/* child has detached */
+#define NSIGCHLD	7
 
 /*
  * SIGPOLL si_codes
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..fbd2451 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,6 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_DETACH 35
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e74882f..7515e62 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1260,6 +1260,9 @@ struct task_struct {
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
+	uint8_t detach_code;
+	uint8_t detaching;
+	int pr_detached:1;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned int personality;
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45eb..d1de141 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -761,6 +761,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 	if (same_thread_group(p->real_parent, father))
 		return;
 
+	/* detached process re-attaches to new parent */
+	p->pr_detached = 0;
+
 	/* We don't want people slaying init.  */
 	p->exit_signal = SIGCHLD;
 
@@ -1309,8 +1312,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 
 	retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
-	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
-		? p->signal->group_exit_code : p->exit_code;
+	if (!p->pr_detached)
+		status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+			? p->signal->group_exit_code : p->exit_code;
+	else
+		status = p->detach_code << 8;
 	if (!retval && wo->wo_stat)
 		retval = put_user(status, wo->wo_stat);
 
@@ -1322,7 +1328,10 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	if (!retval && infop) {
 		int why;
 
-		if ((status & 0x7f) == 0) {
+		if (p->pr_detached) {
+			why = CLD_DETACHED;
+			status >>= 8;
+		} else if ((status & 0x7f) == 0) {
 			why = CLD_EXITED;
 			status >>= 8;
 		} else {
@@ -1472,9 +1481,6 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	if (!unlikely(wo->wo_flags & WCONTINUED))
 		return 0;
 
-	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
-		return 0;
-
 	spin_lock_irq(&p->sighand->siglock);
 	/* Re-check with the lock held.  */
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
@@ -1507,6 +1513,45 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	return retval;
 }
 
+static int wait_task_detached(struct wait_opts *wo, struct task_struct *p)
+{
+	int dt, retval = 0;
+	pid_t pid;
+	uid_t uid;
+
+	if (!likely(wo->wo_flags & WEXITED))
+		return 0;
+
+	if (unlikely(wo->wo_flags & WNOWAIT)) {
+		get_task_struct(p);
+		read_unlock(&tasklist_lock);
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		return wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code);
+	}
+
+	dt = xchg(&p->detaching, 0);
+	if (!dt)
+		return 0;
+	get_task_struct(p);
+	read_unlock(&tasklist_lock);
+
+	if (wo->wo_stat)
+		retval = put_user(p->detach_code << 8, wo->wo_stat);
+
+	if (!retval) {
+		pid = task_pid_vnr(p);
+		uid = __task_cred(p)->uid;
+		retval = wait_noreap_copyout(wo, p, pid, uid, CLD_DETACHED,
+			p->detach_code);
+	} else {
+		put_task_struct(p);
+	}
+
+	return retval;
+}
+
 /*
  * Consider @p for a wait by @parent.
  *
@@ -1555,6 +1600,14 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
 		return wait_task_zombie(wo, p);
 
+	if (unlikely(p->pr_detached)) {
+		if (p->detaching)
+			return wait_task_detached(wo, p);
+		/* pr_detached tasks are hidden from parent */
+		if (!ptrace)
+			return 0;
+	}
+
 	/*
 	 * It's stopped or running now, so it might
 	 * later continue, exit, or stop again.
@@ -1564,7 +1617,10 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (task_stopped_code(p, ptrace))
 		return wait_task_stopped(wo, ptrace, p);
 
-	return wait_task_continued(wo, p);
+	if (p->signal->flags & SIGNAL_STOP_CONTINUED)
+		return wait_task_continued(wo, p);
+
+	return 0;
 }
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..d3c3991 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1233,6 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
+	p->pr_detached = 0;
 
 	/*
 	 * Ok, make it visible to the rest of the system.
diff --git a/kernel/signal.c b/kernel/signal.c
index 3e71e27..5c64bd9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1518,6 +1518,15 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 {
 	int sicode, sistatus, ret;
 
+	if (tsk->pr_detached) {
+		/* if parent did not read detach code, promote to zombie */
+		if (tsk->detaching)
+			return sig;
+		/* otherwise terminate */
+		tsk->exit_signal = -1;
+		return DEATH_REAP;
+	}
+
 	BUG_ON(!task_ptrace(tsk) &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
@@ -1547,6 +1556,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	if (task_ptrace(tsk))
 		parent = tsk->parent;
 	else {
+		/* pr_detached task is hidden from real_parent */
+		if (tsk->pr_detached)
+			return;
 		tsk = tsk->group_leader;
 		parent = tsk->real_parent;
 	}
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702..acf2f69 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/tracehook.h>
 #include <linux/cn_proc.h>
 #include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
@@ -1736,6 +1737,39 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			else
 				error = PR_MCE_KILL_DEFAULT;
 			break;
+		case PR_DETACH: {
+			int notif;
+			struct task_struct *leader;
+			struct pid_namespace *pid_ns;
+			unsigned long flags = arg3;
+			error = -EPERM;
+			if (arg2 & ~0x7f)
+				break;
+			/* reserve flags for the future extensions */
+			if (flags)
+				break;
+			write_lock_irq(&tasklist_lock);
+			pid_ns = task_active_pid_ns(me);
+			leader = me->group_leader;
+			if (leader->pr_detached)
+				goto unlock;
+			/* not detaching from init */
+			if (same_thread_group(leader->real_parent,
+					pid_ns->child_reaper))
+				goto unlock;
+			leader->detach_code = arg2;
+			notif = do_signal_parent(leader, leader->exit_signal,
+					CLD_DETACHED, arg2);
+			if (notif != DEATH_REAP)
+				leader->detaching = 1;
+			else
+				leader->detaching = 0;
+			leader->pr_detached = 1;
+			error = 0;
+unlock:
+			write_unlock_irq(&tasklist_lock);
+			break;
+		}
 		default:
 			error = -EINVAL;
 			break;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-20 20:35                                           ` Stas Sergeev
@ 2011-04-21 20:00                                             ` Oleg Nesterov
  2011-04-21 20:11                                               ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-21 20:00 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Alan Cox, Linux kernel

On 04/21, Stas Sergeev wrote:
> 20.04.2011 23:33, Oleg Nesterov wrote:
>
>> I still do not understand the point of PR_DETACH. Why do you think it is
>> needed without reparenting? OK, I do not really care ;)
> Hmm, but that's really interesting, I wonder why do
> you ask this.

Because I do not understand why do we need this feature. And I strongly
dislike the complications this (wrong) patch adds.

> In my eyes, the reparenting was just a
> "trick",

Sure, I din't like the previous semantics too.


Once again, last time... No need to convince me. Please convince
someone who can ack this hack authoritatively. I can't anyway. And
I'm afraid nobody except Linus can decide whether we need it or not.

>From my side - nack. What can I do if I do not agree with you?

>>>> And. To hide the pr_detached task from do_wait(). you changed
>>>> do_notify_parent() to returnd DEATH_REAP.
>>> No, its hidden by the check in wait_consider_task().
>>> do_notify_parent() was changed only to not allow the
>>> second notification to the same parent.
>> Not only. Please look at your own code ;) wait_consider_task() checks
>> exit_state == EXIT_ZOMBIE before p->pr_detached, and thus do_notify_parent()
>> haas to return DEATH_REAP so that the caller will set EXIT_DEAD. Otherwise
>> the old parent could see EXIT_ZOMBIE&&  pr_detached task again.
> Yes, but that's not to hide from do_wait().
> At least as far as I understand, exit_notify() does
> release_task() in this case, so that's not hiding: I
> literally terminate the child this way.

Yes. But there is a window before release_task() does this, we should
change task->state.

But I forgot where did we start... perhaps I missed something or meant
something else. Perhaps I disliked the fact wait_consider_task() checks
EXIT_ZOMBIE before pr_detached... Nevermind.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-21 20:00                                             ` Oleg Nesterov
@ 2011-04-21 20:11                                               ` Stas Sergeev
  0 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-21 20:11 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Linux kernel

22.04.2011 00:00, Oleg Nesterov wrote:
> Once again, last time... No need to convince me.
Please, I did _never_ tried to convince you in anything.
The question was only why you asked "Why do you think it is
needed without reparenting?", as if without reparenting
it does something different. That was a question about
the semantic only.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-21 10:02                                       ` Stas Sergeev
@ 2011-04-21 20:15                                         ` Oleg Nesterov
  2011-04-21 20:32                                           ` Stas Sergeev
  0 siblings, 1 reply; 57+ messages in thread
From: Oleg Nesterov @ 2011-04-21 20:15 UTC (permalink / raw)
  To: Stas Sergeev; +Cc: Alan Cox, Linux kernel

On 04/21, Stas Sergeev wrote:
>
> Attaching the fixed patch.
> Changes:
> - in wait_task_consider(), pr_detach hides the process
> only in !ptrace case
> ...
> @@ -1555,6 +1600,14 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
>  	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
>  		return wait_task_zombie(wo, p);
>  
> +	if (unlikely(p->pr_detached)) {
> +		if (p->detaching)
> +			return wait_task_detached(wo, p);
> +		/* pr_detached tasks are hidden from parent */
> +		if (!ptrace)
> +			return 0;
> +	}

Hmm... I guess this should fix ptrace? How? I dont understand this at all.

Stas, I bet you didn't test your patch. If the old parent traces the child
WEXITED should not succeed until the child really exits. Otherwise it is
easy to escape ptrace.

Oh. And I bet there are other problems. Say, exec can change the leader...
Easy to fix, but note that we need more and more stupid pr_detached special
cases.

Stas, sorry. I am not going to looks at the next versions. Until you
convince lkml we need this feature.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [path][rfc] add PR_DETACH prctl command [2/2]
  2011-04-21 20:15                                         ` Oleg Nesterov
@ 2011-04-21 20:32                                           ` Stas Sergeev
  0 siblings, 0 replies; 57+ messages in thread
From: Stas Sergeev @ 2011-04-21 20:32 UTC (permalink / raw)
  To: Oleg Nesterov; +Cc: Alan Cox, Linux kernel

22.04.2011 00:15, Oleg Nesterov wrote:
> Stas, I bet you didn't test your patch.
Not with ptrace, yes. :(

>   If the old parent traces the child
What is "old parent" without reparenting?

> Oh. And I bet there are other problems. Say, exec can change the leader...
> Easy to fix, but note that we need more and more stupid pr_detached special
> cases.
>
> Stas, sorry. I am not going to looks at the next versions. Until you
> convince lkml we need this feature.
OK, if you think there are really to much cases
that I missed, then this patch might be really more
intrusive than I suspected.
I got tired of it too, so fare well. :)
Thanks for your time.

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2011-04-21 20:33 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-02-23 13:50 [path][rfc] add PR_DETACH prctl command Stas Sergeev
2011-02-23 19:14 ` Oleg Nesterov
2011-02-23 20:35   ` Stas Sergeev
2011-02-24 13:29     ` Oleg Nesterov
2011-02-24 15:13       ` Stas Sergeev
2011-02-24 15:32         ` Oleg Nesterov
2011-03-31 16:10           ` Stas Sergeev
2011-03-31 17:02             ` Oleg Nesterov
2011-03-31 17:47               ` Stas Sergeev
2011-03-31 18:18                 ` Oleg Nesterov
2011-03-31 20:58                   ` Stas Sergeev
2011-04-02 13:55                     ` Oleg Nesterov
2011-04-02 18:20                       ` Stas Sergeev
2011-04-02 22:00                       ` Stas Sergeev
2011-04-01 17:02               ` Stas Sergeev
2011-04-02 14:06                 ` Oleg Nesterov
2011-04-04 14:34               ` Stas Sergeev
2011-04-04 16:03                 ` Oleg Nesterov
2011-04-04 20:05                   ` Stas Sergeev
2011-04-05 15:15                     ` Oleg Nesterov
2011-04-05 16:25                       ` Stas Sergeev
2011-04-05 16:45                         ` Oleg Nesterov
2011-04-05 17:51                           ` Stas Sergeev
2011-04-08 10:51                           ` Stas Sergeev
2011-04-08 18:55                             ` Oleg Nesterov
2011-04-08 20:16                               ` Stas Sergeev
2011-04-11 11:15                           ` Stas Sergeev
2011-04-19 14:44                           ` [path][rfc] add PR_DETACH prctl command [1/3] Stas Sergeev
2011-04-19 14:50                           ` [path][rfc] add PR_DETACH prctl command [2/3] Stas Sergeev
2011-04-19 14:54                           ` [path][rfc] add PR_DETACH prctl command [3/3] Stas Sergeev
2011-04-19 14:58                             ` Alan Cox
2011-04-19 15:08                               ` Stas Sergeev
2011-04-19 15:54                                 ` Alan Cox
2011-04-19 16:13                                   ` Oleg Nesterov
2011-04-19 16:29                                     ` Oleg Nesterov
2011-04-19 16:54                                       ` Stas Sergeev
2011-04-19 17:20                                         ` Oleg Nesterov
2011-04-19 17:41                                           ` Stas Sergeev
2011-04-19 18:17                                             ` Oleg Nesterov
2011-04-19 16:19                                   ` Stas Sergeev
2011-04-20 13:12                                   ` [path][rfc] add PR_DETACH prctl command [1/2] Stas Sergeev
2011-04-20 13:14                                   ` [path][rfc] add PR_DETACH prctl command [2/2] Stas Sergeev
2011-04-20 16:50                                     ` Oleg Nesterov
2011-04-20 18:45                                       ` Stas Sergeev
2011-04-20 19:33                                         ` Oleg Nesterov
2011-04-20 20:35                                           ` Stas Sergeev
2011-04-21 20:00                                             ` Oleg Nesterov
2011-04-21 20:11                                               ` Stas Sergeev
2011-04-21 10:02                                       ` Stas Sergeev
2011-04-21 20:15                                         ` Oleg Nesterov
2011-04-21 20:32                                           ` Stas Sergeev
2011-04-08 18:13 ` [path][rfc] add PR_DETACH prctl command Bryan Donlan
2011-04-08 20:26   ` Stas Sergeev
2011-04-08 20:52     ` Bryan Donlan
2011-04-08 21:14       ` Stas Sergeev
2011-04-08 21:25         ` Bryan Donlan
2011-04-08 21:38           ` Stas Sergeev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).