linux-hardening.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] Get siginfo from unreaped task
@ 2022-02-12  4:28 Kees Cook
  2022-02-12 11:23 ` Robert Święcki
  0 siblings, 1 reply; 7+ messages in thread
From: Kees Cook @ 2022-02-12  4:28 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Kees Cook, Robert Święcki, Jann Horn, Andy Lutomirski,
	Will Drewry, linux-api, linux-kernel, linux-hardening

Make siginfo available through PTRACE_GETSIGINFO after process death,
without needing to have already used PTRACE_ATTACH. Uses 48 more bytes
in task_struct, though I bet there might be somewhere else we could
stash a copy of it?

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/sched.h                         |   1 +
 kernel/ptrace.c                               |  12 +-
 kernel/signal.c                               |   4 +
 tools/testing/selftests/seccomp/seccomp_bpf.c | 119 ++++++++++++++++++
 4 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f5b2be39a78c..e40789e801ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1178,6 +1178,7 @@ struct task_struct {
 #endif
 	/* Ptrace state: */
 	unsigned long			ptrace_message;
+	kernel_siginfo_t		death_siginfo;
 	kernel_siginfo_t		*last_siginfo;
 
 	struct task_io_accounting	ioac;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eea265082e97..990839c57842 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1304,8 +1304,16 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
 
 	ret = ptrace_check_attach(child, request == PTRACE_KILL ||
 				  request == PTRACE_INTERRUPT);
-	if (ret < 0)
-		goto out_put_task_struct;
+	if (ret < 0) {
+		/*
+		 * Allow PTRACE_GETSIGINFO if process is dead
+		 * and we could otherwise ptrace it.
+		 */
+		if (request != PTRACE_GETSIGINFO ||
+		    !child->exit_state ||
+		    !ptrace_may_access(child, PTRACE_MODE_READ_REALCREDS))
+			goto out_put_task_struct;
+	}
 
 	ret = arch_ptrace(child, request, addr, data);
 	if (ret || request != PTRACE_DETACH)
diff --git a/kernel/signal.c b/kernel/signal.c
index 9b04631acde8..41f6ba6b7aa7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2825,6 +2825,10 @@ bool get_signal(struct ksignal *ksig)
 		}
 
 	fatal:
+		/* Allow siginfo to be queried until reaped. */
+		copy_siginfo(&current->death_siginfo, &ksig->info);
+		current->last_siginfo = &current->death_siginfo;
+
 		spin_unlock_irq(&sighand->siglock);
 		if (unlikely(cgroup_task_frozen(current)))
 			cgroup_leave_frozen(true);
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 9d126d7fabdb..d2bbf9e32f22 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -268,6 +268,10 @@ struct seccomp_notif_addfd_big {
 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
 #endif
 
+#ifndef SYS_SECCOMP
+#define SYS_SECCOMP	1
+#endif
+
 #ifndef seccomp
 int seccomp(unsigned int op, unsigned int flags, void *args)
 {
@@ -765,6 +769,121 @@ TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
 	close(fd);
 }
 
+FIXTURE(SIGINFO) {
+	pid_t child_pid;
+};
+
+FIXTURE_SETUP(SIGINFO)
+{
+	self->child_pid = 0;
+}
+
+FIXTURE_TEARDOWN(SIGINFO)
+{
+	if (self->child_pid > 0)
+		waitpid(self->child_pid, NULL, WNOHANG);
+}
+
+TEST_F(SIGINFO, child)
+{
+	int status;
+	siginfo_t info = { };
+	/* Kill only when calling __NR_prctl. */
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_PROCESS | 0xBA),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+
+	self->child_pid = fork();
+	ASSERT_LE(0, self->child_pid);
+	if (self->child_pid == 0) {
+		ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+			TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+		}
+		ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
+		prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
+		/* Should have died now. */
+		_exit(37);
+	}
+
+	/* Check siginfo_t contents. */
+	EXPECT_EQ(waitid(P_PID, self->child_pid, &info, WEXITED | WNOWAIT), 0);
+#if 0
+	struct {
+	        int si_signo;
+	        int si_code;
+	        int si_errno;
+	        union __sifields _sifields;
+	}
+
+	/* SIGCHLD */
+	struct {
+		__kernel_pid_t _pid;	/* which child */
+		__kernel_uid32_t _uid;	/* sender's uid */
+		int _status;		/* exit code */
+		__ARCH_SI_CLOCK_T _utime;
+		__ARCH_SI_CLOCK_T _stime;
+	} _sigchld;
+#endif
+	ASSERT_EQ(info.si_signo, SIGCHLD);
+	EXPECT_TRUE(info.si_code == CLD_KILLED || info.si_code == CLD_DUMPED);
+	EXPECT_TRUE(info.si_errno == 0);
+	EXPECT_EQ(info.si_pid, self->child_pid);
+
+	ASSERT_TRUE(WIFSIGNALED(info.si_status));
+	/* TODO: why doesn't this WCOREDUMP() agree with below? */
+	/* EXPECT_TRUE(WCOREDUMP(status)); */
+	EXPECT_EQ(WTERMSIG(info.si_status), SIGSYS);
+
+	memset(&info, 0, sizeof(info));
+	ASSERT_EQ(ptrace(PTRACE_GETSIGINFO, self->child_pid, NULL, &info), 0);
+#if 0
+	/* SIGSYS */
+	struct {
+		void __user *_call_addr;/* calling user insn */
+		int _syscall;		/* triggering system call number */
+		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
+	} _sigsys;
+
+	info.si_signo = SIGSYS;
+	info.si_code = SYS_SECCOMP;
+	info.si_call_addr = (void __user *)KSTK_EIP(current);
+	info.si_errno = reason;
+	info.si_arch = syscall_get_arch(current);
+	info.si_syscall = syscall;
+
+#endif
+	ASSERT_EQ(info.si_signo, SIGSYS);
+	EXPECT_EQ(info.si_code, SYS_SECCOMP);
+	/*
+	 * The syscall will have happened somewhere near the libc
+	 * prctl implementation.
+	 */
+	EXPECT_TRUE(info.si_call_addr >= (void *)prctl &&
+		    info.si_call_addr <= (void *)prctl + PAGE_SIZE) {
+		TH_LOG("info.si_call_addr: %p", info.si_call_addr);
+		TH_LOG("prctl            : %p", prctl);
+	}
+	EXPECT_EQ(info.si_errno, 0xBA);
+	/* EXPECT_EQ(info.si_arch, ...native arch...); */
+	EXPECT_EQ(info.si_syscall, __NR_prctl);
+
+	/* Check status contents. */
+	ASSERT_EQ(waitpid(self->child_pid, &status, 0), self->child_pid);
+	ASSERT_TRUE(WIFSIGNALED(status));
+	/* TODO: why doesn't this WCOREDUMP() agree with above? */
+	/* EXPECT_TRUE(WCOREDUMP(status)); */
+	EXPECT_EQ(WTERMSIG(status), SIGSYS);
+	self->child_pid = 0;
+}
+
 /* This is a thread task to die via seccomp filter violation. */
 void *kill_thread(void *data)
 {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC] Get siginfo from unreaped task
  2022-02-12  4:28 [RFC] Get siginfo from unreaped task Kees Cook
@ 2022-02-12 11:23 ` Robert Święcki
  2022-02-13  2:32   ` Andy Lutomirski
  0 siblings, 1 reply; 7+ messages in thread
From: Robert Święcki @ 2022-02-12 11:23 UTC (permalink / raw)
  To: Kees Cook
  Cc: Eric W. Biederman, Jann Horn, Andy Lutomirski, Will Drewry,
	linux-api, linux-kernel, linux-hardening

sob., 12 lut 2022 o 05:28 Kees Cook <keescook@chromium.org> napisał(a):
>
> Make siginfo available through PTRACE_GETSIGINFO after process death,
> without needing to have already used PTRACE_ATTACH. Uses 48 more bytes
> in task_struct, though I bet there might be somewhere else we could
> stash a copy of it?

An alternative way of accessing this info could be abusing the
waitid() interface, with some additional, custom to Linux, flag

waitid(P_ALL, 0, &si, __WCHILDSIGINFO);

which would change what is put into si.

But maybe ptrace() is better, because it's mostly incompatible with
other OSes anyway on the behavior/flag level, while waitd() seems to
be POSIX/BSD standard, even if Linux specifies some additional flags.

-- 
Robert Święcki

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] Get siginfo from unreaped task
  2022-02-12 11:23 ` Robert Święcki
@ 2022-02-13  2:32   ` Andy Lutomirski
  2022-02-13  8:52     ` Christian Brauner
  0 siblings, 1 reply; 7+ messages in thread
From: Andy Lutomirski @ 2022-02-13  2:32 UTC (permalink / raw)
  To: Robert Święcki
  Cc: Kees Cook, Eric W. Biederman, Jann Horn, Will Drewry, linux-api,
	linux-kernel, linux-hardening


> On Feb 12, 2022, at 3:24 AM, Robert Święcki <robert@swiecki.net> wrote:
> 
> sob., 12 lut 2022 o 05:28 Kees Cook <keescook@chromium.org> napisał(a):
>> 
>> Make siginfo available through PTRACE_GETSIGINFO after process death,
>> without needing to have already used PTRACE_ATTACH. Uses 48 more bytes
>> in task_struct, though I bet there might be somewhere else we could
>> stash a copy of it?
> 
> An alternative way of accessing this info could be abusing the
> waitid() interface, with some additional, custom to Linux, flag
> 
> waitid(P_ALL, 0, &si, __WCHILDSIGINFO);
> 
> which would change what is put into si.
> 
> But maybe ptrace() is better, because it's mostly incompatible with
> other OSes anyway on the behavior/flag level, while waitd() seems to
> be POSIX/BSD standard, even if Linux specifies some additional flags.
> 
> 

I had a kind of opposite thought, which is that it would be very nice to be able to get all the waitid() data without reaping a process or even necessarily being its parent.  Maybe these can be combined?  A new waitid() option like you’re suggesting could add siginfo (and might need permissions).  And we could have a different waitid() flag that says “maybe not my child, don’t reap” (and also needs permissions).

Although the “don’t reap” thing is fundamentally racy. What a sane process manager actually wants is an interface to read all this info from a pidfd, which means it all needs to get stuck in struct pid. And task_struct needs a completion or wait queue so you can actually wait for a pidfd to exit (unless someone already did this — I had patches a while back).  And this would be awesome.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] Get siginfo from unreaped task
  2022-02-13  2:32   ` Andy Lutomirski
@ 2022-02-13  8:52     ` Christian Brauner
  2022-02-14 20:07       ` Kees Cook
  0 siblings, 1 reply; 7+ messages in thread
From: Christian Brauner @ 2022-02-13  8:52 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Robert Święcki, Kees Cook, Eric W. Biederman,
	Jann Horn, Will Drewry, linux-api, linux-kernel, linux-hardening

On Sat, Feb 12, 2022 at 06:32:08PM -0800, Andy Lutomirski wrote:
> 
> > On Feb 12, 2022, at 3:24 AM, Robert Święcki <robert@swiecki.net> wrote:
> > 
> > sob., 12 lut 2022 o 05:28 Kees Cook <keescook@chromium.org> napisał(a):
> >> 
> >> Make siginfo available through PTRACE_GETSIGINFO after process death,
> >> without needing to have already used PTRACE_ATTACH. Uses 48 more bytes
> >> in task_struct, though I bet there might be somewhere else we could
> >> stash a copy of it?
> > 
> > An alternative way of accessing this info could be abusing the
> > waitid() interface, with some additional, custom to Linux, flag
> > 
> > waitid(P_ALL, 0, &si, __WCHILDSIGINFO);
> > 
> > which would change what is put into si.
> > 
> > But maybe ptrace() is better, because it's mostly incompatible with
> > other OSes anyway on the behavior/flag level, while waitd() seems to
> > be POSIX/BSD standard, even if Linux specifies some additional flags.
> > 
> > 
> 
> I had a kind of opposite thought, which is that it would be very nice
> to be able to get all the waitid() data without reaping a process or
> even necessarily being its parent.  Maybe these can be combined?  A
> new waitid() option like you’re suggesting could add siginfo (and
> might need permissions).  And we could have a different waitid() flag
> that says “maybe not my child, don’t reap” (and also needs
> permissions).
> 
> Although the “don’t reap” thing is fundamentally racy. What a sane
> process manager actually wants is an interface to read all this info
> from a pidfd, which means it all needs to get stuck in struct pid. And

/me briefly pops out from vacation

Agreed and not just siginfo I would expect(?). We already came to that
conclusion when we first introduced them.

> task_struct needs a completion or wait queue so you can actually wait
> for a pidfd to exit (unless someone already did this — I had patches a
> while back).  And this would be awesome.

Currently, you can wait for a pidfd to exit via polling and you can use
a pidfd to pass it to waitid(P_PIDFD, pidfd, ...).

/me pops back into vacation

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] Get siginfo from unreaped task
  2022-02-13  8:52     ` Christian Brauner
@ 2022-02-14 20:07       ` Kees Cook
  2022-02-14 22:08         ` Robert Święcki
  2022-02-15  9:01         ` Christian Brauner
  0 siblings, 2 replies; 7+ messages in thread
From: Kees Cook @ 2022-02-14 20:07 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andy Lutomirski, Robert Święcki, Eric W. Biederman,
	Jann Horn, Will Drewry, linux-api, linux-kernel, linux-hardening

On Sun, Feb 13, 2022 at 09:52:12AM +0100, Christian Brauner wrote:
> On Sat, Feb 12, 2022 at 06:32:08PM -0800, Andy Lutomirski wrote:
> > 
> > > On Feb 12, 2022, at 3:24 AM, Robert Święcki <robert@swiecki.net> wrote:
> > > 
> > > sob., 12 lut 2022 o 05:28 Kees Cook <keescook@chromium.org> napisał(a):
> > >> 
> > >> Make siginfo available through PTRACE_GETSIGINFO after process death,
> > >> without needing to have already used PTRACE_ATTACH. Uses 48 more bytes
> > >> in task_struct, though I bet there might be somewhere else we could
> > >> stash a copy of it?
> > > 
> > > An alternative way of accessing this info could be abusing the
> > > waitid() interface, with some additional, custom to Linux, flag
> > > 
> > > waitid(P_ALL, 0, &si, __WCHILDSIGINFO);
> > > 
> > > which would change what is put into si.
> > > 
> > > But maybe ptrace() is better, because it's mostly incompatible with
> > > other OSes anyway on the behavior/flag level, while waitd() seems to
> > > be POSIX/BSD standard, even if Linux specifies some additional flags.
> > > 
> > > 
> > 
> > I had a kind of opposite thought, which is that it would be very nice
> > to be able to get all the waitid() data without reaping a process or
> > even necessarily being its parent.  Maybe these can be combined?  A
> > new waitid() option like you’re suggesting could add siginfo (and
> > might need permissions).  And we could have a different waitid() flag
> > that says “maybe not my child, don’t reap” (and also needs
> > permissions).
> > 
> > Although the “don’t reap” thing is fundamentally racy. What a sane
> > process manager actually wants is an interface to read all this info
> > from a pidfd, which means it all needs to get stuck in struct pid. And
> 
> /me briefly pops out from vacation
> 
> Agreed and not just siginfo I would expect(?). We already came to that
> conclusion when we first introduced them.
> 
> > task_struct needs a completion or wait queue so you can actually wait
> > for a pidfd to exit (unless someone already did this — I had patches a
> > while back).  And this would be awesome.
> 
> Currently, you can wait for a pidfd to exit via polling and you can use
> a pidfd to pass it to waitid(P_PIDFD, pidfd, ...).
> 
> /me pops back into vacation

Right, so waitid already has all the infrastructure for this, so I think
adding it there makes a lot of sense. Here's what I've got:



diff --git a/include/linux/sched.h b/include/linux/sched.h
index f5b2be39a78c..e40789e801ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1178,6 +1178,7 @@ struct task_struct {
 #endif
 	/* Ptrace state: */
 	unsigned long			ptrace_message;
+	kernel_siginfo_t		death_siginfo;
 	kernel_siginfo_t		*last_siginfo;
 
 	struct task_io_accounting	ioac;
diff --git a/kernel/signal.c b/kernel/signal.c
index 9b04631acde8..41f6ba6b7aa7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2825,6 +2825,10 @@ bool get_signal(struct ksignal *ksig)
 		}
 
 	fatal:
+		/* Allow siginfo to be queried until reaped. */
+		copy_siginfo(&current->death_siginfo, &ksig->info);
+		current->last_siginfo = &current->death_siginfo;
+
 		spin_unlock_irq(&sighand->siglock);
 		if (unlikely(cgroup_task_frozen(current)))
 			cgroup_leave_frozen(true);
diff --git a/include/uapi/linux/wait.h b/include/uapi/linux/wait.h
index 85b809fc9f11..7258cd4510ba 100644
--- a/include/uapi/linux/wait.h
+++ b/include/uapi/linux/wait.h
@@ -9,6 +9,7 @@
 #define WCONTINUED	0x00000008
 #define WNOWAIT		0x01000000	/* Don't reap, just poll status.  */
 
+#define __WCHILDSIGINFO	0x10000000	/* Report child's siginfo. */
 #define __WNOTHREAD	0x20000000	/* Don't wait on children of other threads in this group */
 #define __WALL		0x40000000	/* Wait on all children, regardless of type */
 #define __WCLONE	0x80000000	/* Wait only on non-SIGCHLD children */
diff --git a/kernel/exit.c b/kernel/exit.c
index d54efddd378b..70ecb996cecd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -953,6 +953,7 @@ struct waitid_info {
 	uid_t uid;
 	int status;
 	int cause;
+	kernel_siginfo_t siginfo;
 };
 
 struct wait_opts {
@@ -964,7 +965,7 @@ struct wait_opts {
 	int			wo_stat;
 	struct rusage		*wo_rusage;
 
-	wait_queue_entry_t		child_wait;
+	wait_queue_entry_t	child_wait;
 	int			notask_error;
 };
 
@@ -1012,11 +1013,16 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	int state, status;
 	pid_t pid = task_pid_vnr(p);
 	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
-	struct waitid_info *infop;
+	struct waitid_info *infop = wo->wo_info;
 
 	if (!likely(wo->wo_flags & WEXITED))
 		return 0;
 
+	/* Before WNOWAIT so a copy can be extracted without reaping. */
+	if (unlikely(wo->wo_flags & __WCHILDSIGINFO)) {
+		if (infop && p->last_siginfo)
+			copy_siginfo(&infop->siginfo, p->last_siginfo);
+	}
 	if (unlikely(wo->wo_flags & WNOWAIT)) {
 		status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 			? p->signal->group_exit_code : p->exit_code;
@@ -1121,7 +1127,6 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		release_task(p);
 
 out_info:
-	infop = wo->wo_info;
 	if (infop) {
 		if ((status & 0x7f) == 0) {
 			infop->cause = CLD_EXITED;
@@ -1564,7 +1569,7 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
 	unsigned int f_flags = 0;
 
 	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
-			__WNOTHREAD|__WCLONE|__WALL))
+			__WNOTHREAD|__WCLONE|__WALL|__WCHILDSIGINFO))
 		return -EINVAL;
 	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
 		return -EINVAL;
@@ -1638,6 +1645,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	if (!infop)
 		return err;
 
+	/* __WCHILDSIGINFO */
+	if (info->siginfo.signo)
+		return copy_siginfo_to_user(infop, &info->siginfo);
+
 	if (!user_write_access_begin(infop, sizeof(*infop)))
 		return -EFAULT;
 
@@ -1781,6 +1792,12 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 	if (!infop)
 		return err;
 
+	/* __WCHILDSIGINFO */
+	if (info->siginfo.signo)
+		return copy_siginfo_to_user32(
+				(struct compat_siginfo __user *)infop,
+				&info->siginfo);
+
 	if (!user_write_access_begin(infop, sizeof(*infop)))
 		return -EFAULT;
 


One usability question I have is:

- if the process just exited normally, should it return an empty
  siginfo, or should it ignore __WCHILDSIGINFO? (I have it ignoring it
  above.)


-- 
Kees Cook

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC] Get siginfo from unreaped task
  2022-02-14 20:07       ` Kees Cook
@ 2022-02-14 22:08         ` Robert Święcki
  2022-02-15  9:01         ` Christian Brauner
  1 sibling, 0 replies; 7+ messages in thread
From: Robert Święcki @ 2022-02-14 22:08 UTC (permalink / raw)
  To: Kees Cook
  Cc: Christian Brauner, Andy Lutomirski, Eric W. Biederman, Jann Horn,
	Will Drewry, linux-api, linux-kernel, linux-hardening

pon., 14 lut 2022 o 21:07 Kees Cook <keescook@chromium.org> napisał(a):

> > > I had a kind of opposite thought, which is that it would be very nice
> > > to be able to get all the waitid() data without reaping a process or
> > > even necessarily being its parent.  Maybe these can be combined?  A
> > > new waitid() option like you’re suggesting could add siginfo (and
> > > might need permissions).  And we could have a different waitid() flag
> > > that says “maybe not my child, don’t reap” (and also needs
> > > permissions).
> > >
> > > Although the “don’t reap” thing is fundamentally racy. What a sane
> > > process manager actually wants is an interface to read all this info
> > > from a pidfd, which means it all needs to get stuck in struct pid. And
> >
> > /me briefly pops out from vacation
> >
> > Agreed and not just siginfo I would expect(?). We already came to that
> > conclusion when we first introduced them.
> >
> > > task_struct needs a completion or wait queue so you can actually wait
> > > for a pidfd to exit (unless someone already did this — I had patches a
> > > while back).  And this would be awesome.
> >
> > Currently, you can wait for a pidfd to exit via polling and you can use
> > a pidfd to pass it to waitid(P_PIDFD, pidfd, ...).
> >
> > /me pops back into vacation
>
> Right, so waitid already has all the infrastructure for this, so I think
> adding it there makes a lot of sense. Here's what I've got:
>
> One usability question I have is:
>
> - if the process just exited normally, should it return an empty
>   siginfo, or should it ignore __WCHILDSIGINFO? (I have it ignoring it
>   above.)

Maybe ENODATA as return code, in order to make it obvious to the
caller that the siginfo is missing? In the end if somebody requests
it, they probably should have already checked that the child process
was killed by a signal, by using WNOWAIT with waitid() or wait4().

Additionally, there might be a problem with input params. The waitid()
can take P_ALL or P_PGID as arguments. If waitid() returns with 0, the
pid of the child process generating the event can be read from
si->si_pid. But siginfo_t is a union of multiple different structs,
many of them missing si_pid. So, such a flag (__WCHILDSIGINFO) would
make sense with P_PID and P_PIDFD only, maybe this should be
explicitly checked for?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] Get siginfo from unreaped task
  2022-02-14 20:07       ` Kees Cook
  2022-02-14 22:08         ` Robert Święcki
@ 2022-02-15  9:01         ` Christian Brauner
  1 sibling, 0 replies; 7+ messages in thread
From: Christian Brauner @ 2022-02-15  9:01 UTC (permalink / raw)
  To: Kees Cook
  Cc: Andy Lutomirski, Robert Święcki, Eric W. Biederman,
	Jann Horn, Will Drewry, linux-api, linux-kernel, linux-hardening

On Mon, Feb 14, 2022 at 12:07:25PM -0800, Kees Cook wrote:
> On Sun, Feb 13, 2022 at 09:52:12AM +0100, Christian Brauner wrote:
> > On Sat, Feb 12, 2022 at 06:32:08PM -0800, Andy Lutomirski wrote:
> > > 
> > > > On Feb 12, 2022, at 3:24 AM, Robert Święcki <robert@swiecki.net> wrote:
> > > > 
> > > > sob., 12 lut 2022 o 05:28 Kees Cook <keescook@chromium.org> napisał(a):
> > > >> 
> > > >> Make siginfo available through PTRACE_GETSIGINFO after process death,
> > > >> without needing to have already used PTRACE_ATTACH. Uses 48 more bytes
> > > >> in task_struct, though I bet there might be somewhere else we could
> > > >> stash a copy of it?
> > > > 
> > > > An alternative way of accessing this info could be abusing the
> > > > waitid() interface, with some additional, custom to Linux, flag
> > > > 
> > > > waitid(P_ALL, 0, &si, __WCHILDSIGINFO);
> > > > 
> > > > which would change what is put into si.
> > > > 
> > > > But maybe ptrace() is better, because it's mostly incompatible with
> > > > other OSes anyway on the behavior/flag level, while waitd() seems to
> > > > be POSIX/BSD standard, even if Linux specifies some additional flags.
> > > > 
> > > > 
> > > 
> > > I had a kind of opposite thought, which is that it would be very nice
> > > to be able to get all the waitid() data without reaping a process or
> > > even necessarily being its parent.  Maybe these can be combined?  A
> > > new waitid() option like you’re suggesting could add siginfo (and
> > > might need permissions).  And we could have a different waitid() flag
> > > that says “maybe not my child, don’t reap” (and also needs
> > > permissions).
> > > 
> > > Although the “don’t reap” thing is fundamentally racy. What a sane
> > > process manager actually wants is an interface to read all this info
> > > from a pidfd, which means it all needs to get stuck in struct pid. And
> > 
> > /me briefly pops out from vacation
> > 
> > Agreed and not just siginfo I would expect(?). We already came to that
> > conclusion when we first introduced them.
> > 
> > > task_struct needs a completion or wait queue so you can actually wait
> > > for a pidfd to exit (unless someone already did this — I had patches a
> > > while back).  And this would be awesome.
> > 
> > Currently, you can wait for a pidfd to exit via polling and you can use
> > a pidfd to pass it to waitid(P_PIDFD, pidfd, ...).
> > 
> > /me pops back into vacation
> 
> Right, so waitid already has all the infrastructure for this, so I think
> adding it there makes a lot of sense. Here's what I've got:
> 
> 
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f5b2be39a78c..e40789e801ef 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1178,6 +1178,7 @@ struct task_struct {
>  #endif
>  	/* Ptrace state: */
>  	unsigned long			ptrace_message;
> +	kernel_siginfo_t		death_siginfo;
>  	kernel_siginfo_t		*last_siginfo;
>  
>  	struct task_io_accounting	ioac;
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 9b04631acde8..41f6ba6b7aa7 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2825,6 +2825,10 @@ bool get_signal(struct ksignal *ksig)
>  		}
>  
>  	fatal:
> +		/* Allow siginfo to be queried until reaped. */
> +		copy_siginfo(&current->death_siginfo, &ksig->info);
> +		current->last_siginfo = &current->death_siginfo;
> +
>  		spin_unlock_irq(&sighand->siglock);
>  		if (unlikely(cgroup_task_frozen(current)))
>  			cgroup_leave_frozen(true);
> diff --git a/include/uapi/linux/wait.h b/include/uapi/linux/wait.h
> index 85b809fc9f11..7258cd4510ba 100644
> --- a/include/uapi/linux/wait.h
> +++ b/include/uapi/linux/wait.h
> @@ -9,6 +9,7 @@
>  #define WCONTINUED	0x00000008
>  #define WNOWAIT		0x01000000	/* Don't reap, just poll status.  */
>  
> +#define __WCHILDSIGINFO	0x10000000	/* Report child's siginfo. */
>  #define __WNOTHREAD	0x20000000	/* Don't wait on children of other threads in this group */
>  #define __WALL		0x40000000	/* Wait on all children, regardless of type */
>  #define __WCLONE	0x80000000	/* Wait only on non-SIGCHLD children */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index d54efddd378b..70ecb996cecd 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -953,6 +953,7 @@ struct waitid_info {
>  	uid_t uid;
>  	int status;
>  	int cause;
> +	kernel_siginfo_t siginfo;
>  };
>  
>  struct wait_opts {
> @@ -964,7 +965,7 @@ struct wait_opts {
>  	int			wo_stat;
>  	struct rusage		*wo_rusage;
>  
> -	wait_queue_entry_t		child_wait;
> +	wait_queue_entry_t	child_wait;
>  	int			notask_error;
>  };
>  
> @@ -1012,11 +1013,16 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
>  	int state, status;
>  	pid_t pid = task_pid_vnr(p);
>  	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
> -	struct waitid_info *infop;
> +	struct waitid_info *infop = wo->wo_info;
>  
>  	if (!likely(wo->wo_flags & WEXITED))
>  		return 0;
>  
> +	/* Before WNOWAIT so a copy can be extracted without reaping. */
> +	if (unlikely(wo->wo_flags & __WCHILDSIGINFO)) {
> +		if (infop && p->last_siginfo)
> +			copy_siginfo(&infop->siginfo, p->last_siginfo);
> +	}
>  	if (unlikely(wo->wo_flags & WNOWAIT)) {
>  		status = (p->signal->flags & SIGNAL_GROUP_EXIT)
>  			? p->signal->group_exit_code : p->exit_code;
> @@ -1121,7 +1127,6 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
>  		release_task(p);
>  
>  out_info:
> -	infop = wo->wo_info;
>  	if (infop) {
>  		if ((status & 0x7f) == 0) {
>  			infop->cause = CLD_EXITED;
> @@ -1564,7 +1569,7 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
>  	unsigned int f_flags = 0;
>  
>  	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
> -			__WNOTHREAD|__WCLONE|__WALL))
> +			__WNOTHREAD|__WCLONE|__WALL|__WCHILDSIGINFO))
>  		return -EINVAL;
>  	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
>  		return -EINVAL;
> @@ -1638,6 +1645,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
>  	if (!infop)
>  		return err;
>  
> +	/* __WCHILDSIGINFO */
> +	if (info->siginfo.signo)
> +		return copy_siginfo_to_user(infop, &info->siginfo);
> +
>  	if (!user_write_access_begin(infop, sizeof(*infop)))
>  		return -EFAULT;
>  
> @@ -1781,6 +1792,12 @@ COMPAT_SYSCALL_DEFINE5(waitid,
>  	if (!infop)
>  		return err;
>  
> +	/* __WCHILDSIGINFO */
> +	if (info->siginfo.signo)
> +		return copy_siginfo_to_user32(
> +				(struct compat_siginfo __user *)infop,
> +				&info->siginfo);
> +
>  	if (!user_write_access_begin(infop, sizeof(*infop)))
>  		return -EFAULT;
>  
> 
> 
> One usability question I have is:
> 
> - if the process just exited normally, should it return an empty
>   siginfo, or should it ignore __WCHILDSIGINFO? (I have it ignoring it
>   above.)

I would model this after the regular waitid() call which seems to always
fill in that info? But note that afaict, there is a potential behavioral
difference between getting the siginfo from a no-reaped task and a
reaped task now. copy_siginfo_to_user*() simply copies all of siginfo
to the user and clears additional fields. In contrast, regular waitid()
only fills in specific fields and leaves additional fields as they were
before. It might not matter in practice but if you're doing this then
this behavior should be documented on the manpage.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-02-15  9:02 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-12  4:28 [RFC] Get siginfo from unreaped task Kees Cook
2022-02-12 11:23 ` Robert Święcki
2022-02-13  2:32   ` Andy Lutomirski
2022-02-13  8:52     ` Christian Brauner
2022-02-14 20:07       ` Kees Cook
2022-02-14 22:08         ` Robert Święcki
2022-02-15  9:01         ` Christian Brauner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).