* [PATCH 2/2] tests: add pidfd_open() tests
2019-05-15 10:03 [PATCH 1/2] pid: add pidfd_open() Christian Brauner
@ 2019-05-15 10:04 ` Christian Brauner
2019-05-15 12:29 ` [PATCH 1/2] pid: add pidfd_open() Geert Uytterhoeven
` (3 subsequent siblings)
4 siblings, 0 replies; 18+ messages in thread
From: Christian Brauner @ 2019-05-15 10:04 UTC (permalink / raw)
To: jannh, oleg, viro, torvalds, linux-kernel, arnd, dhowells
Cc: akpm, cyphar, ebiederm, elena.reshetova, keescook, luto, luto,
tglx, linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest,
Christian Brauner, Michael Kerrisk (man-pages)
This adds testing for the new pidfd_open() syscalls. Specifically, we test:
- that no invalid flags can be passed to pidfd_open()
- that no invalid pid can be passed to pidfd_open()
- that a pidfd can be retrieved with pidfd_open()
- that the retrieved pidfd references the correct pid
Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jann Horn <jannh@google.com>
Cc: David Howells <dhowells@redhat.com>
Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Cc: Andy Lutomirsky <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-api@vger.kernel.org
---
tools/testing/selftests/pidfd/Makefile | 2 +-
tools/testing/selftests/pidfd/pidfd.h | 57 ++++++
.../testing/selftests/pidfd/pidfd_open_test.c | 170 ++++++++++++++++++
tools/testing/selftests/pidfd/pidfd_test.c | 41 +----
4 files changed, 229 insertions(+), 41 deletions(-)
create mode 100644 tools/testing/selftests/pidfd/pidfd.h
create mode 100644 tools/testing/selftests/pidfd/pidfd_open_test.c
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index deaf8073bc06..b36c0be70848 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,6 +1,6 @@
CFLAGS += -g -I../../../../usr/include/
-TEST_GEN_PROGS := pidfd_test
+TEST_GEN_PROGS := pidfd_test pidfd_open_test
include ../lib.mk
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
new file mode 100644
index 000000000000..8452e910463f
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PIDFD_H
+#define __PIDFD_H
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+
+#include "../kselftest.h"
+
+/*
+ * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
+ * That means, when it wraps around any pid < 300 will be skipped.
+ * So we need to use a pid > 300 in order to test recycling.
+ */
+#define PID_RECYCLE 1000
+
+/*
+ * Define a few custom error codes for the child process to clearly indicate
+ * what is happening. This way we can tell the difference between a system
+ * error, a test error, etc.
+ */
+#define PIDFD_PASS 0
+#define PIDFD_FAIL 1
+#define PIDFD_ERROR 2
+#define PIDFD_SKIP 3
+#define PIDFD_XFAIL 4
+
+int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+
+ return -1;
+ }
+
+ if (!WIFEXITED(status))
+ return -1;
+
+ return WEXITSTATUS(status);
+}
+
+
+#endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
new file mode 100644
index 000000000000..9b073c1ac618
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+ return syscall(__NR_pidfd_open, pid, flags);
+}
+
+static int safe_int(const char *numstr, int *converted)
+{
+ char *err = NULL;
+ long sli;
+
+ errno = 0;
+ sli = strtol(numstr, &err, 0);
+ if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN))
+ return -ERANGE;
+
+ if (errno != 0 && sli == 0)
+ return -EINVAL;
+
+ if (err == numstr || *err != '\0')
+ return -EINVAL;
+
+ if (sli > INT_MAX || sli < INT_MIN)
+ return -ERANGE;
+
+ *converted = (int)sli;
+ return 0;
+}
+
+static int char_left_gc(const char *buffer, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (buffer[i] == ' ' ||
+ buffer[i] == '\t')
+ continue;
+
+ return i;
+ }
+
+ return 0;
+}
+
+static int char_right_gc(const char *buffer, size_t len)
+{
+ int i;
+
+ for (i = len - 1; i >= 0; i--) {
+ if (buffer[i] == ' ' ||
+ buffer[i] == '\t' ||
+ buffer[i] == '\n' ||
+ buffer[i] == '\0')
+ continue;
+
+ return i + 1;
+ }
+
+ return 0;
+}
+
+static char *trim_whitespace_in_place(char *buffer)
+{
+ buffer += char_left_gc(buffer, strlen(buffer));
+ buffer[char_right_gc(buffer, strlen(buffer))] = '\0';
+ return buffer;
+}
+
+static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen)
+{
+ int ret;
+ char path[512];
+ FILE *f;
+ size_t n = 0;
+ pid_t result = -1;
+ char *line = NULL;
+
+ snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
+
+ f = fopen(path, "re");
+ if (!f)
+ return -1;
+
+ while (getline(&line, &n, f) != -1) {
+ char *numstr;
+
+ if (strncmp(line, key, keylen))
+ continue;
+
+ numstr = trim_whitespace_in_place(line + 4);
+ ret = safe_int(numstr, &result);
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+
+out:
+ free(line);
+ fclose(f);
+ return result;
+}
+
+int main(int argc, char **argv)
+{
+ int pidfd = -1, ret = 1;
+ pid_t pid;
+
+ pidfd = sys_pidfd_open(-1, 0);
+ if (pidfd >= 0) {
+ ksft_print_msg(
+ "%s - succeeded to open pidfd for invalid pid -1\n",
+ strerror(errno));
+ goto on_error;
+ }
+ ksft_test_result_pass("do not allow invalid pid test: passed\n");
+ ksft_inc_pass_cnt();
+
+ pidfd = sys_pidfd_open(getpid(), 1);
+ if (pidfd >= 0) {
+ ksft_print_msg(
+ "%s - succeeded to open pidfd with invalid flag value specified\n",
+ strerror(errno));
+ goto on_error;
+ }
+ ksft_test_result_pass("do not allow invalid flag test: passed\n");
+ ksft_inc_pass_cnt();
+
+ pidfd = sys_pidfd_open(getpid(), 0);
+ if (pidfd < 0) {
+ ksft_print_msg("%s - failed to open pidfd\n", strerror(errno));
+ goto on_error;
+ }
+ ksft_test_result_pass("open a new pidfd test: passed\n");
+ ksft_inc_pass_cnt();
+
+ pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
+ ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
+
+ ret = 0;
+
+on_error:
+ if (pidfd >= 0)
+ close(pidfd);
+
+ return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c
index d59378a93782..f01de87249c9 100644
--- a/tools/testing/selftests/pidfd/pidfd_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_test.c
@@ -14,6 +14,7 @@
#include <sys/wait.h>
#include <unistd.h>
+#include "pidfd.h"
#include "../kselftest.h"
static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
@@ -62,28 +63,6 @@ static int test_pidfd_send_signal_simple_success(void)
return 0;
}
-static int wait_for_pid(pid_t pid)
-{
- int status, ret;
-
-again:
- ret = waitpid(pid, &status, 0);
- if (ret == -1) {
- if (errno == EINTR)
- goto again;
-
- return -1;
- }
-
- if (ret != pid)
- goto again;
-
- if (!WIFEXITED(status))
- return -1;
-
- return WEXITSTATUS(status);
-}
-
static int test_pidfd_send_signal_exited_fail(void)
{
int pidfd, ret, saved_errno;
@@ -128,13 +107,6 @@ static int test_pidfd_send_signal_exited_fail(void)
return 0;
}
-/*
- * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
- * That means, when it wraps around any pid < 300 will be skipped.
- * So we need to use a pid > 300 in order to test recycling.
- */
-#define PID_RECYCLE 1000
-
/*
* Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT.
* If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of
@@ -143,17 +115,6 @@ static int test_pidfd_send_signal_exited_fail(void)
*/
#define PIDFD_MAX_DEFAULT 0x8000
-/*
- * Define a few custom error codes for the child process to clearly indicate
- * what is happening. This way we can tell the difference between a system
- * error, a test error, etc.
- */
-#define PIDFD_PASS 0
-#define PIDFD_FAIL 1
-#define PIDFD_ERROR 2
-#define PIDFD_SKIP 3
-#define PIDFD_XFAIL 4
-
static int test_pidfd_send_signal_recycled_pid_fail(void)
{
int i, ret;
--
2.21.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 10:03 [PATCH 1/2] pid: add pidfd_open() Christian Brauner
2019-05-15 10:04 ` [PATCH 2/2] tests: add pidfd_open() tests Christian Brauner
@ 2019-05-15 12:29 ` Geert Uytterhoeven
2019-05-15 14:00 ` Yann Droneaud
` (2 subsequent siblings)
4 siblings, 0 replies; 18+ messages in thread
From: Geert Uytterhoeven @ 2019-05-15 12:29 UTC (permalink / raw)
To: Christian Brauner
Cc: Jann Horn, Oleg Nesterov, Al Viro, torvalds,
Linux Kernel Mailing List, Arnd Bergmann, David Howells,
linux-ia64, Linux-sh list, linux-mips,
open list:KERNEL SELFTEST FRAMEWORK, sparclinux, elena.reshetova,
Linux-Arch, linux-s390, linux-xtensa, Kees Cook, linux-m68k,
Andy Lutomirski, Thomas Gleixner, Linux ARM, Parisc List,
Linux API, cyphar, Andy Lutomirski, Eric W. Biederman, alpha,
Andrew Morton, linuxppc-dev
On Wed, May 15, 2019 at 12:04 PM Christian Brauner <christian@brauner.io> wrote:
> This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
> pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
> process that is created via traditional fork()/clone() calls that is only
> referenced by a PID:
>
> int pidfd = pidfd_open(1234, 0);
> ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0);
>
> With the introduction of pidfds through CLONE_PIDFD it is possible to
> created pidfds at process creation time.
> However, a lot of processes get created with traditional PID-based calls
> such as fork() or clone() (without CLONE_PIDFD). For these processes a
> caller can currently not create a pollable pidfd. This is a huge problem
> for Android's low memory killer (LMK) and service managers such as systemd.
> Both are examples of tools that want to make use of pidfds to get reliable
> notification of process exit for non-parents (pidfd polling) and race-free
> signal sending (pidfd_send_signal()). They intend to switch to this API for
> process supervision/management as soon as possible. Having no way to get
> pollable pidfds from PID-only processes is one of the biggest blockers for
> them in adopting this api. With pidfd_open() making it possible to retrieve
> pidfd for PID-based processes we enable them to adopt this api.
>
> In line with Arnd's recent changes to consolidate syscall numbers across
> architectures, I have added the pidfd_open() syscall to all architectures
> at the same time.
>
> Signed-off-by: Christian Brauner <christian@brauner.io>
> arch/m68k/kernel/syscalls/syscall.tbl | 1 +
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 10:03 [PATCH 1/2] pid: add pidfd_open() Christian Brauner
2019-05-15 10:04 ` [PATCH 2/2] tests: add pidfd_open() tests Christian Brauner
2019-05-15 12:29 ` [PATCH 1/2] pid: add pidfd_open() Geert Uytterhoeven
@ 2019-05-15 14:00 ` Yann Droneaud
2019-05-15 14:16 ` Christian Brauner
2019-05-15 14:38 ` Oleg Nesterov
2019-05-15 17:45 ` Daniel Colascione
4 siblings, 1 reply; 18+ messages in thread
From: Yann Droneaud @ 2019-05-15 14:00 UTC (permalink / raw)
To: Christian Brauner, jannh, oleg, viro, torvalds, linux-kernel,
arnd, dhowells
Cc: akpm, cyphar, ebiederm, elena.reshetova, keescook, luto, luto,
tglx, linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
Hi,
Le mercredi 15 mai 2019 à 12:03 +0200, Christian Brauner a écrit :
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 20881598bdfa..237d18d6ecb8 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -451,6 +452,53 @@ struct pid *find_ge_pid(int nr, struct
> pid_namespace *ns)
> return idr_get_next(&ns->idr, &nr);
> }
>
> +/**
> + * pidfd_open() - Open new pid file descriptor.
> + *
> + * @pid: pid for which to retrieve a pidfd
> + * @flags: flags to pass
> + *
> + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> + * the process identified by @pid. Currently, the process identified by
> + * @pid must be a thread-group leader. This restriction currently exists
> + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> + * leaders).
> + *
Would it be possible to create file descriptor with "restricted"
operation ?
- O_RDONLY: waiting for process completion allowed (for example)
- O_WRONLY: sending process signal allowed
For example, a process could send over a Unix socket a process a pidfd,
allowing this to only wait for completion, but not sending signal ?
I see the permission check is not done in pidfd_open(), so what prevent
a user from sending a signal to another user owned process ?
If it's in pidfd_send_signal(), then, passing the socket through
SCM_RIGHT won't be useful if the target process is not owned by the
same user, or root.
> + * Return: On success, a cloexec pidfd is returned.
> + * On error, a negative errno number will be returned.
> + */
> +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> +{
> + int fd, ret;
> + struct pid *p;
> + struct task_struct *tsk;
> +
> + if (flags)
> + return -EINVAL;
> +
> + if (pid <= 0)
> + return -EINVAL;
> +
> + p = find_get_pid(pid);
> + if (!p)
> + return -ESRCH;
> +
> + rcu_read_lock();
> + tsk = pid_task(p, PIDTYPE_PID);
> + if (!tsk)
> + ret = -ESRCH;
> + else if (unlikely(!thread_group_leader(tsk)))
> + ret = -EINVAL;
> + else
> + ret = 0;
> + rcu_read_unlock();
> +
> + fd = ret ?: pidfd_create(p);
> + put_pid(p);
> + return fd;
> +}
> +
> void __init pid_idr_init(void)
> {
> /* Verify no one has done anything silly: */
Regards.
--
Yann Droneaud
OPTEYA
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 14:00 ` Yann Droneaud
@ 2019-05-15 14:16 ` Christian Brauner
2019-05-15 14:51 ` Aleksa Sarai
2019-05-15 15:29 ` Yann Droneaud
0 siblings, 2 replies; 18+ messages in thread
From: Christian Brauner @ 2019-05-15 14:16 UTC (permalink / raw)
To: Yann Droneaud
Cc: jannh, oleg, viro, torvalds, linux-kernel, arnd, dhowells, akpm,
cyphar, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
On Wed, May 15, 2019 at 04:00:20PM +0200, Yann Droneaud wrote:
> Hi,
>
> Le mercredi 15 mai 2019 à 12:03 +0200, Christian Brauner a écrit :
> >
> > diff --git a/kernel/pid.c b/kernel/pid.c
> > index 20881598bdfa..237d18d6ecb8 100644
> > --- a/kernel/pid.c
> > +++ b/kernel/pid.c
> > @@ -451,6 +452,53 @@ struct pid *find_ge_pid(int nr, struct
> > pid_namespace *ns)
> > return idr_get_next(&ns->idr, &nr);
> > }
> >
> > +/**
> > + * pidfd_open() - Open new pid file descriptor.
> > + *
> > + * @pid: pid for which to retrieve a pidfd
> > + * @flags: flags to pass
> > + *
> > + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> > + * the process identified by @pid. Currently, the process identified by
> > + * @pid must be a thread-group leader. This restriction currently exists
> > + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> > + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> > + * leaders).
> > + *
>
> Would it be possible to create file descriptor with "restricted"
> operation ?
>
> - O_RDONLY: waiting for process completion allowed (for example)
> - O_WRONLY: sending process signal allowed
Yes, something like this is likely going to be possible in the future.
We had discussion around this. But mapping this to O_RDONLY and O_WRONLY
is not the right model. It makes more sense to have specialized flags
that restrict actions.
>
> For example, a process could send over a Unix socket a process a pidfd,
> allowing this to only wait for completion, but not sending signal ?
>
> I see the permission check is not done in pidfd_open(), so what prevent
> a user from sending a signal to another user owned process ?
That's supposed to be possible. You can do the same right now already
with pids. Tools like LMK need this probably very much.
Permission checking for signals is done at send time right now.
And if you can't signal via a pid you can't signal via a pidfd as
they're both subject to the same permissions checks.
>
> If it's in pidfd_send_signal(), then, passing the socket through
> SCM_RIGHT won't be useful if the target process is not owned by the
> same user, or root.
>
> > + * Return: On success, a cloexec pidfd is returned.
> > + * On error, a negative errno number will be returned.
> > + */
> > +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> > +{
> > + int fd, ret;
> > + struct pid *p;
> > + struct task_struct *tsk;
> > +
> > + if (flags)
> > + return -EINVAL;
> > +
> > + if (pid <= 0)
> > + return -EINVAL;
> > +
> > + p = find_get_pid(pid);
> > + if (!p)
> > + return -ESRCH;
> > +
> > + rcu_read_lock();
> > + tsk = pid_task(p, PIDTYPE_PID);
> > + if (!tsk)
> > + ret = -ESRCH;
> > + else if (unlikely(!thread_group_leader(tsk)))
> > + ret = -EINVAL;
> > + else
> > + ret = 0;
> > + rcu_read_unlock();
> > +
> > + fd = ret ?: pidfd_create(p);
> > + put_pid(p);
> > + return fd;
> > +}
> > +
> > void __init pid_idr_init(void)
> > {
> > /* Verify no one has done anything silly: */
>
> Regards.
>
> --
> Yann Droneaud
> OPTEYA
>
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 14:16 ` Christian Brauner
@ 2019-05-15 14:51 ` Aleksa Sarai
2019-05-15 15:29 ` Yann Droneaud
1 sibling, 0 replies; 18+ messages in thread
From: Aleksa Sarai @ 2019-05-15 14:51 UTC (permalink / raw)
To: Christian Brauner
Cc: Yann Droneaud, jannh, oleg, viro, torvalds, linux-kernel, arnd,
dhowells, akpm, ebiederm, elena.reshetova, keescook, luto, luto,
tglx, linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
[-- Attachment #1: Type: text/plain, Size: 805 bytes --]
On 2019-05-15, Christian Brauner <christian@brauner.io> wrote:
> On Wed, May 15, 2019 at 04:00:20PM +0200, Yann Droneaud wrote:
> > Would it be possible to create file descriptor with "restricted"
> > operation ?
> >
> > - O_RDONLY: waiting for process completion allowed (for example)
> > - O_WRONLY: sending process signal allowed
>
> Yes, something like this is likely going to be possible in the future.
> We had discussion around this. But mapping this to O_RDONLY and O_WRONLY
> is not the right model. It makes more sense to have specialized flags
> that restrict actions.
Not to mention that the O_* flags have silly values which we shouldn't
replicate in new syscalls IMHO.
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 14:16 ` Christian Brauner
2019-05-15 14:51 ` Aleksa Sarai
@ 2019-05-15 15:29 ` Yann Droneaud
1 sibling, 0 replies; 18+ messages in thread
From: Yann Droneaud @ 2019-05-15 15:29 UTC (permalink / raw)
To: Christian Brauner
Cc: jannh, oleg, viro, torvalds, linux-kernel, arnd, dhowells, akpm,
cyphar, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
Hi,
Le mercredi 15 mai 2019 à 16:16 +0200, Christian Brauner a écrit :
> On Wed, May 15, 2019 at 04:00:20PM +0200, Yann Droneaud wrote:
> > Le mercredi 15 mai 2019 à 12:03 +0200, Christian Brauner a écrit :
> > > diff --git a/kernel/pid.c b/kernel/pid.c
> > > index 20881598bdfa..237d18d6ecb8 100644
> > > --- a/kernel/pid.c
> > > +++ b/kernel/pid.c
> > > @@ -451,6 +452,53 @@ struct pid *find_ge_pid(int nr, struct
> > > pid_namespace *ns)
> > > return idr_get_next(&ns->idr, &nr);
> > > }
> > >
> > > +/**
> > > + * pidfd_open() - Open new pid file descriptor.
> > > + *
> > > + * @pid: pid for which to retrieve a pidfd
> > > + * @flags: flags to pass
> > > + *
> > > + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> > > + * the process identified by @pid. Currently, the process identified by
> > > + * @pid must be a thread-group leader. This restriction currently exists
> > > + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> > > + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> > > + * leaders).
> > > + *
> >
> > Would it be possible to create file descriptor with "restricted"
> > operation ?
> >
> > - O_RDONLY: waiting for process completion allowed (for example)
> > - O_WRONLY: sending process signal allowed
>
> Yes, something like this is likely going to be possible in the future.
> We had discussion around this. But mapping this to O_RDONLY and O_WRONLY
> is not the right model. It makes more sense to have specialized flags
> that restrict actions.
Yes, dedicated flags are the way to go. I've used the old open() flags
here as examples as an echo of the O_CLOEXEC flag used in the comment.
> > For example, a process could send over a Unix socket a process a pidfd,
> > allowing this to only wait for completion, but not sending signal ?
> >
> > I see the permission check is not done in pidfd_open(), so what prevent
> > a user from sending a signal to another user owned process ?
>
> That's supposed to be possible. You can do the same right now already
> with pids. Tools like LMK need this probably very much.
> Permission checking for signals is done at send time right now.
> And if you can't signal via a pid you can't signal via a pidfd as
> they're both subject to the same permissions checks.
>
I would have expect it to behave like most other file descriptor,
permission check done at opening time, which allow more privileged
process to open the file descriptor, then pass it to a less privileged
process, or change its own privileged with setuid() and such. Then the
less privileged process can act on behalf of the privileged process
through the file descriptor.
> > If it's in pidfd_send_signal(), then, passing the socket through
> > SCM_RIGHT won't be useful if the target process is not owned by the
> > same user, or root.
> >
If the permission check is done at sending time, the scenario above
case cannot be implemented.
Sending a pidfd through SCM_RIGHT is only useful if the receiver
process is equally or more privileged than the sender then.
For isolation purpose, I would have expect to be able to give a right
to send a signal to a highly privileged process a specific less
privileged process though Unix socket.
But I can't come up with a specific use case. So I dunno.
Regards.
--
Yann Droneaud
OPTEYA
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 10:03 [PATCH 1/2] pid: add pidfd_open() Christian Brauner
` (2 preceding siblings ...)
2019-05-15 14:00 ` Yann Droneaud
@ 2019-05-15 14:38 ` Oleg Nesterov
2019-05-15 14:49 ` Christian Brauner
2019-05-15 15:35 ` Oleg Nesterov
2019-05-15 17:45 ` Daniel Colascione
4 siblings, 2 replies; 18+ messages in thread
From: Oleg Nesterov @ 2019-05-15 14:38 UTC (permalink / raw)
To: Christian Brauner
Cc: jannh, viro, torvalds, linux-kernel, arnd, dhowells, akpm,
cyphar, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
On 05/15, Christian Brauner wrote:
>
> +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> +{
> + int fd, ret;
> + struct pid *p;
> + struct task_struct *tsk;
> +
> + if (flags)
> + return -EINVAL;
> +
> + if (pid <= 0)
> + return -EINVAL;
> +
> + p = find_get_pid(pid);
> + if (!p)
> + return -ESRCH;
> +
> + rcu_read_lock();
> + tsk = pid_task(p, PIDTYPE_PID);
You do not need find_get_pid() before rcu_lock and put_pid() at the end.
You can just do find_vpid() under rcu_read_lock().
> + if (!tsk)
> + ret = -ESRCH;
> + else if (unlikely(!thread_group_leader(tsk)))
> + ret = -EINVAL;
it seems that you can do a single check
tsk = pid_task(p, PIDTYPE_TGID);
if (!tsk)
ret = -ESRCH;
this even looks more correct if we race with exec changing the leader.
Oleg.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 14:38 ` Oleg Nesterov
@ 2019-05-15 14:49 ` Christian Brauner
2019-05-15 15:19 ` Oleg Nesterov
2019-05-15 15:35 ` Oleg Nesterov
1 sibling, 1 reply; 18+ messages in thread
From: Christian Brauner @ 2019-05-15 14:49 UTC (permalink / raw)
To: Oleg Nesterov
Cc: jannh, viro, torvalds, linux-kernel, arnd, dhowells, akpm,
cyphar, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
On Wed, May 15, 2019 at 04:38:58PM +0200, Oleg Nesterov wrote:
> On 05/15, Christian Brauner wrote:
> >
> > +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> > +{
> > + int fd, ret;
> > + struct pid *p;
> > + struct task_struct *tsk;
> > +
> > + if (flags)
> > + return -EINVAL;
> > +
> > + if (pid <= 0)
> > + return -EINVAL;
> > +
> > + p = find_get_pid(pid);
> > + if (!p)
> > + return -ESRCH;
> > +
> > + rcu_read_lock();
> > + tsk = pid_task(p, PIDTYPE_PID);
>
> You do not need find_get_pid() before rcu_lock and put_pid() at the end.
> You can just do find_vpid() under rcu_read_lock().
Will do.
>
> > + if (!tsk)
> > + ret = -ESRCH;
> > + else if (unlikely(!thread_group_leader(tsk)))
> > + ret = -EINVAL;
>
> it seems that you can do a single check
>
> tsk = pid_task(p, PIDTYPE_TGID);
> if (!tsk)
> ret = -ESRCH;
>
> this even looks more correct if we race with exec changing the leader.
The logic here being that you can only reach the thread_group leader
from struct pid if PIDTYPE_PID == PIDTYPE_TGID for this struct pid?
Thanks, Oleg.
Christian
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 14:49 ` Christian Brauner
@ 2019-05-15 15:19 ` Oleg Nesterov
2019-05-15 15:30 ` Christian Brauner
0 siblings, 1 reply; 18+ messages in thread
From: Oleg Nesterov @ 2019-05-15 15:19 UTC (permalink / raw)
To: Christian Brauner
Cc: jannh, viro, torvalds, linux-kernel, arnd, dhowells, akpm,
cyphar, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
On 05/15, Christian Brauner wrote:
>
> On Wed, May 15, 2019 at 04:38:58PM +0200, Oleg Nesterov wrote:
> >
> > it seems that you can do a single check
> >
> > tsk = pid_task(p, PIDTYPE_TGID);
> > if (!tsk)
> > ret = -ESRCH;
> >
> > this even looks more correct if we race with exec changing the leader.
>
> The logic here being that you can only reach the thread_group leader
> from struct pid if PIDTYPE_PID == PIDTYPE_TGID for this struct pid?
Not exactly... it is not that PIDTYPE_PID == PIDTYPE_TGID for this pid,
struct pid has no "type" or something like this.
The logic is that pid->tasks[PIDTYPE_XXX] is the list of task which use
this pid as "XXX" type.
For example, clone(CLONE_THREAD) creates a pid which has a single non-
empty list, pid->tasks[PIDTYPE_PID]. This pid can't be used as TGID or
SID.
So if pid_task(PIDTYPE_TGID) returns non-NULL we know that this pid was
used for a group-leader, see copy_process() which does
if (thread_group_leader(p))
attach_pid(p, PIDTYPE_TGID);
If we race with exec which changes the leader pid_task(TGID) can return
the old leader. We do not care, but this means that we should not check
thread_group_leader().
Oleg.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 15:19 ` Oleg Nesterov
@ 2019-05-15 15:30 ` Christian Brauner
0 siblings, 0 replies; 18+ messages in thread
From: Christian Brauner @ 2019-05-15 15:30 UTC (permalink / raw)
To: Oleg Nesterov
Cc: jannh, viro, torvalds, linux-kernel, arnd, dhowells, akpm,
cyphar, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
On Wed, May 15, 2019 at 05:19:13PM +0200, Oleg Nesterov wrote:
> On 05/15, Christian Brauner wrote:
> >
> > On Wed, May 15, 2019 at 04:38:58PM +0200, Oleg Nesterov wrote:
> > >
> > > it seems that you can do a single check
> > >
> > > tsk = pid_task(p, PIDTYPE_TGID);
> > > if (!tsk)
> > > ret = -ESRCH;
> > >
> > > this even looks more correct if we race with exec changing the leader.
> >
> > The logic here being that you can only reach the thread_group leader
> > from struct pid if PIDTYPE_PID == PIDTYPE_TGID for this struct pid?
>
> Not exactly... it is not that PIDTYPE_PID == PIDTYPE_TGID for this pid,
> struct pid has no "type" or something like this.
>
> The logic is that pid->tasks[PIDTYPE_XXX] is the list of task which use
> this pid as "XXX" type.
>
> For example, clone(CLONE_THREAD) creates a pid which has a single non-
> empty list, pid->tasks[PIDTYPE_PID]. This pid can't be used as TGID or
> SID.
>
> So if pid_task(PIDTYPE_TGID) returns non-NULL we know that this pid was
> used for a group-leader, see copy_process() which does
Ah, this was what I was asking myself when I worked on thread-specific
signal sending. This clarifies quite a lot of things!
Though I wonder how one reliably gets a the PGID or SID from a
PIDTYPE_PID.
>
> if (thread_group_leader(p))
> attach_pid(p, PIDTYPE_TGID);
>
>
> If we race with exec which changes the leader pid_task(TGID) can return
> the old leader. We do not care, but this means that we should not check
> thread_group_leader().
Nice!
Thank you, Oleg! :)
Christian
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 14:38 ` Oleg Nesterov
2019-05-15 14:49 ` Christian Brauner
@ 2019-05-15 15:35 ` Oleg Nesterov
2019-05-15 15:40 ` Christian Brauner
1 sibling, 1 reply; 18+ messages in thread
From: Oleg Nesterov @ 2019-05-15 15:35 UTC (permalink / raw)
To: Christian Brauner
Cc: jannh, viro, torvalds, linux-kernel, arnd, dhowells, akpm,
cyphar, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
On 05/15, Oleg Nesterov wrote:
>
> On 05/15, Christian Brauner wrote:
> >
> > +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> > +{
> > + int fd, ret;
> > + struct pid *p;
> > + struct task_struct *tsk;
> > +
> > + if (flags)
> > + return -EINVAL;
> > +
> > + if (pid <= 0)
> > + return -EINVAL;
> > +
> > + p = find_get_pid(pid);
> > + if (!p)
> > + return -ESRCH;
> > +
> > + rcu_read_lock();
> > + tsk = pid_task(p, PIDTYPE_PID);
>
> You do not need find_get_pid() before rcu_lock and put_pid() at the end.
> You can just do find_vpid() under rcu_read_lock().
Ah, sorry. Somehow I forgot you need to call pidfd_create(pid), you can't
do this under rcu_read_lock().
So I was wrong, you can't avoid get/put_pid.
Oleg.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 15:35 ` Oleg Nesterov
@ 2019-05-15 15:40 ` Christian Brauner
0 siblings, 0 replies; 18+ messages in thread
From: Christian Brauner @ 2019-05-15 15:40 UTC (permalink / raw)
To: Oleg Nesterov
Cc: jannh, viro, torvalds, linux-kernel, arnd, dhowells, akpm,
cyphar, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k,
linux-mips, linux-parisc, linuxppc-dev, linux-s390, linux-sh,
sparclinux, linux-xtensa, linux-api, linux-arch, linux-kselftest
On Wed, May 15, 2019 at 05:35:15PM +0200, Oleg Nesterov wrote:
> On 05/15, Oleg Nesterov wrote:
> >
> > On 05/15, Christian Brauner wrote:
> > >
> > > +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> > > +{
> > > + int fd, ret;
> > > + struct pid *p;
> > > + struct task_struct *tsk;
> > > +
> > > + if (flags)
> > > + return -EINVAL;
> > > +
> > > + if (pid <= 0)
> > > + return -EINVAL;
> > > +
> > > + p = find_get_pid(pid);
> > > + if (!p)
> > > + return -ESRCH;
> > > +
> > > + rcu_read_lock();
> > > + tsk = pid_task(p, PIDTYPE_PID);
> >
> > You do not need find_get_pid() before rcu_lock and put_pid() at the end.
> > You can just do find_vpid() under rcu_read_lock().
>
> Ah, sorry. Somehow I forgot you need to call pidfd_create(pid), you can't
> do this under rcu_read_lock().
>
> So I was wrong, you can't avoid get/put_pid.
Yeah, I haven't made any changes yet.
Christian
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 10:03 [PATCH 1/2] pid: add pidfd_open() Christian Brauner
` (3 preceding siblings ...)
2019-05-15 14:38 ` Oleg Nesterov
@ 2019-05-15 17:45 ` Daniel Colascione
2019-05-16 13:08 ` Christian Brauner
4 siblings, 1 reply; 18+ messages in thread
From: Daniel Colascione @ 2019-05-15 17:45 UTC (permalink / raw)
To: Christian Brauner
Cc: Jann Horn, Oleg Nesterov, Al Viro, Linus Torvalds, linux-kernel,
Arnd Bergmann, David Howells, Andrew Morton, Aleksa Sarai,
Eric W. Biederman, elena.reshetova, Kees Cook, Andy Lutomirski,
Andy Lutomirski, Thomas Gleixner, linux-alpha, linux-arm-kernel,
linux-ia64, linux-m68k, linux-mips, linux-parisc, linuxppc-dev,
linux-s390, linux-sh, sparclinux, linux-xtensa, Linux API,
linux-arch, open list:KERNEL SELFTEST FRAMEWORK
On Wed, May 15, 2019 at 3:04 AM Christian Brauner <christian@brauner.io> wrote:
>
> This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
> pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
> process that is created via traditional fork()/clone() calls that is only
> referenced by a PID:
Thanks for doing this work. I'm really looking forward to this new
approach to process management.
> int pidfd = pidfd_open(1234, 0);
> ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0);
>
> With the introduction of pidfds through CLONE_PIDFD it is possible to
> created pidfds at process creation time.
> However, a lot of processes get created with traditional PID-based calls
> such as fork() or clone() (without CLONE_PIDFD). For these processes a
> caller can currently not create a pollable pidfd. This is a huge problem
> for Android's low memory killer (LMK) and service managers such as systemd.
> Both are examples of tools that want to make use of pidfds to get reliable
> notification of process exit for non-parents (pidfd polling) and race-free
> signal sending (pidfd_send_signal()). They intend to switch to this API for
> process supervision/management as soon as possible. Having no way to get
> pollable pidfds from PID-only processes is one of the biggest blockers for
> them in adopting this api. With pidfd_open() making it possible to retrieve
> pidfd for PID-based processes we enable them to adopt this api.
>
> In line with Arnd's recent changes to consolidate syscall numbers across
> architectures, I have added the pidfd_open() syscall to all architectures
> at the same time.
I'm glad it's easier now.
> arch/alpha/kernel/syscalls/syscall.tbl | 1 +
> arch/arm64/include/asm/unistd32.h | 2 +
> arch/ia64/kernel/syscalls/syscall.tbl | 1 +
> arch/m68k/kernel/syscalls/syscall.tbl | 1 +
> arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
> arch/parisc/kernel/syscalls/syscall.tbl | 1 +
> arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
> arch/s390/kernel/syscalls/syscall.tbl | 1 +
> arch/sh/kernel/syscalls/syscall.tbl | 1 +
> arch/sparc/kernel/syscalls/syscall.tbl | 1 +
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
It'd be nice to arrange the system call tables so that we need to
change only one file when adding a new system call.
[Snip system call wiring]
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -67,6 +67,7 @@ struct pid
> extern struct pid init_struct_pid;
>
> extern const struct file_operations pidfd_fops;
> +extern int pidfd_create(struct pid *pid);
>
> static inline struct pid *get_pid(struct pid *pid)
> {
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index e2870fe1be5b..989055e0b501 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -929,6 +929,7 @@ asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
> struct old_timex32 __user *tx);
> asmlinkage long sys_syncfs(int fd);
> asmlinkage long sys_setns(int fd, int nstype);
> +asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
> asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
> unsigned int vlen, unsigned flags);
> asmlinkage long sys_process_vm_readv(pid_t pid,
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index dee7292e1df6..94a257a93d20 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -832,9 +832,11 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
> __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
> #define __NR_io_uring_register 427
> __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
> +#define __NR_pidfd_open 428
> +__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 428
> +#define __NR_syscalls 429
>
> /*
> * 32 bit systems traditionally used different
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 737db1828437..980cc1d2b8d4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1714,7 +1714,7 @@ const struct file_operations pidfd_fops = {
> * Return: On success, a cloexec pidfd is returned.
> * On error, a negative errno number will be returned.
> */
> -static int pidfd_create(struct pid *pid)
> +int pidfd_create(struct pid *pid)
> {
> int fd;
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 20881598bdfa..237d18d6ecb8 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -38,6 +38,7 @@
> #include <linux/syscalls.h>
> #include <linux/proc_ns.h>
> #include <linux/proc_fs.h>
> +#include <linux/sched/signal.h>
> #include <linux/sched/task.h>
> #include <linux/idr.h>
>
> @@ -451,6 +452,53 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
> return idr_get_next(&ns->idr, &nr);
> }
>
> +/**
> + * pidfd_open() - Open new pid file descriptor.
> + *
> + * @pid: pid for which to retrieve a pidfd
> + * @flags: flags to pass
> + *
> + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> + * the process identified by @pid. Currently, the process identified by
> + * @pid must be a thread-group leader. This restriction currently exists
> + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> + * leaders).
> + *
> + * Return: On success, a cloexec pidfd is returned.
> + * On error, a negative errno number will be returned.
> + */
> +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> +{
> + int fd, ret;
> + struct pid *p;
> + struct task_struct *tsk;
> +
> + if (flags)
> + return -EINVAL;
If we support blocking operations on pidfds, we'll want to be able to
put them in non-blocking mode. Does it make sense to accept and ignore
O_NONBLOCK here now?
> + if (pid <= 0)
> + return -EINVAL;
WDYT of defining pid == 0 to mean "open myself"?
> + p = find_get_pid(pid);
> + if (!p)
> + return -ESRCH;
> +
> + rcu_read_lock();
> + tsk = pid_task(p, PIDTYPE_PID);
> + if (!tsk)
> + ret = -ESRCH;
> + else if (unlikely(!thread_group_leader(tsk)))
> + ret = -EINVAL;
> + else
> + ret = 0;
> + rcu_read_unlock();
> +
> + fd = ret ?: pidfd_create(p);
> + put_pid(p);
> + return fd;
> +}
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-15 17:45 ` Daniel Colascione
@ 2019-05-16 13:08 ` Christian Brauner
2019-05-16 14:03 ` Jann Horn
2019-05-16 14:53 ` Aleksa Sarai
0 siblings, 2 replies; 18+ messages in thread
From: Christian Brauner @ 2019-05-16 13:08 UTC (permalink / raw)
To: Daniel Colascione
Cc: Jann Horn, Oleg Nesterov, Al Viro, Linus Torvalds, linux-kernel,
Arnd Bergmann, David Howells, Andrew Morton, Aleksa Sarai,
Eric W. Biederman, elena.reshetova, Kees Cook, Andy Lutomirski,
Andy Lutomirski, Thomas Gleixner, linux-alpha, linux-arm-kernel,
linux-ia64, linux-m68k, linux-mips, linux-parisc, linuxppc-dev,
linux-s390, linux-sh, sparclinux, linux-xtensa, Linux API,
linux-arch, open list:KERNEL SELFTEST FRAMEWORK
On Wed, May 15, 2019 at 10:45:06AM -0700, Daniel Colascione wrote:
> On Wed, May 15, 2019 at 3:04 AM Christian Brauner <christian@brauner.io> wrote:
> >
> > This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
> > pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
> > process that is created via traditional fork()/clone() calls that is only
> > referenced by a PID:
>
> Thanks for doing this work. I'm really looking forward to this new
> approach to process management.
Thanks! Glad to hear!
>
> > int pidfd = pidfd_open(1234, 0);
> > ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0);
> >
> > With the introduction of pidfds through CLONE_PIDFD it is possible to
> > created pidfds at process creation time.
> > However, a lot of processes get created with traditional PID-based calls
> > such as fork() or clone() (without CLONE_PIDFD). For these processes a
> > caller can currently not create a pollable pidfd. This is a huge problem
> > for Android's low memory killer (LMK) and service managers such as systemd.
> > Both are examples of tools that want to make use of pidfds to get reliable
> > notification of process exit for non-parents (pidfd polling) and race-free
> > signal sending (pidfd_send_signal()). They intend to switch to this API for
> > process supervision/management as soon as possible. Having no way to get
> > pollable pidfds from PID-only processes is one of the biggest blockers for
> > them in adopting this api. With pidfd_open() making it possible to retrieve
> > pidfd for PID-based processes we enable them to adopt this api.
> >
> > In line with Arnd's recent changes to consolidate syscall numbers across
> > architectures, I have added the pidfd_open() syscall to all architectures
> > at the same time.
>
> I'm glad it's easier now.
>
> > arch/alpha/kernel/syscalls/syscall.tbl | 1 +
> > arch/arm64/include/asm/unistd32.h | 2 +
> > arch/ia64/kernel/syscalls/syscall.tbl | 1 +
> > arch/m68k/kernel/syscalls/syscall.tbl | 1 +
> > arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
> > arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
> > arch/parisc/kernel/syscalls/syscall.tbl | 1 +
> > arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
> > arch/s390/kernel/syscalls/syscall.tbl | 1 +
> > arch/sh/kernel/syscalls/syscall.tbl | 1 +
> > arch/sparc/kernel/syscalls/syscall.tbl | 1 +
> > arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> > arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> > arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
>
> It'd be nice to arrange the system call tables so that we need to
> change only one file when adding a new system call.
>
> [Snip system call wiring]
>
> > --- a/include/linux/pid.h
> > +++ b/include/linux/pid.h
> > @@ -67,6 +67,7 @@ struct pid
> > extern struct pid init_struct_pid;
> >
> > extern const struct file_operations pidfd_fops;
> > +extern int pidfd_create(struct pid *pid);
> >
> > static inline struct pid *get_pid(struct pid *pid)
> > {
> > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > index e2870fe1be5b..989055e0b501 100644
> > --- a/include/linux/syscalls.h
> > +++ b/include/linux/syscalls.h
> > @@ -929,6 +929,7 @@ asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
> > struct old_timex32 __user *tx);
> > asmlinkage long sys_syncfs(int fd);
> > asmlinkage long sys_setns(int fd, int nstype);
> > +asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
> > asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
> > unsigned int vlen, unsigned flags);
> > asmlinkage long sys_process_vm_readv(pid_t pid,
> > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> > index dee7292e1df6..94a257a93d20 100644
> > --- a/include/uapi/asm-generic/unistd.h
> > +++ b/include/uapi/asm-generic/unistd.h
> > @@ -832,9 +832,11 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
> > __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
> > #define __NR_io_uring_register 427
> > __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
> > +#define __NR_pidfd_open 428
> > +__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
> >
> > #undef __NR_syscalls
> > -#define __NR_syscalls 428
> > +#define __NR_syscalls 429
> >
> > /*
> > * 32 bit systems traditionally used different
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 737db1828437..980cc1d2b8d4 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1714,7 +1714,7 @@ const struct file_operations pidfd_fops = {
> > * Return: On success, a cloexec pidfd is returned.
> > * On error, a negative errno number will be returned.
> > */
> > -static int pidfd_create(struct pid *pid)
> > +int pidfd_create(struct pid *pid)
> > {
> > int fd;
> >
> > diff --git a/kernel/pid.c b/kernel/pid.c
> > index 20881598bdfa..237d18d6ecb8 100644
> > --- a/kernel/pid.c
> > +++ b/kernel/pid.c
> > @@ -38,6 +38,7 @@
> > #include <linux/syscalls.h>
> > #include <linux/proc_ns.h>
> > #include <linux/proc_fs.h>
> > +#include <linux/sched/signal.h>
> > #include <linux/sched/task.h>
> > #include <linux/idr.h>
> >
> > @@ -451,6 +452,53 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
> > return idr_get_next(&ns->idr, &nr);
> > }
> >
> > +/**
> > + * pidfd_open() - Open new pid file descriptor.
> > + *
> > + * @pid: pid for which to retrieve a pidfd
> > + * @flags: flags to pass
> > + *
> > + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> > + * the process identified by @pid. Currently, the process identified by
> > + * @pid must be a thread-group leader. This restriction currently exists
> > + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> > + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> > + * leaders).
> > + *
> > + * Return: On success, a cloexec pidfd is returned.
> > + * On error, a negative errno number will be returned.
> > + */
> > +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> > +{
> > + int fd, ret;
> > + struct pid *p;
> > + struct task_struct *tsk;
> > +
> > + if (flags)
> > + return -EINVAL;
>
> If we support blocking operations on pidfds, we'll want to be able to
> put them in non-blocking mode. Does it make sense to accept and ignore
> O_NONBLOCK here now?
Hm, is there a race condition you see that would prevent you from using
fcntl()? If you're ok with using fcntl() I would argue we should hold on
tight to every bit we have for flags.
>
> > + if (pid <= 0)
> > + return -EINVAL;
>
> WDYT of defining pid == 0 to mean "open myself"?
I'm torn. It be a nice shortcut of course but pid being 0 is usually an
indicator for child processes. So unless the getpid() before
pidfd_open() is an issue I'd say let's leave it as is. If you really
want the shortcut might -1 be better?
>
> > + p = find_get_pid(pid);
> > + if (!p)
> > + return -ESRCH;
> > +
> > + rcu_read_lock();
> > + tsk = pid_task(p, PIDTYPE_PID);
> > + if (!tsk)
> > + ret = -ESRCH;
> > + else if (unlikely(!thread_group_leader(tsk)))
> > + ret = -EINVAL;
> > + else
> > + ret = 0;
> > + rcu_read_unlock();
> > +
> > + fd = ret ?: pidfd_create(p);
> > + put_pid(p);
> > + return fd;
> > +}
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-16 13:08 ` Christian Brauner
@ 2019-05-16 14:03 ` Jann Horn
2019-05-16 14:05 ` Christian Brauner
2019-05-16 14:53 ` Aleksa Sarai
1 sibling, 1 reply; 18+ messages in thread
From: Jann Horn @ 2019-05-16 14:03 UTC (permalink / raw)
To: Christian Brauner, Daniel Colascione
Cc: Oleg Nesterov, Al Viro, Linus Torvalds, linux-kernel,
Arnd Bergmann, David Howells, Andrew Morton, Aleksa Sarai,
Eric W. Biederman, Elena Reshetova, Kees Cook, Andy Lutomirski,
Andy Lutomirski, Thomas Gleixner, linux-alpha, linux-arm-kernel,
linux-ia64, linux-m68k, linux-mips, linux-parisc, linuxppc-dev,
linux-s390, linux-sh, sparclinux, linux-xtensa, Linux API,
linux-arch, open list:KERNEL SELFTEST FRAMEWORK
On Thu, May 16, 2019 at 3:08 PM Christian Brauner <christian@brauner.io> wrote:
> On Wed, May 15, 2019 at 10:45:06AM -0700, Daniel Colascione wrote:
> > On Wed, May 15, 2019 at 3:04 AM Christian Brauner <christian@brauner.io> wrote:
> > >
> > > This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
> > > pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
> > > process that is created via traditional fork()/clone() calls that is only
> > > referenced by a PID:
[...]
> > > +/**
> > > + * pidfd_open() - Open new pid file descriptor.
> > > + *
> > > + * @pid: pid for which to retrieve a pidfd
> > > + * @flags: flags to pass
> > > + *
> > > + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> > > + * the process identified by @pid. Currently, the process identified by
> > > + * @pid must be a thread-group leader. This restriction currently exists
> > > + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> > > + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> > > + * leaders).
> > > + *
> > > + * Return: On success, a cloexec pidfd is returned.
> > > + * On error, a negative errno number will be returned.
> > > + */
> > > +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> > > +{
[...]
> > > + if (pid <= 0)
> > > + return -EINVAL;
> >
> > WDYT of defining pid == 0 to mean "open myself"?
>
> I'm torn. It be a nice shortcut of course but pid being 0 is usually an
> indicator for child processes. So unless the getpid() before
> pidfd_open() is an issue I'd say let's leave it as is. If you really
> want the shortcut might -1 be better?
Joining the bikeshed painting club: Please don't allow either 0 or -1
as shortcut for "self". James Forshaw found an Android security bug a
while back (https://bugs.chromium.org/p/project-zero/issues/detail?id=727)
that passed a PID to getpidcon(), except that the PID was 0
(placeholder for oneway binder transactions), and then the service
thought it was talking to itself. You could pick some other number and
provide a #define for that, but I think pidfd_open(getpid(), ...)
makes more sense.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-16 14:03 ` Jann Horn
@ 2019-05-16 14:05 ` Christian Brauner
0 siblings, 0 replies; 18+ messages in thread
From: Christian Brauner @ 2019-05-16 14:05 UTC (permalink / raw)
To: Jann Horn
Cc: Daniel Colascione, Oleg Nesterov, Al Viro, Linus Torvalds,
linux-kernel, Arnd Bergmann, David Howells, Andrew Morton,
Aleksa Sarai, Eric W. Biederman, Elena Reshetova, Kees Cook,
Andy Lutomirski, Andy Lutomirski, Thomas Gleixner, linux-alpha,
linux-arm-kernel, linux-ia64, linux-m68k, linux-mips,
linux-parisc, linuxppc-dev, linux-s390, linux-sh, sparclinux,
linux-xtensa, Linux API, linux-arch,
open list:KERNEL SELFTEST FRAMEWORK
On Thu, May 16, 2019 at 04:03:27PM +0200, Jann Horn wrote:
> On Thu, May 16, 2019 at 3:08 PM Christian Brauner <christian@brauner.io> wrote:
> > On Wed, May 15, 2019 at 10:45:06AM -0700, Daniel Colascione wrote:
> > > On Wed, May 15, 2019 at 3:04 AM Christian Brauner <christian@brauner.io> wrote:
> > > >
> > > > This adds the pidfd_open() syscall. It allows a caller to retrieve pollable
> > > > pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a
> > > > process that is created via traditional fork()/clone() calls that is only
> > > > referenced by a PID:
> [...]
> > > > +/**
> > > > + * pidfd_open() - Open new pid file descriptor.
> > > > + *
> > > > + * @pid: pid for which to retrieve a pidfd
> > > > + * @flags: flags to pass
> > > > + *
> > > > + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> > > > + * the process identified by @pid. Currently, the process identified by
> > > > + * @pid must be a thread-group leader. This restriction currently exists
> > > > + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> > > > + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> > > > + * leaders).
> > > > + *
> > > > + * Return: On success, a cloexec pidfd is returned.
> > > > + * On error, a negative errno number will be returned.
> > > > + */
> > > > +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> > > > +{
> [...]
> > > > + if (pid <= 0)
> > > > + return -EINVAL;
> > >
> > > WDYT of defining pid == 0 to mean "open myself"?
> >
> > I'm torn. It be a nice shortcut of course but pid being 0 is usually an
> > indicator for child processes. So unless the getpid() before
> > pidfd_open() is an issue I'd say let's leave it as is. If you really
> > want the shortcut might -1 be better?
>
> Joining the bikeshed painting club: Please don't allow either 0 or -1
> as shortcut for "self". James Forshaw found an Android security bug a
> while back (https://bugs.chromium.org/p/project-zero/issues/detail?id=727)
> that passed a PID to getpidcon(), except that the PID was 0
> (placeholder for oneway binder transactions), and then the service
> thought it was talking to itself. You could pick some other number and
> provide a #define for that, but I think pidfd_open(getpid(), ...)
> makes more sense.
Yes, I agree. I left it as is for v1, i.e. no shortcut; getpid() should
do.
Christian
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/2] pid: add pidfd_open()
2019-05-16 13:08 ` Christian Brauner
2019-05-16 14:03 ` Jann Horn
@ 2019-05-16 14:53 ` Aleksa Sarai
1 sibling, 0 replies; 18+ messages in thread
From: Aleksa Sarai @ 2019-05-16 14:53 UTC (permalink / raw)
To: Christian Brauner
Cc: Daniel Colascione, Jann Horn, Oleg Nesterov, Al Viro,
Linus Torvalds, linux-kernel, Arnd Bergmann, David Howells,
Andrew Morton, Eric W. Biederman, elena.reshetova, Kees Cook,
Andy Lutomirski, Andy Lutomirski, Thomas Gleixner, linux-alpha,
linux-arm-kernel, linux-ia64, linux-m68k, linux-mips,
linux-parisc, linuxppc-dev, linux-s390, linux-sh, sparclinux,
linux-xtensa, Linux API, linux-arch,
open list:KERNEL SELFTEST FRAMEWORK
[-- Attachment #1: Type: text/plain, Size: 921 bytes --]
On 2019-05-16, Christian Brauner <christian@brauner.io> wrote:
> On Wed, May 15, 2019 at 10:45:06AM -0700, Daniel Colascione wrote:
> > On Wed, May 15, 2019 at 3:04 AM Christian Brauner <christian@brauner.io> wrote:
> > > + if (pid <= 0)
> > > + return -EINVAL;
> >
> > WDYT of defining pid == 0 to mean "open myself"?
>
> I'm torn. It be a nice shortcut of course but pid being 0 is usually an
> indicator for child processes. So unless the getpid() before
> pidfd_open() is an issue I'd say let's leave it as is. If you really
> want the shortcut might -1 be better?
I'd suggest not using negative numbers, and instead reserving them for
PIDTYPE_TGID if we ever want to have that in the future. IMHO, doing
pfd = pidfd_open(getpid(), 0);
is not the end of the world.
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 18+ messages in thread