* Re: [PATCH v5 2/3] pid: Introduce pidfd_getfd syscall
2019-12-20 23:28 [PATCH v5 2/3] pid: Introduce pidfd_getfd syscall Sargun Dhillon
@ 2019-12-21 0:27 ` Aleksa Sarai
2019-12-21 1:45 ` Sargun Dhillon
2019-12-22 12:47 ` Christian Brauner
2019-12-24 11:09 ` kbuild test robot
2 siblings, 1 reply; 7+ messages in thread
From: Aleksa Sarai @ 2019-12-21 0:27 UTC (permalink / raw)
To: Sargun Dhillon
Cc: linux-kernel, containers, linux-api, linux-fsdevel, ealvarez,
arnd, jannh, gpascutto, jld, oleg, luto, viro, christian.brauner
[-- Attachment #1: Type: text/plain, Size: 17449 bytes --]
On 2019-12-20, Sargun Dhillon <sargun@sargun.me> wrote:
> This syscall allows for the retrieval of file descriptors from other
> processes, based on their pidfd. This is possible using ptrace, and
> injection of parasitic code along with using SCM_RIGHTS to move
> file descriptors between a tracee and a tracer. Unfortunately, ptrace
> comes with a high cost of requiring the process to be stopped, and
> breaks debuggers. This does not require stopping the process under
> manipulation.
>
> One reason to use this is to allow sandboxers to take actions on file
> descriptors on the behalf of another process. For example, this can be
> combined with seccomp-bpf's user notification to do on-demand fd
> extraction and take privileged actions. For example, it can be used
> to bind a socket to a privileged port.
>
> /* prototype */
> /*
> * pidfd_getfd_options is an extensible struct which can have options
> * added to it. If options is NULL, size, and it will be ignored be
> * ignored, otherwise, size should be set to sizeof(*options). If
> * option is newer than the current kernel version, E2BIG will be
> * returned.
> */
> struct pidfd_getfd_options {};
> long pidfd_getfd(int pidfd, int fd, unsigned int flags,
> struct pidfd_getfd_options *options, size_t size);
>
> /* testing */
> Ran self-test suite on x86_64
>
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> ---
> MAINTAINERS | 1 +
> arch/alpha/kernel/syscalls/syscall.tbl | 1 +
> arch/arm/tools/syscall.tbl | 1 +
> arch/arm64/include/asm/unistd.h | 2 +-
> arch/arm64/include/asm/unistd32.h | 2 +
> arch/ia64/kernel/syscalls/syscall.tbl | 1 +
> arch/m68k/kernel/syscalls/syscall.tbl | 1 +
> arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_n64.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_o32.tbl | 1 +
> arch/parisc/kernel/syscalls/syscall.tbl | 1 +
> arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
> arch/s390/kernel/syscalls/syscall.tbl | 1 +
> arch/sh/kernel/syscalls/syscall.tbl | 1 +
> arch/sparc/kernel/syscalls/syscall.tbl | 1 +
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
> include/linux/syscalls.h | 4 +
> include/uapi/asm-generic/unistd.h | 3 +-
> include/uapi/linux/pidfd.h | 10 ++
> kernel/pid.c | 115 ++++++++++++++++++++
> 23 files changed, 151 insertions(+), 2 deletions(-)
> create mode 100644 include/uapi/linux/pidfd.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cc0a4a8ae06a..bc370ff59dbf 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13014,6 +13014,7 @@ M: Christian Brauner <christian@brauner.io>
> L: linux-kernel@vger.kernel.org
> S: Maintained
> T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
> +F: include/uapi/linux/pidfd.h
> F: samples/pidfd/
> F: tools/testing/selftests/pidfd/
> F: tools/testing/selftests/clone3/
> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
> index 8e13b0b2928d..d1cac0d657b7 100644
> --- a/arch/alpha/kernel/syscalls/syscall.tbl
> +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> @@ -475,3 +475,4 @@
> 543 common fspick sys_fspick
> 544 common pidfd_open sys_pidfd_open
> # 545 reserved for clone3
> +548 common pidfd_getfd sys_pidfd
> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
> index 6da7dc4d79cc..ba045e2f3a60 100644
> --- a/arch/arm/tools/syscall.tbl
> +++ b/arch/arm/tools/syscall.tbl
> @@ -449,3 +449,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
> index 2629a68b8724..b722e47377a5 100644
> --- a/arch/arm64/include/asm/unistd.h
> +++ b/arch/arm64/include/asm/unistd.h
> @@ -38,7 +38,7 @@
> #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
> #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
>
> -#define __NR_compat_syscalls 436
> +#define __NR_compat_syscalls 439
> #endif
>
> #define __ARCH_WANT_SYS_CLONE
> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
> index 94ab29cf4f00..a8da97a2de41 100644
> --- a/arch/arm64/include/asm/unistd32.h
> +++ b/arch/arm64/include/asm/unistd32.h
> @@ -879,6 +879,8 @@ __SYSCALL(__NR_fspick, sys_fspick)
> __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
> #define __NR_clone3 435
> __SYSCALL(__NR_clone3, sys_clone3)
> +#define __NR_pidfd_getfd 438
> +__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
>
> /*
> * Please add new compat syscalls above this comment and update
> diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
> index 36d5faf4c86c..2b11adfc860c 100644
> --- a/arch/ia64/kernel/syscalls/syscall.tbl
> +++ b/arch/ia64/kernel/syscalls/syscall.tbl
> @@ -356,3 +356,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> # 435 reserved for clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
> index a88a285a0e5f..44e879e98459 100644
> --- a/arch/m68k/kernel/syscalls/syscall.tbl
> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
> @@ -435,3 +435,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> # 435 reserved for clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
> index 09b0cd7dab0a..7afa00125cc4 100644
> --- a/arch/microblaze/kernel/syscalls/syscall.tbl
> +++ b/arch/microblaze/kernel/syscalls/syscall.tbl
> @@ -441,3 +441,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
> index e7c5ab38e403..856d5ba34461 100644
> --- a/arch/mips/kernel/syscalls/syscall_n32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
> @@ -374,3 +374,4 @@
> 433 n32 fspick sys_fspick
> 434 n32 pidfd_open sys_pidfd_open
> 435 n32 clone3 __sys_clone3
> +438 n32 pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
> index 13cd66581f3b..2db6075352f3 100644
> --- a/arch/mips/kernel/syscalls/syscall_n64.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
> @@ -350,3 +350,4 @@
> 433 n64 fspick sys_fspick
> 434 n64 pidfd_open sys_pidfd_open
> 435 n64 clone3 __sys_clone3
> +438 n64 pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
> index 353539ea4140..e9f9d4a9b105 100644
> --- a/arch/mips/kernel/syscalls/syscall_o32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
> @@ -423,3 +423,4 @@
> 433 o32 fspick sys_fspick
> 434 o32 pidfd_open sys_pidfd_open
> 435 o32 clone3 __sys_clone3
> +438 o32 pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
> index 285ff516150c..c58c7eb144ca 100644
> --- a/arch/parisc/kernel/syscalls/syscall.tbl
> +++ b/arch/parisc/kernel/syscalls/syscall.tbl
> @@ -433,3 +433,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3_wrapper
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
> index 43f736ed47f2..707609bfe3ea 100644
> --- a/arch/powerpc/kernel/syscalls/syscall.tbl
> +++ b/arch/powerpc/kernel/syscalls/syscall.tbl
> @@ -517,3 +517,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 nospu clone3 ppc_clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
> index 3054e9c035a3..185cd624face 100644
> --- a/arch/s390/kernel/syscalls/syscall.tbl
> +++ b/arch/s390/kernel/syscalls/syscall.tbl
> @@ -438,3 +438,4 @@
> 433 common fspick sys_fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3 sys_clone3
> +438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
> index b5ed26c4c005..88f90895aad8 100644
> --- a/arch/sh/kernel/syscalls/syscall.tbl
> +++ b/arch/sh/kernel/syscalls/syscall.tbl
> @@ -438,3 +438,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> # 435 reserved for clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
> index 8c8cc7537fb2..218df6a2326e 100644
> --- a/arch/sparc/kernel/syscalls/syscall.tbl
> +++ b/arch/sparc/kernel/syscalls/syscall.tbl
> @@ -481,3 +481,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> # 435 reserved for clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 15908eb9b17e..9c3101b65e0f 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -440,3 +440,4 @@
> 433 i386 fspick sys_fspick __ia32_sys_fspick
> 434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
> 435 i386 clone3 sys_clone3 __ia32_sys_clone3
> +438 i386 pidfd_getfd sys_pidfd_getfd __ia32_sys_pidfd_getfd
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index c29976eca4a8..cef85db75a62 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -357,6 +357,7 @@
> 433 common fspick __x64_sys_fspick
> 434 common pidfd_open __x64_sys_pidfd_open
> 435 common clone3 __x64_sys_clone3/ptregs
> +438 common pidfd_getfd __x64_sys_pidfd_getfd
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
> index 25f4de729a6d..ae15183def12 100644
> --- a/arch/xtensa/kernel/syscalls/syscall.tbl
> +++ b/arch/xtensa/kernel/syscalls/syscall.tbl
> @@ -406,3 +406,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 2960dedcfde8..62fe706329d1 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -69,6 +69,7 @@ struct rseq;
> union bpf_attr;
> struct io_uring_params;
> struct clone_args;
> +struct pidfd_getfd_options;
>
> #include <linux/types.h>
> #include <linux/aio_abi.h>
> @@ -1000,6 +1001,9 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
> asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
> siginfo_t __user *info,
> unsigned int flags);
> +asmlinkage long sys_pidfd_getfd(int pidfd, int fd,
> + struct pidfd_getfd_options __user *options,
> + size_t, usize);
>
> /*
> * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1fc8faa6e973..f358488366f6 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -850,9 +850,10 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
> #define __NR_clone3 435
> __SYSCALL(__NR_clone3, sys_clone3)
> #endif
> +#define __NR_pidfd_getfd 438
>
> #undef __NR_syscalls
> -#define __NR_syscalls 436
> +#define __NR_syscalls 439
>
> /*
> * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> new file mode 100644
> index 000000000000..0a3fc922661d
> --- /dev/null
> +++ b/include/uapi/linux/pidfd.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_PIDFD_H
> +#define _UAPI_LINUX_PIDFD_H
> +
> +struct pidfd_getfd_options {};
Are empty structs well-defined in C (from memory, some compilers make
them non-zero in size)? Since we probably plan to add a flags field in
the future anyway, why not just have a __u64 flags which must be zeroed?
> +
> +#define PIDFD_GETFD_OPTIONS_SIZE_VER0 0
> +#define PIDFD_GETFD_OPTIONS_SIZE_LATEST PIDFD_GETFD_OPTIONS_SIZE_VER0
> +
> +#endif /* _UAPI_LINUX_PIDFD_H */
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 2278e249141d..2a9cb4be383f 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -42,6 +42,7 @@
> #include <linux/sched/signal.h>
> #include <linux/sched/task.h>
> #include <linux/idr.h>
> +#include <uapi/linux/pidfd.h>
>
> struct pid init_struct_pid = {
> .count = REFCOUNT_INIT(1),
> @@ -578,3 +579,117 @@ void __init pid_idr_init(void)
> init_pid_ns.pid_cachep = KMEM_CACHE(pid,
> SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
> }
> +
> +static struct file *__pidfd_getfd_fget_task(struct task_struct *task, u32 fd)
> +{
> + struct file *file;
> + int ret;
> +
> + ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
> + if (ret)
> + return ERR_PTR(ret);
> +
> + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) {
> + file = ERR_PTR(-EPERM);
> + goto out;
> + }
> +
> + file = fget_task(task, fd);
> + if (!file)
> + file = ERR_PTR(-EBADF);
> +
> +out:
> + mutex_unlock(&task->signal->cred_guard_mutex);
> + return file;
> +}
> +
> +static long pidfd_getfd(struct pid *pid, u32 fd)
> +{
> + struct task_struct *task;
> + struct file *file;
> + int ret, retfd;
> +
> + task = get_pid_task(pid, PIDTYPE_PID);
> + if (!task)
> + return -ESRCH;
> +
> + file = __pidfd_getfd_fget_task(task, fd);
> + put_task_struct(task);
> + if (IS_ERR(file))
> + return PTR_ERR(file);
> +
> + retfd = get_unused_fd_flags(O_CLOEXEC);
> + if (retfd < 0) {
> + ret = retfd;
> + goto out;
> + }
> +
> + /*
> + * security_file_receive must come last since it may have side effects
> + * and cannot be reversed.
> + */
> + ret = security_file_receive(file);
> + if (ret)
> + goto out_put_fd;
> +
> + fd_install(retfd, file);
> + return retfd;
> +
> +out_put_fd:
> + put_unused_fd(retfd);
> +out:
> + fput(file);
> + return ret;
> +}
> +
> +/**
> + * sys_pidfd_getfd() - Get a file descriptor from another process
> + *
> + * @pidfd: file descriptor of the process
> + * @fd: the file descriptor number to get
> + * @options: options on how to get the fd
> + * @usize: the size of options
> + *
> + * This syscall requires that the process has the ability to ptrace the
> + * process represented by the pidfd. It will return a duplicated version
> + * of the file descriptor on success. The process who which is having
> + * its file descriptor taken is otherwise unaffected. If options is NULL
> + * it is ignored along with usize.
> + *
> + * Return: On success, a file descriptor with cloexec is returned.
> + * On error, a negative errno number will be returned.
> + */
> +SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
> + struct pidfd_getfd_options __user *, options, size_t, usize)
> +{
> + struct pid *pid;
> + struct fd f;
> + int ret;
> +
> + BUILD_BUG_ON(sizeof(struct pidfd_getfd_options) != PIDFD_GETFD_OPTIONS_SIZE_LATEST);
> +
> + /*
> + * options is currently unused, verify it's unset or if it is set,
> + * ensure that size is 0.
> + *
> + * In the future, this will need to adopt copy_struct_from_user.
> + */
> + if (options && usize > PIDFD_GETFD_OPTIONS_SIZE_VER0)
> + return -E2BIG;
I wouldn't suggest doing this. I understand that this is simpler, but it
will cause problems once we add a field to pidfd_getfd_options -- newer
programs compiled with a larger struct won't work on older kernels with
this check (even if the struct is completely zeroed).
If copy_struct_from_user() doesn't work with usize == 0, I would *much*
prefer a patch which fixes that (effectively copy_struct_from_user()
with a usize == 0 should just be a call to check_zeroed_user()).
> + f = fdget(pidfd);
> + if (!f.file)
> + return -EBADF;
> +
> + pid = pidfd_pid(f.file);
> + if (IS_ERR(pid)) {
> + ret = PTR_ERR(pid);
> + goto out;
> + }
> +
> + ret = pidfd_getfd(pid, fd);
> +
> +out:
> + fdput(f);
> + return ret;
> +}
> --
> 2.20.1
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v5 2/3] pid: Introduce pidfd_getfd syscall
2019-12-21 0:27 ` Aleksa Sarai
@ 2019-12-21 1:45 ` Sargun Dhillon
0 siblings, 0 replies; 7+ messages in thread
From: Sargun Dhillon @ 2019-12-21 1:45 UTC (permalink / raw)
To: Aleksa Sarai
Cc: LKML, Linux Containers, Linux API, Linux FS-devel Mailing List,
Emilio Cobos Álvarez, Arnd Bergmann, Jann Horn,
Gian-Carlo Pascutto, Jed Davis, Oleg Nesterov, Andy Lutomirski,
Al Viro, Christian Brauner
On Fri, Dec 20, 2019 at 4:27 PM Aleksa Sarai <asarai@suse.de> wrote:
>
> On 2019-12-20, Sargun Dhillon <sargun@sargun.me> wrote:
> > diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> > new file mode 100644
> > index 000000000000..0a3fc922661d
> > --- /dev/null
> > +++ b/include/uapi/linux/pidfd.h
> > @@ -0,0 +1,10 @@
> > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > +#ifndef _UAPI_LINUX_PIDFD_H
> > +#define _UAPI_LINUX_PIDFD_H
> > +
> > +struct pidfd_getfd_options {};
>
> Are empty structs well-defined in C (from memory, some compilers make
> them non-zero in size)? Since we probably plan to add a flags field in
> the future anyway, why not just have a __u64 flags which must be zeroed?
>
It's allowed in GCC:
https://gcc.gnu.org/onlinedocs/gcc-8.1.0/gcc/Empty-Structures.html
I can add an __aligned_u64 flags for now, and just say something like
"reserved". This will also solve the latter issue, and I'll just use
copy_struct_from_user,
as long as Christian is okay with having an unused (reserved) flag member.
> > + f = fdget(pidfd);
> > + if (!f.file)
> > + return -EBADF;
> > +
> > + pid = pidfd_pid(f.file);
> > + if (IS_ERR(pid)) {
> > + ret = PTR_ERR(pid);
> > + goto out;
> > + }
> > +
> > + ret = pidfd_getfd(pid, fd);
> > +
> > +out:
> > + fdput(f);
> > + return ret;
> > +}
> > --
> > 2.20.1
>
> --
> Aleksa Sarai
> Senior Software Engineer (Containers)
> SUSE Linux GmbH
> <https://www.cyphar.com/>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v5 2/3] pid: Introduce pidfd_getfd syscall
2019-12-20 23:28 [PATCH v5 2/3] pid: Introduce pidfd_getfd syscall Sargun Dhillon
2019-12-21 0:27 ` Aleksa Sarai
@ 2019-12-22 12:47 ` Christian Brauner
2019-12-22 18:36 ` Sargun Dhillon
2019-12-24 11:09 ` kbuild test robot
2 siblings, 1 reply; 7+ messages in thread
From: Christian Brauner @ 2019-12-22 12:47 UTC (permalink / raw)
To: Sargun Dhillon
Cc: linux-kernel, containers, linux-api, linux-fsdevel, tycho, jannh,
cyphar, oleg, luto, viro, gpascutto, ealvarez, fweimer, jld,
arnd
On Fri, Dec 20, 2019 at 11:28:13PM +0000, Sargun Dhillon wrote:
> This syscall allows for the retrieval of file descriptors from other
> processes, based on their pidfd. This is possible using ptrace, and
> injection of parasitic code along with using SCM_RIGHTS to move
> file descriptors between a tracee and a tracer. Unfortunately, ptrace
> comes with a high cost of requiring the process to be stopped, and
> breaks debuggers. This does not require stopping the process under
> manipulation.
>
> One reason to use this is to allow sandboxers to take actions on file
> descriptors on the behalf of another process. For example, this can be
> combined with seccomp-bpf's user notification to do on-demand fd
> extraction and take privileged actions. For example, it can be used
> to bind a socket to a privileged port.
>
> /* prototype */
> /*
> * pidfd_getfd_options is an extensible struct which can have options
> * added to it. If options is NULL, size, and it will be ignored be
> * ignored, otherwise, size should be set to sizeof(*options). If
> * option is newer than the current kernel version, E2BIG will be
> * returned.
> */
> struct pidfd_getfd_options {};
> long pidfd_getfd(int pidfd, int fd, unsigned int flags,
> struct pidfd_getfd_options *options, size_t size);
The prototype advertises a flags argument but the actual
+SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
+ struct pidfd_getfd_options __user *, options, size_t, usize)
does not have a flags argument...
I think having a flags argument makes a lot of sense.
I'm not sure what to think about the struct. I agree with Aleksa that
having an empty struct is not a great idea. From a design perspective it
seems very out of place. If we do a struct at all putting at least a
single reserved field in there might makes more sense.
In general, I think we need to have a _concrete_ reason why putting a
struct versioned by size as arguments for this syscall.
That means we need to have at least a concrete example for a new feature
for this syscall where a flag would not convey enough information.
And I'm not sure that there is a good one... I guess one thing I can
think of is that a caller might want dup-like semantics, i.e. a caller
might want to say:
pidfd_getfd(<pidfd>, <fd-to-get>, <fd-number-to-want>, <flags>, ...)
such that after pidfd_getfd() returns <fd-to-get> corresponds to
<fd-number-to-want> in the caller. But that can also be achieved via:
int fd = pidfd_getfd(<pidfd>, <fd-to-get>, <flags>, ...)
int final_fd = dup3(fd, <newfd>, O_CLOEXEC)
>
> /* testing */
> Ran self-test suite on x86_64
+1
>
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> ---
> MAINTAINERS | 1 +
> arch/alpha/kernel/syscalls/syscall.tbl | 1 +
> arch/arm/tools/syscall.tbl | 1 +
> arch/arm64/include/asm/unistd.h | 2 +-
> arch/arm64/include/asm/unistd32.h | 2 +
> arch/ia64/kernel/syscalls/syscall.tbl | 1 +
> arch/m68k/kernel/syscalls/syscall.tbl | 1 +
> arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_n64.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_o32.tbl | 1 +
> arch/parisc/kernel/syscalls/syscall.tbl | 1 +
> arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
> arch/s390/kernel/syscalls/syscall.tbl | 1 +
> arch/sh/kernel/syscalls/syscall.tbl | 1 +
> arch/sparc/kernel/syscalls/syscall.tbl | 1 +
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
> include/linux/syscalls.h | 4 +
> include/uapi/asm-generic/unistd.h | 3 +-
> include/uapi/linux/pidfd.h | 10 ++
> kernel/pid.c | 115 ++++++++++++++++++++
> 23 files changed, 151 insertions(+), 2 deletions(-)
> create mode 100644 include/uapi/linux/pidfd.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cc0a4a8ae06a..bc370ff59dbf 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13014,6 +13014,7 @@ M: Christian Brauner <christian@brauner.io>
> L: linux-kernel@vger.kernel.org
> S: Maintained
> T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
> +F: include/uapi/linux/pidfd.h
> F: samples/pidfd/
> F: tools/testing/selftests/pidfd/
> F: tools/testing/selftests/clone3/
> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
> index 8e13b0b2928d..d1cac0d657b7 100644
> --- a/arch/alpha/kernel/syscalls/syscall.tbl
> +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> @@ -475,3 +475,4 @@
> 543 common fspick sys_fspick
> 544 common pidfd_open sys_pidfd_open
> # 545 reserved for clone3
> +548 common pidfd_getfd sys_pidfd
> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
> index 6da7dc4d79cc..ba045e2f3a60 100644
> --- a/arch/arm/tools/syscall.tbl
> +++ b/arch/arm/tools/syscall.tbl
> @@ -449,3 +449,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
> index 2629a68b8724..b722e47377a5 100644
> --- a/arch/arm64/include/asm/unistd.h
> +++ b/arch/arm64/include/asm/unistd.h
> @@ -38,7 +38,7 @@
> #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
> #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
>
> -#define __NR_compat_syscalls 436
> +#define __NR_compat_syscalls 439
> #endif
>
> #define __ARCH_WANT_SYS_CLONE
> diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
> index 94ab29cf4f00..a8da97a2de41 100644
> --- a/arch/arm64/include/asm/unistd32.h
> +++ b/arch/arm64/include/asm/unistd32.h
> @@ -879,6 +879,8 @@ __SYSCALL(__NR_fspick, sys_fspick)
> __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
> #define __NR_clone3 435
> __SYSCALL(__NR_clone3, sys_clone3)
> +#define __NR_pidfd_getfd 438
> +__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
>
> /*
> * Please add new compat syscalls above this comment and update
> diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
> index 36d5faf4c86c..2b11adfc860c 100644
> --- a/arch/ia64/kernel/syscalls/syscall.tbl
> +++ b/arch/ia64/kernel/syscalls/syscall.tbl
> @@ -356,3 +356,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> # 435 reserved for clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
> index a88a285a0e5f..44e879e98459 100644
> --- a/arch/m68k/kernel/syscalls/syscall.tbl
> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
> @@ -435,3 +435,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> # 435 reserved for clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
> index 09b0cd7dab0a..7afa00125cc4 100644
> --- a/arch/microblaze/kernel/syscalls/syscall.tbl
> +++ b/arch/microblaze/kernel/syscalls/syscall.tbl
> @@ -441,3 +441,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
> index e7c5ab38e403..856d5ba34461 100644
> --- a/arch/mips/kernel/syscalls/syscall_n32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
> @@ -374,3 +374,4 @@
> 433 n32 fspick sys_fspick
> 434 n32 pidfd_open sys_pidfd_open
> 435 n32 clone3 __sys_clone3
> +438 n32 pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
> index 13cd66581f3b..2db6075352f3 100644
> --- a/arch/mips/kernel/syscalls/syscall_n64.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
> @@ -350,3 +350,4 @@
> 433 n64 fspick sys_fspick
> 434 n64 pidfd_open sys_pidfd_open
> 435 n64 clone3 __sys_clone3
> +438 n64 pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
> index 353539ea4140..e9f9d4a9b105 100644
> --- a/arch/mips/kernel/syscalls/syscall_o32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
> @@ -423,3 +423,4 @@
> 433 o32 fspick sys_fspick
> 434 o32 pidfd_open sys_pidfd_open
> 435 o32 clone3 __sys_clone3
> +438 o32 pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
> index 285ff516150c..c58c7eb144ca 100644
> --- a/arch/parisc/kernel/syscalls/syscall.tbl
> +++ b/arch/parisc/kernel/syscalls/syscall.tbl
> @@ -433,3 +433,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3_wrapper
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
> index 43f736ed47f2..707609bfe3ea 100644
> --- a/arch/powerpc/kernel/syscalls/syscall.tbl
> +++ b/arch/powerpc/kernel/syscalls/syscall.tbl
> @@ -517,3 +517,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 nospu clone3 ppc_clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
> index 3054e9c035a3..185cd624face 100644
> --- a/arch/s390/kernel/syscalls/syscall.tbl
> +++ b/arch/s390/kernel/syscalls/syscall.tbl
> @@ -438,3 +438,4 @@
> 433 common fspick sys_fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3 sys_clone3
> +438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
> index b5ed26c4c005..88f90895aad8 100644
> --- a/arch/sh/kernel/syscalls/syscall.tbl
> +++ b/arch/sh/kernel/syscalls/syscall.tbl
> @@ -438,3 +438,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> # 435 reserved for clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
> index 8c8cc7537fb2..218df6a2326e 100644
> --- a/arch/sparc/kernel/syscalls/syscall.tbl
> +++ b/arch/sparc/kernel/syscalls/syscall.tbl
> @@ -481,3 +481,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> # 435 reserved for clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 15908eb9b17e..9c3101b65e0f 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -440,3 +440,4 @@
> 433 i386 fspick sys_fspick __ia32_sys_fspick
> 434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
> 435 i386 clone3 sys_clone3 __ia32_sys_clone3
> +438 i386 pidfd_getfd sys_pidfd_getfd __ia32_sys_pidfd_getfd
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index c29976eca4a8..cef85db75a62 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -357,6 +357,7 @@
> 433 common fspick __x64_sys_fspick
> 434 common pidfd_open __x64_sys_pidfd_open
> 435 common clone3 __x64_sys_clone3/ptregs
> +438 common pidfd_getfd __x64_sys_pidfd_getfd
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
> index 25f4de729a6d..ae15183def12 100644
> --- a/arch/xtensa/kernel/syscalls/syscall.tbl
> +++ b/arch/xtensa/kernel/syscalls/syscall.tbl
> @@ -406,3 +406,4 @@
> 433 common fspick sys_fspick
> 434 common pidfd_open sys_pidfd_open
> 435 common clone3 sys_clone3
> +438 common pidfd_getfd sys_pidfd_getfd
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 2960dedcfde8..62fe706329d1 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -69,6 +69,7 @@ struct rseq;
> union bpf_attr;
> struct io_uring_params;
> struct clone_args;
> +struct pidfd_getfd_options;
>
> #include <linux/types.h>
> #include <linux/aio_abi.h>
> @@ -1000,6 +1001,9 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
> asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
> siginfo_t __user *info,
> unsigned int flags);
> +asmlinkage long sys_pidfd_getfd(int pidfd, int fd,
> + struct pidfd_getfd_options __user *options,
> + size_t, usize);
>
> /*
> * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1fc8faa6e973..f358488366f6 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -850,9 +850,10 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
> #define __NR_clone3 435
> __SYSCALL(__NR_clone3, sys_clone3)
> #endif
> +#define __NR_pidfd_getfd 438
>
> #undef __NR_syscalls
> -#define __NR_syscalls 436
> +#define __NR_syscalls 439
>
> /*
> * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> new file mode 100644
> index 000000000000..0a3fc922661d
> --- /dev/null
> +++ b/include/uapi/linux/pidfd.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_PIDFD_H
> +#define _UAPI_LINUX_PIDFD_H
> +
> +struct pidfd_getfd_options {};
> +
> +#define PIDFD_GETFD_OPTIONS_SIZE_VER0 0
> +#define PIDFD_GETFD_OPTIONS_SIZE_LATEST PIDFD_GETFD_OPTIONS_SIZE_VER0
> +
> +#endif /* _UAPI_LINUX_PIDFD_H */
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 2278e249141d..2a9cb4be383f 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -42,6 +42,7 @@
> #include <linux/sched/signal.h>
> #include <linux/sched/task.h>
> #include <linux/idr.h>
> +#include <uapi/linux/pidfd.h>
>
> struct pid init_struct_pid = {
> .count = REFCOUNT_INIT(1),
> @@ -578,3 +579,117 @@ void __init pid_idr_init(void)
> init_pid_ns.pid_cachep = KMEM_CACHE(pid,
> SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
> }
> +
> +static struct file *__pidfd_getfd_fget_task(struct task_struct *task, u32 fd)
> +{
> + struct file *file;
> + int ret;
> +
> + ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
> + if (ret)
> + return ERR_PTR(ret);
> +
> + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) {
> + file = ERR_PTR(-EPERM);
> + goto out;
> + }
> +
> + file = fget_task(task, fd);
> + if (!file)
> + file = ERR_PTR(-EBADF);
> +
> +out:
> + mutex_unlock(&task->signal->cred_guard_mutex);
> + return file;
> +}
> +
> +static long pidfd_getfd(struct pid *pid, u32 fd)
> +{
> + struct task_struct *task;
> + struct file *file;
> + int ret, retfd;
> +
> + task = get_pid_task(pid, PIDTYPE_PID);
> + if (!task)
> + return -ESRCH;
> +
> + file = __pidfd_getfd_fget_task(task, fd);
> + put_task_struct(task);
> + if (IS_ERR(file))
> + return PTR_ERR(file);
> +
> + retfd = get_unused_fd_flags(O_CLOEXEC);
> + if (retfd < 0) {
> + ret = retfd;
> + goto out;
> + }
> +
> + /*
> + * security_file_receive must come last since it may have side effects
> + * and cannot be reversed.
> + */
> + ret = security_file_receive(file);
> + if (ret)
> + goto out_put_fd;
> +
> + fd_install(retfd, file);
> + return retfd;
> +
> +out_put_fd:
> + put_unused_fd(retfd);
> +out:
> + fput(file);
> + return ret;
> +}
> +
> +/**
> + * sys_pidfd_getfd() - Get a file descriptor from another process
> + *
> + * @pidfd: file descriptor of the process
> + * @fd: the file descriptor number to get
> + * @options: options on how to get the fd
> + * @usize: the size of options
> + *
> + * This syscall requires that the process has the ability to ptrace the
> + * process represented by the pidfd. It will return a duplicated version
> + * of the file descriptor on success. The process who which is having
s/who which/which/
> + * its file descriptor taken is otherwise unaffected. If options is NULL
> + * it is ignored along with usize.
> + *
> + * Return: On success, a file descriptor with cloexec is returned.
> + * On error, a negative errno number will be returned.
> + */
> +SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
> + struct pidfd_getfd_options __user *, options, size_t, usize)
> +{
> + struct pid *pid;
> + struct fd f;
> + int ret;
> +
> + BUILD_BUG_ON(sizeof(struct pidfd_getfd_options) != PIDFD_GETFD_OPTIONS_SIZE_LATEST);
> +
> + /*
> + * options is currently unused, verify it's unset or if it is set,
> + * ensure that size is 0.
> + *
> + * In the future, this will need to adopt copy_struct_from_user.
> + */
> + if (options && usize > PIDFD_GETFD_OPTIONS_SIZE_VER0)
> + return -E2BIG;
> +
> + f = fdget(pidfd);
> + if (!f.file)
> + return -EBADF;
> +
> + pid = pidfd_pid(f.file);
> + if (IS_ERR(pid)) {
> + ret = PTR_ERR(pid);
> + goto out;
> + }
> +
> + ret = pidfd_getfd(pid, fd);
> +
> +out:
> + fdput(f);
> + return ret;
> +}
> --
> 2.20.1
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v5 2/3] pid: Introduce pidfd_getfd syscall
2019-12-22 12:47 ` Christian Brauner
@ 2019-12-22 18:36 ` Sargun Dhillon
2019-12-22 20:15 ` Christian Brauner
0 siblings, 1 reply; 7+ messages in thread
From: Sargun Dhillon @ 2019-12-22 18:36 UTC (permalink / raw)
To: Christian Brauner
Cc: LKML, Linux Containers, Linux API, Linux FS-devel Mailing List,
Tycho Andersen, Jann Horn, Aleksa Sarai, Oleg Nesterov,
Andy Lutomirski, Al Viro, Gian-Carlo Pascutto,
Emilio Cobos Álvarez, Florian Weimer, Jed Davis,
Arnd Bergmann
, On Sun, Dec 22, 2019 at 4:48 AM Christian Brauner
<christian.brauner@ubuntu.com> wrote:
>
> On Fri, Dec 20, 2019 at 11:28:13PM +0000, Sargun Dhillon wrote:
> > This syscall allows for the retrieval of file descriptors from other
> > processes, based on their pidfd. This is possible using ptrace, and
> > injection of parasitic code along with using SCM_RIGHTS to move
> > file descriptors between a tracee and a tracer. Unfortunately, ptrace
> > comes with a high cost of requiring the process to be stopped, and
> > breaks debuggers. This does not require stopping the process under
> > manipulation.
> >
> > One reason to use this is to allow sandboxers to take actions on file
> > descriptors on the behalf of another process. For example, this can be
> > combined with seccomp-bpf's user notification to do on-demand fd
> > extraction and take privileged actions. For example, it can be used
> > to bind a socket to a privileged port.
> >
> > /* prototype */
> > /*
> > * pidfd_getfd_options is an extensible struct which can have options
> > * added to it. If options is NULL, size, and it will be ignored be
> > * ignored, otherwise, size should be set to sizeof(*options). If
> > * option is newer than the current kernel version, E2BIG will be
> > * returned.
> > */
> > struct pidfd_getfd_options {};
> > long pidfd_getfd(int pidfd, int fd, unsigned int flags,
> > struct pidfd_getfd_options *options, size_t size);
That's embarrassing. This was supposed to read:
long pidfd_getfd(int pidfd, int fd, struct pidfd_get_options *options,
size_t size);
>
> The prototype advertises a flags argument but the actual
>
> +SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
> + struct pidfd_getfd_options __user *, options, size_t, usize)
>
> does not have a flags argument...
>
> I think having a flags argument makes a lot of sense.
>
> I'm not sure what to think about the struct. I agree with Aleksa that
> having an empty struct is not a great idea. From a design perspective it
> seems very out of place. If we do a struct at all putting at least a
> single reserved field in there might makes more sense.
>
> In general, I think we need to have a _concrete_ reason why putting a
> struct versioned by size as arguments for this syscall.
> That means we need to have at least a concrete example for a new feature
> for this syscall where a flag would not convey enough information.
I can think of at least two reasons we need flags:
* Clearing cgroup flags
* Closing the process under manipulation's FD when we fetch it.
The original reason for wanting to have two places where we can put
flags was to have a different field for fd flags vs. call flags. I'm not sure
there's any flags you'd want to set.
Given this, if we want to go down the route of a syscall, we should just
leave it as a __u64 flags, and drop the pointer to the struct, if we're
not worried about that.
>
> And I'm not sure that there is a good one... I guess one thing I can
> think of is that a caller might want dup-like semantics, i.e. a caller
> might want to say:
>
> pidfd_getfd(<pidfd>, <fd-to-get>, <fd-number-to-want>, <flags>, ...)
>
> such that after pidfd_getfd() returns <fd-to-get> corresponds to
> <fd-number-to-want> in the caller. But that can also be achieved via:
> int fd = pidfd_getfd(<pidfd>, <fd-to-get>, <flags>, ...)
> int final_fd = dup3(fd, <newfd>, O_CLOEXEC)
>
> >
> > /* testing */
> > Ran self-test suite on x86_64
>
> +1
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v5 2/3] pid: Introduce pidfd_getfd syscall
2019-12-22 18:36 ` Sargun Dhillon
@ 2019-12-22 20:15 ` Christian Brauner
0 siblings, 0 replies; 7+ messages in thread
From: Christian Brauner @ 2019-12-22 20:15 UTC (permalink / raw)
To: Sargun Dhillon
Cc: Emilio Cobos Álvarez, Arnd Bergmann, Jann Horn,
Gian-Carlo Pascutto, Linux API, Linux Containers, Jed Davis,
LKML, Oleg Nesterov, Al Viro, Linux FS-devel Mailing List,
Andy Lutomirski
On Sun, Dec 22, 2019 at 10:36:42AM -0800, Sargun Dhillon wrote:
> , On Sun, Dec 22, 2019 at 4:48 AM Christian Brauner
> <christian.brauner@ubuntu.com> wrote:
> >
> > On Fri, Dec 20, 2019 at 11:28:13PM +0000, Sargun Dhillon wrote:
> > > This syscall allows for the retrieval of file descriptors from other
> > > processes, based on their pidfd. This is possible using ptrace, and
> > > injection of parasitic code along with using SCM_RIGHTS to move
> > > file descriptors between a tracee and a tracer. Unfortunately, ptrace
> > > comes with a high cost of requiring the process to be stopped, and
> > > breaks debuggers. This does not require stopping the process under
> > > manipulation.
> > >
> > > One reason to use this is to allow sandboxers to take actions on file
> > > descriptors on the behalf of another process. For example, this can be
> > > combined with seccomp-bpf's user notification to do on-demand fd
> > > extraction and take privileged actions. For example, it can be used
> > > to bind a socket to a privileged port.
> > >
> > > /* prototype */
> > > /*
> > > * pidfd_getfd_options is an extensible struct which can have options
> > > * added to it. If options is NULL, size, and it will be ignored be
> > > * ignored, otherwise, size should be set to sizeof(*options). If
> > > * option is newer than the current kernel version, E2BIG will be
> > > * returned.
> > > */
> > > struct pidfd_getfd_options {};
> > > long pidfd_getfd(int pidfd, int fd, unsigned int flags,
> > > struct pidfd_getfd_options *options, size_t size);
> That's embarrassing. This was supposed to read:
> long pidfd_getfd(int pidfd, int fd, struct pidfd_get_options *options,
> size_t size);
>
> >
> > The prototype advertises a flags argument but the actual
> >
> > +SYSCALL_DEFINE4(pidfd_getfd, int, pidfd, int, fd,
> > + struct pidfd_getfd_options __user *, options, size_t, usize)
> >
> > does not have a flags argument...
> >
> > I think having a flags argument makes a lot of sense.
> >
> > I'm not sure what to think about the struct. I agree with Aleksa that
> > having an empty struct is not a great idea. From a design perspective it
> > seems very out of place. If we do a struct at all putting at least a
> > single reserved field in there might makes more sense.
> >
> > In general, I think we need to have a _concrete_ reason why putting a
> > struct versioned by size as arguments for this syscall.
> > That means we need to have at least a concrete example for a new feature
> > for this syscall where a flag would not convey enough information.
> I can think of at least two reasons we need flags:
> * Clearing cgroup flags
> * Closing the process under manipulation's FD when we fetch it.
>
> The original reason for wanting to have two places where we can put
> flags was to have a different field for fd flags vs. call flags. I'm not sure
> there's any flags you'd want to set.
>
> Given this, if we want to go down the route of a syscall, we should just
> leave it as a __u64 flags, and drop the pointer to the struct, if we're
I think it needs to be an unsigned int. Having a 64bit register arg is
really messy on 32bit and means you need to have a compat syscall
implementation which handles this.
Christian
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v5 2/3] pid: Introduce pidfd_getfd syscall
2019-12-20 23:28 [PATCH v5 2/3] pid: Introduce pidfd_getfd syscall Sargun Dhillon
2019-12-21 0:27 ` Aleksa Sarai
2019-12-22 12:47 ` Christian Brauner
@ 2019-12-24 11:09 ` kbuild test robot
2 siblings, 0 replies; 7+ messages in thread
From: kbuild test robot @ 2019-12-24 11:09 UTC (permalink / raw)
To: Sargun Dhillon
Cc: kbuild-all, linux-kernel, containers, linux-api, linux-fsdevel,
tycho, jannh, cyphar, christian.brauner, oleg, luto, viro,
gpascutto, ealvarez, fweimer, jld, arnd
[-- Attachment #1: Type: text/plain, Size: 9301 bytes --]
Hi Sargun,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on kselftest/next]
[also build test ERROR on arm64/for-next/core linus/master v5.5-rc3]
[cannot apply to tip/x86/asm next-20191220]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
url: https://github.com/0day-ci/linux/commits/Sargun-Dhillon/Add-pidfd_getfd-syscall/20191224-061915
base: https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next
config: i386-randconfig-a001-20191224 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.2-10+deb8u1) 4.9.2
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
In file included from drivers//ptp/ptp_clock.c:16:0:
>> include/linux/syscalls.h:1006:13: error: unknown type name 'usize'
size_t, usize);
^
vim +/usize +1006 include/linux/syscalls.h
864
865 /* mm/, CONFIG_MMU only */
866 asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags);
867 asmlinkage long sys_swapoff(const char __user *specialfile);
868 asmlinkage long sys_mprotect(unsigned long start, size_t len,
869 unsigned long prot);
870 asmlinkage long sys_msync(unsigned long start, size_t len, int flags);
871 asmlinkage long sys_mlock(unsigned long start, size_t len);
872 asmlinkage long sys_munlock(unsigned long start, size_t len);
873 asmlinkage long sys_mlockall(int flags);
874 asmlinkage long sys_munlockall(void);
875 asmlinkage long sys_mincore(unsigned long start, size_t len,
876 unsigned char __user * vec);
877 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
878 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
879 unsigned long prot, unsigned long pgoff,
880 unsigned long flags);
881 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
882 unsigned long mode,
883 const unsigned long __user *nmask,
884 unsigned long maxnode,
885 unsigned flags);
886 asmlinkage long sys_get_mempolicy(int __user *policy,
887 unsigned long __user *nmask,
888 unsigned long maxnode,
889 unsigned long addr, unsigned long flags);
890 asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask,
891 unsigned long maxnode);
892 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
893 const unsigned long __user *from,
894 const unsigned long __user *to);
895 asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
896 const void __user * __user *pages,
897 const int __user *nodes,
898 int __user *status,
899 int flags);
900
901 asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
902 siginfo_t __user *uinfo);
903 asmlinkage long sys_perf_event_open(
904 struct perf_event_attr __user *attr_uptr,
905 pid_t pid, int cpu, int group_fd, unsigned long flags);
906 asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
907 asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
908 unsigned int vlen, unsigned flags,
909 struct __kernel_timespec __user *timeout);
910 asmlinkage long sys_recvmmsg_time32(int fd, struct mmsghdr __user *msg,
911 unsigned int vlen, unsigned flags,
912 struct old_timespec32 __user *timeout);
913
914 asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
915 int options, struct rusage __user *ru);
916 asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource,
917 const struct rlimit64 __user *new_rlim,
918 struct rlimit64 __user *old_rlim);
919 asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags);
920 asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
921 u64 mask, int fd,
922 const char __user *pathname);
923 asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
924 struct file_handle __user *handle,
925 int __user *mnt_id, int flag);
926 asmlinkage long sys_open_by_handle_at(int mountdirfd,
927 struct file_handle __user *handle,
928 int flags);
929 asmlinkage long sys_clock_adjtime(clockid_t which_clock,
930 struct __kernel_timex __user *tx);
931 asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
932 struct old_timex32 __user *tx);
933 asmlinkage long sys_syncfs(int fd);
934 asmlinkage long sys_setns(int fd, int nstype);
935 asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
936 asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
937 unsigned int vlen, unsigned flags);
938 asmlinkage long sys_process_vm_readv(pid_t pid,
939 const struct iovec __user *lvec,
940 unsigned long liovcnt,
941 const struct iovec __user *rvec,
942 unsigned long riovcnt,
943 unsigned long flags);
944 asmlinkage long sys_process_vm_writev(pid_t pid,
945 const struct iovec __user *lvec,
946 unsigned long liovcnt,
947 const struct iovec __user *rvec,
948 unsigned long riovcnt,
949 unsigned long flags);
950 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
951 unsigned long idx1, unsigned long idx2);
952 asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
953 asmlinkage long sys_sched_setattr(pid_t pid,
954 struct sched_attr __user *attr,
955 unsigned int flags);
956 asmlinkage long sys_sched_getattr(pid_t pid,
957 struct sched_attr __user *attr,
958 unsigned int size,
959 unsigned int flags);
960 asmlinkage long sys_renameat2(int olddfd, const char __user *oldname,
961 int newdfd, const char __user *newname,
962 unsigned int flags);
963 asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
964 void __user *uargs);
965 asmlinkage long sys_getrandom(char __user *buf, size_t count,
966 unsigned int flags);
967 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
968 asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
969 asmlinkage long sys_execveat(int dfd, const char __user *filename,
970 const char __user *const __user *argv,
971 const char __user *const __user *envp, int flags);
972 asmlinkage long sys_userfaultfd(int flags);
973 asmlinkage long sys_membarrier(int cmd, int flags);
974 asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
975 asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
976 int fd_out, loff_t __user *off_out,
977 size_t len, unsigned int flags);
978 asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user *vec,
979 unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
980 rwf_t flags);
981 asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user *vec,
982 unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
983 rwf_t flags);
984 asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
985 unsigned long prot, int pkey);
986 asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
987 asmlinkage long sys_pkey_free(int pkey);
988 asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
989 unsigned mask, struct statx __user *buffer);
990 asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
991 int flags, uint32_t sig);
992 asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
993 asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
994 int to_dfd, const char __user *to_path,
995 unsigned int ms_flags);
996 asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
997 asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
998 const void __user *value, int aux);
999 asmlinkage long sys_fsmount(int fs_fd, unsigned int flags, unsigned int ms_flags);
1000 asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags);
1001 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
1002 siginfo_t __user *info,
1003 unsigned int flags);
1004 asmlinkage long sys_pidfd_getfd(int pidfd, int fd,
1005 struct pidfd_getfd_options __user *options,
> 1006 size_t, usize);
1007
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 29697 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread