* [PATCH RFC v3 1/2] nsfs: replace proc_ns_fget() with proc_ns_fdget() @ 2015-09-25 13:52 ` Konstantin Khlebnikov 0 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw) To: linux-api, containers, linux-kernel Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber New helper function returns pointer to struct ns_common, can check namespace type and uses struct fd for returning file reference: this saves couple atomic operations for single-threaded applications. Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> --- fs/nsfs.c | 20 +++++++++++++------- include/linux/proc_ns.h | 4 +++- kernel/nsproxy.c | 15 +++++---------- net/core/net_namespace.c | 18 ++++++------------ 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 8f20d6016e20..a2e803c97960 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -119,21 +119,27 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, return res; } -struct file *proc_ns_fget(int fd) +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) { - struct file *file; + struct ns_common *ns; + struct fd f; - file = fget(fd); - if (!file) + f = fdget(fd); + if (!f.file) return ERR_PTR(-EBADF); - if (file->f_op != &ns_file_operations) + if (f.file->f_op != &ns_file_operations) + goto out_invalid; + + ns = get_proc_ns(file_inode(f.file)); + if (nstype && (ns->ops->type != nstype)) goto out_invalid; - return file; + *fd_ref = f; + return ns; out_invalid: - fput(file); + fdput(f); return ERR_PTR(-EINVAL); } diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 42dfc615dbf8..406ea4617606 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -9,6 +9,7 @@ struct pid_namespace; struct nsproxy; struct path; +struct fd; struct proc_ns_operations { const char *name; @@ -65,7 +66,8 @@ static inline int ns_alloc_inum(struct ns_common *ns) #define ns_free_inum(ns) proc_free_inum((ns)->inum) -extern struct file *proc_ns_fget(int fd); +extern struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref); + #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) extern void *ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 49746c81ad8d..689b929a0fcb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -222,18 +222,13 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) { struct task_struct *tsk = current; struct nsproxy *new_nsproxy; - struct file *file; + struct fd fd_ref; struct ns_common *ns; int err; - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); - - err = -EINVAL; - ns = get_proc_ns(file_inode(file)); - if (nstype && (ns->ops->type != nstype)) - goto out; + ns = proc_ns_fdget(fd, nstype, &fd_ref); + if (IS_ERR(ns)) + return PTR_ERR(ns); new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); if (IS_ERR(new_nsproxy)) { @@ -248,7 +243,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) } switch_task_namespaces(tsk, new_nsproxy); out: - fput(file); + fdput(fd_ref); return err; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2c2eb1b629b1..ffc5e6df86dc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -465,21 +465,15 @@ EXPORT_SYMBOL_GPL(__put_net); struct net *get_net_ns_by_fd(int fd) { - struct file *file; struct ns_common *ns; + struct fd fd_ref; struct net *net; - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return ERR_CAST(file); - - ns = get_proc_ns(file_inode(file)); - if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); - else - net = ERR_PTR(-EINVAL); - - fput(file); + ns = proc_ns_fdget(fd, CLONE_NEWNET, &fd_ref); + if (IS_ERR(ns)) + return ERR_CAST(ns); + net = get_net(container_of(ns, struct net, ns)); + fdput(fd_ref); return net; } ^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH RFC v3 1/2] nsfs: replace proc_ns_fget() with proc_ns_fdget() @ 2015-09-25 13:52 ` Konstantin Khlebnikov 0 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw) To: linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-kernel-u79uwXL29TY76Z2rM5mHXA Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber New helper function returns pointer to struct ns_common, can check namespace type and uses struct fd for returning file reference: this saves couple atomic operations for single-threaded applications. Signed-off-by: Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> --- fs/nsfs.c | 20 +++++++++++++------- include/linux/proc_ns.h | 4 +++- kernel/nsproxy.c | 15 +++++---------- net/core/net_namespace.c | 18 ++++++------------ 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 8f20d6016e20..a2e803c97960 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -119,21 +119,27 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, return res; } -struct file *proc_ns_fget(int fd) +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) { - struct file *file; + struct ns_common *ns; + struct fd f; - file = fget(fd); - if (!file) + f = fdget(fd); + if (!f.file) return ERR_PTR(-EBADF); - if (file->f_op != &ns_file_operations) + if (f.file->f_op != &ns_file_operations) + goto out_invalid; + + ns = get_proc_ns(file_inode(f.file)); + if (nstype && (ns->ops->type != nstype)) goto out_invalid; - return file; + *fd_ref = f; + return ns; out_invalid: - fput(file); + fdput(f); return ERR_PTR(-EINVAL); } diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 42dfc615dbf8..406ea4617606 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -9,6 +9,7 @@ struct pid_namespace; struct nsproxy; struct path; +struct fd; struct proc_ns_operations { const char *name; @@ -65,7 +66,8 @@ static inline int ns_alloc_inum(struct ns_common *ns) #define ns_free_inum(ns) proc_free_inum((ns)->inum) -extern struct file *proc_ns_fget(int fd); +extern struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref); + #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) extern void *ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 49746c81ad8d..689b929a0fcb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -222,18 +222,13 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) { struct task_struct *tsk = current; struct nsproxy *new_nsproxy; - struct file *file; + struct fd fd_ref; struct ns_common *ns; int err; - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); - - err = -EINVAL; - ns = get_proc_ns(file_inode(file)); - if (nstype && (ns->ops->type != nstype)) - goto out; + ns = proc_ns_fdget(fd, nstype, &fd_ref); + if (IS_ERR(ns)) + return PTR_ERR(ns); new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); if (IS_ERR(new_nsproxy)) { @@ -248,7 +243,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) } switch_task_namespaces(tsk, new_nsproxy); out: - fput(file); + fdput(fd_ref); return err; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2c2eb1b629b1..ffc5e6df86dc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -465,21 +465,15 @@ EXPORT_SYMBOL_GPL(__put_net); struct net *get_net_ns_by_fd(int fd) { - struct file *file; struct ns_common *ns; + struct fd fd_ref; struct net *net; - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return ERR_CAST(file); - - ns = get_proc_ns(file_inode(file)); - if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); - else - net = ERR_PTR(-EINVAL); - - fput(file); + ns = proc_ns_fdget(fd, CLONE_NEWNET, &fd_ref); + if (IS_ERR(ns)) + return ERR_CAST(ns); + net = get_net(container_of(ns, struct net, ns)); + fdput(fd_ref); return net; } ^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH RFC v3 2/2] pidns: introduce syscall getvpid 2015-09-25 13:52 ` Konstantin Khlebnikov @ 2015-09-25 13:52 ` Konstantin Khlebnikov -1 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw) To: linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-kernel-u79uwXL29TY76Z2rM5mHXA Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds pid_t getvpid(pid_t pid, int source, int target); This syscall converts pid from source pid-namespace into pid visible in target pid-namespace. If pid is unreachable from target namespace then getvpid() returns zero. Namespaces are defined by file descriptors pointing to entries in proc (/proc/[pid]/ns/pid). If argument is negative then current pid namespace is used. If pid is negative then getvpid() returns pid of parent task for -pid. Possible error codes: ESRCH - task not found EBADF - closed file descriptor EINVAL - not pid-namespace file descriptor Such conversion is required for interaction between processes from different pid-namespaces. For example system service at host system who provide access to restricted set of privileged operations for clients from containers have to convert pids back and forward. Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but this interface works only in one way and even that is non-trivial. Other option is passing pids with credentials via unix socket, but this solution requires a lot of preparation and CAP_SYS_ADMIN for sending arbitrary pids. This syscall works in both directions, it's fast and simple. Examples: getvpid(pid, ns, -1) - get pid in our pid namespace getvpid(pid, -1, ns) - get pid in container getvpid(pid, -1, ns) > 0 - is pid is reachable from container? getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2? getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2? getvpid(1, ns, -1) - get init task of pid-namespace getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace getvpid(-pid, -1, -1) - get ppid by pid Signed-off-by: Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> -- v1: https://lkml.org/lkml/2015/9/15/411 v2: https://lkml.org/lkml/2015/9/24/278 v3: * use proc_ns_fdget() * update description * rebase to next-20150925 * fix conflict with mlock2 --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 4 ++- kernel/sys.c | 51 ++++++++++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 143ef9f37932..c36c2c65d204 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -383,3 +383,4 @@ 374 i386 userfaultfd sys_userfaultfd 375 i386 membarrier sys_membarrier 376 i386 mlock2 sys_mlock2 +377 i386 getvpid sys_getvpid diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 314a90bfc09c..90bbbc7fdbe0 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -332,6 +332,7 @@ 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 +326 common getvpid sys_getvpid # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a156b82dd14c..dbb5638260b5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us asmlinkage long sys_alarm(unsigned int seconds); asmlinkage long sys_getpid(void); asmlinkage long sys_getppid(void); +asmlinkage long sys_getvpid(pid_t pid, int source, int target); asmlinkage long sys_getuid(void); asmlinkage long sys_geteuid(void); asmlinkage long sys_getgid(void); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 1324b0292ec2..2c1123130f90 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd) __SYSCALL(__NR_membarrier, sys_membarrier) #define __NR_mlock2 284 __SYSCALL(__NR_mlock2, sys_mlock2) +#define __NR_mlock2 285 +__SYSCALL(__NR_getvpid, sys_getvpid) #undef __NR_syscalls -#define __NR_syscalls 285 +#define __NR_syscalls 286 /* * All syscalls below here should go away really, diff --git a/kernel/sys.c b/kernel/sys.c index fa2f2f671a5c..1e28a36b84fa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -46,6 +46,7 @@ #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/proc_ns.h> #include <linux/binfmts.h> #include <linux/sched.h> @@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid) return pid; } +SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target) +{ + struct pid_namespace *source_ns, *target_ns; + struct fd source_fd = {}, target_fd = {}; + struct pid *struct_pid; + struct ns_common *ns; + pid_t result; + + if (source >= 0) { + ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd); + result = PTR_ERR(ns); + if (IS_ERR(ns)) + goto out; + source_ns = container_of(ns, struct pid_namespace, ns); + } else + source_ns = task_active_pid_ns(current); + + if (target >= 0) { + ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd); + result = PTR_ERR(ns); + if (IS_ERR(ns)) + goto out; + target_ns = container_of(ns, struct pid_namespace, ns); + } else + target_ns = task_active_pid_ns(current); + + rcu_read_lock(); + struct_pid = find_pid_ns(abs(pid), source_ns); + + if (struct_pid && pid < 0) { + struct task_struct *task; + + task = pid_task(struct_pid, PIDTYPE_PID); + if (task) + task = rcu_dereference(task->real_parent); + struct_pid = task ? task_pid(task) : NULL; + } + + if (struct_pid) + result = pid_nr_ns(struct_pid, target_ns); + else + result = -ESRCH; + rcu_read_unlock(); + +out: + fdput(target_fd); + fdput(source_fd); + return result; +} + SYSCALL_DEFINE0(getuid) { /* Only we change this so SMP safe */ ^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH RFC v3 2/2] pidns: introduce syscall getvpid @ 2015-09-25 13:52 ` Konstantin Khlebnikov 0 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw) To: linux-api, containers, linux-kernel Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber pid_t getvpid(pid_t pid, int source, int target); This syscall converts pid from source pid-namespace into pid visible in target pid-namespace. If pid is unreachable from target namespace then getvpid() returns zero. Namespaces are defined by file descriptors pointing to entries in proc (/proc/[pid]/ns/pid). If argument is negative then current pid namespace is used. If pid is negative then getvpid() returns pid of parent task for -pid. Possible error codes: ESRCH - task not found EBADF - closed file descriptor EINVAL - not pid-namespace file descriptor Such conversion is required for interaction between processes from different pid-namespaces. For example system service at host system who provide access to restricted set of privileged operations for clients from containers have to convert pids back and forward. Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but this interface works only in one way and even that is non-trivial. Other option is passing pids with credentials via unix socket, but this solution requires a lot of preparation and CAP_SYS_ADMIN for sending arbitrary pids. This syscall works in both directions, it's fast and simple. Examples: getvpid(pid, ns, -1) - get pid in our pid namespace getvpid(pid, -1, ns) - get pid in container getvpid(pid, -1, ns) > 0 - is pid is reachable from container? getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2? getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2? getvpid(1, ns, -1) - get init task of pid-namespace getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace getvpid(-pid, -1, -1) - get ppid by pid Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> -- v1: https://lkml.org/lkml/2015/9/15/411 v2: https://lkml.org/lkml/2015/9/24/278 v3: * use proc_ns_fdget() * update description * rebase to next-20150925 * fix conflict with mlock2 --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 4 ++- kernel/sys.c | 51 ++++++++++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 143ef9f37932..c36c2c65d204 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -383,3 +383,4 @@ 374 i386 userfaultfd sys_userfaultfd 375 i386 membarrier sys_membarrier 376 i386 mlock2 sys_mlock2 +377 i386 getvpid sys_getvpid diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 314a90bfc09c..90bbbc7fdbe0 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -332,6 +332,7 @@ 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier 325 common mlock2 sys_mlock2 +326 common getvpid sys_getvpid # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a156b82dd14c..dbb5638260b5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us asmlinkage long sys_alarm(unsigned int seconds); asmlinkage long sys_getpid(void); asmlinkage long sys_getppid(void); +asmlinkage long sys_getvpid(pid_t pid, int source, int target); asmlinkage long sys_getuid(void); asmlinkage long sys_geteuid(void); asmlinkage long sys_getgid(void); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 1324b0292ec2..2c1123130f90 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd) __SYSCALL(__NR_membarrier, sys_membarrier) #define __NR_mlock2 284 __SYSCALL(__NR_mlock2, sys_mlock2) +#define __NR_mlock2 285 +__SYSCALL(__NR_getvpid, sys_getvpid) #undef __NR_syscalls -#define __NR_syscalls 285 +#define __NR_syscalls 286 /* * All syscalls below here should go away really, diff --git a/kernel/sys.c b/kernel/sys.c index fa2f2f671a5c..1e28a36b84fa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -46,6 +46,7 @@ #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/proc_ns.h> #include <linux/binfmts.h> #include <linux/sched.h> @@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid) return pid; } +SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target) +{ + struct pid_namespace *source_ns, *target_ns; + struct fd source_fd = {}, target_fd = {}; + struct pid *struct_pid; + struct ns_common *ns; + pid_t result; + + if (source >= 0) { + ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd); + result = PTR_ERR(ns); + if (IS_ERR(ns)) + goto out; + source_ns = container_of(ns, struct pid_namespace, ns); + } else + source_ns = task_active_pid_ns(current); + + if (target >= 0) { + ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd); + result = PTR_ERR(ns); + if (IS_ERR(ns)) + goto out; + target_ns = container_of(ns, struct pid_namespace, ns); + } else + target_ns = task_active_pid_ns(current); + + rcu_read_lock(); + struct_pid = find_pid_ns(abs(pid), source_ns); + + if (struct_pid && pid < 0) { + struct task_struct *task; + + task = pid_task(struct_pid, PIDTYPE_PID); + if (task) + task = rcu_dereference(task->real_parent); + struct_pid = task ? task_pid(task) : NULL; + } + + if (struct_pid) + result = pid_nr_ns(struct_pid, target_ns); + else + result = -ESRCH; + rcu_read_unlock(); + +out: + fdput(target_fd); + fdput(source_fd); + return result; +} + SYSCALL_DEFINE0(getuid) { /* Only we change this so SMP safe */ ^ permalink raw reply related [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid 2015-09-25 13:52 ` Konstantin Khlebnikov @ 2015-09-28 4:12 ` kbuild test robot -1 siblings, 0 replies; 37+ messages in thread From: kbuild test robot @ 2015-09-28 4:12 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, kbuild-all-JC7UmRfGjtg, Chen Fan, Andrew Morton, Linus Torvalds, Eric W. Biederman [-- Attachment #1: Type: text/plain, Size: 4263 bytes --] Hi Konstantin, [auto build test results on next-20150925 -- if it's inappropriate base, please ignore] config: openrisc-or1ksim_defconfig (attached as .config) reproduce: wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout 7bea85b3c058d69527d79b94338025f218a0655e # save the attached .config to linux build tree make.cross ARCH=openrisc All warnings (new ones prefixed by >>): In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from <stdin>:2: >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition <stdin>:1307:2: warning: #warning syscall getvpid not implemented -- In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from include/uapi/linux/unistd.h:7, from arch/openrisc/kernel/setup.c:25: >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition -- In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from include/uapi/linux/unistd.h:7, from include/linux/syscalls.h:78, from arch/openrisc/kernel/sys_call_table.c:17: >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from arch/openrisc/kernel/sys_call_table.c:27: include/uapi/asm-generic/unistd.h:716:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:718:0: note: this is the location of the previous definition >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition include/uapi/asm-generic/unistd.h:719:1: error: '__NR_getvpid' undeclared here (not in a function) include/uapi/asm-generic/unistd.h:719:1: error: array index in initializer not of integer type include/uapi/asm-generic/unistd.h:719:1: error: (near initialization for 'sys_call_table') -- kernel/time/Kconfig:155:warning: range is invalid In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from <stdin>:2: >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition <stdin>:1307:2: warning: #warning syscall getvpid not implemented vim +/__NR_mlock2 +718 include/uapi/asm-generic/unistd.h 702 #define __NR_seccomp 277 703 __SYSCALL(__NR_seccomp, sys_seccomp) 704 #define __NR_getrandom 278 705 __SYSCALL(__NR_getrandom, sys_getrandom) 706 #define __NR_memfd_create 279 707 __SYSCALL(__NR_memfd_create, sys_memfd_create) 708 #define __NR_bpf 280 709 __SYSCALL(__NR_bpf, sys_bpf) 710 #define __NR_execveat 281 711 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) 712 #define __NR_userfaultfd 282 713 __SYSCALL(__NR_userfaultfd, sys_userfaultfd) 714 #define __NR_membarrier 283 715 __SYSCALL(__NR_membarrier, sys_membarrier) 716 #define __NR_mlock2 284 717 __SYSCALL(__NR_mlock2, sys_mlock2) > 718 #define __NR_mlock2 285 719 __SYSCALL(__NR_getvpid, sys_getvpid) 720 721 #undef __NR_syscalls 722 #define __NR_syscalls 286 723 724 /* 725 * All syscalls below here should go away really, 726 * these are provided for both review and as a porting --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation [-- Attachment #2: .config.gz --] [-- Type: application/octet-stream, Size: 6887 bytes --] [-- Attachment #3: Type: text/plain, Size: 205 bytes --] _______________________________________________ Containers mailing list Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org https://lists.linuxfoundation.org/mailman/listinfo/containers ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid @ 2015-09-28 4:12 ` kbuild test robot 0 siblings, 0 replies; 37+ messages in thread From: kbuild test robot @ 2015-09-28 4:12 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: kbuild-all, linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber [-- Attachment #1: Type: text/plain, Size: 4263 bytes --] Hi Konstantin, [auto build test results on next-20150925 -- if it's inappropriate base, please ignore] config: openrisc-or1ksim_defconfig (attached as .config) reproduce: wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout 7bea85b3c058d69527d79b94338025f218a0655e # save the attached .config to linux build tree make.cross ARCH=openrisc All warnings (new ones prefixed by >>): In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from <stdin>:2: >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition <stdin>:1307:2: warning: #warning syscall getvpid not implemented -- In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from include/uapi/linux/unistd.h:7, from arch/openrisc/kernel/setup.c:25: >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition -- In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from include/uapi/linux/unistd.h:7, from include/linux/syscalls.h:78, from arch/openrisc/kernel/sys_call_table.c:17: >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from arch/openrisc/kernel/sys_call_table.c:27: include/uapi/asm-generic/unistd.h:716:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:718:0: note: this is the location of the previous definition >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition include/uapi/asm-generic/unistd.h:719:1: error: '__NR_getvpid' undeclared here (not in a function) include/uapi/asm-generic/unistd.h:719:1: error: array index in initializer not of integer type include/uapi/asm-generic/unistd.h:719:1: error: (near initialization for 'sys_call_table') -- kernel/time/Kconfig:155:warning: range is invalid In file included from include/asm-generic/unistd.h:1:0, from arch/openrisc/include/uapi/asm/unistd.h:26, from <stdin>:2: >> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition <stdin>:1307:2: warning: #warning syscall getvpid not implemented vim +/__NR_mlock2 +718 include/uapi/asm-generic/unistd.h 702 #define __NR_seccomp 277 703 __SYSCALL(__NR_seccomp, sys_seccomp) 704 #define __NR_getrandom 278 705 __SYSCALL(__NR_getrandom, sys_getrandom) 706 #define __NR_memfd_create 279 707 __SYSCALL(__NR_memfd_create, sys_memfd_create) 708 #define __NR_bpf 280 709 __SYSCALL(__NR_bpf, sys_bpf) 710 #define __NR_execveat 281 711 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) 712 #define __NR_userfaultfd 282 713 __SYSCALL(__NR_userfaultfd, sys_userfaultfd) 714 #define __NR_membarrier 283 715 __SYSCALL(__NR_membarrier, sys_membarrier) 716 #define __NR_mlock2 284 717 __SYSCALL(__NR_mlock2, sys_mlock2) > 718 #define __NR_mlock2 285 719 __SYSCALL(__NR_getvpid, sys_getvpid) 720 721 #undef __NR_syscalls 722 #define __NR_syscalls 286 723 724 /* 725 * All syscalls below here should go away really, 726 * these are provided for both review and as a porting --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation [-- Attachment #2: .config.gz --] [-- Type: application/octet-stream, Size: 6887 bytes --] ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid 2015-09-25 13:52 ` Konstantin Khlebnikov @ 2015-09-28 16:22 ` Eric W. Biederman -1 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-28 16:22 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, Chen Fan, Andrew Morton, Linus Torvalds Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes: > pid_t getvpid(pid_t pid, int source, int target); > > This syscall converts pid from source pid-namespace into pid visible > in target pid-namespace. If pid is unreachable from target namespace > then getvpid() returns zero. Two minor things. Can we please call this translate_pid? getvpid does not really cover what this syscall does. Can you please split wiring up into a separate patch? You goofed it up this round and it just adds noise in reviewing the core syscall. > Namespaces are defined by file descriptors pointing to entries in > proc (/proc/[pid]/ns/pid). If argument is negative then current pid > namespace is used. > > If pid is negative then getvpid() returns pid of parent task for -pid. > > Possible error codes: > ESRCH - task not found > EBADF - closed file descriptor > EINVAL - not pid-namespace file descriptor > > Such conversion is required for interaction between processes from > different pid-namespaces. For example system service at host system > who provide access to restricted set of privileged operations for > clients from containers have to convert pids back and forward. > > Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but > this interface works only in one way and even that is non-trivial. > > Other option is passing pids with credentials via unix socket, but > this solution requires a lot of preparation and CAP_SYS_ADMIN for > sending arbitrary pids. > > This syscall works in both directions, it's fast and simple. > > Examples: > getvpid(pid, ns, -1) - get pid in our pid namespace > getvpid(pid, -1, ns) - get pid in container > getvpid(pid, -1, ns) > 0 - is pid is reachable from container? > getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2? > getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2? > getvpid(1, ns, -1) - get init task of pid-namespace > getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace > getvpid(-pid, -1, -1) - get ppid by pid > > Signed-off-by: Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> > > -- > > v1: https://lkml.org/lkml/2015/9/15/411 > v2: https://lkml.org/lkml/2015/9/24/278 > v3: > * use proc_ns_fdget() > * update description > * rebase to next-20150925 > * fix conflict with mlock2 > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > include/linux/syscalls.h | 1 + > include/uapi/asm-generic/unistd.h | 4 ++- > kernel/sys.c | 51 ++++++++++++++++++++++++++++++++ > 5 files changed, 57 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > index 143ef9f37932..c36c2c65d204 100644 > --- a/arch/x86/entry/syscalls/syscall_32.tbl > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > @@ -383,3 +383,4 @@ > 374 i386 userfaultfd sys_userfaultfd > 375 i386 membarrier sys_membarrier > 376 i386 mlock2 sys_mlock2 > +377 i386 getvpid sys_getvpid > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index 314a90bfc09c..90bbbc7fdbe0 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -332,6 +332,7 @@ > 323 common userfaultfd sys_userfaultfd > 324 common membarrier sys_membarrier > 325 common mlock2 sys_mlock2 > +326 common getvpid sys_getvpid > > # > # x32-specific system call numbers start at 512 to avoid cache impact > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index a156b82dd14c..dbb5638260b5 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us > asmlinkage long sys_alarm(unsigned int seconds); > asmlinkage long sys_getpid(void); > asmlinkage long sys_getppid(void); > +asmlinkage long sys_getvpid(pid_t pid, int source, int target); > asmlinkage long sys_getuid(void); > asmlinkage long sys_geteuid(void); > asmlinkage long sys_getgid(void); > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > index 1324b0292ec2..2c1123130f90 100644 > --- a/include/uapi/asm-generic/unistd.h > +++ b/include/uapi/asm-generic/unistd.h > @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd) > __SYSCALL(__NR_membarrier, sys_membarrier) > #define __NR_mlock2 284 > __SYSCALL(__NR_mlock2, sys_mlock2) > +#define __NR_mlock2 285 > +__SYSCALL(__NR_getvpid, sys_getvpid) > > #undef __NR_syscalls > -#define __NR_syscalls 285 > +#define __NR_syscalls 286 > > /* > * All syscalls below here should go away really, > diff --git a/kernel/sys.c b/kernel/sys.c > index fa2f2f671a5c..1e28a36b84fa 100644 > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -46,6 +46,7 @@ > #include <linux/syscalls.h> > #include <linux/kprobes.h> > #include <linux/user_namespace.h> > +#include <linux/proc_ns.h> > #include <linux/binfmts.h> > > #include <linux/sched.h> > @@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid) > return pid; > } > > +SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target) > +{ > + struct pid_namespace *source_ns, *target_ns; > + struct fd source_fd = {}, target_fd = {}; > + struct pid *struct_pid; > + struct ns_common *ns; > + pid_t result; > + > + if (source >= 0) { > + ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd); > + result = PTR_ERR(ns); > + if (IS_ERR(ns)) > + goto out; > + source_ns = container_of(ns, struct pid_namespace, ns); > + } else > + source_ns = task_active_pid_ns(current); > + > + if (target >= 0) { > + ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd); > + result = PTR_ERR(ns); > + if (IS_ERR(ns)) > + goto out; > + target_ns = container_of(ns, struct pid_namespace, ns); > + } else > + target_ns = task_active_pid_ns(current); > + > + rcu_read_lock(); > + struct_pid = find_pid_ns(abs(pid), source_ns); > + > + if (struct_pid && pid < 0) { > + struct task_struct *task; > + > + task = pid_task(struct_pid, PIDTYPE_PID); > + if (task) > + task = rcu_dereference(task->real_parent); > + struct_pid = task ? task_pid(task) : NULL; > + } > + > + if (struct_pid) > + result = pid_nr_ns(struct_pid, target_ns); > + else > + result = -ESRCH; > + rcu_read_unlock(); > + > +out: > + fdput(target_fd); > + fdput(source_fd); > + return result; > +} > + > SYSCALL_DEFINE0(getuid) > { > /* Only we change this so SMP safe */ ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid @ 2015-09-28 16:22 ` Eric W. Biederman 0 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-28 16:22 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes: > pid_t getvpid(pid_t pid, int source, int target); > > This syscall converts pid from source pid-namespace into pid visible > in target pid-namespace. If pid is unreachable from target namespace > then getvpid() returns zero. Two minor things. Can we please call this translate_pid? getvpid does not really cover what this syscall does. Can you please split wiring up into a separate patch? You goofed it up this round and it just adds noise in reviewing the core syscall. > Namespaces are defined by file descriptors pointing to entries in > proc (/proc/[pid]/ns/pid). If argument is negative then current pid > namespace is used. > > If pid is negative then getvpid() returns pid of parent task for -pid. > > Possible error codes: > ESRCH - task not found > EBADF - closed file descriptor > EINVAL - not pid-namespace file descriptor > > Such conversion is required for interaction between processes from > different pid-namespaces. For example system service at host system > who provide access to restricted set of privileged operations for > clients from containers have to convert pids back and forward. > > Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but > this interface works only in one way and even that is non-trivial. > > Other option is passing pids with credentials via unix socket, but > this solution requires a lot of preparation and CAP_SYS_ADMIN for > sending arbitrary pids. > > This syscall works in both directions, it's fast and simple. > > Examples: > getvpid(pid, ns, -1) - get pid in our pid namespace > getvpid(pid, -1, ns) - get pid in container > getvpid(pid, -1, ns) > 0 - is pid is reachable from container? > getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2? > getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2? > getvpid(1, ns, -1) - get init task of pid-namespace > getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace > getvpid(-pid, -1, -1) - get ppid by pid > > Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> > > -- > > v1: https://lkml.org/lkml/2015/9/15/411 > v2: https://lkml.org/lkml/2015/9/24/278 > v3: > * use proc_ns_fdget() > * update description > * rebase to next-20150925 > * fix conflict with mlock2 > --- > arch/x86/entry/syscalls/syscall_32.tbl | 1 + > arch/x86/entry/syscalls/syscall_64.tbl | 1 + > include/linux/syscalls.h | 1 + > include/uapi/asm-generic/unistd.h | 4 ++- > kernel/sys.c | 51 ++++++++++++++++++++++++++++++++ > 5 files changed, 57 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl > index 143ef9f37932..c36c2c65d204 100644 > --- a/arch/x86/entry/syscalls/syscall_32.tbl > +++ b/arch/x86/entry/syscalls/syscall_32.tbl > @@ -383,3 +383,4 @@ > 374 i386 userfaultfd sys_userfaultfd > 375 i386 membarrier sys_membarrier > 376 i386 mlock2 sys_mlock2 > +377 i386 getvpid sys_getvpid > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index 314a90bfc09c..90bbbc7fdbe0 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -332,6 +332,7 @@ > 323 common userfaultfd sys_userfaultfd > 324 common membarrier sys_membarrier > 325 common mlock2 sys_mlock2 > +326 common getvpid sys_getvpid > > # > # x32-specific system call numbers start at 512 to avoid cache impact > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index a156b82dd14c..dbb5638260b5 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us > asmlinkage long sys_alarm(unsigned int seconds); > asmlinkage long sys_getpid(void); > asmlinkage long sys_getppid(void); > +asmlinkage long sys_getvpid(pid_t pid, int source, int target); > asmlinkage long sys_getuid(void); > asmlinkage long sys_geteuid(void); > asmlinkage long sys_getgid(void); > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h > index 1324b0292ec2..2c1123130f90 100644 > --- a/include/uapi/asm-generic/unistd.h > +++ b/include/uapi/asm-generic/unistd.h > @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd) > __SYSCALL(__NR_membarrier, sys_membarrier) > #define __NR_mlock2 284 > __SYSCALL(__NR_mlock2, sys_mlock2) > +#define __NR_mlock2 285 > +__SYSCALL(__NR_getvpid, sys_getvpid) > > #undef __NR_syscalls > -#define __NR_syscalls 285 > +#define __NR_syscalls 286 > > /* > * All syscalls below here should go away really, > diff --git a/kernel/sys.c b/kernel/sys.c > index fa2f2f671a5c..1e28a36b84fa 100644 > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -46,6 +46,7 @@ > #include <linux/syscalls.h> > #include <linux/kprobes.h> > #include <linux/user_namespace.h> > +#include <linux/proc_ns.h> > #include <linux/binfmts.h> > > #include <linux/sched.h> > @@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid) > return pid; > } > > +SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target) > +{ > + struct pid_namespace *source_ns, *target_ns; > + struct fd source_fd = {}, target_fd = {}; > + struct pid *struct_pid; > + struct ns_common *ns; > + pid_t result; > + > + if (source >= 0) { > + ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd); > + result = PTR_ERR(ns); > + if (IS_ERR(ns)) > + goto out; > + source_ns = container_of(ns, struct pid_namespace, ns); > + } else > + source_ns = task_active_pid_ns(current); > + > + if (target >= 0) { > + ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd); > + result = PTR_ERR(ns); > + if (IS_ERR(ns)) > + goto out; > + target_ns = container_of(ns, struct pid_namespace, ns); > + } else > + target_ns = task_active_pid_ns(current); > + > + rcu_read_lock(); > + struct_pid = find_pid_ns(abs(pid), source_ns); > + > + if (struct_pid && pid < 0) { > + struct task_struct *task; > + > + task = pid_task(struct_pid, PIDTYPE_PID); > + if (task) > + task = rcu_dereference(task->real_parent); > + struct_pid = task ? task_pid(task) : NULL; > + } > + > + if (struct_pid) > + result = pid_nr_ns(struct_pid, target_ns); > + else > + result = -ESRCH; > + rcu_read_unlock(); > + > +out: > + fdput(target_fd); > + fdput(source_fd); > + return result; > +} > + > SYSCALL_DEFINE0(getuid) > { > /* Only we change this so SMP safe */ ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid 2015-09-25 13:52 ` Konstantin Khlebnikov @ 2015-09-28 16:57 ` Eric W. Biederman -1 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-28 16:57 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, Chen Fan, Andrew Morton, Linus Torvalds Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes: > If pid is negative then getvpid() returns pid of parent task for -pid. Now that I am noticing this. I don't think I have seen any discussion about justifying a syscall getting another processes parent pid. My apologies if I just missed it. Why do we want the the parent pid? We can we usefully do with it? Is proc really that bad of an interface? Fetching a parent pid feels like a separate logical operation from pid translation. Which makes me a bit uneasy about this part of the conversation. > Examples: > getvpid(pid, ns, -1) - get pid in our pid namespace > getvpid(pid, -1, ns) - get pid in container > getvpid(pid, -1, ns) > 0 - is pid is reachable from container? > getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2? > getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2? > getvpid(1, ns, -1) - get init task of pid-namespace > getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace > getvpid(-pid, -1, -1) - get ppid by pid As I step back and pay attention to this case I am half wondering if perhaps what would be most useful is a file descriptor that refers to a pid and an updated set of system calls that takes pid file descriptors instead of pids. Something like: getpidfd(int pidnsfd, pid_t pid); waitfd(int pidfd, int *status, int options, struct rusage *rusage); killfd(int pidfd, int sig); clonefd(...); And perhaps: pid_nr_ns(int pidnsfd, int pidfd); parentfd(int pidfd); Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid @ 2015-09-28 16:57 ` Eric W. Biederman 0 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-28 16:57 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes: > If pid is negative then getvpid() returns pid of parent task for -pid. Now that I am noticing this. I don't think I have seen any discussion about justifying a syscall getting another processes parent pid. My apologies if I just missed it. Why do we want the the parent pid? We can we usefully do with it? Is proc really that bad of an interface? Fetching a parent pid feels like a separate logical operation from pid translation. Which makes me a bit uneasy about this part of the conversation. > Examples: > getvpid(pid, ns, -1) - get pid in our pid namespace > getvpid(pid, -1, ns) - get pid in container > getvpid(pid, -1, ns) > 0 - is pid is reachable from container? > getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2? > getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2? > getvpid(1, ns, -1) - get init task of pid-namespace > getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace > getvpid(-pid, -1, -1) - get ppid by pid As I step back and pay attention to this case I am half wondering if perhaps what would be most useful is a file descriptor that refers to a pid and an updated set of system calls that takes pid file descriptors instead of pids. Something like: getpidfd(int pidnsfd, pid_t pid); waitfd(int pidfd, int *status, int options, struct rusage *rusage); killfd(int pidfd, int sig); clonefd(...); And perhaps: pid_nr_ns(int pidnsfd, int pidfd); parentfd(int pidfd); Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
[parent not found: <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>]
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid [not found] ` <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> @ 2015-10-20 10:04 ` Konstantin Khlebnikov 0 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-10-20 10:04 UTC (permalink / raw) To: Eric W. Biederman Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov, Chen Fan, Andrew Morton, Linus Torvalds On 28.09.2015 19:57, Eric W. Biederman wrote: > Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes: > >> If pid is negative then getvpid() returns pid of parent task for -pid. > > Now that I am noticing this. I don't think I have seen any discussion > about justifying a syscall getting another processes parent pid. My > apologies if I just missed it. > Sorry for late response. This completely fell out of my mind after LinuxCon. > Why do we want the the parent pid? We can we usefully do with it? > Is proc really that bad of an interface? > > Fetching a parent pid feels like a separate logical operation > from pid translation. Which makes me a bit uneasy about this > part of the conversation. Yep proc interface is bad. /proc/$pid/stat is almost impossible to parse without flaws because task could set second field "comm" into any string and fake ppid - for example ") Z 1". /proc/$pid/status is better but it has more information and thus slower. This trick for distant getppid looks cheap useful: in this interface space of negative pids is free for use. > >> Examples: >> getvpid(pid, ns, -1) - get pid in our pid namespace >> getvpid(pid, -1, ns) - get pid in container >> getvpid(pid, -1, ns) > 0 - is pid is reachable from container? >> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2? >> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2? >> getvpid(1, ns, -1) - get init task of pid-namespace >> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace >> getvpid(-pid, -1, -1) - get ppid by pid > > As I step back and pay attention to this case I am half wondering if > perhaps what would be most useful is a file descriptor that refers > to a pid and an updated set of system calls that takes pid file > descriptors instead of pids. Fd which pins pids isn't a good idea. I think it's better to refer (but not hold) task rather than pid. For example inode of taskfd will hold small buffer for task exit status: task holds reference to its own taskfd inode and populates status when exits. Here will be no zombies and delayed reaping. Something like: task_fd = clonefd() ... select(...) exit(...) pread(task_fd, &status_rusage_etc, sizeof, 0); close(task_fd); Task pid also could be part of structure in that fd. Potentially it could provide the same information as /proc/$pid/... in effective binary format: we can read only required fields of structure and kernel can skip unneeded calculations. > > Something like: > > getpidfd(int pidnsfd, pid_t pid); > > waitfd(int pidfd, int *status, int options, struct rusage *rusage); > > killfd(int pidfd, int sig); > > clonefd(...); > > And perhaps: > pid_nr_ns(int pidnsfd, int pidfd); > > parentfd(int pidfd); > > Eric > -- Konstantin ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid [not found] ` <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> @ 2015-10-20 10:04 ` Konstantin Khlebnikov 0 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-10-20 10:04 UTC (permalink / raw) To: Eric W. Biederman Cc: linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber On 28.09.2015 19:57, Eric W. Biederman wrote: > Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes: > >> If pid is negative then getvpid() returns pid of parent task for -pid. > > Now that I am noticing this. I don't think I have seen any discussion > about justifying a syscall getting another processes parent pid. My > apologies if I just missed it. > Sorry for late response. This completely fell out of my mind after LinuxCon. > Why do we want the the parent pid? We can we usefully do with it? > Is proc really that bad of an interface? > > Fetching a parent pid feels like a separate logical operation > from pid translation. Which makes me a bit uneasy about this > part of the conversation. Yep proc interface is bad. /proc/$pid/stat is almost impossible to parse without flaws because task could set second field "comm" into any string and fake ppid - for example ") Z 1". /proc/$pid/status is better but it has more information and thus slower. This trick for distant getppid looks cheap useful: in this interface space of negative pids is free for use. > >> Examples: >> getvpid(pid, ns, -1) - get pid in our pid namespace >> getvpid(pid, -1, ns) - get pid in container >> getvpid(pid, -1, ns) > 0 - is pid is reachable from container? >> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2? >> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2? >> getvpid(1, ns, -1) - get init task of pid-namespace >> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace >> getvpid(-pid, -1, -1) - get ppid by pid > > As I step back and pay attention to this case I am half wondering if > perhaps what would be most useful is a file descriptor that refers > to a pid and an updated set of system calls that takes pid file > descriptors instead of pids. Fd which pins pids isn't a good idea. I think it's better to refer (but not hold) task rather than pid. For example inode of taskfd will hold small buffer for task exit status: task holds reference to its own taskfd inode and populates status when exits. Here will be no zombies and delayed reaping. Something like: task_fd = clonefd() ... select(...) exit(...) pread(task_fd, &status_rusage_etc, sizeof, 0); close(task_fd); Task pid also could be part of structure in that fd. Potentially it could provide the same information as /proc/$pid/... in effective binary format: we can read only required fields of structure and kernel can skip unneeded calculations. > > Something like: > > getpidfd(int pidnsfd, pid_t pid); > > waitfd(int pidfd, int *status, int options, struct rusage *rusage); > > killfd(int pidfd, int sig); > > clonefd(...); > > And perhaps: > pid_nr_ns(int pidnsfd, int pidfd); > > parentfd(int pidfd); > > Eric > -- Konstantin ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid @ 2015-10-20 10:04 ` Konstantin Khlebnikov 0 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-10-20 10:04 UTC (permalink / raw) To: Eric W. Biederman Cc: linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin, Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber On 28.09.2015 19:57, Eric W. Biederman wrote: > Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes: > >> If pid is negative then getvpid() returns pid of parent task for -pid. > > Now that I am noticing this. I don't think I have seen any discussion > about justifying a syscall getting another processes parent pid. My > apologies if I just missed it. > Sorry for late response. This completely fell out of my mind after LinuxCon. > Why do we want the the parent pid? We can we usefully do with it? > Is proc really that bad of an interface? > > Fetching a parent pid feels like a separate logical operation > from pid translation. Which makes me a bit uneasy about this > part of the conversation. Yep proc interface is bad. /proc/$pid/stat is almost impossible to parse without flaws because task could set second field "comm" into any string and fake ppid - for example ") Z 1". /proc/$pid/status is better but it has more information and thus slower. This trick for distant getppid looks cheap useful: in this interface space of negative pids is free for use. > >> Examples: >> getvpid(pid, ns, -1) - get pid in our pid namespace >> getvpid(pid, -1, ns) - get pid in container >> getvpid(pid, -1, ns) > 0 - is pid is reachable from container? >> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2? >> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2? >> getvpid(1, ns, -1) - get init task of pid-namespace >> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace >> getvpid(-pid, -1, -1) - get ppid by pid > > As I step back and pay attention to this case I am half wondering if > perhaps what would be most useful is a file descriptor that refers > to a pid and an updated set of system calls that takes pid file > descriptors instead of pids. Fd which pins pids isn't a good idea. I think it's better to refer (but not hold) task rather than pid. For example inode of taskfd will hold small buffer for task exit status: task holds reference to its own taskfd inode and populates status when exits. Here will be no zombies and delayed reaping. Something like: task_fd = clonefd() ... select(...) exit(...) pread(task_fd, &status_rusage_etc, sizeof, 0); close(task_fd); Task pid also could be part of structure in that fd. Potentially it could provide the same information as /proc/$pid/... in effective binary format: we can read only required fields of structure and kernel can skip unneeded calculations. > > Something like: > > getpidfd(int pidnsfd, pid_t pid); > > waitfd(int pidfd, int *status, int options, struct rusage *rusage); > > killfd(int pidfd, int sig); > > clonefd(...); > > And perhaps: > pid_nr_ns(int pidnsfd, int pidfd); > > parentfd(int pidfd); > > Eric > -- Konstantin ^ permalink raw reply [flat|nested] 37+ messages in thread
* [PATCH 0/1] ns: introduce proc_get_ns_by_fd() 2015-09-25 13:52 ` Konstantin Khlebnikov @ 2015-09-25 17:56 ` Oleg Nesterov -1 siblings, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-25 17:56 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds On 09/25, Konstantin Khlebnikov wrote: > > +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) > { > - struct file *file; > + struct ns_common *ns; > + struct fd f; > > - file = fget(fd); > - if (!file) > + f = fdget(fd); > + if (!f.file) > return ERR_PTR(-EBADF); > > - if (file->f_op != &ns_file_operations) > + if (f.file->f_op != &ns_file_operations) > + goto out_invalid; > + > + ns = get_proc_ns(file_inode(f.file)); > + if (nstype && (ns->ops->type != nstype)) > goto out_invalid; > > - return file; > + *fd_ref = f; > + return ns; > > out_invalid: > - fput(file); > + fdput(f); > return ERR_PTR(-EINVAL); > } Well yes, fdget() makes sense but this is minor. Honestly, I do not really like the new helper... I understand this is subjective, so I won't insist. But how about 1/1? We do not need fd/file at all. With this patch your sys_getvpid() can just use proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). Eric, what do you think? See also "TODO" in the changelog. Oleg. ^ permalink raw reply [flat|nested] 37+ messages in thread
* [PATCH 0/1] ns: introduce proc_get_ns_by_fd() @ 2015-09-25 17:56 ` Oleg Nesterov 0 siblings, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-25 17:56 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber On 09/25, Konstantin Khlebnikov wrote: > > +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) > { > - struct file *file; > + struct ns_common *ns; > + struct fd f; > > - file = fget(fd); > - if (!file) > + f = fdget(fd); > + if (!f.file) > return ERR_PTR(-EBADF); > > - if (file->f_op != &ns_file_operations) > + if (f.file->f_op != &ns_file_operations) > + goto out_invalid; > + > + ns = get_proc_ns(file_inode(f.file)); > + if (nstype && (ns->ops->type != nstype)) > goto out_invalid; > > - return file; > + *fd_ref = f; > + return ns; > > out_invalid: > - fput(file); > + fdput(f); > return ERR_PTR(-EINVAL); > } Well yes, fdget() makes sense but this is minor. Honestly, I do not really like the new helper... I understand this is subjective, so I won't insist. But how about 1/1? We do not need fd/file at all. With this patch your sys_getvpid() can just use proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). Eric, what do you think? See also "TODO" in the changelog. Oleg. ^ permalink raw reply [flat|nested] 37+ messages in thread
[parent not found: <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]
* [PATCH 1/1] ns: introduce proc_get_ns_by_fd() 2015-09-25 17:56 ` Oleg Nesterov @ 2015-09-25 17:57 ` Oleg Nesterov -1 siblings, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-25 17:57 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds Introduce proc_get_ns_by_fd() so that get_net_ns_by_fd() becomes one-liner. It will have another CLONE_NEWPID user soon. TODO: proc_get_ns_by_fd() can share some code with proc_ns_fget(). Signed-off-by: Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> --- fs/nsfs.c | 24 ++++++++++++++++++++++++ include/linux/proc_ns.h | 3 +++ kernel/pid_namespace.c | 6 ++++++ net/core/net_namespace.c | 23 +++++++---------------- 4 files changed, 40 insertions(+), 16 deletions(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 99521e7..72474ca 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -136,6 +136,30 @@ out_invalid: return ERR_PTR(-EINVAL); } +void *proc_get_ns_by_fd(int fd, int type) +{ + struct fd f; + struct ns_common *ns; + void *ret; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + ret = ERR_PTR(-EINVAL); + if (f.file->f_op != &ns_file_operations) + goto put; + + ns = get_proc_ns(file_inode(f.file)); + if (ns->ops->type != type || !ns->ops->get_type) + goto put; + + ret = ns->ops->get_type(ns); +put: + fdput(f); + return ret; +} + static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 42dfc61..d956f89 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -16,6 +16,7 @@ struct proc_ns_operations { struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); + void *(*get_type)(struct ns_common *ns); }; extern const struct proc_ns_operations netns_operations; @@ -66,6 +67,8 @@ static inline int ns_alloc_inum(struct ns_common *ns) #define ns_free_inum(ns) proc_free_inum((ns)->inum) extern struct file *proc_ns_fget(int fd); +extern void *proc_get_ns_by_fd(int fd, int type); + #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) extern void *ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba13..0c87393 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -388,12 +388,18 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static void *pidns_get_type(struct ns_common *ns) +{ + return get_pid_ns(to_pid_ns(ns)); +} + const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, + .get_type = pidns_get_type, }; static __init int pid_namespaces_init(void) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 572af00..6465dc0 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -420,22 +420,7 @@ EXPORT_SYMBOL_GPL(__put_net); struct net *get_net_ns_by_fd(int fd) { - struct file *file; - struct ns_common *ns; - struct net *net; - - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return ERR_CAST(file); - - ns = get_proc_ns(file_inode(file)); - if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); - else - net = ERR_PTR(-EINVAL); - - fput(file); - return net; + return proc_get_ns_by_fd(fd, CLONE_NEWNET); } #else @@ -955,11 +940,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static void *netns_get_type(struct ns_common *ns) +{ + return get_net(to_net_ns(ns)); +} + const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, + .get_type = netns_get_type, }; #endif -- 1.5.5.1 ^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH 1/1] ns: introduce proc_get_ns_by_fd() @ 2015-09-25 17:57 ` Oleg Nesterov 0 siblings, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-25 17:57 UTC (permalink / raw) To: Konstantin Khlebnikov Cc: linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber Introduce proc_get_ns_by_fd() so that get_net_ns_by_fd() becomes one-liner. It will have another CLONE_NEWPID user soon. TODO: proc_get_ns_by_fd() can share some code with proc_ns_fget(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- fs/nsfs.c | 24 ++++++++++++++++++++++++ include/linux/proc_ns.h | 3 +++ kernel/pid_namespace.c | 6 ++++++ net/core/net_namespace.c | 23 +++++++---------------- 4 files changed, 40 insertions(+), 16 deletions(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 99521e7..72474ca 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -136,6 +136,30 @@ out_invalid: return ERR_PTR(-EINVAL); } +void *proc_get_ns_by_fd(int fd, int type) +{ + struct fd f; + struct ns_common *ns; + void *ret; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + ret = ERR_PTR(-EINVAL); + if (f.file->f_op != &ns_file_operations) + goto put; + + ns = get_proc_ns(file_inode(f.file)); + if (ns->ops->type != type || !ns->ops->get_type) + goto put; + + ret = ns->ops->get_type(ns); +put: + fdput(f); + return ret; +} + static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 42dfc61..d956f89 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -16,6 +16,7 @@ struct proc_ns_operations { struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); + void *(*get_type)(struct ns_common *ns); }; extern const struct proc_ns_operations netns_operations; @@ -66,6 +67,8 @@ static inline int ns_alloc_inum(struct ns_common *ns) #define ns_free_inum(ns) proc_free_inum((ns)->inum) extern struct file *proc_ns_fget(int fd); +extern void *proc_get_ns_by_fd(int fd, int type); + #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) extern void *ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a65ba13..0c87393 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -388,12 +388,18 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static void *pidns_get_type(struct ns_common *ns) +{ + return get_pid_ns(to_pid_ns(ns)); +} + const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, + .get_type = pidns_get_type, }; static __init int pid_namespaces_init(void) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 572af00..6465dc0 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -420,22 +420,7 @@ EXPORT_SYMBOL_GPL(__put_net); struct net *get_net_ns_by_fd(int fd) { - struct file *file; - struct ns_common *ns; - struct net *net; - - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return ERR_CAST(file); - - ns = get_proc_ns(file_inode(file)); - if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); - else - net = ERR_PTR(-EINVAL); - - fput(file); - return net; + return proc_get_ns_by_fd(fd, CLONE_NEWNET); } #else @@ -955,11 +940,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static void *netns_get_type(struct ns_common *ns) +{ + return get_net(to_net_ns(ns)); +} + const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, + .get_type = netns_get_type, }; #endif -- 1.5.5.1 ^ permalink raw reply related [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> 2015-09-25 17:57 ` Oleg Nesterov @ 2015-09-28 8:21 ` Konstantin Khlebnikov 2015-09-28 16:37 ` Eric W. Biederman 2 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-09-28 8:21 UTC (permalink / raw) To: Oleg Nesterov Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds On 25.09.2015 20:56, Oleg Nesterov wrote: > On 09/25, Konstantin Khlebnikov wrote: >> >> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) >> { >> - struct file *file; >> + struct ns_common *ns; >> + struct fd f; >> >> - file = fget(fd); >> - if (!file) >> + f = fdget(fd); >> + if (!f.file) >> return ERR_PTR(-EBADF); >> >> - if (file->f_op != &ns_file_operations) >> + if (f.file->f_op != &ns_file_operations) >> + goto out_invalid; >> + >> + ns = get_proc_ns(file_inode(f.file)); >> + if (nstype && (ns->ops->type != nstype)) >> goto out_invalid; >> >> - return file; >> + *fd_ref = f; >> + return ns; >> >> out_invalid: >> - fput(file); >> + fdput(f); >> return ERR_PTR(-EINVAL); >> } > > Well yes, fdget() makes sense but this is minor. > > Honestly, I do not really like the new helper... I understand this > is subjective, so I won't insist. But how about 1/1? We do not need > fd/file at all. With this patch your sys_getvpid() can just use > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). Hmm. My version has 0 or 2 atomic ops per get-put sequence. Your version: 2 or 4 atomic ops. Plus even in worst case pinning by struct fd theoretically scales better because it touches refcount at struct file: there're might be many of them for one namespace. > > Eric, what do you think? > > See also "TODO" in the changelog. > > Oleg. > -- Konstantin ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> 2015-09-25 17:57 ` Oleg Nesterov 2015-09-28 8:21 ` [PATCH 0/1] " Konstantin Khlebnikov @ 2015-09-28 16:37 ` Eric W. Biederman 2 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-28 16:37 UTC (permalink / raw) To: Oleg Nesterov Cc: Roman Gushchin, Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan, Andrew Morton, Linus Torvalds Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > On 09/25, Konstantin Khlebnikov wrote: >> >> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) >> { >> - struct file *file; >> + struct ns_common *ns; >> + struct fd f; >> >> - file = fget(fd); >> - if (!file) >> + f = fdget(fd); >> + if (!f.file) >> return ERR_PTR(-EBADF); >> >> - if (file->f_op != &ns_file_operations) >> + if (f.file->f_op != &ns_file_operations) >> + goto out_invalid; >> + >> + ns = get_proc_ns(file_inode(f.file)); >> + if (nstype && (ns->ops->type != nstype)) >> goto out_invalid; >> >> - return file; >> + *fd_ref = f; >> + return ns; >> >> out_invalid: >> - fput(file); >> + fdput(f); >> return ERR_PTR(-EINVAL); >> } > > Well yes, fdget() makes sense but this is minor. > > Honestly, I do not really like the new helper... I understand this > is subjective, so I won't insist. But how about 1/1? We do not need > fd/file at all. With this patch your sys_getvpid() can just use > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). > > Eric, what do you think? At some level I don't care this is not exposed to userspace. However since we are going several rounds with this. Of the existing uses several of them sleep, which unfortunately means we can not use rcu locking for everything. The network namespace ones wind up taking a reference to struct net because the have the legacy pid case to deal with. Which makes we can not use fdget for all callers either. For this translate_pid rcu locking is sufficient, rcu locking is easy and doing any more than rcu locking just seems silly. So let me respectfully suggest. struct ns_common *ns_by_fd_rcu(int fd, int type) { struct files_struct *files = current->files; struct file *file; struct ns_common *ns; void *ret; file = fcheck_files(files, fd); if (!file) return ERR_PTR(-EBADF); if (file->f_mode & FMODE_PATH) return ERR_PTR(-EINVAL); if (file->f_op != &ns_file_operations) return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(file)); if (ns->ops->type != type) return ERR_PTR(-EINVAL); return ns; } struct pid_namespace *pidns_by_fd_rcu(int fd) { struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID); if (IS_ERR(ns)) return ERR_CAST(ns); return container_of(ns, struct pid_namespace, ns); } SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd) { struct pid_namespace *source, *target; struct pid *pid; pid_t result; rcu_read_lock(); if (sourcefd >= 0) source = pidns_by_fd_rcu(sourcefd); else source = task_active_pid_ns(current); if (targetfd >= 0) target = pidns_by_fd_rcu(targetfd); else target = task_active_pid_ns(current); pid = find_pid_ns(pid_nr, source); result = pid_nr_ns(pid, target); if (result == 0) result = -ESRCH; rcu_read_unlock(); return result; } Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> @ 2015-09-28 8:21 ` Konstantin Khlebnikov 2015-09-28 8:21 ` [PATCH 0/1] " Konstantin Khlebnikov 2015-09-28 16:37 ` Eric W. Biederman 2 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-09-28 8:21 UTC (permalink / raw) To: Oleg Nesterov Cc: linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber On 25.09.2015 20:56, Oleg Nesterov wrote: > On 09/25, Konstantin Khlebnikov wrote: >> >> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) >> { >> - struct file *file; >> + struct ns_common *ns; >> + struct fd f; >> >> - file = fget(fd); >> - if (!file) >> + f = fdget(fd); >> + if (!f.file) >> return ERR_PTR(-EBADF); >> >> - if (file->f_op != &ns_file_operations) >> + if (f.file->f_op != &ns_file_operations) >> + goto out_invalid; >> + >> + ns = get_proc_ns(file_inode(f.file)); >> + if (nstype && (ns->ops->type != nstype)) >> goto out_invalid; >> >> - return file; >> + *fd_ref = f; >> + return ns; >> >> out_invalid: >> - fput(file); >> + fdput(f); >> return ERR_PTR(-EINVAL); >> } > > Well yes, fdget() makes sense but this is minor. > > Honestly, I do not really like the new helper... I understand this > is subjective, so I won't insist. But how about 1/1? We do not need > fd/file at all. With this patch your sys_getvpid() can just use > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). Hmm. My version has 0 or 2 atomic ops per get-put sequence. Your version: 2 or 4 atomic ops. Plus even in worst case pinning by struct fd theoretically scales better because it touches refcount at struct file: there're might be many of them for one namespace. > > Eric, what do you think? > > See also "TODO" in the changelog. > > Oleg. > -- Konstantin ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() @ 2015-09-28 8:21 ` Konstantin Khlebnikov 0 siblings, 0 replies; 37+ messages in thread From: Konstantin Khlebnikov @ 2015-09-28 8:21 UTC (permalink / raw) To: Oleg Nesterov Cc: linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin, Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber On 25.09.2015 20:56, Oleg Nesterov wrote: > On 09/25, Konstantin Khlebnikov wrote: >> >> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) >> { >> - struct file *file; >> + struct ns_common *ns; >> + struct fd f; >> >> - file = fget(fd); >> - if (!file) >> + f = fdget(fd); >> + if (!f.file) >> return ERR_PTR(-EBADF); >> >> - if (file->f_op != &ns_file_operations) >> + if (f.file->f_op != &ns_file_operations) >> + goto out_invalid; >> + >> + ns = get_proc_ns(file_inode(f.file)); >> + if (nstype && (ns->ops->type != nstype)) >> goto out_invalid; >> >> - return file; >> + *fd_ref = f; >> + return ns; >> >> out_invalid: >> - fput(file); >> + fdput(f); >> return ERR_PTR(-EINVAL); >> } > > Well yes, fdget() makes sense but this is minor. > > Honestly, I do not really like the new helper... I understand this > is subjective, so I won't insist. But how about 1/1? We do not need > fd/file at all. With this patch your sys_getvpid() can just use > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). Hmm. My version has 0 or 2 atomic ops per get-put sequence. Your version: 2 or 4 atomic ops. Plus even in worst case pinning by struct fd theoretically scales better because it touches refcount at struct file: there're might be many of them for one namespace. > > Eric, what do you think? > > See also "TODO" in the changelog. > > Oleg. > -- Konstantin ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> @ 2015-09-28 16:37 ` Eric W. Biederman 2015-09-28 8:21 ` [PATCH 0/1] " Konstantin Khlebnikov 2015-09-28 16:37 ` Eric W. Biederman 2 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-28 16:37 UTC (permalink / raw) To: Oleg Nesterov Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber Oleg Nesterov <oleg@redhat.com> writes: > On 09/25, Konstantin Khlebnikov wrote: >> >> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) >> { >> - struct file *file; >> + struct ns_common *ns; >> + struct fd f; >> >> - file = fget(fd); >> - if (!file) >> + f = fdget(fd); >> + if (!f.file) >> return ERR_PTR(-EBADF); >> >> - if (file->f_op != &ns_file_operations) >> + if (f.file->f_op != &ns_file_operations) >> + goto out_invalid; >> + >> + ns = get_proc_ns(file_inode(f.file)); >> + if (nstype && (ns->ops->type != nstype)) >> goto out_invalid; >> >> - return file; >> + *fd_ref = f; >> + return ns; >> >> out_invalid: >> - fput(file); >> + fdput(f); >> return ERR_PTR(-EINVAL); >> } > > Well yes, fdget() makes sense but this is minor. > > Honestly, I do not really like the new helper... I understand this > is subjective, so I won't insist. But how about 1/1? We do not need > fd/file at all. With this patch your sys_getvpid() can just use > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). > > Eric, what do you think? At some level I don't care this is not exposed to userspace. However since we are going several rounds with this. Of the existing uses several of them sleep, which unfortunately means we can not use rcu locking for everything. The network namespace ones wind up taking a reference to struct net because the have the legacy pid case to deal with. Which makes we can not use fdget for all callers either. For this translate_pid rcu locking is sufficient, rcu locking is easy and doing any more than rcu locking just seems silly. So let me respectfully suggest. struct ns_common *ns_by_fd_rcu(int fd, int type) { struct files_struct *files = current->files; struct file *file; struct ns_common *ns; void *ret; file = fcheck_files(files, fd); if (!file) return ERR_PTR(-EBADF); if (file->f_mode & FMODE_PATH) return ERR_PTR(-EINVAL); if (file->f_op != &ns_file_operations) return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(file)); if (ns->ops->type != type) return ERR_PTR(-EINVAL); return ns; } struct pid_namespace *pidns_by_fd_rcu(int fd) { struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID); if (IS_ERR(ns)) return ERR_CAST(ns); return container_of(ns, struct pid_namespace, ns); } SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd) { struct pid_namespace *source, *target; struct pid *pid; pid_t result; rcu_read_lock(); if (sourcefd >= 0) source = pidns_by_fd_rcu(sourcefd); else source = task_active_pid_ns(current); if (targetfd >= 0) target = pidns_by_fd_rcu(targetfd); else target = task_active_pid_ns(current); pid = find_pid_ns(pid_nr, source); result = pid_nr_ns(pid, target); if (result == 0) result = -ESRCH; rcu_read_unlock(); return result; } Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() @ 2015-09-28 16:37 ` Eric W. Biederman 0 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-28 16:37 UTC (permalink / raw) To: Oleg Nesterov Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > On 09/25, Konstantin Khlebnikov wrote: >> >> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) >> { >> - struct file *file; >> + struct ns_common *ns; >> + struct fd f; >> >> - file = fget(fd); >> - if (!file) >> + f = fdget(fd); >> + if (!f.file) >> return ERR_PTR(-EBADF); >> >> - if (file->f_op != &ns_file_operations) >> + if (f.file->f_op != &ns_file_operations) >> + goto out_invalid; >> + >> + ns = get_proc_ns(file_inode(f.file)); >> + if (nstype && (ns->ops->type != nstype)) >> goto out_invalid; >> >> - return file; >> + *fd_ref = f; >> + return ns; >> >> out_invalid: >> - fput(file); >> + fdput(f); >> return ERR_PTR(-EINVAL); >> } > > Well yes, fdget() makes sense but this is minor. > > Honestly, I do not really like the new helper... I understand this > is subjective, so I won't insist. But how about 1/1? We do not need > fd/file at all. With this patch your sys_getvpid() can just use > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). > > Eric, what do you think? At some level I don't care this is not exposed to userspace. However since we are going several rounds with this. Of the existing uses several of them sleep, which unfortunately means we can not use rcu locking for everything. The network namespace ones wind up taking a reference to struct net because the have the legacy pid case to deal with. Which makes we can not use fdget for all callers either. For this translate_pid rcu locking is sufficient, rcu locking is easy and doing any more than rcu locking just seems silly. So let me respectfully suggest. struct ns_common *ns_by_fd_rcu(int fd, int type) { struct files_struct *files = current->files; struct file *file; struct ns_common *ns; void *ret; file = fcheck_files(files, fd); if (!file) return ERR_PTR(-EBADF); if (file->f_mode & FMODE_PATH) return ERR_PTR(-EINVAL); if (file->f_op != &ns_file_operations) return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(file)); if (ns->ops->type != type) return ERR_PTR(-EINVAL); return ns; } struct pid_namespace *pidns_by_fd_rcu(int fd) { struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID); if (IS_ERR(ns)) return ERR_CAST(ns); return container_of(ns, struct pid_namespace, ns); } SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd) { struct pid_namespace *source, *target; struct pid *pid; pid_t result; rcu_read_lock(); if (sourcefd >= 0) source = pidns_by_fd_rcu(sourcefd); else source = task_active_pid_ns(current); if (targetfd >= 0) target = pidns_by_fd_rcu(targetfd); else target = task_active_pid_ns(current); pid = find_pid_ns(pid_nr, source); result = pid_nr_ns(pid, target); if (result == 0) result = -ESRCH; rcu_read_unlock(); return result; } Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> @ 2015-09-29 16:43 ` Oleg Nesterov 2015-09-30 2:54 ` Chen Fan 1 sibling, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-29 16:43 UTC (permalink / raw) To: Eric W. Biederman Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber On 09/28, Eric W. Biederman wrote: > > Oleg Nesterov <oleg@redhat.com> writes: > > > Honestly, I do not really like the new helper... I understand this > > is subjective, so I won't insist. But how about 1/1? We do not need > > fd/file at all. With this patch your sys_getvpid() can just use > > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). > > > > Eric, what do you think? > > At some level I don't care this is not exposed to userspace. I agree, this is minor. But you know, the kernel is already complicated too much, we should try to simplify/cleanup everything we can ;) > Of the existing uses several of them sleep, which unfortunately means we > can not use rcu locking for everything. The network namespace ones wind > up taking a reference to struct net because the have the legacy pid case > to deal with. Which makes we can not use fdget for all callers either. And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used by the network namespace. > For this translate_pid rcu locking is sufficient, rcu locking is easy > and doing any more than rcu locking just seems silly. So let me > respectfully suggest. > > struct ns_common *ns_by_fd_rcu(int fd, int type) > { > struct files_struct *files = current->files; > struct file *file; > struct ns_common *ns; > void *ret; > > file = fcheck_files(files, fd); > if (!file) > return ERR_PTR(-EBADF); > > if (file->f_mode & FMODE_PATH) > return ERR_PTR(-EINVAL); > > if (file->f_op != &ns_file_operations) > return ERR_PTR(-EINVAL); > > ns = get_proc_ns(file_inode(file)); > if (ns->ops->type != type) > return ERR_PTR(-EINVAL); > > return ns; > } OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). And in any case fcheck_files() makes more sense than fdget(), somehow I did not think about this when I sent 1/1. Hmm. and after the quick look at cleanup_net() I can't understand whether get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or not... Can it? Oleg. ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() @ 2015-09-29 16:43 ` Oleg Nesterov 0 siblings, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-29 16:43 UTC (permalink / raw) To: Eric W. Biederman Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber On 09/28, Eric W. Biederman wrote: > > Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > > > Honestly, I do not really like the new helper... I understand this > > is subjective, so I won't insist. But how about 1/1? We do not need > > fd/file at all. With this patch your sys_getvpid() can just use > > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). > > > > Eric, what do you think? > > At some level I don't care this is not exposed to userspace. I agree, this is minor. But you know, the kernel is already complicated too much, we should try to simplify/cleanup everything we can ;) > Of the existing uses several of them sleep, which unfortunately means we > can not use rcu locking for everything. The network namespace ones wind > up taking a reference to struct net because the have the legacy pid case > to deal with. Which makes we can not use fdget for all callers either. And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used by the network namespace. > For this translate_pid rcu locking is sufficient, rcu locking is easy > and doing any more than rcu locking just seems silly. So let me > respectfully suggest. > > struct ns_common *ns_by_fd_rcu(int fd, int type) > { > struct files_struct *files = current->files; > struct file *file; > struct ns_common *ns; > void *ret; > > file = fcheck_files(files, fd); > if (!file) > return ERR_PTR(-EBADF); > > if (file->f_mode & FMODE_PATH) > return ERR_PTR(-EINVAL); > > if (file->f_op != &ns_file_operations) > return ERR_PTR(-EINVAL); > > ns = get_proc_ns(file_inode(file)); > if (ns->ops->type != type) > return ERR_PTR(-EINVAL); > > return ns; > } OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). And in any case fcheck_files() makes more sense than fdget(), somehow I did not think about this when I sent 1/1. Hmm. and after the quick look at cleanup_net() I can't understand whether get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or not... Can it? Oleg. ^ permalink raw reply [flat|nested] 37+ messages in thread
[parent not found: <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> @ 2015-09-29 17:30 ` Eric W. Biederman 0 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-29 17:30 UTC (permalink / raw) To: Oleg Nesterov Cc: Roman Gushchin, Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan, Andrew Morton, Linus Torvalds Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > On 09/28, Eric W. Biederman wrote: >> >> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: >> >> > Honestly, I do not really like the new helper... I understand this >> > is subjective, so I won't insist. But how about 1/1? We do not need >> > fd/file at all. With this patch your sys_getvpid() can just use >> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). >> > >> > Eric, what do you think? >> >> At some level I don't care this is not exposed to userspace. > > I agree, this is minor. But you know, the kernel is already complicated > too much, we should try to simplify/cleanup everything we can ;) > >> Of the existing uses several of them sleep, which unfortunately means we >> can not use rcu locking for everything. The network namespace ones wind >> up taking a reference to struct net because the have the legacy pid case >> to deal with. Which makes we can not use fdget for all callers either. > > And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used > by the network namespace. > >> For this translate_pid rcu locking is sufficient, rcu locking is easy >> and doing any more than rcu locking just seems silly. So let me >> respectfully suggest. >> >> struct ns_common *ns_by_fd_rcu(int fd, int type) >> { >> struct files_struct *files = current->files; >> struct file *file; >> struct ns_common *ns; >> void *ret; >> >> file = fcheck_files(files, fd); >> if (!file) >> return ERR_PTR(-EBADF); >> >> if (file->f_mode & FMODE_PATH) >> return ERR_PTR(-EINVAL); >> >> if (file->f_op != &ns_file_operations) >> return ERR_PTR(-EINVAL); >> >> ns = get_proc_ns(file_inode(file)); >> if (ns->ops->type != type) >> return ERR_PTR(-EINVAL); >> >> return ns; >> } > > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). > > And in any case fcheck_files() makes more sense than fdget(), somehow I did > not think about this when I sent 1/1. > > Hmm. and after the quick look at cleanup_net() I can't understand whether > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or > not... Can it? Some of those places need a reference that allows them to sleep, and the code is shared with the legacy pid case so with an addition of get_net we can use ns_by_fd_rcu(). There are cases like setns that could use ns_by_fd_rcu() with code reording. We can implement get_net_ns_by_fd as: struct net *get_net_ns_by_fd(int fd) { struct net *net; rcu_read_lock(); net = net_ns_by_fd_rcu(fd); if (!IS_ERR(net)) get_net(net); rcu_read_unlock(); return net; } Which means we can achieve code sharing with the pure rcu version as a base. If the networking code did not have the legacy pid case to handle I would want to do something with struct fd, as the file descriptor already keeps the struct net reference alive and struct fd allows for sleeping. Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> @ 2015-09-29 17:30 ` Eric W. Biederman 0 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-29 17:30 UTC (permalink / raw) To: Oleg Nesterov Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber Oleg Nesterov <oleg@redhat.com> writes: > On 09/28, Eric W. Biederman wrote: >> >> Oleg Nesterov <oleg@redhat.com> writes: >> >> > Honestly, I do not really like the new helper... I understand this >> > is subjective, so I won't insist. But how about 1/1? We do not need >> > fd/file at all. With this patch your sys_getvpid() can just use >> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). >> > >> > Eric, what do you think? >> >> At some level I don't care this is not exposed to userspace. > > I agree, this is minor. But you know, the kernel is already complicated > too much, we should try to simplify/cleanup everything we can ;) > >> Of the existing uses several of them sleep, which unfortunately means we >> can not use rcu locking for everything. The network namespace ones wind >> up taking a reference to struct net because the have the legacy pid case >> to deal with. Which makes we can not use fdget for all callers either. > > And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used > by the network namespace. > >> For this translate_pid rcu locking is sufficient, rcu locking is easy >> and doing any more than rcu locking just seems silly. So let me >> respectfully suggest. >> >> struct ns_common *ns_by_fd_rcu(int fd, int type) >> { >> struct files_struct *files = current->files; >> struct file *file; >> struct ns_common *ns; >> void *ret; >> >> file = fcheck_files(files, fd); >> if (!file) >> return ERR_PTR(-EBADF); >> >> if (file->f_mode & FMODE_PATH) >> return ERR_PTR(-EINVAL); >> >> if (file->f_op != &ns_file_operations) >> return ERR_PTR(-EINVAL); >> >> ns = get_proc_ns(file_inode(file)); >> if (ns->ops->type != type) >> return ERR_PTR(-EINVAL); >> >> return ns; >> } > > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). > > And in any case fcheck_files() makes more sense than fdget(), somehow I did > not think about this when I sent 1/1. > > Hmm. and after the quick look at cleanup_net() I can't understand whether > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or > not... Can it? Some of those places need a reference that allows them to sleep, and the code is shared with the legacy pid case so with an addition of get_net we can use ns_by_fd_rcu(). There are cases like setns that could use ns_by_fd_rcu() with code reording. We can implement get_net_ns_by_fd as: struct net *get_net_ns_by_fd(int fd) { struct net *net; rcu_read_lock(); net = net_ns_by_fd_rcu(fd); if (!IS_ERR(net)) get_net(net); rcu_read_unlock(); return net; } Which means we can achieve code sharing with the pure rcu version as a base. If the networking code did not have the legacy pid case to handle I would want to do something with struct fd, as the file descriptor already keeps the struct net reference alive and struct fd allows for sleeping. Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() @ 2015-09-29 17:30 ` Eric W. Biederman 0 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-29 17:30 UTC (permalink / raw) To: Oleg Nesterov Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > On 09/28, Eric W. Biederman wrote: >> >> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: >> >> > Honestly, I do not really like the new helper... I understand this >> > is subjective, so I won't insist. But how about 1/1? We do not need >> > fd/file at all. With this patch your sys_getvpid() can just use >> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). >> > >> > Eric, what do you think? >> >> At some level I don't care this is not exposed to userspace. > > I agree, this is minor. But you know, the kernel is already complicated > too much, we should try to simplify/cleanup everything we can ;) > >> Of the existing uses several of them sleep, which unfortunately means we >> can not use rcu locking for everything. The network namespace ones wind >> up taking a reference to struct net because the have the legacy pid case >> to deal with. Which makes we can not use fdget for all callers either. > > And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used > by the network namespace. > >> For this translate_pid rcu locking is sufficient, rcu locking is easy >> and doing any more than rcu locking just seems silly. So let me >> respectfully suggest. >> >> struct ns_common *ns_by_fd_rcu(int fd, int type) >> { >> struct files_struct *files = current->files; >> struct file *file; >> struct ns_common *ns; >> void *ret; >> >> file = fcheck_files(files, fd); >> if (!file) >> return ERR_PTR(-EBADF); >> >> if (file->f_mode & FMODE_PATH) >> return ERR_PTR(-EINVAL); >> >> if (file->f_op != &ns_file_operations) >> return ERR_PTR(-EINVAL); >> >> ns = get_proc_ns(file_inode(file)); >> if (ns->ops->type != type) >> return ERR_PTR(-EINVAL); >> >> return ns; >> } > > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). > > And in any case fcheck_files() makes more sense than fdget(), somehow I did > not think about this when I sent 1/1. > > Hmm. and after the quick look at cleanup_net() I can't understand whether > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or > not... Can it? Some of those places need a reference that allows them to sleep, and the code is shared with the legacy pid case so with an addition of get_net we can use ns_by_fd_rcu(). There are cases like setns that could use ns_by_fd_rcu() with code reording. We can implement get_net_ns_by_fd as: struct net *get_net_ns_by_fd(int fd) { struct net *net; rcu_read_lock(); net = net_ns_by_fd_rcu(fd); if (!IS_ERR(net)) get_net(net); rcu_read_unlock(); return net; } Which means we can achieve code sharing with the pure rcu version as a base. If the networking code did not have the legacy pid case to handle I would want to do something with struct fd, as the file descriptor already keeps the struct net reference alive and struct fd allows for sleeping. Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> @ 2015-09-29 18:38 ` Oleg Nesterov 0 siblings, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-29 18:38 UTC (permalink / raw) To: Eric W. Biederman Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber On 09/29, Eric W. Biederman wrote: > > Oleg Nesterov <oleg@redhat.com> writes: > > > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). > > > > And in any case fcheck_files() makes more sense than fdget(), somehow I did > > not think about this when I sent 1/1. > > > > Hmm. and after the quick look at cleanup_net() I can't understand whether > > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or > > not... Can it? > > Some of those places need a reference that allows them to sleep, and the > code is shared with the legacy pid case so with an addition of get_net > we can use ns_by_fd_rcu(). There are cases like setns that could > use ns_by_fd_rcu() with code reording. > > We can implement get_net_ns_by_fd as: > struct net *get_net_ns_by_fd(int fd) > { > struct net *net; > > rcu_read_lock(); > net = net_ns_by_fd_rcu(fd); > if (!IS_ERR(net)) > get_net(net); > rcu_read_unlock(); > > return net; > } > > Which means we can achieve code sharing with the pure rcu version > as a base. Yes, this is what I meant... but don't we need maybe_get_net() ? Oleg. ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() @ 2015-09-29 18:38 ` Oleg Nesterov 0 siblings, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-29 18:38 UTC (permalink / raw) To: Eric W. Biederman Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber On 09/29, Eric W. Biederman wrote: > > Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > > > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). > > > > And in any case fcheck_files() makes more sense than fdget(), somehow I did > > not think about this when I sent 1/1. > > > > Hmm. and after the quick look at cleanup_net() I can't understand whether > > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or > > not... Can it? > > Some of those places need a reference that allows them to sleep, and the > code is shared with the legacy pid case so with an addition of get_net > we can use ns_by_fd_rcu(). There are cases like setns that could > use ns_by_fd_rcu() with code reording. > > We can implement get_net_ns_by_fd as: > struct net *get_net_ns_by_fd(int fd) > { > struct net *net; > > rcu_read_lock(); > net = net_ns_by_fd_rcu(fd); > if (!IS_ERR(net)) > get_net(net); > rcu_read_unlock(); > > return net; > } > > Which means we can achieve code sharing with the pure rcu version > as a base. Yes, this is what I meant... but don't we need maybe_get_net() ? Oleg. ^ permalink raw reply [flat|nested] 37+ messages in thread
[parent not found: <20150929183833.GA21875-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>]
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() 2015-09-29 18:38 ` Oleg Nesterov @ 2015-09-29 19:05 ` Eric W. Biederman -1 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-29 19:05 UTC (permalink / raw) To: Oleg Nesterov Cc: Roman Gushchin, Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan, Andrew Morton, Linus Torvalds Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > On 09/29, Eric W. Biederman wrote: >> >> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: >> >> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). >> > >> > And in any case fcheck_files() makes more sense than fdget(), somehow I did >> > not think about this when I sent 1/1. >> > >> > Hmm. and after the quick look at cleanup_net() I can't understand whether >> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or >> > not... Can it? >> >> Some of those places need a reference that allows them to sleep, and the >> code is shared with the legacy pid case so with an addition of get_net >> we can use ns_by_fd_rcu(). There are cases like setns that could >> use ns_by_fd_rcu() with code reording. >> >> We can implement get_net_ns_by_fd as: >> struct net *get_net_ns_by_fd(int fd) >> { >> struct net *net; >> >> rcu_read_lock(); >> net = net_ns_by_fd_rcu(fd); >> if (!IS_ERR(net)) >> get_net(net); >> rcu_read_unlock(); >> >> return net; >> } >> >> Which means we can achieve code sharing with the pure rcu version >> as a base. > > Yes, this is what I meant... but don't we need maybe_get_net() ? Let me see. The call chain is a little interesting so let's trace it. thread1 thread2 net = net_ns_by_fd_rcu(fd); fput(file) get_net(net); ____fput __fput dput dentry_kill __dentry_kill dentry_iput iput iput_final evict ->evict_inode (nsfs_evict) netns_put put_net __put_net queue_work(net_cleanup_work) ... cleanup_net synchronize_rcu. Given that someone in another thread can put the file descriptor and I don't see anything explicitly waiting until after the rcu critical section ends to put the network namespace I guess we do need maybe_get_net. In practice I think the delay logic in fput will act as an rcu barrier, but that is not guaranteed so we had better not count on it. Sigh. Anyway this is a long ways from a syscall to translate pids. Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() @ 2015-09-29 19:05 ` Eric W. Biederman 0 siblings, 0 replies; 37+ messages in thread From: Eric W. Biederman @ 2015-09-29 19:05 UTC (permalink / raw) To: Oleg Nesterov Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber Oleg Nesterov <oleg@redhat.com> writes: > On 09/29, Eric W. Biederman wrote: >> >> Oleg Nesterov <oleg@redhat.com> writes: >> >> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). >> > >> > And in any case fcheck_files() makes more sense than fdget(), somehow I did >> > not think about this when I sent 1/1. >> > >> > Hmm. and after the quick look at cleanup_net() I can't understand whether >> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or >> > not... Can it? >> >> Some of those places need a reference that allows them to sleep, and the >> code is shared with the legacy pid case so with an addition of get_net >> we can use ns_by_fd_rcu(). There are cases like setns that could >> use ns_by_fd_rcu() with code reording. >> >> We can implement get_net_ns_by_fd as: >> struct net *get_net_ns_by_fd(int fd) >> { >> struct net *net; >> >> rcu_read_lock(); >> net = net_ns_by_fd_rcu(fd); >> if (!IS_ERR(net)) >> get_net(net); >> rcu_read_unlock(); >> >> return net; >> } >> >> Which means we can achieve code sharing with the pure rcu version >> as a base. > > Yes, this is what I meant... but don't we need maybe_get_net() ? Let me see. The call chain is a little interesting so let's trace it. thread1 thread2 net = net_ns_by_fd_rcu(fd); fput(file) get_net(net); ____fput __fput dput dentry_kill __dentry_kill dentry_iput iput iput_final evict ->evict_inode (nsfs_evict) netns_put put_net __put_net queue_work(net_cleanup_work) ... cleanup_net synchronize_rcu. Given that someone in another thread can put the file descriptor and I don't see anything explicitly waiting until after the rcu critical section ends to put the network namespace I guess we do need maybe_get_net. In practice I think the delay logic in fput will act as an rcu barrier, but that is not guaranteed so we had better not count on it. Sigh. Anyway this is a long ways from a syscall to translate pids. Eric ^ permalink raw reply [flat|nested] 37+ messages in thread
[parent not found: <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>]
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> @ 2015-09-29 18:38 ` Oleg Nesterov 0 siblings, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-29 18:38 UTC (permalink / raw) To: Eric W. Biederman Cc: Roman Gushchin, Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan, Andrew Morton, Linus Torvalds On 09/29, Eric W. Biederman wrote: > > Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > > > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). > > > > And in any case fcheck_files() makes more sense than fdget(), somehow I did > > not think about this when I sent 1/1. > > > > Hmm. and after the quick look at cleanup_net() I can't understand whether > > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or > > not... Can it? > > Some of those places need a reference that allows them to sleep, and the > code is shared with the legacy pid case so with an addition of get_net > we can use ns_by_fd_rcu(). There are cases like setns that could > use ns_by_fd_rcu() with code reording. > > We can implement get_net_ns_by_fd as: > struct net *get_net_ns_by_fd(int fd) > { > struct net *net; > > rcu_read_lock(); > net = net_ns_by_fd_rcu(fd); > if (!IS_ERR(net)) > get_net(net); > rcu_read_unlock(); > > return net; > } > > Which means we can achieve code sharing with the pure rcu version > as a base. Yes, this is what I meant... but don't we need maybe_get_net() ? Oleg. ^ permalink raw reply [flat|nested] 37+ messages in thread
[parent not found: <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>]
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> @ 2015-09-29 16:43 ` Oleg Nesterov 2015-09-30 2:54 ` Chen Fan 1 sibling, 0 replies; 37+ messages in thread From: Oleg Nesterov @ 2015-09-29 16:43 UTC (permalink / raw) To: Eric W. Biederman Cc: Roman Gushchin, Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan, Andrew Morton, Linus Torvalds On 09/28, Eric W. Biederman wrote: > > Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > > > Honestly, I do not really like the new helper... I understand this > > is subjective, so I won't insist. But how about 1/1? We do not need > > fd/file at all. With this patch your sys_getvpid() can just use > > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). > > > > Eric, what do you think? > > At some level I don't care this is not exposed to userspace. I agree, this is minor. But you know, the kernel is already complicated too much, we should try to simplify/cleanup everything we can ;) > Of the existing uses several of them sleep, which unfortunately means we > can not use rcu locking for everything. The network namespace ones wind > up taking a reference to struct net because the have the legacy pid case > to deal with. Which makes we can not use fdget for all callers either. And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used by the network namespace. > For this translate_pid rcu locking is sufficient, rcu locking is easy > and doing any more than rcu locking just seems silly. So let me > respectfully suggest. > > struct ns_common *ns_by_fd_rcu(int fd, int type) > { > struct files_struct *files = current->files; > struct file *file; > struct ns_common *ns; > void *ret; > > file = fcheck_files(files, fd); > if (!file) > return ERR_PTR(-EBADF); > > if (file->f_mode & FMODE_PATH) > return ERR_PTR(-EINVAL); > > if (file->f_op != &ns_file_operations) > return ERR_PTR(-EINVAL); > > ns = get_proc_ns(file_inode(file)); > if (ns->ops->type != type) > return ERR_PTR(-EINVAL); > > return ns; > } OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref). And in any case fcheck_files() makes more sense than fdget(), somehow I did not think about this when I sent 1/1. Hmm. and after the quick look at cleanup_net() I can't understand whether get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or not... Can it? Oleg. ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> 2015-09-29 16:43 ` Oleg Nesterov @ 2015-09-30 2:54 ` Chen Fan 1 sibling, 0 replies; 37+ messages in thread From: Chen Fan @ 2015-09-30 2:54 UTC (permalink / raw) To: Eric W. Biederman, Oleg Nesterov Cc: Roman Gushchin, Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andrew Morton, Linus Torvalds On 09/29/2015 12:37 AM, Eric W. Biederman wrote: > Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > >> On 09/25, Konstantin Khlebnikov wrote: >>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) >>> { >>> - struct file *file; >>> + struct ns_common *ns; >>> + struct fd f; >>> >>> - file = fget(fd); >>> - if (!file) >>> + f = fdget(fd); >>> + if (!f.file) >>> return ERR_PTR(-EBADF); >>> >>> - if (file->f_op != &ns_file_operations) >>> + if (f.file->f_op != &ns_file_operations) >>> + goto out_invalid; >>> + >>> + ns = get_proc_ns(file_inode(f.file)); >>> + if (nstype && (ns->ops->type != nstype)) >>> goto out_invalid; >>> >>> - return file; >>> + *fd_ref = f; >>> + return ns; >>> >>> out_invalid: >>> - fput(file); >>> + fdput(f); >>> return ERR_PTR(-EINVAL); >>> } >> Well yes, fdget() makes sense but this is minor. >> >> Honestly, I do not really like the new helper... I understand this >> is subjective, so I won't insist. But how about 1/1? We do not need >> fd/file at all. With this patch your sys_getvpid() can just use >> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). >> >> Eric, what do you think? > At some level I don't care this is not exposed to userspace. > > However since we are going several rounds with this. > > Of the existing uses several of them sleep, which unfortunately means we > can not use rcu locking for everything. The network namespace ones wind > up taking a reference to struct net because the have the legacy pid case > to deal with. Which makes we can not use fdget for all callers either. > > For this translate_pid rcu locking is sufficient, rcu locking is easy > and doing any more than rcu locking just seems silly. So let me > respectfully suggest. > > struct ns_common *ns_by_fd_rcu(int fd, int type) > { > struct files_struct *files = current->files; > struct file *file; > struct ns_common *ns; > void *ret; pointer files seems no more useful. we can use fcheck(fd) instead. Thanks, > > file = fcheck_files(files, fd); > if (!file) > return ERR_PTR(-EBADF); > > if (file->f_mode & FMODE_PATH) > return ERR_PTR(-EINVAL); > > if (file->f_op != &ns_file_operations) > return ERR_PTR(-EINVAL); > > ns = get_proc_ns(file_inode(file)); > if (ns->ops->type != type) > return ERR_PTR(-EINVAL); > > return ns; > } > > struct pid_namespace *pidns_by_fd_rcu(int fd) > { > struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID); > if (IS_ERR(ns)) > return ERR_CAST(ns); > return container_of(ns, struct pid_namespace, ns); > } > > SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd) > { > struct pid_namespace *source, *target; > struct pid *pid; > pid_t result; > > rcu_read_lock(); > > if (sourcefd >= 0) > source = pidns_by_fd_rcu(sourcefd); > else > source = task_active_pid_ns(current); > > if (targetfd >= 0) > target = pidns_by_fd_rcu(targetfd); > else > target = task_active_pid_ns(current); > > pid = find_pid_ns(pid_nr, source); > result = pid_nr_ns(pid, target); > if (result == 0) > result = -ESRCH; > > rcu_read_unlock(); > > return result; > } > > Eric > . > ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() [not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> @ 2015-09-30 2:54 ` Chen Fan 2015-09-30 2:54 ` Chen Fan 1 sibling, 0 replies; 37+ messages in thread From: Chen Fan @ 2015-09-30 2:54 UTC (permalink / raw) To: Eric W. Biederman, Oleg Nesterov Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel, Roman Gushchin, Serge Hallyn, Andrew Morton, Linus Torvalds, Stéphane Graber On 09/29/2015 12:37 AM, Eric W. Biederman wrote: > Oleg Nesterov <oleg@redhat.com> writes: > >> On 09/25, Konstantin Khlebnikov wrote: >>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) >>> { >>> - struct file *file; >>> + struct ns_common *ns; >>> + struct fd f; >>> >>> - file = fget(fd); >>> - if (!file) >>> + f = fdget(fd); >>> + if (!f.file) >>> return ERR_PTR(-EBADF); >>> >>> - if (file->f_op != &ns_file_operations) >>> + if (f.file->f_op != &ns_file_operations) >>> + goto out_invalid; >>> + >>> + ns = get_proc_ns(file_inode(f.file)); >>> + if (nstype && (ns->ops->type != nstype)) >>> goto out_invalid; >>> >>> - return file; >>> + *fd_ref = f; >>> + return ns; >>> >>> out_invalid: >>> - fput(file); >>> + fdput(f); >>> return ERR_PTR(-EINVAL); >>> } >> Well yes, fdget() makes sense but this is minor. >> >> Honestly, I do not really like the new helper... I understand this >> is subjective, so I won't insist. But how about 1/1? We do not need >> fd/file at all. With this patch your sys_getvpid() can just use >> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). >> >> Eric, what do you think? > At some level I don't care this is not exposed to userspace. > > However since we are going several rounds with this. > > Of the existing uses several of them sleep, which unfortunately means we > can not use rcu locking for everything. The network namespace ones wind > up taking a reference to struct net because the have the legacy pid case > to deal with. Which makes we can not use fdget for all callers either. > > For this translate_pid rcu locking is sufficient, rcu locking is easy > and doing any more than rcu locking just seems silly. So let me > respectfully suggest. > > struct ns_common *ns_by_fd_rcu(int fd, int type) > { > struct files_struct *files = current->files; > struct file *file; > struct ns_common *ns; > void *ret; pointer files seems no more useful. we can use fcheck(fd) instead. Thanks, > > file = fcheck_files(files, fd); > if (!file) > return ERR_PTR(-EBADF); > > if (file->f_mode & FMODE_PATH) > return ERR_PTR(-EINVAL); > > if (file->f_op != &ns_file_operations) > return ERR_PTR(-EINVAL); > > ns = get_proc_ns(file_inode(file)); > if (ns->ops->type != type) > return ERR_PTR(-EINVAL); > > return ns; > } > > struct pid_namespace *pidns_by_fd_rcu(int fd) > { > struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID); > if (IS_ERR(ns)) > return ERR_CAST(ns); > return container_of(ns, struct pid_namespace, ns); > } > > SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd) > { > struct pid_namespace *source, *target; > struct pid *pid; > pid_t result; > > rcu_read_lock(); > > if (sourcefd >= 0) > source = pidns_by_fd_rcu(sourcefd); > else > source = task_active_pid_ns(current); > > if (targetfd >= 0) > target = pidns_by_fd_rcu(targetfd); > else > target = task_active_pid_ns(current); > > pid = find_pid_ns(pid_nr, source); > result = pid_nr_ns(pid, target); > if (result == 0) > result = -ESRCH; > > rcu_read_unlock(); > > return result; > } > > Eric > . > ^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd() @ 2015-09-30 2:54 ` Chen Fan 0 siblings, 0 replies; 37+ messages in thread From: Chen Fan @ 2015-09-30 2:54 UTC (permalink / raw) To: Eric W. Biederman, Oleg Nesterov Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA, containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin, Serge Hallyn, Andrew Morton, Linus Torvalds, Stéphane Graber On 09/29/2015 12:37 AM, Eric W. Biederman wrote: > Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes: > >> On 09/25, Konstantin Khlebnikov wrote: >>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref) >>> { >>> - struct file *file; >>> + struct ns_common *ns; >>> + struct fd f; >>> >>> - file = fget(fd); >>> - if (!file) >>> + f = fdget(fd); >>> + if (!f.file) >>> return ERR_PTR(-EBADF); >>> >>> - if (file->f_op != &ns_file_operations) >>> + if (f.file->f_op != &ns_file_operations) >>> + goto out_invalid; >>> + >>> + ns = get_proc_ns(file_inode(f.file)); >>> + if (nstype && (ns->ops->type != nstype)) >>> goto out_invalid; >>> >>> - return file; >>> + *fd_ref = f; >>> + return ns; >>> >>> out_invalid: >>> - fput(file); >>> + fdput(f); >>> return ERR_PTR(-EINVAL); >>> } >> Well yes, fdget() makes sense but this is minor. >> >> Honestly, I do not really like the new helper... I understand this >> is subjective, so I won't insist. But how about 1/1? We do not need >> fd/file at all. With this patch your sys_getvpid() can just use >> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns(). >> >> Eric, what do you think? > At some level I don't care this is not exposed to userspace. > > However since we are going several rounds with this. > > Of the existing uses several of them sleep, which unfortunately means we > can not use rcu locking for everything. The network namespace ones wind > up taking a reference to struct net because the have the legacy pid case > to deal with. Which makes we can not use fdget for all callers either. > > For this translate_pid rcu locking is sufficient, rcu locking is easy > and doing any more than rcu locking just seems silly. So let me > respectfully suggest. > > struct ns_common *ns_by_fd_rcu(int fd, int type) > { > struct files_struct *files = current->files; > struct file *file; > struct ns_common *ns; > void *ret; pointer files seems no more useful. we can use fcheck(fd) instead. Thanks, > > file = fcheck_files(files, fd); > if (!file) > return ERR_PTR(-EBADF); > > if (file->f_mode & FMODE_PATH) > return ERR_PTR(-EINVAL); > > if (file->f_op != &ns_file_operations) > return ERR_PTR(-EINVAL); > > ns = get_proc_ns(file_inode(file)); > if (ns->ops->type != type) > return ERR_PTR(-EINVAL); > > return ns; > } > > struct pid_namespace *pidns_by_fd_rcu(int fd) > { > struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID); > if (IS_ERR(ns)) > return ERR_CAST(ns); > return container_of(ns, struct pid_namespace, ns); > } > > SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd) > { > struct pid_namespace *source, *target; > struct pid *pid; > pid_t result; > > rcu_read_lock(); > > if (sourcefd >= 0) > source = pidns_by_fd_rcu(sourcefd); > else > source = task_active_pid_ns(current); > > if (targetfd >= 0) > target = pidns_by_fd_rcu(targetfd); > else > target = task_active_pid_ns(current); > > pid = find_pid_ns(pid_nr, source); > result = pid_nr_ns(pid, target); > if (result == 0) > result = -ESRCH; > > rcu_read_unlock(); > > return result; > } > > Eric > . > ^ permalink raw reply [flat|nested] 37+ messages in thread
end of thread, other threads:[~2015-10-20 10:04 UTC | newest] Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2015-09-25 13:52 [PATCH RFC v3 1/2] nsfs: replace proc_ns_fget() with proc_ns_fdget() Konstantin Khlebnikov 2015-09-25 13:52 ` Konstantin Khlebnikov 2015-09-25 13:52 ` [PATCH RFC v3 2/2] pidns: introduce syscall getvpid Konstantin Khlebnikov 2015-09-25 13:52 ` Konstantin Khlebnikov 2015-09-28 4:12 ` kbuild test robot 2015-09-28 4:12 ` kbuild test robot 2015-09-28 16:22 ` Eric W. Biederman 2015-09-28 16:22 ` Eric W. Biederman 2015-09-28 16:57 ` Eric W. Biederman 2015-09-28 16:57 ` Eric W. Biederman [not found] ` <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> 2015-10-20 10:04 ` Konstantin Khlebnikov 2015-10-20 10:04 ` Konstantin Khlebnikov 2015-10-20 10:04 ` Konstantin Khlebnikov 2015-09-25 17:56 ` [PATCH 0/1] ns: introduce proc_get_ns_by_fd() Oleg Nesterov 2015-09-25 17:56 ` Oleg Nesterov [not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> 2015-09-25 17:57 ` [PATCH 1/1] " Oleg Nesterov 2015-09-25 17:57 ` Oleg Nesterov 2015-09-28 8:21 ` [PATCH 0/1] " Konstantin Khlebnikov 2015-09-28 16:37 ` Eric W. Biederman 2015-09-28 8:21 ` Konstantin Khlebnikov 2015-09-28 8:21 ` Konstantin Khlebnikov 2015-09-28 16:37 ` Eric W. Biederman 2015-09-28 16:37 ` Eric W. Biederman 2015-09-29 16:43 ` Oleg Nesterov 2015-09-29 16:43 ` Oleg Nesterov [not found] ` <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> 2015-09-29 17:30 ` Eric W. Biederman 2015-09-29 17:30 ` Eric W. Biederman 2015-09-29 17:30 ` Eric W. Biederman 2015-09-29 18:38 ` Oleg Nesterov 2015-09-29 18:38 ` Oleg Nesterov [not found] ` <20150929183833.GA21875-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> 2015-09-29 19:05 ` Eric W. Biederman 2015-09-29 19:05 ` Eric W. Biederman [not found] ` <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> 2015-09-29 18:38 ` Oleg Nesterov [not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org> 2015-09-29 16:43 ` Oleg Nesterov 2015-09-30 2:54 ` Chen Fan 2015-09-30 2:54 ` Chen Fan 2015-09-30 2:54 ` Chen Fan
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.