* [PATCH RFC v3 1/2] nsfs: replace proc_ns_fget() with proc_ns_fdget()
@ 2015-09-25 13:52 ` Konstantin Khlebnikov
0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw)
To: linux-api, containers, linux-kernel
Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman,
Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber
New helper function returns pointer to struct ns_common, can check
namespace type and uses struct fd for returning file reference:
this saves couple atomic operations for single-threaded applications.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
fs/nsfs.c | 20 +++++++++++++-------
include/linux/proc_ns.h | 4 +++-
kernel/nsproxy.c | 15 +++++----------
net/core/net_namespace.c | 18 ++++++------------
4 files changed, 27 insertions(+), 30 deletions(-)
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8f20d6016e20..a2e803c97960 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -119,21 +119,27 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
return res;
}
-struct file *proc_ns_fget(int fd)
+struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
{
- struct file *file;
+ struct ns_common *ns;
+ struct fd f;
- file = fget(fd);
- if (!file)
+ f = fdget(fd);
+ if (!f.file)
return ERR_PTR(-EBADF);
- if (file->f_op != &ns_file_operations)
+ if (f.file->f_op != &ns_file_operations)
+ goto out_invalid;
+
+ ns = get_proc_ns(file_inode(f.file));
+ if (nstype && (ns->ops->type != nstype))
goto out_invalid;
- return file;
+ *fd_ref = f;
+ return ns;
out_invalid:
- fput(file);
+ fdput(f);
return ERR_PTR(-EINVAL);
}
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 42dfc615dbf8..406ea4617606 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -9,6 +9,7 @@
struct pid_namespace;
struct nsproxy;
struct path;
+struct fd;
struct proc_ns_operations {
const char *name;
@@ -65,7 +66,8 @@ static inline int ns_alloc_inum(struct ns_common *ns)
#define ns_free_inum(ns) proc_free_inum((ns)->inum)
-extern struct file *proc_ns_fget(int fd);
+extern struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref);
+
#define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
extern void *ns_get_path(struct path *path, struct task_struct *task,
const struct proc_ns_operations *ns_ops);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 49746c81ad8d..689b929a0fcb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -222,18 +222,13 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
{
struct task_struct *tsk = current;
struct nsproxy *new_nsproxy;
- struct file *file;
+ struct fd fd_ref;
struct ns_common *ns;
int err;
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- err = -EINVAL;
- ns = get_proc_ns(file_inode(file));
- if (nstype && (ns->ops->type != nstype))
- goto out;
+ ns = proc_ns_fdget(fd, nstype, &fd_ref);
+ if (IS_ERR(ns))
+ return PTR_ERR(ns);
new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
if (IS_ERR(new_nsproxy)) {
@@ -248,7 +243,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
}
switch_task_namespaces(tsk, new_nsproxy);
out:
- fput(file);
+ fdput(fd_ref);
return err;
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..ffc5e6df86dc 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -465,21 +465,15 @@ EXPORT_SYMBOL_GPL(__put_net);
struct net *get_net_ns_by_fd(int fd)
{
- struct file *file;
struct ns_common *ns;
+ struct fd fd_ref;
struct net *net;
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return ERR_CAST(file);
-
- ns = get_proc_ns(file_inode(file));
- if (ns->ops == &netns_operations)
- net = get_net(container_of(ns, struct net, ns));
- else
- net = ERR_PTR(-EINVAL);
-
- fput(file);
+ ns = proc_ns_fdget(fd, CLONE_NEWNET, &fd_ref);
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
+ net = get_net(container_of(ns, struct net, ns));
+ fdput(fd_ref);
return net;
}
^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH RFC v3 1/2] nsfs: replace proc_ns_fget() with proc_ns_fdget()
@ 2015-09-25 13:52 ` Konstantin Khlebnikov
0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw)
To: linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman,
Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber
New helper function returns pointer to struct ns_common, can check
namespace type and uses struct fd for returning file reference:
this saves couple atomic operations for single-threaded applications.
Signed-off-by: Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org>
---
fs/nsfs.c | 20 +++++++++++++-------
include/linux/proc_ns.h | 4 +++-
kernel/nsproxy.c | 15 +++++----------
net/core/net_namespace.c | 18 ++++++------------
4 files changed, 27 insertions(+), 30 deletions(-)
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8f20d6016e20..a2e803c97960 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -119,21 +119,27 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
return res;
}
-struct file *proc_ns_fget(int fd)
+struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
{
- struct file *file;
+ struct ns_common *ns;
+ struct fd f;
- file = fget(fd);
- if (!file)
+ f = fdget(fd);
+ if (!f.file)
return ERR_PTR(-EBADF);
- if (file->f_op != &ns_file_operations)
+ if (f.file->f_op != &ns_file_operations)
+ goto out_invalid;
+
+ ns = get_proc_ns(file_inode(f.file));
+ if (nstype && (ns->ops->type != nstype))
goto out_invalid;
- return file;
+ *fd_ref = f;
+ return ns;
out_invalid:
- fput(file);
+ fdput(f);
return ERR_PTR(-EINVAL);
}
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 42dfc615dbf8..406ea4617606 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -9,6 +9,7 @@
struct pid_namespace;
struct nsproxy;
struct path;
+struct fd;
struct proc_ns_operations {
const char *name;
@@ -65,7 +66,8 @@ static inline int ns_alloc_inum(struct ns_common *ns)
#define ns_free_inum(ns) proc_free_inum((ns)->inum)
-extern struct file *proc_ns_fget(int fd);
+extern struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref);
+
#define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
extern void *ns_get_path(struct path *path, struct task_struct *task,
const struct proc_ns_operations *ns_ops);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 49746c81ad8d..689b929a0fcb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -222,18 +222,13 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
{
struct task_struct *tsk = current;
struct nsproxy *new_nsproxy;
- struct file *file;
+ struct fd fd_ref;
struct ns_common *ns;
int err;
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- err = -EINVAL;
- ns = get_proc_ns(file_inode(file));
- if (nstype && (ns->ops->type != nstype))
- goto out;
+ ns = proc_ns_fdget(fd, nstype, &fd_ref);
+ if (IS_ERR(ns))
+ return PTR_ERR(ns);
new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
if (IS_ERR(new_nsproxy)) {
@@ -248,7 +243,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
}
switch_task_namespaces(tsk, new_nsproxy);
out:
- fput(file);
+ fdput(fd_ref);
return err;
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..ffc5e6df86dc 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -465,21 +465,15 @@ EXPORT_SYMBOL_GPL(__put_net);
struct net *get_net_ns_by_fd(int fd)
{
- struct file *file;
struct ns_common *ns;
+ struct fd fd_ref;
struct net *net;
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return ERR_CAST(file);
-
- ns = get_proc_ns(file_inode(file));
- if (ns->ops == &netns_operations)
- net = get_net(container_of(ns, struct net, ns));
- else
- net = ERR_PTR(-EINVAL);
-
- fput(file);
+ ns = proc_ns_fdget(fd, CLONE_NEWNET, &fd_ref);
+ if (IS_ERR(ns))
+ return ERR_CAST(ns);
+ net = get_net(container_of(ns, struct net, ns));
+ fdput(fd_ref);
return net;
}
^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
2015-09-25 13:52 ` Konstantin Khlebnikov
@ 2015-09-25 13:52 ` Konstantin Khlebnikov
-1 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw)
To: linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA
Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman,
Chen Fan, Andrew Morton, Linus Torvalds
pid_t getvpid(pid_t pid, int source, int target);
This syscall converts pid from source pid-namespace into pid visible
in target pid-namespace. If pid is unreachable from target namespace
then getvpid() returns zero.
Namespaces are defined by file descriptors pointing to entries in
proc (/proc/[pid]/ns/pid). If argument is negative then current pid
namespace is used.
If pid is negative then getvpid() returns pid of parent task for -pid.
Possible error codes:
ESRCH - task not found
EBADF - closed file descriptor
EINVAL - not pid-namespace file descriptor
Such conversion is required for interaction between processes from
different pid-namespaces. For example system service at host system
who provide access to restricted set of privileged operations for
clients from containers have to convert pids back and forward.
Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but
this interface works only in one way and even that is non-trivial.
Other option is passing pids with credentials via unix socket, but
this solution requires a lot of preparation and CAP_SYS_ADMIN for
sending arbitrary pids.
This syscall works in both directions, it's fast and simple.
Examples:
getvpid(pid, ns, -1) - get pid in our pid namespace
getvpid(pid, -1, ns) - get pid in container
getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
getvpid(1, ns, -1) - get init task of pid-namespace
getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
getvpid(-pid, -1, -1) - get ppid by pid
Signed-off-by: Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org>
--
v1: https://lkml.org/lkml/2015/9/15/411
v2: https://lkml.org/lkml/2015/9/24/278
v3:
* use proc_ns_fdget()
* update description
* rebase to next-20150925
* fix conflict with mlock2
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
include/linux/syscalls.h | 1 +
include/uapi/asm-generic/unistd.h | 4 ++-
kernel/sys.c | 51 ++++++++++++++++++++++++++++++++
5 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 143ef9f37932..c36c2c65d204 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -383,3 +383,4 @@
374 i386 userfaultfd sys_userfaultfd
375 i386 membarrier sys_membarrier
376 i386 mlock2 sys_mlock2
+377 i386 getvpid sys_getvpid
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90bfc09c..90bbbc7fdbe0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -332,6 +332,7 @@
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
325 common mlock2 sys_mlock2
+326 common getvpid sys_getvpid
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a156b82dd14c..dbb5638260b5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
asmlinkage long sys_alarm(unsigned int seconds);
asmlinkage long sys_getpid(void);
asmlinkage long sys_getppid(void);
+asmlinkage long sys_getvpid(pid_t pid, int source, int target);
asmlinkage long sys_getuid(void);
asmlinkage long sys_geteuid(void);
asmlinkage long sys_getgid(void);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 1324b0292ec2..2c1123130f90 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
__SYSCALL(__NR_membarrier, sys_membarrier)
#define __NR_mlock2 284
__SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_mlock2 285
+__SYSCALL(__NR_getvpid, sys_getvpid)
#undef __NR_syscalls
-#define __NR_syscalls 285
+#define __NR_syscalls 286
/*
* All syscalls below here should go away really,
diff --git a/kernel/sys.c b/kernel/sys.c
index fa2f2f671a5c..1e28a36b84fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -46,6 +46,7 @@
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
#include <linux/binfmts.h>
#include <linux/sched.h>
@@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid)
return pid;
}
+SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target)
+{
+ struct pid_namespace *source_ns, *target_ns;
+ struct fd source_fd = {}, target_fd = {};
+ struct pid *struct_pid;
+ struct ns_common *ns;
+ pid_t result;
+
+ if (source >= 0) {
+ ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd);
+ result = PTR_ERR(ns);
+ if (IS_ERR(ns))
+ goto out;
+ source_ns = container_of(ns, struct pid_namespace, ns);
+ } else
+ source_ns = task_active_pid_ns(current);
+
+ if (target >= 0) {
+ ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd);
+ result = PTR_ERR(ns);
+ if (IS_ERR(ns))
+ goto out;
+ target_ns = container_of(ns, struct pid_namespace, ns);
+ } else
+ target_ns = task_active_pid_ns(current);
+
+ rcu_read_lock();
+ struct_pid = find_pid_ns(abs(pid), source_ns);
+
+ if (struct_pid && pid < 0) {
+ struct task_struct *task;
+
+ task = pid_task(struct_pid, PIDTYPE_PID);
+ if (task)
+ task = rcu_dereference(task->real_parent);
+ struct_pid = task ? task_pid(task) : NULL;
+ }
+
+ if (struct_pid)
+ result = pid_nr_ns(struct_pid, target_ns);
+ else
+ result = -ESRCH;
+ rcu_read_unlock();
+
+out:
+ fdput(target_fd);
+ fdput(source_fd);
+ return result;
+}
+
SYSCALL_DEFINE0(getuid)
{
/* Only we change this so SMP safe */
^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-09-25 13:52 ` Konstantin Khlebnikov
0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw)
To: linux-api, containers, linux-kernel
Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman,
Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber
pid_t getvpid(pid_t pid, int source, int target);
This syscall converts pid from source pid-namespace into pid visible
in target pid-namespace. If pid is unreachable from target namespace
then getvpid() returns zero.
Namespaces are defined by file descriptors pointing to entries in
proc (/proc/[pid]/ns/pid). If argument is negative then current pid
namespace is used.
If pid is negative then getvpid() returns pid of parent task for -pid.
Possible error codes:
ESRCH - task not found
EBADF - closed file descriptor
EINVAL - not pid-namespace file descriptor
Such conversion is required for interaction between processes from
different pid-namespaces. For example system service at host system
who provide access to restricted set of privileged operations for
clients from containers have to convert pids back and forward.
Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but
this interface works only in one way and even that is non-trivial.
Other option is passing pids with credentials via unix socket, but
this solution requires a lot of preparation and CAP_SYS_ADMIN for
sending arbitrary pids.
This syscall works in both directions, it's fast and simple.
Examples:
getvpid(pid, ns, -1) - get pid in our pid namespace
getvpid(pid, -1, ns) - get pid in container
getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
getvpid(1, ns, -1) - get init task of pid-namespace
getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
getvpid(-pid, -1, -1) - get ppid by pid
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
--
v1: https://lkml.org/lkml/2015/9/15/411
v2: https://lkml.org/lkml/2015/9/24/278
v3:
* use proc_ns_fdget()
* update description
* rebase to next-20150925
* fix conflict with mlock2
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
include/linux/syscalls.h | 1 +
include/uapi/asm-generic/unistd.h | 4 ++-
kernel/sys.c | 51 ++++++++++++++++++++++++++++++++
5 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 143ef9f37932..c36c2c65d204 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -383,3 +383,4 @@
374 i386 userfaultfd sys_userfaultfd
375 i386 membarrier sys_membarrier
376 i386 mlock2 sys_mlock2
+377 i386 getvpid sys_getvpid
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90bfc09c..90bbbc7fdbe0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -332,6 +332,7 @@
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
325 common mlock2 sys_mlock2
+326 common getvpid sys_getvpid
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a156b82dd14c..dbb5638260b5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
asmlinkage long sys_alarm(unsigned int seconds);
asmlinkage long sys_getpid(void);
asmlinkage long sys_getppid(void);
+asmlinkage long sys_getvpid(pid_t pid, int source, int target);
asmlinkage long sys_getuid(void);
asmlinkage long sys_geteuid(void);
asmlinkage long sys_getgid(void);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 1324b0292ec2..2c1123130f90 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
__SYSCALL(__NR_membarrier, sys_membarrier)
#define __NR_mlock2 284
__SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_mlock2 285
+__SYSCALL(__NR_getvpid, sys_getvpid)
#undef __NR_syscalls
-#define __NR_syscalls 285
+#define __NR_syscalls 286
/*
* All syscalls below here should go away really,
diff --git a/kernel/sys.c b/kernel/sys.c
index fa2f2f671a5c..1e28a36b84fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -46,6 +46,7 @@
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
#include <linux/binfmts.h>
#include <linux/sched.h>
@@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid)
return pid;
}
+SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target)
+{
+ struct pid_namespace *source_ns, *target_ns;
+ struct fd source_fd = {}, target_fd = {};
+ struct pid *struct_pid;
+ struct ns_common *ns;
+ pid_t result;
+
+ if (source >= 0) {
+ ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd);
+ result = PTR_ERR(ns);
+ if (IS_ERR(ns))
+ goto out;
+ source_ns = container_of(ns, struct pid_namespace, ns);
+ } else
+ source_ns = task_active_pid_ns(current);
+
+ if (target >= 0) {
+ ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd);
+ result = PTR_ERR(ns);
+ if (IS_ERR(ns))
+ goto out;
+ target_ns = container_of(ns, struct pid_namespace, ns);
+ } else
+ target_ns = task_active_pid_ns(current);
+
+ rcu_read_lock();
+ struct_pid = find_pid_ns(abs(pid), source_ns);
+
+ if (struct_pid && pid < 0) {
+ struct task_struct *task;
+
+ task = pid_task(struct_pid, PIDTYPE_PID);
+ if (task)
+ task = rcu_dereference(task->real_parent);
+ struct_pid = task ? task_pid(task) : NULL;
+ }
+
+ if (struct_pid)
+ result = pid_nr_ns(struct_pid, target_ns);
+ else
+ result = -ESRCH;
+ rcu_read_unlock();
+
+out:
+ fdput(target_fd);
+ fdput(source_fd);
+ return result;
+}
+
SYSCALL_DEFINE0(getuid)
{
/* Only we change this so SMP safe */
^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
2015-09-25 13:52 ` Konstantin Khlebnikov
@ 2015-09-25 17:56 ` Oleg Nesterov
-1 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-25 17:56 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds
On 09/25, Konstantin Khlebnikov wrote:
>
> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
> {
> - struct file *file;
> + struct ns_common *ns;
> + struct fd f;
>
> - file = fget(fd);
> - if (!file)
> + f = fdget(fd);
> + if (!f.file)
> return ERR_PTR(-EBADF);
>
> - if (file->f_op != &ns_file_operations)
> + if (f.file->f_op != &ns_file_operations)
> + goto out_invalid;
> +
> + ns = get_proc_ns(file_inode(f.file));
> + if (nstype && (ns->ops->type != nstype))
> goto out_invalid;
>
> - return file;
> + *fd_ref = f;
> + return ns;
>
> out_invalid:
> - fput(file);
> + fdput(f);
> return ERR_PTR(-EINVAL);
> }
Well yes, fdget() makes sense but this is minor.
Honestly, I do not really like the new helper... I understand this
is subjective, so I won't insist. But how about 1/1? We do not need
fd/file at all. With this patch your sys_getvpid() can just use
proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
Eric, what do you think?
See also "TODO" in the changelog.
Oleg.
^ permalink raw reply [flat|nested] 37+ messages in thread
* [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-25 17:56 ` Oleg Nesterov
0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-25 17:56 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: linux-api, containers, linux-kernel, Roman Gushchin,
Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
On 09/25, Konstantin Khlebnikov wrote:
>
> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
> {
> - struct file *file;
> + struct ns_common *ns;
> + struct fd f;
>
> - file = fget(fd);
> - if (!file)
> + f = fdget(fd);
> + if (!f.file)
> return ERR_PTR(-EBADF);
>
> - if (file->f_op != &ns_file_operations)
> + if (f.file->f_op != &ns_file_operations)
> + goto out_invalid;
> +
> + ns = get_proc_ns(file_inode(f.file));
> + if (nstype && (ns->ops->type != nstype))
> goto out_invalid;
>
> - return file;
> + *fd_ref = f;
> + return ns;
>
> out_invalid:
> - fput(file);
> + fdput(f);
> return ERR_PTR(-EINVAL);
> }
Well yes, fdget() makes sense but this is minor.
Honestly, I do not really like the new helper... I understand this
is subjective, so I won't insist. But how about 1/1? We do not need
fd/file at all. With this patch your sys_getvpid() can just use
proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
Eric, what do you think?
See also "TODO" in the changelog.
Oleg.
^ permalink raw reply [flat|nested] 37+ messages in thread
* [PATCH 1/1] ns: introduce proc_get_ns_by_fd()
2015-09-25 17:56 ` Oleg Nesterov
@ 2015-09-25 17:57 ` Oleg Nesterov
-1 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-25 17:57 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds
Introduce proc_get_ns_by_fd() so that get_net_ns_by_fd() becomes
one-liner. It will have another CLONE_NEWPID user soon.
TODO: proc_get_ns_by_fd() can share some code with proc_ns_fget().
Signed-off-by: Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
fs/nsfs.c | 24 ++++++++++++++++++++++++
include/linux/proc_ns.h | 3 +++
kernel/pid_namespace.c | 6 ++++++
net/core/net_namespace.c | 23 +++++++----------------
4 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 99521e7..72474ca 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -136,6 +136,30 @@ out_invalid:
return ERR_PTR(-EINVAL);
}
+void *proc_get_ns_by_fd(int fd, int type)
+{
+ struct fd f;
+ struct ns_common *ns;
+ void *ret;
+
+ f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ ret = ERR_PTR(-EINVAL);
+ if (f.file->f_op != &ns_file_operations)
+ goto put;
+
+ ns = get_proc_ns(file_inode(f.file));
+ if (ns->ops->type != type || !ns->ops->get_type)
+ goto put;
+
+ ret = ns->ops->get_type(ns);
+put:
+ fdput(f);
+ return ret;
+}
+
static const struct super_operations nsfs_ops = {
.statfs = simple_statfs,
.evict_inode = nsfs_evict,
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 42dfc61..d956f89 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -16,6 +16,7 @@ struct proc_ns_operations {
struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common *ns);
int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
+ void *(*get_type)(struct ns_common *ns);
};
extern const struct proc_ns_operations netns_operations;
@@ -66,6 +67,8 @@ static inline int ns_alloc_inum(struct ns_common *ns)
#define ns_free_inum(ns) proc_free_inum((ns)->inum)
extern struct file *proc_ns_fget(int fd);
+extern void *proc_get_ns_by_fd(int fd, int type);
+
#define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
extern void *ns_get_path(struct path *path, struct task_struct *task,
const struct proc_ns_operations *ns_ops);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba13..0c87393 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -388,12 +388,18 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static void *pidns_get_type(struct ns_common *ns)
+{
+ return get_pid_ns(to_pid_ns(ns));
+}
+
const struct proc_ns_operations pidns_operations = {
.name = "pid",
.type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
+ .get_type = pidns_get_type,
};
static __init int pid_namespaces_init(void)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 572af00..6465dc0 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -420,22 +420,7 @@ EXPORT_SYMBOL_GPL(__put_net);
struct net *get_net_ns_by_fd(int fd)
{
- struct file *file;
- struct ns_common *ns;
- struct net *net;
-
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return ERR_CAST(file);
-
- ns = get_proc_ns(file_inode(file));
- if (ns->ops == &netns_operations)
- net = get_net(container_of(ns, struct net, ns));
- else
- net = ERR_PTR(-EINVAL);
-
- fput(file);
- return net;
+ return proc_get_ns_by_fd(fd, CLONE_NEWNET);
}
#else
@@ -955,11 +940,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static void *netns_get_type(struct ns_common *ns)
+{
+ return get_net(to_net_ns(ns));
+}
+
const struct proc_ns_operations netns_operations = {
.name = "net",
.type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
+ .get_type = netns_get_type,
};
#endif
--
1.5.5.1
^ permalink raw reply related [flat|nested] 37+ messages in thread
* [PATCH 1/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-25 17:57 ` Oleg Nesterov
0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-25 17:57 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: linux-api, containers, linux-kernel, Roman Gushchin,
Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
Introduce proc_get_ns_by_fd() so that get_net_ns_by_fd() becomes
one-liner. It will have another CLONE_NEWPID user soon.
TODO: proc_get_ns_by_fd() can share some code with proc_ns_fget().
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
fs/nsfs.c | 24 ++++++++++++++++++++++++
include/linux/proc_ns.h | 3 +++
kernel/pid_namespace.c | 6 ++++++
net/core/net_namespace.c | 23 +++++++----------------
4 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 99521e7..72474ca 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -136,6 +136,30 @@ out_invalid:
return ERR_PTR(-EINVAL);
}
+void *proc_get_ns_by_fd(int fd, int type)
+{
+ struct fd f;
+ struct ns_common *ns;
+ void *ret;
+
+ f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ ret = ERR_PTR(-EINVAL);
+ if (f.file->f_op != &ns_file_operations)
+ goto put;
+
+ ns = get_proc_ns(file_inode(f.file));
+ if (ns->ops->type != type || !ns->ops->get_type)
+ goto put;
+
+ ret = ns->ops->get_type(ns);
+put:
+ fdput(f);
+ return ret;
+}
+
static const struct super_operations nsfs_ops = {
.statfs = simple_statfs,
.evict_inode = nsfs_evict,
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 42dfc61..d956f89 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -16,6 +16,7 @@ struct proc_ns_operations {
struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common *ns);
int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
+ void *(*get_type)(struct ns_common *ns);
};
extern const struct proc_ns_operations netns_operations;
@@ -66,6 +67,8 @@ static inline int ns_alloc_inum(struct ns_common *ns)
#define ns_free_inum(ns) proc_free_inum((ns)->inum)
extern struct file *proc_ns_fget(int fd);
+extern void *proc_get_ns_by_fd(int fd, int type);
+
#define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
extern void *ns_get_path(struct path *path, struct task_struct *task,
const struct proc_ns_operations *ns_ops);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba13..0c87393 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -388,12 +388,18 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static void *pidns_get_type(struct ns_common *ns)
+{
+ return get_pid_ns(to_pid_ns(ns));
+}
+
const struct proc_ns_operations pidns_operations = {
.name = "pid",
.type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
+ .get_type = pidns_get_type,
};
static __init int pid_namespaces_init(void)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 572af00..6465dc0 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -420,22 +420,7 @@ EXPORT_SYMBOL_GPL(__put_net);
struct net *get_net_ns_by_fd(int fd)
{
- struct file *file;
- struct ns_common *ns;
- struct net *net;
-
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return ERR_CAST(file);
-
- ns = get_proc_ns(file_inode(file));
- if (ns->ops == &netns_operations)
- net = get_net(container_of(ns, struct net, ns));
- else
- net = ERR_PTR(-EINVAL);
-
- fput(file);
- return net;
+ return proc_get_ns_by_fd(fd, CLONE_NEWNET);
}
#else
@@ -955,11 +940,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static void *netns_get_type(struct ns_common *ns)
+{
+ return get_net(to_net_ns(ns));
+}
+
const struct proc_ns_operations netns_operations = {
.name = "net",
.type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
+ .get_type = netns_get_type,
};
#endif
--
1.5.5.1
^ permalink raw reply related [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
2015-09-25 13:52 ` Konstantin Khlebnikov
@ 2015-09-28 4:12 ` kbuild test robot
-1 siblings, 0 replies; 37+ messages in thread
From: kbuild test robot @ 2015-09-28 4:12 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov,
kbuild-all-JC7UmRfGjtg, Chen Fan, Andrew Morton, Linus Torvalds,
Eric W. Biederman
[-- Attachment #1: Type: text/plain, Size: 4263 bytes --]
Hi Konstantin,
[auto build test results on next-20150925 -- if it's inappropriate base, please ignore]
config: openrisc-or1ksim_defconfig (attached as .config)
reproduce:
wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 7bea85b3c058d69527d79b94338025f218a0655e
# save the attached .config to linux build tree
make.cross ARCH=openrisc
All warnings (new ones prefixed by >>):
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from <stdin>:2:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
<stdin>:1307:2: warning: #warning syscall getvpid not implemented
--
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from include/uapi/linux/unistd.h:7,
from arch/openrisc/kernel/setup.c:25:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
--
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from include/uapi/linux/unistd.h:7,
from include/linux/syscalls.h:78,
from arch/openrisc/kernel/sys_call_table.c:17:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from arch/openrisc/kernel/sys_call_table.c:27:
include/uapi/asm-generic/unistd.h:716:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:718:0: note: this is the location of the previous definition
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
include/uapi/asm-generic/unistd.h:719:1: error: '__NR_getvpid' undeclared here (not in a function)
include/uapi/asm-generic/unistd.h:719:1: error: array index in initializer not of integer type
include/uapi/asm-generic/unistd.h:719:1: error: (near initialization for 'sys_call_table')
--
kernel/time/Kconfig:155:warning: range is invalid
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from <stdin>:2:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
<stdin>:1307:2: warning: #warning syscall getvpid not implemented
vim +/__NR_mlock2 +718 include/uapi/asm-generic/unistd.h
702 #define __NR_seccomp 277
703 __SYSCALL(__NR_seccomp, sys_seccomp)
704 #define __NR_getrandom 278
705 __SYSCALL(__NR_getrandom, sys_getrandom)
706 #define __NR_memfd_create 279
707 __SYSCALL(__NR_memfd_create, sys_memfd_create)
708 #define __NR_bpf 280
709 __SYSCALL(__NR_bpf, sys_bpf)
710 #define __NR_execveat 281
711 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
712 #define __NR_userfaultfd 282
713 __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
714 #define __NR_membarrier 283
715 __SYSCALL(__NR_membarrier, sys_membarrier)
716 #define __NR_mlock2 284
717 __SYSCALL(__NR_mlock2, sys_mlock2)
> 718 #define __NR_mlock2 285
719 __SYSCALL(__NR_getvpid, sys_getvpid)
720
721 #undef __NR_syscalls
722 #define __NR_syscalls 286
723
724 /*
725 * All syscalls below here should go away really,
726 * these are provided for both review and as a porting
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 6887 bytes --]
[-- Attachment #3: Type: text/plain, Size: 205 bytes --]
_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linuxfoundation.org/mailman/listinfo/containers
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-09-28 4:12 ` kbuild test robot
0 siblings, 0 replies; 37+ messages in thread
From: kbuild test robot @ 2015-09-28 4:12 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: kbuild-all, linux-api, containers, linux-kernel, Roman Gushchin,
Serge Hallyn, Oleg Nesterov, Eric W. Biederman, Chen Fan,
Andrew Morton, Linus Torvalds, Stéphane Graber
[-- Attachment #1: Type: text/plain, Size: 4263 bytes --]
Hi Konstantin,
[auto build test results on next-20150925 -- if it's inappropriate base, please ignore]
config: openrisc-or1ksim_defconfig (attached as .config)
reproduce:
wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 7bea85b3c058d69527d79b94338025f218a0655e
# save the attached .config to linux build tree
make.cross ARCH=openrisc
All warnings (new ones prefixed by >>):
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from <stdin>:2:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
<stdin>:1307:2: warning: #warning syscall getvpid not implemented
--
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from include/uapi/linux/unistd.h:7,
from arch/openrisc/kernel/setup.c:25:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
--
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from include/uapi/linux/unistd.h:7,
from include/linux/syscalls.h:78,
from arch/openrisc/kernel/sys_call_table.c:17:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from arch/openrisc/kernel/sys_call_table.c:27:
include/uapi/asm-generic/unistd.h:716:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:718:0: note: this is the location of the previous definition
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
include/uapi/asm-generic/unistd.h:719:1: error: '__NR_getvpid' undeclared here (not in a function)
include/uapi/asm-generic/unistd.h:719:1: error: array index in initializer not of integer type
include/uapi/asm-generic/unistd.h:719:1: error: (near initialization for 'sys_call_table')
--
kernel/time/Kconfig:155:warning: range is invalid
In file included from include/asm-generic/unistd.h:1:0,
from arch/openrisc/include/uapi/asm/unistd.h:26,
from <stdin>:2:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
<stdin>:1307:2: warning: #warning syscall getvpid not implemented
vim +/__NR_mlock2 +718 include/uapi/asm-generic/unistd.h
702 #define __NR_seccomp 277
703 __SYSCALL(__NR_seccomp, sys_seccomp)
704 #define __NR_getrandom 278
705 __SYSCALL(__NR_getrandom, sys_getrandom)
706 #define __NR_memfd_create 279
707 __SYSCALL(__NR_memfd_create, sys_memfd_create)
708 #define __NR_bpf 280
709 __SYSCALL(__NR_bpf, sys_bpf)
710 #define __NR_execveat 281
711 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
712 #define __NR_userfaultfd 282
713 __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
714 #define __NR_membarrier 283
715 __SYSCALL(__NR_membarrier, sys_membarrier)
716 #define __NR_mlock2 284
717 __SYSCALL(__NR_mlock2, sys_mlock2)
> 718 #define __NR_mlock2 285
719 __SYSCALL(__NR_getvpid, sys_getvpid)
720
721 #undef __NR_syscalls
722 #define __NR_syscalls 286
723
724 /*
725 * All syscalls below here should go away really,
726 * these are provided for both review and as a porting
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 6887 bytes --]
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-09-25 17:57 ` Oleg Nesterov
@ 2015-09-28 8:21 ` Konstantin Khlebnikov
2015-09-28 16:37 ` Eric W. Biederman
2 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-28 8:21 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds
On 25.09.2015 20:56, Oleg Nesterov wrote:
> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>> {
>> - struct file *file;
>> + struct ns_common *ns;
>> + struct fd f;
>>
>> - file = fget(fd);
>> - if (!file)
>> + f = fdget(fd);
>> + if (!f.file)
>> return ERR_PTR(-EBADF);
>>
>> - if (file->f_op != &ns_file_operations)
>> + if (f.file->f_op != &ns_file_operations)
>> + goto out_invalid;
>> +
>> + ns = get_proc_ns(file_inode(f.file));
>> + if (nstype && (ns->ops->type != nstype))
>> goto out_invalid;
>>
>> - return file;
>> + *fd_ref = f;
>> + return ns;
>>
>> out_invalid:
>> - fput(file);
>> + fdput(f);
>> return ERR_PTR(-EINVAL);
>> }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
Hmm. My version has 0 or 2 atomic ops per get-put sequence.
Your version: 2 or 4 atomic ops. Plus even in worst case pinning by
struct fd theoretically scales better because it touches refcount at
struct file: there're might be many of them for one namespace.
>
> Eric, what do you think?
>
> See also "TODO" in the changelog.
>
> Oleg.
>
--
Konstantin
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-09-28 8:21 ` Konstantin Khlebnikov
2015-09-28 8:21 ` [PATCH 0/1] " Konstantin Khlebnikov
2015-09-28 16:37 ` Eric W. Biederman
2 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-28 8:21 UTC (permalink / raw)
To: Oleg Nesterov
Cc: linux-api, containers, linux-kernel, Roman Gushchin,
Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
On 25.09.2015 20:56, Oleg Nesterov wrote:
> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>> {
>> - struct file *file;
>> + struct ns_common *ns;
>> + struct fd f;
>>
>> - file = fget(fd);
>> - if (!file)
>> + f = fdget(fd);
>> + if (!f.file)
>> return ERR_PTR(-EBADF);
>>
>> - if (file->f_op != &ns_file_operations)
>> + if (f.file->f_op != &ns_file_operations)
>> + goto out_invalid;
>> +
>> + ns = get_proc_ns(file_inode(f.file));
>> + if (nstype && (ns->ops->type != nstype))
>> goto out_invalid;
>>
>> - return file;
>> + *fd_ref = f;
>> + return ns;
>>
>> out_invalid:
>> - fput(file);
>> + fdput(f);
>> return ERR_PTR(-EINVAL);
>> }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
Hmm. My version has 0 or 2 atomic ops per get-put sequence.
Your version: 2 or 4 atomic ops. Plus even in worst case pinning by
struct fd theoretically scales better because it touches refcount at
struct file: there're might be many of them for one namespace.
>
> Eric, what do you think?
>
> See also "TODO" in the changelog.
>
> Oleg.
>
--
Konstantin
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-28 8:21 ` Konstantin Khlebnikov
0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-28 8:21 UTC (permalink / raw)
To: Oleg Nesterov
Cc: linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
On 25.09.2015 20:56, Oleg Nesterov wrote:
> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>> {
>> - struct file *file;
>> + struct ns_common *ns;
>> + struct fd f;
>>
>> - file = fget(fd);
>> - if (!file)
>> + f = fdget(fd);
>> + if (!f.file)
>> return ERR_PTR(-EBADF);
>>
>> - if (file->f_op != &ns_file_operations)
>> + if (f.file->f_op != &ns_file_operations)
>> + goto out_invalid;
>> +
>> + ns = get_proc_ns(file_inode(f.file));
>> + if (nstype && (ns->ops->type != nstype))
>> goto out_invalid;
>>
>> - return file;
>> + *fd_ref = f;
>> + return ns;
>>
>> out_invalid:
>> - fput(file);
>> + fdput(f);
>> return ERR_PTR(-EINVAL);
>> }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
Hmm. My version has 0 or 2 atomic ops per get-put sequence.
Your version: 2 or 4 atomic ops. Plus even in worst case pinning by
struct fd theoretically scales better because it touches refcount at
struct file: there're might be many of them for one namespace.
>
> Eric, what do you think?
>
> See also "TODO" in the changelog.
>
> Oleg.
>
--
Konstantin
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
2015-09-25 13:52 ` Konstantin Khlebnikov
@ 2015-09-28 16:22 ` Eric W. Biederman
-1 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:22 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov,
Chen Fan, Andrew Morton, Linus Torvalds
Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes:
> pid_t getvpid(pid_t pid, int source, int target);
>
> This syscall converts pid from source pid-namespace into pid visible
> in target pid-namespace. If pid is unreachable from target namespace
> then getvpid() returns zero.
Two minor things.
Can we please call this translate_pid? getvpid does not really cover
what this syscall does.
Can you please split wiring up into a separate patch? You goofed it
up this round and it just adds noise in reviewing the core syscall.
> Namespaces are defined by file descriptors pointing to entries in
> proc (/proc/[pid]/ns/pid). If argument is negative then current pid
> namespace is used.
>
> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Possible error codes:
> ESRCH - task not found
> EBADF - closed file descriptor
> EINVAL - not pid-namespace file descriptor
>
> Such conversion is required for interaction between processes from
> different pid-namespaces. For example system service at host system
> who provide access to restricted set of privileged operations for
> clients from containers have to convert pids back and forward.
>
> Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but
> this interface works only in one way and even that is non-trivial.
>
> Other option is passing pids with credentials via unix socket, but
> this solution requires a lot of preparation and CAP_SYS_ADMIN for
> sending arbitrary pids.
>
> This syscall works in both directions, it's fast and simple.
>
> Examples:
> getvpid(pid, ns, -1) - get pid in our pid namespace
> getvpid(pid, -1, ns) - get pid in container
> getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> getvpid(1, ns, -1) - get init task of pid-namespace
> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
> getvpid(-pid, -1, -1) - get ppid by pid
>
> Signed-off-by: Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org>
>
> --
>
> v1: https://lkml.org/lkml/2015/9/15/411
> v2: https://lkml.org/lkml/2015/9/24/278
> v3:
> * use proc_ns_fdget()
> * update description
> * rebase to next-20150925
> * fix conflict with mlock2
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> include/linux/syscalls.h | 1 +
> include/uapi/asm-generic/unistd.h | 4 ++-
> kernel/sys.c | 51 ++++++++++++++++++++++++++++++++
> 5 files changed, 57 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 143ef9f37932..c36c2c65d204 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -383,3 +383,4 @@
> 374 i386 userfaultfd sys_userfaultfd
> 375 i386 membarrier sys_membarrier
> 376 i386 mlock2 sys_mlock2
> +377 i386 getvpid sys_getvpid
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 314a90bfc09c..90bbbc7fdbe0 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -332,6 +332,7 @@
> 323 common userfaultfd sys_userfaultfd
> 324 common membarrier sys_membarrier
> 325 common mlock2 sys_mlock2
> +326 common getvpid sys_getvpid
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a156b82dd14c..dbb5638260b5 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
> asmlinkage long sys_alarm(unsigned int seconds);
> asmlinkage long sys_getpid(void);
> asmlinkage long sys_getppid(void);
> +asmlinkage long sys_getvpid(pid_t pid, int source, int target);
> asmlinkage long sys_getuid(void);
> asmlinkage long sys_geteuid(void);
> asmlinkage long sys_getgid(void);
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1324b0292ec2..2c1123130f90 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
> __SYSCALL(__NR_membarrier, sys_membarrier)
> #define __NR_mlock2 284
> __SYSCALL(__NR_mlock2, sys_mlock2)
> +#define __NR_mlock2 285
> +__SYSCALL(__NR_getvpid, sys_getvpid)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 285
> +#define __NR_syscalls 286
>
> /*
> * All syscalls below here should go away really,
> diff --git a/kernel/sys.c b/kernel/sys.c
> index fa2f2f671a5c..1e28a36b84fa 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -46,6 +46,7 @@
> #include <linux/syscalls.h>
> #include <linux/kprobes.h>
> #include <linux/user_namespace.h>
> +#include <linux/proc_ns.h>
> #include <linux/binfmts.h>
>
> #include <linux/sched.h>
> @@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid)
> return pid;
> }
>
> +SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target)
> +{
> + struct pid_namespace *source_ns, *target_ns;
> + struct fd source_fd = {}, target_fd = {};
> + struct pid *struct_pid;
> + struct ns_common *ns;
> + pid_t result;
> +
> + if (source >= 0) {
> + ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd);
> + result = PTR_ERR(ns);
> + if (IS_ERR(ns))
> + goto out;
> + source_ns = container_of(ns, struct pid_namespace, ns);
> + } else
> + source_ns = task_active_pid_ns(current);
> +
> + if (target >= 0) {
> + ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd);
> + result = PTR_ERR(ns);
> + if (IS_ERR(ns))
> + goto out;
> + target_ns = container_of(ns, struct pid_namespace, ns);
> + } else
> + target_ns = task_active_pid_ns(current);
> +
> + rcu_read_lock();
> + struct_pid = find_pid_ns(abs(pid), source_ns);
> +
> + if (struct_pid && pid < 0) {
> + struct task_struct *task;
> +
> + task = pid_task(struct_pid, PIDTYPE_PID);
> + if (task)
> + task = rcu_dereference(task->real_parent);
> + struct_pid = task ? task_pid(task) : NULL;
> + }
> +
> + if (struct_pid)
> + result = pid_nr_ns(struct_pid, target_ns);
> + else
> + result = -ESRCH;
> + rcu_read_unlock();
> +
> +out:
> + fdput(target_fd);
> + fdput(source_fd);
> + return result;
> +}
> +
> SYSCALL_DEFINE0(getuid)
> {
> /* Only we change this so SMP safe */
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-09-28 16:22 ` Eric W. Biederman
0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:22 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: linux-api, containers, linux-kernel, Roman Gushchin,
Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:
> pid_t getvpid(pid_t pid, int source, int target);
>
> This syscall converts pid from source pid-namespace into pid visible
> in target pid-namespace. If pid is unreachable from target namespace
> then getvpid() returns zero.
Two minor things.
Can we please call this translate_pid? getvpid does not really cover
what this syscall does.
Can you please split wiring up into a separate patch? You goofed it
up this round and it just adds noise in reviewing the core syscall.
> Namespaces are defined by file descriptors pointing to entries in
> proc (/proc/[pid]/ns/pid). If argument is negative then current pid
> namespace is used.
>
> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Possible error codes:
> ESRCH - task not found
> EBADF - closed file descriptor
> EINVAL - not pid-namespace file descriptor
>
> Such conversion is required for interaction between processes from
> different pid-namespaces. For example system service at host system
> who provide access to restricted set of privileged operations for
> clients from containers have to convert pids back and forward.
>
> Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but
> this interface works only in one way and even that is non-trivial.
>
> Other option is passing pids with credentials via unix socket, but
> this solution requires a lot of preparation and CAP_SYS_ADMIN for
> sending arbitrary pids.
>
> This syscall works in both directions, it's fast and simple.
>
> Examples:
> getvpid(pid, ns, -1) - get pid in our pid namespace
> getvpid(pid, -1, ns) - get pid in container
> getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> getvpid(1, ns, -1) - get init task of pid-namespace
> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
> getvpid(-pid, -1, -1) - get ppid by pid
>
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
>
> --
>
> v1: https://lkml.org/lkml/2015/9/15/411
> v2: https://lkml.org/lkml/2015/9/24/278
> v3:
> * use proc_ns_fdget()
> * update description
> * rebase to next-20150925
> * fix conflict with mlock2
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> include/linux/syscalls.h | 1 +
> include/uapi/asm-generic/unistd.h | 4 ++-
> kernel/sys.c | 51 ++++++++++++++++++++++++++++++++
> 5 files changed, 57 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 143ef9f37932..c36c2c65d204 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -383,3 +383,4 @@
> 374 i386 userfaultfd sys_userfaultfd
> 375 i386 membarrier sys_membarrier
> 376 i386 mlock2 sys_mlock2
> +377 i386 getvpid sys_getvpid
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 314a90bfc09c..90bbbc7fdbe0 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -332,6 +332,7 @@
> 323 common userfaultfd sys_userfaultfd
> 324 common membarrier sys_membarrier
> 325 common mlock2 sys_mlock2
> +326 common getvpid sys_getvpid
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a156b82dd14c..dbb5638260b5 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
> asmlinkage long sys_alarm(unsigned int seconds);
> asmlinkage long sys_getpid(void);
> asmlinkage long sys_getppid(void);
> +asmlinkage long sys_getvpid(pid_t pid, int source, int target);
> asmlinkage long sys_getuid(void);
> asmlinkage long sys_geteuid(void);
> asmlinkage long sys_getgid(void);
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1324b0292ec2..2c1123130f90 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
> __SYSCALL(__NR_membarrier, sys_membarrier)
> #define __NR_mlock2 284
> __SYSCALL(__NR_mlock2, sys_mlock2)
> +#define __NR_mlock2 285
> +__SYSCALL(__NR_getvpid, sys_getvpid)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 285
> +#define __NR_syscalls 286
>
> /*
> * All syscalls below here should go away really,
> diff --git a/kernel/sys.c b/kernel/sys.c
> index fa2f2f671a5c..1e28a36b84fa 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -46,6 +46,7 @@
> #include <linux/syscalls.h>
> #include <linux/kprobes.h>
> #include <linux/user_namespace.h>
> +#include <linux/proc_ns.h>
> #include <linux/binfmts.h>
>
> #include <linux/sched.h>
> @@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid)
> return pid;
> }
>
> +SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target)
> +{
> + struct pid_namespace *source_ns, *target_ns;
> + struct fd source_fd = {}, target_fd = {};
> + struct pid *struct_pid;
> + struct ns_common *ns;
> + pid_t result;
> +
> + if (source >= 0) {
> + ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd);
> + result = PTR_ERR(ns);
> + if (IS_ERR(ns))
> + goto out;
> + source_ns = container_of(ns, struct pid_namespace, ns);
> + } else
> + source_ns = task_active_pid_ns(current);
> +
> + if (target >= 0) {
> + ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd);
> + result = PTR_ERR(ns);
> + if (IS_ERR(ns))
> + goto out;
> + target_ns = container_of(ns, struct pid_namespace, ns);
> + } else
> + target_ns = task_active_pid_ns(current);
> +
> + rcu_read_lock();
> + struct_pid = find_pid_ns(abs(pid), source_ns);
> +
> + if (struct_pid && pid < 0) {
> + struct task_struct *task;
> +
> + task = pid_task(struct_pid, PIDTYPE_PID);
> + if (task)
> + task = rcu_dereference(task->real_parent);
> + struct_pid = task ? task_pid(task) : NULL;
> + }
> +
> + if (struct_pid)
> + result = pid_nr_ns(struct_pid, target_ns);
> + else
> + result = -ESRCH;
> + rcu_read_unlock();
> +
> +out:
> + fdput(target_fd);
> + fdput(source_fd);
> + return result;
> +}
> +
> SYSCALL_DEFINE0(getuid)
> {
> /* Only we change this so SMP safe */
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-09-25 17:57 ` Oleg Nesterov
2015-09-28 8:21 ` [PATCH 0/1] " Konstantin Khlebnikov
@ 2015-09-28 16:37 ` Eric W. Biederman
2 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:37 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Roman Gushchin, Konstantin Khlebnikov,
linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
Andrew Morton, Linus Torvalds
Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>> {
>> - struct file *file;
>> + struct ns_common *ns;
>> + struct fd f;
>>
>> - file = fget(fd);
>> - if (!file)
>> + f = fdget(fd);
>> + if (!f.file)
>> return ERR_PTR(-EBADF);
>>
>> - if (file->f_op != &ns_file_operations)
>> + if (f.file->f_op != &ns_file_operations)
>> + goto out_invalid;
>> +
>> + ns = get_proc_ns(file_inode(f.file));
>> + if (nstype && (ns->ops->type != nstype))
>> goto out_invalid;
>>
>> - return file;
>> + *fd_ref = f;
>> + return ns;
>>
>> out_invalid:
>> - fput(file);
>> + fdput(f);
>> return ERR_PTR(-EINVAL);
>> }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>
> Eric, what do you think?
At some level I don't care this is not exposed to userspace.
However since we are going several rounds with this.
Of the existing uses several of them sleep, which unfortunately means we
can not use rcu locking for everything. The network namespace ones wind
up taking a reference to struct net because the have the legacy pid case
to deal with. Which makes we can not use fdget for all callers either.
For this translate_pid rcu locking is sufficient, rcu locking is easy
and doing any more than rcu locking just seems silly. So let me
respectfully suggest.
struct ns_common *ns_by_fd_rcu(int fd, int type)
{
struct files_struct *files = current->files;
struct file *file;
struct ns_common *ns;
void *ret;
file = fcheck_files(files, fd);
if (!file)
return ERR_PTR(-EBADF);
if (file->f_mode & FMODE_PATH)
return ERR_PTR(-EINVAL);
if (file->f_op != &ns_file_operations)
return ERR_PTR(-EINVAL);
ns = get_proc_ns(file_inode(file));
if (ns->ops->type != type)
return ERR_PTR(-EINVAL);
return ns;
}
struct pid_namespace *pidns_by_fd_rcu(int fd)
{
struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
if (IS_ERR(ns))
return ERR_CAST(ns);
return container_of(ns, struct pid_namespace, ns);
}
SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
{
struct pid_namespace *source, *target;
struct pid *pid;
pid_t result;
rcu_read_lock();
if (sourcefd >= 0)
source = pidns_by_fd_rcu(sourcefd);
else
source = task_active_pid_ns(current);
if (targetfd >= 0)
target = pidns_by_fd_rcu(targetfd);
else
target = task_active_pid_ns(current);
pid = find_pid_ns(pid_nr, source);
result = pid_nr_ns(pid, target);
if (result == 0)
result = -ESRCH;
rcu_read_unlock();
return result;
}
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-09-28 16:37 ` Eric W. Biederman
2015-09-28 8:21 ` [PATCH 0/1] " Konstantin Khlebnikov
2015-09-28 16:37 ` Eric W. Biederman
2 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:37 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
Oleg Nesterov <oleg@redhat.com> writes:
> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>> {
>> - struct file *file;
>> + struct ns_common *ns;
>> + struct fd f;
>>
>> - file = fget(fd);
>> - if (!file)
>> + f = fdget(fd);
>> + if (!f.file)
>> return ERR_PTR(-EBADF);
>>
>> - if (file->f_op != &ns_file_operations)
>> + if (f.file->f_op != &ns_file_operations)
>> + goto out_invalid;
>> +
>> + ns = get_proc_ns(file_inode(f.file));
>> + if (nstype && (ns->ops->type != nstype))
>> goto out_invalid;
>>
>> - return file;
>> + *fd_ref = f;
>> + return ns;
>>
>> out_invalid:
>> - fput(file);
>> + fdput(f);
>> return ERR_PTR(-EINVAL);
>> }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>
> Eric, what do you think?
At some level I don't care this is not exposed to userspace.
However since we are going several rounds with this.
Of the existing uses several of them sleep, which unfortunately means we
can not use rcu locking for everything. The network namespace ones wind
up taking a reference to struct net because the have the legacy pid case
to deal with. Which makes we can not use fdget for all callers either.
For this translate_pid rcu locking is sufficient, rcu locking is easy
and doing any more than rcu locking just seems silly. So let me
respectfully suggest.
struct ns_common *ns_by_fd_rcu(int fd, int type)
{
struct files_struct *files = current->files;
struct file *file;
struct ns_common *ns;
void *ret;
file = fcheck_files(files, fd);
if (!file)
return ERR_PTR(-EBADF);
if (file->f_mode & FMODE_PATH)
return ERR_PTR(-EINVAL);
if (file->f_op != &ns_file_operations)
return ERR_PTR(-EINVAL);
ns = get_proc_ns(file_inode(file));
if (ns->ops->type != type)
return ERR_PTR(-EINVAL);
return ns;
}
struct pid_namespace *pidns_by_fd_rcu(int fd)
{
struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
if (IS_ERR(ns))
return ERR_CAST(ns);
return container_of(ns, struct pid_namespace, ns);
}
SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
{
struct pid_namespace *source, *target;
struct pid *pid;
pid_t result;
rcu_read_lock();
if (sourcefd >= 0)
source = pidns_by_fd_rcu(sourcefd);
else
source = task_active_pid_ns(current);
if (targetfd >= 0)
target = pidns_by_fd_rcu(targetfd);
else
target = task_active_pid_ns(current);
pid = find_pid_ns(pid_nr, source);
result = pid_nr_ns(pid, target);
if (result == 0)
result = -ESRCH;
rcu_read_unlock();
return result;
}
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-28 16:37 ` Eric W. Biederman
0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:37 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds,
Stéphane Graber
Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>> {
>> - struct file *file;
>> + struct ns_common *ns;
>> + struct fd f;
>>
>> - file = fget(fd);
>> - if (!file)
>> + f = fdget(fd);
>> + if (!f.file)
>> return ERR_PTR(-EBADF);
>>
>> - if (file->f_op != &ns_file_operations)
>> + if (f.file->f_op != &ns_file_operations)
>> + goto out_invalid;
>> +
>> + ns = get_proc_ns(file_inode(f.file));
>> + if (nstype && (ns->ops->type != nstype))
>> goto out_invalid;
>>
>> - return file;
>> + *fd_ref = f;
>> + return ns;
>>
>> out_invalid:
>> - fput(file);
>> + fdput(f);
>> return ERR_PTR(-EINVAL);
>> }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>
> Eric, what do you think?
At some level I don't care this is not exposed to userspace.
However since we are going several rounds with this.
Of the existing uses several of them sleep, which unfortunately means we
can not use rcu locking for everything. The network namespace ones wind
up taking a reference to struct net because the have the legacy pid case
to deal with. Which makes we can not use fdget for all callers either.
For this translate_pid rcu locking is sufficient, rcu locking is easy
and doing any more than rcu locking just seems silly. So let me
respectfully suggest.
struct ns_common *ns_by_fd_rcu(int fd, int type)
{
struct files_struct *files = current->files;
struct file *file;
struct ns_common *ns;
void *ret;
file = fcheck_files(files, fd);
if (!file)
return ERR_PTR(-EBADF);
if (file->f_mode & FMODE_PATH)
return ERR_PTR(-EINVAL);
if (file->f_op != &ns_file_operations)
return ERR_PTR(-EINVAL);
ns = get_proc_ns(file_inode(file));
if (ns->ops->type != type)
return ERR_PTR(-EINVAL);
return ns;
}
struct pid_namespace *pidns_by_fd_rcu(int fd)
{
struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
if (IS_ERR(ns))
return ERR_CAST(ns);
return container_of(ns, struct pid_namespace, ns);
}
SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
{
struct pid_namespace *source, *target;
struct pid *pid;
pid_t result;
rcu_read_lock();
if (sourcefd >= 0)
source = pidns_by_fd_rcu(sourcefd);
else
source = task_active_pid_ns(current);
if (targetfd >= 0)
target = pidns_by_fd_rcu(targetfd);
else
target = task_active_pid_ns(current);
pid = find_pid_ns(pid_nr, source);
result = pid_nr_ns(pid, target);
if (result == 0)
result = -ESRCH;
rcu_read_unlock();
return result;
}
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
2015-09-25 13:52 ` Konstantin Khlebnikov
@ 2015-09-28 16:57 ` Eric W. Biederman
-1 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:57 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov,
Chen Fan, Andrew Morton, Linus Torvalds
Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes:
> If pid is negative then getvpid() returns pid of parent task for -pid.
Now that I am noticing this. I don't think I have seen any discussion
about justifying a syscall getting another processes parent pid. My
apologies if I just missed it.
Why do we want the the parent pid? We can we usefully do with it?
Is proc really that bad of an interface?
Fetching a parent pid feels like a separate logical operation
from pid translation. Which makes me a bit uneasy about this
part of the conversation.
> Examples:
> getvpid(pid, ns, -1) - get pid in our pid namespace
> getvpid(pid, -1, ns) - get pid in container
> getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> getvpid(1, ns, -1) - get init task of pid-namespace
> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
> getvpid(-pid, -1, -1) - get ppid by pid
As I step back and pay attention to this case I am half wondering if
perhaps what would be most useful is a file descriptor that refers
to a pid and an updated set of system calls that takes pid file
descriptors instead of pids.
Something like:
getpidfd(int pidnsfd, pid_t pid);
waitfd(int pidfd, int *status, int options, struct rusage *rusage);
killfd(int pidfd, int sig);
clonefd(...);
And perhaps:
pid_nr_ns(int pidnsfd, int pidfd);
parentfd(int pidfd);
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-09-28 16:57 ` Eric W. Biederman
0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:57 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: linux-api, containers, linux-kernel, Roman Gushchin,
Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:
> If pid is negative then getvpid() returns pid of parent task for -pid.
Now that I am noticing this. I don't think I have seen any discussion
about justifying a syscall getting another processes parent pid. My
apologies if I just missed it.
Why do we want the the parent pid? We can we usefully do with it?
Is proc really that bad of an interface?
Fetching a parent pid feels like a separate logical operation
from pid translation. Which makes me a bit uneasy about this
part of the conversation.
> Examples:
> getvpid(pid, ns, -1) - get pid in our pid namespace
> getvpid(pid, -1, ns) - get pid in container
> getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> getvpid(1, ns, -1) - get init task of pid-namespace
> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
> getvpid(-pid, -1, -1) - get ppid by pid
As I step back and pay attention to this case I am half wondering if
perhaps what would be most useful is a file descriptor that refers
to a pid and an updated set of system calls that takes pid file
descriptors instead of pids.
Something like:
getpidfd(int pidnsfd, pid_t pid);
waitfd(int pidfd, int *status, int options, struct rusage *rusage);
killfd(int pidfd, int sig);
clonefd(...);
And perhaps:
pid_nr_ns(int pidnsfd, int pidfd);
parentfd(int pidfd);
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-29 16:43 ` Oleg Nesterov
2015-09-30 2:54 ` Chen Fan
1 sibling, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 16:43 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Roman Gushchin, Konstantin Khlebnikov,
linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
Andrew Morton, Linus Torvalds
On 09/28, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
> > Honestly, I do not really like the new helper... I understand this
> > is subjective, so I won't insist. But how about 1/1? We do not need
> > fd/file at all. With this patch your sys_getvpid() can just use
> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
> >
> > Eric, what do you think?
>
> At some level I don't care this is not exposed to userspace.
I agree, this is minor. But you know, the kernel is already complicated
too much, we should try to simplify/cleanup everything we can ;)
> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything. The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with. Which makes we can not use fdget for all callers either.
And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
by the network namespace.
> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly. So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> struct files_struct *files = current->files;
> struct file *file;
> struct ns_common *ns;
> void *ret;
>
> file = fcheck_files(files, fd);
> if (!file)
> return ERR_PTR(-EBADF);
>
> if (file->f_mode & FMODE_PATH)
> return ERR_PTR(-EINVAL);
>
> if (file->f_op != &ns_file_operations)
> return ERR_PTR(-EINVAL);
>
> ns = get_proc_ns(file_inode(file));
> if (ns->ops->type != type)
> return ERR_PTR(-EINVAL);
>
> return ns;
> }
OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
And in any case fcheck_files() makes more sense than fdget(), somehow I did
not think about this when I sent 1/1.
Hmm. and after the quick look at cleanup_net() I can't understand whether
get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
not... Can it?
Oleg.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-29 16:43 ` Oleg Nesterov
2015-09-30 2:54 ` Chen Fan
1 sibling, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 16:43 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
On 09/28, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg@redhat.com> writes:
>
> > Honestly, I do not really like the new helper... I understand this
> > is subjective, so I won't insist. But how about 1/1? We do not need
> > fd/file at all. With this patch your sys_getvpid() can just use
> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
> >
> > Eric, what do you think?
>
> At some level I don't care this is not exposed to userspace.
I agree, this is minor. But you know, the kernel is already complicated
too much, we should try to simplify/cleanup everything we can ;)
> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything. The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with. Which makes we can not use fdget for all callers either.
And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
by the network namespace.
> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly. So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> struct files_struct *files = current->files;
> struct file *file;
> struct ns_common *ns;
> void *ret;
>
> file = fcheck_files(files, fd);
> if (!file)
> return ERR_PTR(-EBADF);
>
> if (file->f_mode & FMODE_PATH)
> return ERR_PTR(-EINVAL);
>
> if (file->f_op != &ns_file_operations)
> return ERR_PTR(-EINVAL);
>
> ns = get_proc_ns(file_inode(file));
> if (ns->ops->type != type)
> return ERR_PTR(-EINVAL);
>
> return ns;
> }
OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
And in any case fcheck_files() makes more sense than fdget(), somehow I did
not think about this when I sent 1/1.
Hmm. and after the quick look at cleanup_net() I can't understand whether
get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
not... Can it?
Oleg.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-29 16:43 ` Oleg Nesterov
0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 16:43 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds,
Stéphane Graber
On 09/28, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
> > Honestly, I do not really like the new helper... I understand this
> > is subjective, so I won't insist. But how about 1/1? We do not need
> > fd/file at all. With this patch your sys_getvpid() can just use
> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
> >
> > Eric, what do you think?
>
> At some level I don't care this is not exposed to userspace.
I agree, this is minor. But you know, the kernel is already complicated
too much, we should try to simplify/cleanup everything we can ;)
> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything. The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with. Which makes we can not use fdget for all callers either.
And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
by the network namespace.
> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly. So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> struct files_struct *files = current->files;
> struct file *file;
> struct ns_common *ns;
> void *ret;
>
> file = fcheck_files(files, fd);
> if (!file)
> return ERR_PTR(-EBADF);
>
> if (file->f_mode & FMODE_PATH)
> return ERR_PTR(-EINVAL);
>
> if (file->f_op != &ns_file_operations)
> return ERR_PTR(-EINVAL);
>
> ns = get_proc_ns(file_inode(file));
> if (ns->ops->type != type)
> return ERR_PTR(-EINVAL);
>
> return ns;
> }
OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
And in any case fcheck_files() makes more sense than fdget(), somehow I did
not think about this when I sent 1/1.
Hmm. and after the quick look at cleanup_net() I can't understand whether
get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
not... Can it?
Oleg.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-09-29 17:30 ` Eric W. Biederman
0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 17:30 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Roman Gushchin, Konstantin Khlebnikov,
linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
Andrew Morton, Linus Torvalds
Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
> On 09/28, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>>
>> > Honestly, I do not really like the new helper... I understand this
>> > is subjective, so I won't insist. But how about 1/1? We do not need
>> > fd/file at all. With this patch your sys_getvpid() can just use
>> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>> >
>> > Eric, what do you think?
>>
>> At some level I don't care this is not exposed to userspace.
>
> I agree, this is minor. But you know, the kernel is already complicated
> too much, we should try to simplify/cleanup everything we can ;)
>
>> Of the existing uses several of them sleep, which unfortunately means we
>> can not use rcu locking for everything. The network namespace ones wind
>> up taking a reference to struct net because the have the legacy pid case
>> to deal with. Which makes we can not use fdget for all callers either.
>
> And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
> by the network namespace.
>
>> For this translate_pid rcu locking is sufficient, rcu locking is easy
>> and doing any more than rcu locking just seems silly. So let me
>> respectfully suggest.
>>
>> struct ns_common *ns_by_fd_rcu(int fd, int type)
>> {
>> struct files_struct *files = current->files;
>> struct file *file;
>> struct ns_common *ns;
>> void *ret;
>>
>> file = fcheck_files(files, fd);
>> if (!file)
>> return ERR_PTR(-EBADF);
>>
>> if (file->f_mode & FMODE_PATH)
>> return ERR_PTR(-EINVAL);
>>
>> if (file->f_op != &ns_file_operations)
>> return ERR_PTR(-EINVAL);
>>
>> ns = get_proc_ns(file_inode(file));
>> if (ns->ops->type != type)
>> return ERR_PTR(-EINVAL);
>>
>> return ns;
>> }
>
> OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>
> And in any case fcheck_files() makes more sense than fdget(), somehow I did
> not think about this when I sent 1/1.
>
> Hmm. and after the quick look at cleanup_net() I can't understand whether
> get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> not... Can it?
Some of those places need a reference that allows them to sleep, and the
code is shared with the legacy pid case so with an addition of get_net
we can use ns_by_fd_rcu(). There are cases like setns that could
use ns_by_fd_rcu() with code reording.
We can implement get_net_ns_by_fd as:
struct net *get_net_ns_by_fd(int fd)
{
struct net *net;
rcu_read_lock();
net = net_ns_by_fd_rcu(fd);
if (!IS_ERR(net))
get_net(net);
rcu_read_unlock();
return net;
}
Which means we can achieve code sharing with the pure rcu version
as a base.
If the networking code did not have the legacy pid case to handle I
would want to do something with struct fd, as the file descriptor
already keeps the struct net reference alive and struct fd allows
for sleeping.
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-09-29 17:30 ` Eric W. Biederman
0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 17:30 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
Oleg Nesterov <oleg@redhat.com> writes:
> On 09/28, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg@redhat.com> writes:
>>
>> > Honestly, I do not really like the new helper... I understand this
>> > is subjective, so I won't insist. But how about 1/1? We do not need
>> > fd/file at all. With this patch your sys_getvpid() can just use
>> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>> >
>> > Eric, what do you think?
>>
>> At some level I don't care this is not exposed to userspace.
>
> I agree, this is minor. But you know, the kernel is already complicated
> too much, we should try to simplify/cleanup everything we can ;)
>
>> Of the existing uses several of them sleep, which unfortunately means we
>> can not use rcu locking for everything. The network namespace ones wind
>> up taking a reference to struct net because the have the legacy pid case
>> to deal with. Which makes we can not use fdget for all callers either.
>
> And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
> by the network namespace.
>
>> For this translate_pid rcu locking is sufficient, rcu locking is easy
>> and doing any more than rcu locking just seems silly. So let me
>> respectfully suggest.
>>
>> struct ns_common *ns_by_fd_rcu(int fd, int type)
>> {
>> struct files_struct *files = current->files;
>> struct file *file;
>> struct ns_common *ns;
>> void *ret;
>>
>> file = fcheck_files(files, fd);
>> if (!file)
>> return ERR_PTR(-EBADF);
>>
>> if (file->f_mode & FMODE_PATH)
>> return ERR_PTR(-EINVAL);
>>
>> if (file->f_op != &ns_file_operations)
>> return ERR_PTR(-EINVAL);
>>
>> ns = get_proc_ns(file_inode(file));
>> if (ns->ops->type != type)
>> return ERR_PTR(-EINVAL);
>>
>> return ns;
>> }
>
> OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>
> And in any case fcheck_files() makes more sense than fdget(), somehow I did
> not think about this when I sent 1/1.
>
> Hmm. and after the quick look at cleanup_net() I can't understand whether
> get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> not... Can it?
Some of those places need a reference that allows them to sleep, and the
code is shared with the legacy pid case so with an addition of get_net
we can use ns_by_fd_rcu(). There are cases like setns that could
use ns_by_fd_rcu() with code reording.
We can implement get_net_ns_by_fd as:
struct net *get_net_ns_by_fd(int fd)
{
struct net *net;
rcu_read_lock();
net = net_ns_by_fd_rcu(fd);
if (!IS_ERR(net))
get_net(net);
rcu_read_unlock();
return net;
}
Which means we can achieve code sharing with the pure rcu version
as a base.
If the networking code did not have the legacy pid case to handle I
would want to do something with struct fd, as the file descriptor
already keeps the struct net reference alive and struct fd allows
for sleeping.
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-29 17:30 ` Eric W. Biederman
0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 17:30 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds,
Stéphane Graber
Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
> On 09/28, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>>
>> > Honestly, I do not really like the new helper... I understand this
>> > is subjective, so I won't insist. But how about 1/1? We do not need
>> > fd/file at all. With this patch your sys_getvpid() can just use
>> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>> >
>> > Eric, what do you think?
>>
>> At some level I don't care this is not exposed to userspace.
>
> I agree, this is minor. But you know, the kernel is already complicated
> too much, we should try to simplify/cleanup everything we can ;)
>
>> Of the existing uses several of them sleep, which unfortunately means we
>> can not use rcu locking for everything. The network namespace ones wind
>> up taking a reference to struct net because the have the legacy pid case
>> to deal with. Which makes we can not use fdget for all callers either.
>
> And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
> by the network namespace.
>
>> For this translate_pid rcu locking is sufficient, rcu locking is easy
>> and doing any more than rcu locking just seems silly. So let me
>> respectfully suggest.
>>
>> struct ns_common *ns_by_fd_rcu(int fd, int type)
>> {
>> struct files_struct *files = current->files;
>> struct file *file;
>> struct ns_common *ns;
>> void *ret;
>>
>> file = fcheck_files(files, fd);
>> if (!file)
>> return ERR_PTR(-EBADF);
>>
>> if (file->f_mode & FMODE_PATH)
>> return ERR_PTR(-EINVAL);
>>
>> if (file->f_op != &ns_file_operations)
>> return ERR_PTR(-EINVAL);
>>
>> ns = get_proc_ns(file_inode(file));
>> if (ns->ops->type != type)
>> return ERR_PTR(-EINVAL);
>>
>> return ns;
>> }
>
> OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>
> And in any case fcheck_files() makes more sense than fdget(), somehow I did
> not think about this when I sent 1/1.
>
> Hmm. and after the quick look at cleanup_net() I can't understand whether
> get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> not... Can it?
Some of those places need a reference that allows them to sleep, and the
code is shared with the legacy pid case so with an addition of get_net
we can use ns_by_fd_rcu(). There are cases like setns that could
use ns_by_fd_rcu() with code reording.
We can implement get_net_ns_by_fd as:
struct net *get_net_ns_by_fd(int fd)
{
struct net *net;
rcu_read_lock();
net = net_ns_by_fd_rcu(fd);
if (!IS_ERR(net))
get_net(net);
rcu_read_unlock();
return net;
}
Which means we can achieve code sharing with the pure rcu version
as a base.
If the networking code did not have the legacy pid case to handle I
would want to do something with struct fd, as the file descriptor
already keeps the struct net reference alive and struct fd allows
for sleeping.
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-29 18:38 ` Oleg Nesterov
0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 18:38 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Roman Gushchin, Konstantin Khlebnikov,
linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
Andrew Morton, Linus Torvalds
On 09/29, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
> >
> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
> > not think about this when I sent 1/1.
> >
> > Hmm. and after the quick look at cleanup_net() I can't understand whether
> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> > not... Can it?
>
> Some of those places need a reference that allows them to sleep, and the
> code is shared with the legacy pid case so with an addition of get_net
> we can use ns_by_fd_rcu(). There are cases like setns that could
> use ns_by_fd_rcu() with code reording.
>
> We can implement get_net_ns_by_fd as:
> struct net *get_net_ns_by_fd(int fd)
> {
> struct net *net;
>
> rcu_read_lock();
> net = net_ns_by_fd_rcu(fd);
> if (!IS_ERR(net))
> get_net(net);
> rcu_read_unlock();
>
> return net;
> }
>
> Which means we can achieve code sharing with the pure rcu version
> as a base.
Yes, this is what I meant... but don't we need maybe_get_net() ?
Oleg.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-29 18:38 ` Oleg Nesterov
0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 18:38 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
On 09/29, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg@redhat.com> writes:
>
> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
> >
> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
> > not think about this when I sent 1/1.
> >
> > Hmm. and after the quick look at cleanup_net() I can't understand whether
> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> > not... Can it?
>
> Some of those places need a reference that allows them to sleep, and the
> code is shared with the legacy pid case so with an addition of get_net
> we can use ns_by_fd_rcu(). There are cases like setns that could
> use ns_by_fd_rcu() with code reording.
>
> We can implement get_net_ns_by_fd as:
> struct net *get_net_ns_by_fd(int fd)
> {
> struct net *net;
>
> rcu_read_lock();
> net = net_ns_by_fd_rcu(fd);
> if (!IS_ERR(net))
> get_net(net);
> rcu_read_unlock();
>
> return net;
> }
>
> Which means we can achieve code sharing with the pure rcu version
> as a base.
Yes, this is what I meant... but don't we need maybe_get_net() ?
Oleg.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-29 18:38 ` Oleg Nesterov
0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 18:38 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds,
Stéphane Graber
On 09/29, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
> >
> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
> > not think about this when I sent 1/1.
> >
> > Hmm. and after the quick look at cleanup_net() I can't understand whether
> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> > not... Can it?
>
> Some of those places need a reference that allows them to sleep, and the
> code is shared with the legacy pid case so with an addition of get_net
> we can use ns_by_fd_rcu(). There are cases like setns that could
> use ns_by_fd_rcu() with code reording.
>
> We can implement get_net_ns_by_fd as:
> struct net *get_net_ns_by_fd(int fd)
> {
> struct net *net;
>
> rcu_read_lock();
> net = net_ns_by_fd_rcu(fd);
> if (!IS_ERR(net))
> get_net(net);
> rcu_read_unlock();
>
> return net;
> }
>
> Which means we can achieve code sharing with the pure rcu version
> as a base.
Yes, this is what I meant... but don't we need maybe_get_net() ?
Oleg.
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
2015-09-29 18:38 ` Oleg Nesterov
@ 2015-09-29 19:05 ` Eric W. Biederman
-1 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 19:05 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Roman Gushchin, Konstantin Khlebnikov,
linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
Andrew Morton, Linus Torvalds
Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
> On 09/29, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>>
>> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>> >
>> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
>> > not think about this when I sent 1/1.
>> >
>> > Hmm. and after the quick look at cleanup_net() I can't understand whether
>> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
>> > not... Can it?
>>
>> Some of those places need a reference that allows them to sleep, and the
>> code is shared with the legacy pid case so with an addition of get_net
>> we can use ns_by_fd_rcu(). There are cases like setns that could
>> use ns_by_fd_rcu() with code reording.
>>
>> We can implement get_net_ns_by_fd as:
>> struct net *get_net_ns_by_fd(int fd)
>> {
>> struct net *net;
>>
>> rcu_read_lock();
>> net = net_ns_by_fd_rcu(fd);
>> if (!IS_ERR(net))
>> get_net(net);
>> rcu_read_unlock();
>>
>> return net;
>> }
>>
>> Which means we can achieve code sharing with the pure rcu version
>> as a base.
>
> Yes, this is what I meant... but don't we need maybe_get_net() ?
Let me see.
The call chain is a little interesting so let's trace it.
thread1 thread2
net = net_ns_by_fd_rcu(fd); fput(file)
get_net(net); ____fput
__fput
dput
dentry_kill
__dentry_kill
dentry_iput
iput
iput_final
evict
->evict_inode (nsfs_evict)
netns_put
put_net
__put_net
queue_work(net_cleanup_work)
...
cleanup_net
synchronize_rcu.
Given that someone in another thread can put the file descriptor and I
don't see anything explicitly waiting until after the rcu critical
section ends to put the network namespace I guess we do need
maybe_get_net.
In practice I think the delay logic in fput will act as an rcu barrier,
but that is not guaranteed so we had better not count on it. Sigh.
Anyway this is a long ways from a syscall to translate pids.
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-29 19:05 ` Eric W. Biederman
0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 19:05 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
Oleg Nesterov <oleg@redhat.com> writes:
> On 09/29, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg@redhat.com> writes:
>>
>> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>> >
>> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
>> > not think about this when I sent 1/1.
>> >
>> > Hmm. and after the quick look at cleanup_net() I can't understand whether
>> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
>> > not... Can it?
>>
>> Some of those places need a reference that allows them to sleep, and the
>> code is shared with the legacy pid case so with an addition of get_net
>> we can use ns_by_fd_rcu(). There are cases like setns that could
>> use ns_by_fd_rcu() with code reording.
>>
>> We can implement get_net_ns_by_fd as:
>> struct net *get_net_ns_by_fd(int fd)
>> {
>> struct net *net;
>>
>> rcu_read_lock();
>> net = net_ns_by_fd_rcu(fd);
>> if (!IS_ERR(net))
>> get_net(net);
>> rcu_read_unlock();
>>
>> return net;
>> }
>>
>> Which means we can achieve code sharing with the pure rcu version
>> as a base.
>
> Yes, this is what I meant... but don't we need maybe_get_net() ?
Let me see.
The call chain is a little interesting so let's trace it.
thread1 thread2
net = net_ns_by_fd_rcu(fd); fput(file)
get_net(net); ____fput
__fput
dput
dentry_kill
__dentry_kill
dentry_iput
iput
iput_final
evict
->evict_inode (nsfs_evict)
netns_put
put_net
__put_net
queue_work(net_cleanup_work)
...
cleanup_net
synchronize_rcu.
Given that someone in another thread can put the file descriptor and I
don't see anything explicitly waiting until after the rcu critical
section ends to put the network namespace I guess we do need
maybe_get_net.
In practice I think the delay logic in fput will act as an rcu barrier,
but that is not guaranteed so we had better not count on it. Sigh.
Anyway this is a long ways from a syscall to translate pids.
Eric
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2015-09-29 16:43 ` Oleg Nesterov
@ 2015-09-30 2:54 ` Chen Fan
1 sibling, 0 replies; 37+ messages in thread
From: Chen Fan @ 2015-09-30 2:54 UTC (permalink / raw)
To: Eric W. Biederman, Oleg Nesterov
Cc: Roman Gushchin, Konstantin Khlebnikov,
linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andrew Morton,
Linus Torvalds
On 09/29/2015 12:37 AM, Eric W. Biederman wrote:
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
>> On 09/25, Konstantin Khlebnikov wrote:
>>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>> {
>>> - struct file *file;
>>> + struct ns_common *ns;
>>> + struct fd f;
>>>
>>> - file = fget(fd);
>>> - if (!file)
>>> + f = fdget(fd);
>>> + if (!f.file)
>>> return ERR_PTR(-EBADF);
>>>
>>> - if (file->f_op != &ns_file_operations)
>>> + if (f.file->f_op != &ns_file_operations)
>>> + goto out_invalid;
>>> +
>>> + ns = get_proc_ns(file_inode(f.file));
>>> + if (nstype && (ns->ops->type != nstype))
>>> goto out_invalid;
>>>
>>> - return file;
>>> + *fd_ref = f;
>>> + return ns;
>>>
>>> out_invalid:
>>> - fput(file);
>>> + fdput(f);
>>> return ERR_PTR(-EINVAL);
>>> }
>> Well yes, fdget() makes sense but this is minor.
>>
>> Honestly, I do not really like the new helper... I understand this
>> is subjective, so I won't insist. But how about 1/1? We do not need
>> fd/file at all. With this patch your sys_getvpid() can just use
>> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>>
>> Eric, what do you think?
> At some level I don't care this is not exposed to userspace.
>
> However since we are going several rounds with this.
>
> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything. The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with. Which makes we can not use fdget for all callers either.
>
> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly. So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> struct files_struct *files = current->files;
> struct file *file;
> struct ns_common *ns;
> void *ret;
pointer files seems no more useful. we can
use fcheck(fd) instead.
Thanks,
>
> file = fcheck_files(files, fd);
> if (!file)
> return ERR_PTR(-EBADF);
>
> if (file->f_mode & FMODE_PATH)
> return ERR_PTR(-EINVAL);
>
> if (file->f_op != &ns_file_operations)
> return ERR_PTR(-EINVAL);
>
> ns = get_proc_ns(file_inode(file));
> if (ns->ops->type != type)
> return ERR_PTR(-EINVAL);
>
> return ns;
> }
>
> struct pid_namespace *pidns_by_fd_rcu(int fd)
> {
> struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
> if (IS_ERR(ns))
> return ERR_CAST(ns);
> return container_of(ns, struct pid_namespace, ns);
> }
>
> SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
> {
> struct pid_namespace *source, *target;
> struct pid *pid;
> pid_t result;
>
> rcu_read_lock();
>
> if (sourcefd >= 0)
> source = pidns_by_fd_rcu(sourcefd);
> else
> source = task_active_pid_ns(current);
>
> if (targetfd >= 0)
> target = pidns_by_fd_rcu(targetfd);
> else
> target = task_active_pid_ns(current);
>
> pid = find_pid_ns(pid_nr, source);
> result = pid_nr_ns(pid, target);
> if (result == 0)
> result = -ESRCH;
>
> rcu_read_unlock();
>
> return result;
> }
>
> Eric
> .
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
[not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-30 2:54 ` Chen Fan
2015-09-30 2:54 ` Chen Fan
1 sibling, 0 replies; 37+ messages in thread
From: Chen Fan @ 2015-09-30 2:54 UTC (permalink / raw)
To: Eric W. Biederman, Oleg Nesterov
Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
Roman Gushchin, Serge Hallyn, Andrew Morton, Linus Torvalds,
Stéphane Graber
On 09/29/2015 12:37 AM, Eric W. Biederman wrote:
> Oleg Nesterov <oleg@redhat.com> writes:
>
>> On 09/25, Konstantin Khlebnikov wrote:
>>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>> {
>>> - struct file *file;
>>> + struct ns_common *ns;
>>> + struct fd f;
>>>
>>> - file = fget(fd);
>>> - if (!file)
>>> + f = fdget(fd);
>>> + if (!f.file)
>>> return ERR_PTR(-EBADF);
>>>
>>> - if (file->f_op != &ns_file_operations)
>>> + if (f.file->f_op != &ns_file_operations)
>>> + goto out_invalid;
>>> +
>>> + ns = get_proc_ns(file_inode(f.file));
>>> + if (nstype && (ns->ops->type != nstype))
>>> goto out_invalid;
>>>
>>> - return file;
>>> + *fd_ref = f;
>>> + return ns;
>>>
>>> out_invalid:
>>> - fput(file);
>>> + fdput(f);
>>> return ERR_PTR(-EINVAL);
>>> }
>> Well yes, fdget() makes sense but this is minor.
>>
>> Honestly, I do not really like the new helper... I understand this
>> is subjective, so I won't insist. But how about 1/1? We do not need
>> fd/file at all. With this patch your sys_getvpid() can just use
>> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>>
>> Eric, what do you think?
> At some level I don't care this is not exposed to userspace.
>
> However since we are going several rounds with this.
>
> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything. The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with. Which makes we can not use fdget for all callers either.
>
> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly. So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> struct files_struct *files = current->files;
> struct file *file;
> struct ns_common *ns;
> void *ret;
pointer files seems no more useful. we can
use fcheck(fd) instead.
Thanks,
>
> file = fcheck_files(files, fd);
> if (!file)
> return ERR_PTR(-EBADF);
>
> if (file->f_mode & FMODE_PATH)
> return ERR_PTR(-EINVAL);
>
> if (file->f_op != &ns_file_operations)
> return ERR_PTR(-EINVAL);
>
> ns = get_proc_ns(file_inode(file));
> if (ns->ops->type != type)
> return ERR_PTR(-EINVAL);
>
> return ns;
> }
>
> struct pid_namespace *pidns_by_fd_rcu(int fd)
> {
> struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
> if (IS_ERR(ns))
> return ERR_CAST(ns);
> return container_of(ns, struct pid_namespace, ns);
> }
>
> SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
> {
> struct pid_namespace *source, *target;
> struct pid *pid;
> pid_t result;
>
> rcu_read_lock();
>
> if (sourcefd >= 0)
> source = pidns_by_fd_rcu(sourcefd);
> else
> source = task_active_pid_ns(current);
>
> if (targetfd >= 0)
> target = pidns_by_fd_rcu(targetfd);
> else
> target = task_active_pid_ns(current);
>
> pid = find_pid_ns(pid_nr, source);
> result = pid_nr_ns(pid, target);
> if (result == 0)
> result = -ESRCH;
>
> rcu_read_unlock();
>
> return result;
> }
>
> Eric
> .
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-30 2:54 ` Chen Fan
0 siblings, 0 replies; 37+ messages in thread
From: Chen Fan @ 2015-09-30 2:54 UTC (permalink / raw)
To: Eric W. Biederman, Oleg Nesterov
Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
Serge Hallyn, Andrew Morton, Linus Torvalds,
Stéphane Graber
On 09/29/2015 12:37 AM, Eric W. Biederman wrote:
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
>> On 09/25, Konstantin Khlebnikov wrote:
>>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>> {
>>> - struct file *file;
>>> + struct ns_common *ns;
>>> + struct fd f;
>>>
>>> - file = fget(fd);
>>> - if (!file)
>>> + f = fdget(fd);
>>> + if (!f.file)
>>> return ERR_PTR(-EBADF);
>>>
>>> - if (file->f_op != &ns_file_operations)
>>> + if (f.file->f_op != &ns_file_operations)
>>> + goto out_invalid;
>>> +
>>> + ns = get_proc_ns(file_inode(f.file));
>>> + if (nstype && (ns->ops->type != nstype))
>>> goto out_invalid;
>>>
>>> - return file;
>>> + *fd_ref = f;
>>> + return ns;
>>>
>>> out_invalid:
>>> - fput(file);
>>> + fdput(f);
>>> return ERR_PTR(-EINVAL);
>>> }
>> Well yes, fdget() makes sense but this is minor.
>>
>> Honestly, I do not really like the new helper... I understand this
>> is subjective, so I won't insist. But how about 1/1? We do not need
>> fd/file at all. With this patch your sys_getvpid() can just use
>> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>>
>> Eric, what do you think?
> At some level I don't care this is not exposed to userspace.
>
> However since we are going several rounds with this.
>
> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything. The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with. Which makes we can not use fdget for all callers either.
>
> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly. So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> struct files_struct *files = current->files;
> struct file *file;
> struct ns_common *ns;
> void *ret;
pointer files seems no more useful. we can
use fcheck(fd) instead.
Thanks,
>
> file = fcheck_files(files, fd);
> if (!file)
> return ERR_PTR(-EBADF);
>
> if (file->f_mode & FMODE_PATH)
> return ERR_PTR(-EINVAL);
>
> if (file->f_op != &ns_file_operations)
> return ERR_PTR(-EINVAL);
>
> ns = get_proc_ns(file_inode(file));
> if (ns->ops->type != type)
> return ERR_PTR(-EINVAL);
>
> return ns;
> }
>
> struct pid_namespace *pidns_by_fd_rcu(int fd)
> {
> struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
> if (IS_ERR(ns))
> return ERR_CAST(ns);
> return container_of(ns, struct pid_namespace, ns);
> }
>
> SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
> {
> struct pid_namespace *source, *target;
> struct pid *pid;
> pid_t result;
>
> rcu_read_lock();
>
> if (sourcefd >= 0)
> source = pidns_by_fd_rcu(sourcefd);
> else
> source = task_active_pid_ns(current);
>
> if (targetfd >= 0)
> target = pidns_by_fd_rcu(targetfd);
> else
> target = task_active_pid_ns(current);
>
> pid = find_pid_ns(pid_nr, source);
> result = pid_nr_ns(pid, target);
> if (result == 0)
> result = -ESRCH;
>
> rcu_read_unlock();
>
> return result;
> }
>
> Eric
> .
>
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
[not found] ` <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-10-20 10:04 ` Konstantin Khlebnikov
0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-10-20 10:04 UTC (permalink / raw)
To: Eric W. Biederman
Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov,
Chen Fan, Andrew Morton, Linus Torvalds
On 28.09.2015 19:57, Eric W. Biederman wrote:
> Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes:
>
>> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Now that I am noticing this. I don't think I have seen any discussion
> about justifying a syscall getting another processes parent pid. My
> apologies if I just missed it.
>
Sorry for late response. This completely fell out of my mind after LinuxCon.
> Why do we want the the parent pid? We can we usefully do with it?
> Is proc really that bad of an interface?
>
> Fetching a parent pid feels like a separate logical operation
> from pid translation. Which makes me a bit uneasy about this
> part of the conversation.
Yep proc interface is bad. /proc/$pid/stat is almost impossible to
parse without flaws because task could set second field "comm" into
any string and fake ppid - for example ") Z 1". /proc/$pid/status
is better but it has more information and thus slower.
This trick for distant getppid looks cheap useful:
in this interface space of negative pids is free for use.
>
>> Examples:
>> getvpid(pid, ns, -1) - get pid in our pid namespace
>> getvpid(pid, -1, ns) - get pid in container
>> getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
>> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
>> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
>> getvpid(1, ns, -1) - get init task of pid-namespace
>> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
>> getvpid(-pid, -1, -1) - get ppid by pid
>
> As I step back and pay attention to this case I am half wondering if
> perhaps what would be most useful is a file descriptor that refers
> to a pid and an updated set of system calls that takes pid file
> descriptors instead of pids.
Fd which pins pids isn't a good idea.
I think it's better to refer (but not hold) task rather than pid.
For example inode of taskfd will hold small buffer for task exit
status: task holds reference to its own taskfd inode and populates
status when exits. Here will be no zombies and delayed reaping.
Something like:
task_fd = clonefd()
...
select(...)
exit(...)
pread(task_fd, &status_rusage_etc, sizeof, 0);
close(task_fd);
Task pid also could be part of structure in that fd. Potentially it
could provide the same information as /proc/$pid/... in effective
binary format: we can read only required fields of structure and
kernel can skip unneeded calculations.
>
> Something like:
>
> getpidfd(int pidnsfd, pid_t pid);
>
> waitfd(int pidfd, int *status, int options, struct rusage *rusage);
>
> killfd(int pidfd, int sig);
>
> clonefd(...);
>
> And perhaps:
> pid_nr_ns(int pidnsfd, int pidfd);
>
> parentfd(int pidfd);
>
> Eric
>
--
Konstantin
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
[not found] ` <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-10-20 10:04 ` Konstantin Khlebnikov
0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-10-20 10:04 UTC (permalink / raw)
To: Eric W. Biederman
Cc: linux-api, containers, linux-kernel, Roman Gushchin,
Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
On 28.09.2015 19:57, Eric W. Biederman wrote:
> Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:
>
>> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Now that I am noticing this. I don't think I have seen any discussion
> about justifying a syscall getting another processes parent pid. My
> apologies if I just missed it.
>
Sorry for late response. This completely fell out of my mind after LinuxCon.
> Why do we want the the parent pid? We can we usefully do with it?
> Is proc really that bad of an interface?
>
> Fetching a parent pid feels like a separate logical operation
> from pid translation. Which makes me a bit uneasy about this
> part of the conversation.
Yep proc interface is bad. /proc/$pid/stat is almost impossible to
parse without flaws because task could set second field "comm" into
any string and fake ppid - for example ") Z 1". /proc/$pid/status
is better but it has more information and thus slower.
This trick for distant getppid looks cheap useful:
in this interface space of negative pids is free for use.
>
>> Examples:
>> getvpid(pid, ns, -1) - get pid in our pid namespace
>> getvpid(pid, -1, ns) - get pid in container
>> getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
>> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
>> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
>> getvpid(1, ns, -1) - get init task of pid-namespace
>> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
>> getvpid(-pid, -1, -1) - get ppid by pid
>
> As I step back and pay attention to this case I am half wondering if
> perhaps what would be most useful is a file descriptor that refers
> to a pid and an updated set of system calls that takes pid file
> descriptors instead of pids.
Fd which pins pids isn't a good idea.
I think it's better to refer (but not hold) task rather than pid.
For example inode of taskfd will hold small buffer for task exit
status: task holds reference to its own taskfd inode and populates
status when exits. Here will be no zombies and delayed reaping.
Something like:
task_fd = clonefd()
...
select(...)
exit(...)
pread(task_fd, &status_rusage_etc, sizeof, 0);
close(task_fd);
Task pid also could be part of structure in that fd. Potentially it
could provide the same information as /proc/$pid/... in effective
binary format: we can read only required fields of structure and
kernel can skip unneeded calculations.
>
> Something like:
>
> getpidfd(int pidnsfd, pid_t pid);
>
> waitfd(int pidfd, int *status, int options, struct rusage *rusage);
>
> killfd(int pidfd, int sig);
>
> clonefd(...);
>
> And perhaps:
> pid_nr_ns(int pidnsfd, int pidfd);
>
> parentfd(int pidfd);
>
> Eric
>
--
Konstantin
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-10-20 10:04 ` Konstantin Khlebnikov
0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-10-20 10:04 UTC (permalink / raw)
To: Eric W. Biederman
Cc: linux-api-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton,
Linus Torvalds, Stéphane Graber
On 28.09.2015 19:57, Eric W. Biederman wrote:
> Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes:
>
>> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Now that I am noticing this. I don't think I have seen any discussion
> about justifying a syscall getting another processes parent pid. My
> apologies if I just missed it.
>
Sorry for late response. This completely fell out of my mind after LinuxCon.
> Why do we want the the parent pid? We can we usefully do with it?
> Is proc really that bad of an interface?
>
> Fetching a parent pid feels like a separate logical operation
> from pid translation. Which makes me a bit uneasy about this
> part of the conversation.
Yep proc interface is bad. /proc/$pid/stat is almost impossible to
parse without flaws because task could set second field "comm" into
any string and fake ppid - for example ") Z 1". /proc/$pid/status
is better but it has more information and thus slower.
This trick for distant getppid looks cheap useful:
in this interface space of negative pids is free for use.
>
>> Examples:
>> getvpid(pid, ns, -1) - get pid in our pid namespace
>> getvpid(pid, -1, ns) - get pid in container
>> getvpid(pid, -1, ns) > 0 - is pid is reachable from container?
>> getvpid(1, ns1, ns2) > 0 - is ns1 inside ns2?
>> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
>> getvpid(1, ns, -1) - get init task of pid-namespace
>> getvpid(-1, ns, -1) - get reaper of init task in parent pid-namespace
>> getvpid(-pid, -1, -1) - get ppid by pid
>
> As I step back and pay attention to this case I am half wondering if
> perhaps what would be most useful is a file descriptor that refers
> to a pid and an updated set of system calls that takes pid file
> descriptors instead of pids.
Fd which pins pids isn't a good idea.
I think it's better to refer (but not hold) task rather than pid.
For example inode of taskfd will hold small buffer for task exit
status: task holds reference to its own taskfd inode and populates
status when exits. Here will be no zombies and delayed reaping.
Something like:
task_fd = clonefd()
...
select(...)
exit(...)
pread(task_fd, &status_rusage_etc, sizeof, 0);
close(task_fd);
Task pid also could be part of structure in that fd. Potentially it
could provide the same information as /proc/$pid/... in effective
binary format: we can read only required fields of structure and
kernel can skip unneeded calculations.
>
> Something like:
>
> getpidfd(int pidnsfd, pid_t pid);
>
> waitfd(int pidfd, int *status, int options, struct rusage *rusage);
>
> killfd(int pidfd, int sig);
>
> clonefd(...);
>
> And perhaps:
> pid_nr_ns(int pidnsfd, int pidfd);
>
> parentfd(int pidfd);
>
> Eric
>
--
Konstantin
^ permalink raw reply [flat|nested] 37+ messages in thread
end of thread, other threads:[~2015-10-20 10:04 UTC | newest]
Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-25 13:52 [PATCH RFC v3 1/2] nsfs: replace proc_ns_fget() with proc_ns_fdget() Konstantin Khlebnikov
2015-09-25 13:52 ` Konstantin Khlebnikov
2015-09-25 13:52 ` [PATCH RFC v3 2/2] pidns: introduce syscall getvpid Konstantin Khlebnikov
2015-09-25 13:52 ` Konstantin Khlebnikov
2015-09-28 4:12 ` kbuild test robot
2015-09-28 4:12 ` kbuild test robot
2015-09-28 16:22 ` Eric W. Biederman
2015-09-28 16:22 ` Eric W. Biederman
2015-09-28 16:57 ` Eric W. Biederman
2015-09-28 16:57 ` Eric W. Biederman
[not found] ` <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2015-10-20 10:04 ` Konstantin Khlebnikov
2015-10-20 10:04 ` Konstantin Khlebnikov
2015-10-20 10:04 ` Konstantin Khlebnikov
2015-09-25 17:56 ` [PATCH 0/1] ns: introduce proc_get_ns_by_fd() Oleg Nesterov
2015-09-25 17:56 ` Oleg Nesterov
[not found] ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-09-25 17:57 ` [PATCH 1/1] " Oleg Nesterov
2015-09-25 17:57 ` Oleg Nesterov
2015-09-28 8:21 ` [PATCH 0/1] " Konstantin Khlebnikov
2015-09-28 16:37 ` Eric W. Biederman
2015-09-28 8:21 ` Konstantin Khlebnikov
2015-09-28 8:21 ` Konstantin Khlebnikov
2015-09-28 16:37 ` Eric W. Biederman
2015-09-28 16:37 ` Eric W. Biederman
2015-09-29 16:43 ` Oleg Nesterov
2015-09-29 16:43 ` Oleg Nesterov
[not found] ` <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-09-29 17:30 ` Eric W. Biederman
2015-09-29 17:30 ` Eric W. Biederman
2015-09-29 17:30 ` Eric W. Biederman
2015-09-29 18:38 ` Oleg Nesterov
2015-09-29 18:38 ` Oleg Nesterov
[not found] ` <20150929183833.GA21875-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-09-29 19:05 ` Eric W. Biederman
2015-09-29 19:05 ` Eric W. Biederman
[not found] ` <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2015-09-29 18:38 ` Oleg Nesterov
[not found] ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2015-09-29 16:43 ` Oleg Nesterov
2015-09-30 2:54 ` Chen Fan
2015-09-30 2:54 ` Chen Fan
2015-09-30 2:54 ` Chen Fan
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.