All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC v3 1/2] nsfs: replace proc_ns_fget() with proc_ns_fdget()
@ 2015-09-25 13:52 ` Konstantin Khlebnikov
  0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw)
  To: linux-api, containers, linux-kernel
  Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman,
	Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber

New helper function returns pointer to struct ns_common, can check
namespace type and uses struct fd for returning file reference:
this saves couple atomic operations for single-threaded applications.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 fs/nsfs.c                |   20 +++++++++++++-------
 include/linux/proc_ns.h  |    4 +++-
 kernel/nsproxy.c         |   15 +++++----------
 net/core/net_namespace.c |   18 ++++++------------
 4 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8f20d6016e20..a2e803c97960 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -119,21 +119,27 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
 	return res;
 }
 
-struct file *proc_ns_fget(int fd)
+struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
 {
-	struct file *file;
+	struct ns_common *ns;
+	struct fd f;
 
-	file = fget(fd);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		return ERR_PTR(-EBADF);
 
-	if (file->f_op != &ns_file_operations)
+	if (f.file->f_op != &ns_file_operations)
+		goto out_invalid;
+
+	ns = get_proc_ns(file_inode(f.file));
+	if (nstype && (ns->ops->type != nstype))
 		goto out_invalid;
 
-	return file;
+	*fd_ref = f;
+	return ns;
 
 out_invalid:
-	fput(file);
+	fdput(f);
 	return ERR_PTR(-EINVAL);
 }
 
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 42dfc615dbf8..406ea4617606 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -9,6 +9,7 @@
 struct pid_namespace;
 struct nsproxy;
 struct path;
+struct fd;
 
 struct proc_ns_operations {
 	const char *name;
@@ -65,7 +66,8 @@ static inline int ns_alloc_inum(struct ns_common *ns)
 
 #define ns_free_inum(ns) proc_free_inum((ns)->inum)
 
-extern struct file *proc_ns_fget(int fd);
+extern struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref);
+
 #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
 extern void *ns_get_path(struct path *path, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 49746c81ad8d..689b929a0fcb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -222,18 +222,13 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 {
 	struct task_struct *tsk = current;
 	struct nsproxy *new_nsproxy;
-	struct file *file;
+	struct fd fd_ref;
 	struct ns_common *ns;
 	int err;
 
-	file = proc_ns_fget(fd);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	err = -EINVAL;
-	ns = get_proc_ns(file_inode(file));
-	if (nstype && (ns->ops->type != nstype))
-		goto out;
+	ns = proc_ns_fdget(fd, nstype, &fd_ref);
+	if (IS_ERR(ns))
+		return PTR_ERR(ns);
 
 	new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
 	if (IS_ERR(new_nsproxy)) {
@@ -248,7 +243,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	}
 	switch_task_namespaces(tsk, new_nsproxy);
 out:
-	fput(file);
+	fdput(fd_ref);
 	return err;
 }
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..ffc5e6df86dc 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -465,21 +465,15 @@ EXPORT_SYMBOL_GPL(__put_net);
 
 struct net *get_net_ns_by_fd(int fd)
 {
-	struct file *file;
 	struct ns_common *ns;
+	struct fd fd_ref;
 	struct net *net;
 
-	file = proc_ns_fget(fd);
-	if (IS_ERR(file))
-		return ERR_CAST(file);
-
-	ns = get_proc_ns(file_inode(file));
-	if (ns->ops == &netns_operations)
-		net = get_net(container_of(ns, struct net, ns));
-	else
-		net = ERR_PTR(-EINVAL);
-
-	fput(file);
+	ns = proc_ns_fdget(fd, CLONE_NEWNET, &fd_ref);
+	if (IS_ERR(ns))
+		return ERR_CAST(ns);
+	net = get_net(container_of(ns, struct net, ns));
+	fdput(fd_ref);
 	return net;
 }
 


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH RFC v3 1/2] nsfs: replace proc_ns_fget() with proc_ns_fdget()
@ 2015-09-25 13:52 ` Konstantin Khlebnikov
  0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw)
  To: linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman,
	Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber

New helper function returns pointer to struct ns_common, can check
namespace type and uses struct fd for returning file reference:
this saves couple atomic operations for single-threaded applications.

Signed-off-by: Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org>
---
 fs/nsfs.c                |   20 +++++++++++++-------
 include/linux/proc_ns.h  |    4 +++-
 kernel/nsproxy.c         |   15 +++++----------
 net/core/net_namespace.c |   18 ++++++------------
 4 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8f20d6016e20..a2e803c97960 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -119,21 +119,27 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
 	return res;
 }
 
-struct file *proc_ns_fget(int fd)
+struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
 {
-	struct file *file;
+	struct ns_common *ns;
+	struct fd f;
 
-	file = fget(fd);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		return ERR_PTR(-EBADF);
 
-	if (file->f_op != &ns_file_operations)
+	if (f.file->f_op != &ns_file_operations)
+		goto out_invalid;
+
+	ns = get_proc_ns(file_inode(f.file));
+	if (nstype && (ns->ops->type != nstype))
 		goto out_invalid;
 
-	return file;
+	*fd_ref = f;
+	return ns;
 
 out_invalid:
-	fput(file);
+	fdput(f);
 	return ERR_PTR(-EINVAL);
 }
 
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 42dfc615dbf8..406ea4617606 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -9,6 +9,7 @@
 struct pid_namespace;
 struct nsproxy;
 struct path;
+struct fd;
 
 struct proc_ns_operations {
 	const char *name;
@@ -65,7 +66,8 @@ static inline int ns_alloc_inum(struct ns_common *ns)
 
 #define ns_free_inum(ns) proc_free_inum((ns)->inum)
 
-extern struct file *proc_ns_fget(int fd);
+extern struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref);
+
 #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
 extern void *ns_get_path(struct path *path, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 49746c81ad8d..689b929a0fcb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -222,18 +222,13 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 {
 	struct task_struct *tsk = current;
 	struct nsproxy *new_nsproxy;
-	struct file *file;
+	struct fd fd_ref;
 	struct ns_common *ns;
 	int err;
 
-	file = proc_ns_fget(fd);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-
-	err = -EINVAL;
-	ns = get_proc_ns(file_inode(file));
-	if (nstype && (ns->ops->type != nstype))
-		goto out;
+	ns = proc_ns_fdget(fd, nstype, &fd_ref);
+	if (IS_ERR(ns))
+		return PTR_ERR(ns);
 
 	new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
 	if (IS_ERR(new_nsproxy)) {
@@ -248,7 +243,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	}
 	switch_task_namespaces(tsk, new_nsproxy);
 out:
-	fput(file);
+	fdput(fd_ref);
 	return err;
 }
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..ffc5e6df86dc 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -465,21 +465,15 @@ EXPORT_SYMBOL_GPL(__put_net);
 
 struct net *get_net_ns_by_fd(int fd)
 {
-	struct file *file;
 	struct ns_common *ns;
+	struct fd fd_ref;
 	struct net *net;
 
-	file = proc_ns_fget(fd);
-	if (IS_ERR(file))
-		return ERR_CAST(file);
-
-	ns = get_proc_ns(file_inode(file));
-	if (ns->ops == &netns_operations)
-		net = get_net(container_of(ns, struct net, ns));
-	else
-		net = ERR_PTR(-EINVAL);
-
-	fput(file);
+	ns = proc_ns_fdget(fd, CLONE_NEWNET, &fd_ref);
+	if (IS_ERR(ns))
+		return ERR_CAST(ns);
+	net = get_net(container_of(ns, struct net, ns));
+	fdput(fd_ref);
 	return net;
 }
 

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
  2015-09-25 13:52 ` Konstantin Khlebnikov
@ 2015-09-25 13:52   ` Konstantin Khlebnikov
  -1 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw)
  To: linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman,
	Chen Fan, Andrew Morton, Linus Torvalds

pid_t getvpid(pid_t pid, int source, int target);

This syscall converts pid from source pid-namespace into pid visible
in target pid-namespace. If pid is unreachable from target namespace
then getvpid() returns zero.

Namespaces are defined by file descriptors pointing to entries in
proc (/proc/[pid]/ns/pid). If argument is negative then current pid
namespace is used.

If pid is negative then getvpid() returns pid of parent task for -pid.

Possible error codes:
ESRCH    - task not found
EBADF    - closed file descriptor
EINVAL   - not pid-namespace file descriptor

Such conversion is required for interaction between processes from
different pid-namespaces. For example system service at host system
who provide access to restricted set of privileged operations for
clients from containers have to convert pids back and forward.

Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but
this interface works only in one way and even that is non-trivial.

Other option is passing pids with credentials via unix socket, but
this solution requires a lot of preparation and CAP_SYS_ADMIN for
sending arbitrary pids.

This syscall works in both directions, it's fast and simple.

Examples:
getvpid(pid, ns, -1)      - get pid in our pid namespace
getvpid(pid, -1, ns)      - get pid in container
getvpid(pid, -1, ns) > 0  - is pid is reachable from container?
getvpid(1, ns1, ns2) > 0  - is ns1 inside ns2?
getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
getvpid(1, ns, -1)        - get init task of pid-namespace
getvpid(-1, ns, -1)       - get reaper of init task in parent pid-namespace
getvpid(-pid, -1, -1)     - get ppid by pid

Signed-off-by: Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org>

--

v1: https://lkml.org/lkml/2015/9/15/411
v2: https://lkml.org/lkml/2015/9/24/278
v3:
 * use proc_ns_fdget()
 * update description
 * rebase to next-20150925
 * fix conflict with mlock2
---
 arch/x86/entry/syscalls/syscall_32.tbl |    1 +
 arch/x86/entry/syscalls/syscall_64.tbl |    1 +
 include/linux/syscalls.h               |    1 +
 include/uapi/asm-generic/unistd.h      |    4 ++-
 kernel/sys.c                           |   51 ++++++++++++++++++++++++++++++++
 5 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 143ef9f37932..c36c2c65d204 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -383,3 +383,4 @@
 374	i386	userfaultfd		sys_userfaultfd
 375	i386	membarrier		sys_membarrier
 376	i386	mlock2			sys_mlock2
+377	i386	getvpid			sys_getvpid
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90bfc09c..90bbbc7fdbe0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -332,6 +332,7 @@
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
+326	common	getvpid			sys_getvpid
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a156b82dd14c..dbb5638260b5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
 asmlinkage long sys_alarm(unsigned int seconds);
 asmlinkage long sys_getpid(void);
 asmlinkage long sys_getppid(void);
+asmlinkage long sys_getvpid(pid_t pid, int source, int target);
 asmlinkage long sys_getuid(void);
 asmlinkage long sys_geteuid(void);
 asmlinkage long sys_getgid(void);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 1324b0292ec2..2c1123130f90 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
 __SYSCALL(__NR_membarrier, sys_membarrier)
 #define __NR_mlock2 284
 __SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_mlock2 285
+__SYSCALL(__NR_getvpid, sys_getvpid)
 
 #undef __NR_syscalls
-#define __NR_syscalls 285
+#define __NR_syscalls 286
 
 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys.c b/kernel/sys.c
index fa2f2f671a5c..1e28a36b84fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -46,6 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
 #include <linux/binfmts.h>
 
 #include <linux/sched.h>
@@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid)
 	return pid;
 }
 
+SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target)
+{
+	struct pid_namespace *source_ns, *target_ns;
+	struct fd source_fd = {}, target_fd = {};
+	struct pid *struct_pid;
+	struct ns_common *ns;
+	pid_t result;
+
+	if (source >= 0) {
+		ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd);
+		result = PTR_ERR(ns);
+		if (IS_ERR(ns))
+			goto out;
+		source_ns = container_of(ns, struct pid_namespace, ns);
+	} else
+		source_ns = task_active_pid_ns(current);
+
+	if (target >= 0) {
+		ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd);
+		result = PTR_ERR(ns);
+		if (IS_ERR(ns))
+			goto out;
+		target_ns = container_of(ns, struct pid_namespace, ns);
+	} else
+		target_ns = task_active_pid_ns(current);
+
+	rcu_read_lock();
+	struct_pid = find_pid_ns(abs(pid), source_ns);
+
+	if (struct_pid && pid < 0) {
+		struct task_struct *task;
+
+		task = pid_task(struct_pid, PIDTYPE_PID);
+		if (task)
+			task = rcu_dereference(task->real_parent);
+		struct_pid = task ? task_pid(task) : NULL;
+	}
+
+	if (struct_pid)
+		result = pid_nr_ns(struct_pid, target_ns);
+	else
+		result = -ESRCH;
+	rcu_read_unlock();
+
+out:
+	fdput(target_fd);
+	fdput(source_fd);
+	return result;
+}
+
 SYSCALL_DEFINE0(getuid)
 {
 	/* Only we change this so SMP safe */

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-09-25 13:52   ` Konstantin Khlebnikov
  0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-25 13:52 UTC (permalink / raw)
  To: linux-api, containers, linux-kernel
  Cc: Roman Gushchin, Serge Hallyn, Oleg Nesterov, Eric W. Biederman,
	Chen Fan, Andrew Morton, Linus Torvalds, Stéphane Graber

pid_t getvpid(pid_t pid, int source, int target);

This syscall converts pid from source pid-namespace into pid visible
in target pid-namespace. If pid is unreachable from target namespace
then getvpid() returns zero.

Namespaces are defined by file descriptors pointing to entries in
proc (/proc/[pid]/ns/pid). If argument is negative then current pid
namespace is used.

If pid is negative then getvpid() returns pid of parent task for -pid.

Possible error codes:
ESRCH    - task not found
EBADF    - closed file descriptor
EINVAL   - not pid-namespace file descriptor

Such conversion is required for interaction between processes from
different pid-namespaces. For example system service at host system
who provide access to restricted set of privileged operations for
clients from containers have to convert pids back and forward.

Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but
this interface works only in one way and even that is non-trivial.

Other option is passing pids with credentials via unix socket, but
this solution requires a lot of preparation and CAP_SYS_ADMIN for
sending arbitrary pids.

This syscall works in both directions, it's fast and simple.

Examples:
getvpid(pid, ns, -1)      - get pid in our pid namespace
getvpid(pid, -1, ns)      - get pid in container
getvpid(pid, -1, ns) > 0  - is pid is reachable from container?
getvpid(1, ns1, ns2) > 0  - is ns1 inside ns2?
getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
getvpid(1, ns, -1)        - get init task of pid-namespace
getvpid(-1, ns, -1)       - get reaper of init task in parent pid-namespace
getvpid(-pid, -1, -1)     - get ppid by pid

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>

--

v1: https://lkml.org/lkml/2015/9/15/411
v2: https://lkml.org/lkml/2015/9/24/278
v3:
 * use proc_ns_fdget()
 * update description
 * rebase to next-20150925
 * fix conflict with mlock2
---
 arch/x86/entry/syscalls/syscall_32.tbl |    1 +
 arch/x86/entry/syscalls/syscall_64.tbl |    1 +
 include/linux/syscalls.h               |    1 +
 include/uapi/asm-generic/unistd.h      |    4 ++-
 kernel/sys.c                           |   51 ++++++++++++++++++++++++++++++++
 5 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 143ef9f37932..c36c2c65d204 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -383,3 +383,4 @@
 374	i386	userfaultfd		sys_userfaultfd
 375	i386	membarrier		sys_membarrier
 376	i386	mlock2			sys_mlock2
+377	i386	getvpid			sys_getvpid
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90bfc09c..90bbbc7fdbe0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -332,6 +332,7 @@
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
+326	common	getvpid			sys_getvpid
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a156b82dd14c..dbb5638260b5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
 asmlinkage long sys_alarm(unsigned int seconds);
 asmlinkage long sys_getpid(void);
 asmlinkage long sys_getppid(void);
+asmlinkage long sys_getvpid(pid_t pid, int source, int target);
 asmlinkage long sys_getuid(void);
 asmlinkage long sys_geteuid(void);
 asmlinkage long sys_getgid(void);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 1324b0292ec2..2c1123130f90 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
 __SYSCALL(__NR_membarrier, sys_membarrier)
 #define __NR_mlock2 284
 __SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_mlock2 285
+__SYSCALL(__NR_getvpid, sys_getvpid)
 
 #undef __NR_syscalls
-#define __NR_syscalls 285
+#define __NR_syscalls 286
 
 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys.c b/kernel/sys.c
index fa2f2f671a5c..1e28a36b84fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -46,6 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
 #include <linux/binfmts.h>
 
 #include <linux/sched.h>
@@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid)
 	return pid;
 }
 
+SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target)
+{
+	struct pid_namespace *source_ns, *target_ns;
+	struct fd source_fd = {}, target_fd = {};
+	struct pid *struct_pid;
+	struct ns_common *ns;
+	pid_t result;
+
+	if (source >= 0) {
+		ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd);
+		result = PTR_ERR(ns);
+		if (IS_ERR(ns))
+			goto out;
+		source_ns = container_of(ns, struct pid_namespace, ns);
+	} else
+		source_ns = task_active_pid_ns(current);
+
+	if (target >= 0) {
+		ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd);
+		result = PTR_ERR(ns);
+		if (IS_ERR(ns))
+			goto out;
+		target_ns = container_of(ns, struct pid_namespace, ns);
+	} else
+		target_ns = task_active_pid_ns(current);
+
+	rcu_read_lock();
+	struct_pid = find_pid_ns(abs(pid), source_ns);
+
+	if (struct_pid && pid < 0) {
+		struct task_struct *task;
+
+		task = pid_task(struct_pid, PIDTYPE_PID);
+		if (task)
+			task = rcu_dereference(task->real_parent);
+		struct_pid = task ? task_pid(task) : NULL;
+	}
+
+	if (struct_pid)
+		result = pid_nr_ns(struct_pid, target_ns);
+	else
+		result = -ESRCH;
+	rcu_read_unlock();
+
+out:
+	fdput(target_fd);
+	fdput(source_fd);
+	return result;
+}
+
 SYSCALL_DEFINE0(getuid)
 {
 	/* Only we change this so SMP safe */


^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
  2015-09-25 13:52 ` Konstantin Khlebnikov
@ 2015-09-25 17:56   ` Oleg Nesterov
  -1 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-25 17:56 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds

On 09/25, Konstantin Khlebnikov wrote:
>
> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>  {
> -	struct file *file;
> +	struct ns_common *ns;
> +	struct fd f;
>  
> -	file = fget(fd);
> -	if (!file)
> +	f = fdget(fd);
> +	if (!f.file)
>  		return ERR_PTR(-EBADF);
>  
> -	if (file->f_op != &ns_file_operations)
> +	if (f.file->f_op != &ns_file_operations)
> +		goto out_invalid;
> +
> +	ns = get_proc_ns(file_inode(f.file));
> +	if (nstype && (ns->ops->type != nstype))
>  		goto out_invalid;
>  
> -	return file;
> +	*fd_ref = f;
> +	return ns;
>  
>  out_invalid:
> -	fput(file);
> +	fdput(f);
>  	return ERR_PTR(-EINVAL);
>  }

Well yes, fdget() makes sense but this is minor.

Honestly, I do not really like the new helper... I understand this
is subjective, so I won't insist. But how about 1/1? We do not need
fd/file at all. With this patch your sys_getvpid() can just use
proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().

Eric, what do you think?

See also "TODO" in the changelog.

Oleg.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-25 17:56   ` Oleg Nesterov
  0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-25 17:56 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-api, containers, linux-kernel, Roman Gushchin,
	Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

On 09/25, Konstantin Khlebnikov wrote:
>
> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>  {
> -	struct file *file;
> +	struct ns_common *ns;
> +	struct fd f;
>  
> -	file = fget(fd);
> -	if (!file)
> +	f = fdget(fd);
> +	if (!f.file)
>  		return ERR_PTR(-EBADF);
>  
> -	if (file->f_op != &ns_file_operations)
> +	if (f.file->f_op != &ns_file_operations)
> +		goto out_invalid;
> +
> +	ns = get_proc_ns(file_inode(f.file));
> +	if (nstype && (ns->ops->type != nstype))
>  		goto out_invalid;
>  
> -	return file;
> +	*fd_ref = f;
> +	return ns;
>  
>  out_invalid:
> -	fput(file);
> +	fdput(f);
>  	return ERR_PTR(-EINVAL);
>  }

Well yes, fdget() makes sense but this is minor.

Honestly, I do not really like the new helper... I understand this
is subjective, so I won't insist. But how about 1/1? We do not need
fd/file at all. With this patch your sys_getvpid() can just use
proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().

Eric, what do you think?

See also "TODO" in the changelog.

Oleg.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH 1/1] ns: introduce proc_get_ns_by_fd()
  2015-09-25 17:56   ` Oleg Nesterov
@ 2015-09-25 17:57       ` Oleg Nesterov
  -1 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-25 17:57 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds

Introduce proc_get_ns_by_fd() so that get_net_ns_by_fd() becomes
one-liner. It will have another CLONE_NEWPID user soon.

TODO: proc_get_ns_by_fd() can share some code with proc_ns_fget().

Signed-off-by: Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 fs/nsfs.c                |   24 ++++++++++++++++++++++++
 include/linux/proc_ns.h  |    3 +++
 kernel/pid_namespace.c   |    6 ++++++
 net/core/net_namespace.c |   23 +++++++----------------
 4 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 99521e7..72474ca 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -136,6 +136,30 @@ out_invalid:
 	return ERR_PTR(-EINVAL);
 }
 
+void *proc_get_ns_by_fd(int fd, int type)
+{
+	struct fd f;
+	struct ns_common *ns;
+	void *ret;
+
+	f = fdget(fd);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	ret = ERR_PTR(-EINVAL);
+	if (f.file->f_op != &ns_file_operations)
+		goto put;
+
+	ns = get_proc_ns(file_inode(f.file));
+	if (ns->ops->type != type || !ns->ops->get_type)
+		goto put;
+
+	ret = ns->ops->get_type(ns);
+put:
+	fdput(f);
+	return ret;
+}
+
 static const struct super_operations nsfs_ops = {
 	.statfs = simple_statfs,
 	.evict_inode = nsfs_evict,
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 42dfc61..d956f89 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -16,6 +16,7 @@ struct proc_ns_operations {
 	struct ns_common *(*get)(struct task_struct *task);
 	void (*put)(struct ns_common *ns);
 	int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
+	void *(*get_type)(struct ns_common *ns);
 };
 
 extern const struct proc_ns_operations netns_operations;
@@ -66,6 +67,8 @@ static inline int ns_alloc_inum(struct ns_common *ns)
 #define ns_free_inum(ns) proc_free_inum((ns)->inum)
 
 extern struct file *proc_ns_fget(int fd);
+extern void *proc_get_ns_by_fd(int fd, int type);
+
 #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
 extern void *ns_get_path(struct path *path, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba13..0c87393 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -388,12 +388,18 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return 0;
 }
 
+static void *pidns_get_type(struct ns_common *ns)
+{
+	return get_pid_ns(to_pid_ns(ns));
+}
+
 const struct proc_ns_operations pidns_operations = {
 	.name		= "pid",
 	.type		= CLONE_NEWPID,
 	.get		= pidns_get,
 	.put		= pidns_put,
 	.install	= pidns_install,
+	.get_type	= pidns_get_type,
 };
 
 static __init int pid_namespaces_init(void)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 572af00..6465dc0 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -420,22 +420,7 @@ EXPORT_SYMBOL_GPL(__put_net);
 
 struct net *get_net_ns_by_fd(int fd)
 {
-	struct file *file;
-	struct ns_common *ns;
-	struct net *net;
-
-	file = proc_ns_fget(fd);
-	if (IS_ERR(file))
-		return ERR_CAST(file);
-
-	ns = get_proc_ns(file_inode(file));
-	if (ns->ops == &netns_operations)
-		net = get_net(container_of(ns, struct net, ns));
-	else
-		net = ERR_PTR(-EINVAL);
-
-	fput(file);
-	return net;
+	return proc_get_ns_by_fd(fd, CLONE_NEWNET);
 }
 
 #else
@@ -955,11 +940,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return 0;
 }
 
+static void *netns_get_type(struct ns_common *ns)
+{
+	return get_net(to_net_ns(ns));
+}
+
 const struct proc_ns_operations netns_operations = {
 	.name		= "net",
 	.type		= CLONE_NEWNET,
 	.get		= netns_get,
 	.put		= netns_put,
 	.install	= netns_install,
+	.get_type	= netns_get_type,
 };
 #endif
-- 
1.5.5.1

^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 1/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-25 17:57       ` Oleg Nesterov
  0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-25 17:57 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-api, containers, linux-kernel, Roman Gushchin,
	Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

Introduce proc_get_ns_by_fd() so that get_net_ns_by_fd() becomes
one-liner. It will have another CLONE_NEWPID user soon.

TODO: proc_get_ns_by_fd() can share some code with proc_ns_fget().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/nsfs.c                |   24 ++++++++++++++++++++++++
 include/linux/proc_ns.h  |    3 +++
 kernel/pid_namespace.c   |    6 ++++++
 net/core/net_namespace.c |   23 +++++++----------------
 4 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 99521e7..72474ca 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -136,6 +136,30 @@ out_invalid:
 	return ERR_PTR(-EINVAL);
 }
 
+void *proc_get_ns_by_fd(int fd, int type)
+{
+	struct fd f;
+	struct ns_common *ns;
+	void *ret;
+
+	f = fdget(fd);
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	ret = ERR_PTR(-EINVAL);
+	if (f.file->f_op != &ns_file_operations)
+		goto put;
+
+	ns = get_proc_ns(file_inode(f.file));
+	if (ns->ops->type != type || !ns->ops->get_type)
+		goto put;
+
+	ret = ns->ops->get_type(ns);
+put:
+	fdput(f);
+	return ret;
+}
+
 static const struct super_operations nsfs_ops = {
 	.statfs = simple_statfs,
 	.evict_inode = nsfs_evict,
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 42dfc61..d956f89 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -16,6 +16,7 @@ struct proc_ns_operations {
 	struct ns_common *(*get)(struct task_struct *task);
 	void (*put)(struct ns_common *ns);
 	int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
+	void *(*get_type)(struct ns_common *ns);
 };
 
 extern const struct proc_ns_operations netns_operations;
@@ -66,6 +67,8 @@ static inline int ns_alloc_inum(struct ns_common *ns)
 #define ns_free_inum(ns) proc_free_inum((ns)->inum)
 
 extern struct file *proc_ns_fget(int fd);
+extern void *proc_get_ns_by_fd(int fd, int type);
+
 #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
 extern void *ns_get_path(struct path *path, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba13..0c87393 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -388,12 +388,18 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return 0;
 }
 
+static void *pidns_get_type(struct ns_common *ns)
+{
+	return get_pid_ns(to_pid_ns(ns));
+}
+
 const struct proc_ns_operations pidns_operations = {
 	.name		= "pid",
 	.type		= CLONE_NEWPID,
 	.get		= pidns_get,
 	.put		= pidns_put,
 	.install	= pidns_install,
+	.get_type	= pidns_get_type,
 };
 
 static __init int pid_namespaces_init(void)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 572af00..6465dc0 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -420,22 +420,7 @@ EXPORT_SYMBOL_GPL(__put_net);
 
 struct net *get_net_ns_by_fd(int fd)
 {
-	struct file *file;
-	struct ns_common *ns;
-	struct net *net;
-
-	file = proc_ns_fget(fd);
-	if (IS_ERR(file))
-		return ERR_CAST(file);
-
-	ns = get_proc_ns(file_inode(file));
-	if (ns->ops == &netns_operations)
-		net = get_net(container_of(ns, struct net, ns));
-	else
-		net = ERR_PTR(-EINVAL);
-
-	fput(file);
-	return net;
+	return proc_get_ns_by_fd(fd, CLONE_NEWNET);
 }
 
 #else
@@ -955,11 +940,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	return 0;
 }
 
+static void *netns_get_type(struct ns_common *ns)
+{
+	return get_net(to_net_ns(ns));
+}
+
 const struct proc_ns_operations netns_operations = {
 	.name		= "net",
 	.type		= CLONE_NEWNET,
 	.get		= netns_get,
 	.put		= netns_put,
 	.install	= netns_install,
+	.get_type	= netns_get_type,
 };
 #endif
-- 
1.5.5.1



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
  2015-09-25 13:52   ` Konstantin Khlebnikov
@ 2015-09-28  4:12     ` kbuild test robot
  -1 siblings, 0 replies; 37+ messages in thread
From: kbuild test robot @ 2015-09-28  4:12 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov,
	kbuild-all-JC7UmRfGjtg, Chen Fan, Andrew Morton, Linus Torvalds,
	Eric W. Biederman

[-- Attachment #1: Type: text/plain, Size: 4263 bytes --]

Hi Konstantin,

[auto build test results on next-20150925 -- if it's inappropriate base, please ignore]

config: openrisc-or1ksim_defconfig (attached as .config)
reproduce:
  wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
  chmod +x ~/bin/make.cross
  git checkout 7bea85b3c058d69527d79b94338025f218a0655e
  # save the attached .config to linux build tree
  make.cross ARCH=openrisc 

All warnings (new ones prefixed by >>):

   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from <stdin>:2:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
   <stdin>:1307:2: warning: #warning syscall getvpid not implemented
--
   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from include/uapi/linux/unistd.h:7,
                    from arch/openrisc/kernel/setup.c:25:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
--
   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from include/uapi/linux/unistd.h:7,
                    from include/linux/syscalls.h:78,
                    from arch/openrisc/kernel/sys_call_table.c:17:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from arch/openrisc/kernel/sys_call_table.c:27:
   include/uapi/asm-generic/unistd.h:716:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:718:0: note: this is the location of the previous definition
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
   include/uapi/asm-generic/unistd.h:719:1: error: '__NR_getvpid' undeclared here (not in a function)
   include/uapi/asm-generic/unistd.h:719:1: error: array index in initializer not of integer type
   include/uapi/asm-generic/unistd.h:719:1: error: (near initialization for 'sys_call_table')
--
   kernel/time/Kconfig:155:warning: range is invalid
   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from <stdin>:2:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
   <stdin>:1307:2: warning: #warning syscall getvpid not implemented

vim +/__NR_mlock2 +718 include/uapi/asm-generic/unistd.h

   702	#define __NR_seccomp 277
   703	__SYSCALL(__NR_seccomp, sys_seccomp)
   704	#define __NR_getrandom 278
   705	__SYSCALL(__NR_getrandom, sys_getrandom)
   706	#define __NR_memfd_create 279
   707	__SYSCALL(__NR_memfd_create, sys_memfd_create)
   708	#define __NR_bpf 280
   709	__SYSCALL(__NR_bpf, sys_bpf)
   710	#define __NR_execveat 281
   711	__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
   712	#define __NR_userfaultfd 282
   713	__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
   714	#define __NR_membarrier 283
   715	__SYSCALL(__NR_membarrier, sys_membarrier)
   716	#define __NR_mlock2 284
   717	__SYSCALL(__NR_mlock2, sys_mlock2)
 > 718	#define __NR_mlock2 285
   719	__SYSCALL(__NR_getvpid, sys_getvpid)
   720	
   721	#undef __NR_syscalls
   722	#define __NR_syscalls 286
   723	
   724	/*
   725	 * All syscalls below here should go away really,
   726	 * these are provided for both review and as a porting

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 6887 bytes --]

[-- Attachment #3: Type: text/plain, Size: 205 bytes --]

_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linuxfoundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-09-28  4:12     ` kbuild test robot
  0 siblings, 0 replies; 37+ messages in thread
From: kbuild test robot @ 2015-09-28  4:12 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: kbuild-all, linux-api, containers, linux-kernel, Roman Gushchin,
	Serge Hallyn, Oleg Nesterov, Eric W. Biederman, Chen Fan,
	Andrew Morton, Linus Torvalds, Stéphane Graber

[-- Attachment #1: Type: text/plain, Size: 4263 bytes --]

Hi Konstantin,

[auto build test results on next-20150925 -- if it's inappropriate base, please ignore]

config: openrisc-or1ksim_defconfig (attached as .config)
reproduce:
  wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
  chmod +x ~/bin/make.cross
  git checkout 7bea85b3c058d69527d79b94338025f218a0655e
  # save the attached .config to linux build tree
  make.cross ARCH=openrisc 

All warnings (new ones prefixed by >>):

   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from <stdin>:2:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
   <stdin>:1307:2: warning: #warning syscall getvpid not implemented
--
   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from include/uapi/linux/unistd.h:7,
                    from arch/openrisc/kernel/setup.c:25:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
--
   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from include/uapi/linux/unistd.h:7,
                    from include/linux/syscalls.h:78,
                    from arch/openrisc/kernel/sys_call_table.c:17:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from arch/openrisc/kernel/sys_call_table.c:27:
   include/uapi/asm-generic/unistd.h:716:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:718:0: note: this is the location of the previous definition
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
   include/uapi/asm-generic/unistd.h:719:1: error: '__NR_getvpid' undeclared here (not in a function)
   include/uapi/asm-generic/unistd.h:719:1: error: array index in initializer not of integer type
   include/uapi/asm-generic/unistd.h:719:1: error: (near initialization for 'sys_call_table')
--
   kernel/time/Kconfig:155:warning: range is invalid
   In file included from include/asm-generic/unistd.h:1:0,
                    from arch/openrisc/include/uapi/asm/unistd.h:26,
                    from <stdin>:2:
>> include/uapi/asm-generic/unistd.h:718:0: warning: "__NR_mlock2" redefined
   include/uapi/asm-generic/unistd.h:716:0: note: this is the location of the previous definition
   <stdin>:1307:2: warning: #warning syscall getvpid not implemented

vim +/__NR_mlock2 +718 include/uapi/asm-generic/unistd.h

   702	#define __NR_seccomp 277
   703	__SYSCALL(__NR_seccomp, sys_seccomp)
   704	#define __NR_getrandom 278
   705	__SYSCALL(__NR_getrandom, sys_getrandom)
   706	#define __NR_memfd_create 279
   707	__SYSCALL(__NR_memfd_create, sys_memfd_create)
   708	#define __NR_bpf 280
   709	__SYSCALL(__NR_bpf, sys_bpf)
   710	#define __NR_execveat 281
   711	__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
   712	#define __NR_userfaultfd 282
   713	__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
   714	#define __NR_membarrier 283
   715	__SYSCALL(__NR_membarrier, sys_membarrier)
   716	#define __NR_mlock2 284
   717	__SYSCALL(__NR_mlock2, sys_mlock2)
 > 718	#define __NR_mlock2 285
   719	__SYSCALL(__NR_getvpid, sys_getvpid)
   720	
   721	#undef __NR_syscalls
   722	#define __NR_syscalls 286
   723	
   724	/*
   725	 * All syscalls below here should go away really,
   726	 * these are provided for both review and as a porting

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 6887 bytes --]

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]   ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2015-09-25 17:57       ` Oleg Nesterov
@ 2015-09-28  8:21     ` Konstantin Khlebnikov
  2015-09-28 16:37     ` Eric W. Biederman
  2 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-28  8:21 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Eric W. Biederman, Chen Fan, Andrew Morton, Linus Torvalds

On 25.09.2015 20:56, Oleg Nesterov wrote:
> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>   {
>> -	struct file *file;
>> +	struct ns_common *ns;
>> +	struct fd f;
>>
>> -	file = fget(fd);
>> -	if (!file)
>> +	f = fdget(fd);
>> +	if (!f.file)
>>   		return ERR_PTR(-EBADF);
>>
>> -	if (file->f_op != &ns_file_operations)
>> +	if (f.file->f_op != &ns_file_operations)
>> +		goto out_invalid;
>> +
>> +	ns = get_proc_ns(file_inode(f.file));
>> +	if (nstype && (ns->ops->type != nstype))
>>   		goto out_invalid;
>>
>> -	return file;
>> +	*fd_ref = f;
>> +	return ns;
>>
>>   out_invalid:
>> -	fput(file);
>> +	fdput(f);
>>   	return ERR_PTR(-EINVAL);
>>   }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().

Hmm. My version has 0 or 2 atomic ops per get-put sequence.
Your version: 2 or 4 atomic ops. Plus even in worst case pinning by
struct fd theoretically scales better because it touches refcount at
struct file: there're might be many of them for one namespace.

>
> Eric, what do you think?
>
> See also "TODO" in the changelog.
>
> Oleg.
>


-- 
Konstantin

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]   ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-09-28  8:21     ` Konstantin Khlebnikov
  2015-09-28  8:21     ` [PATCH 0/1] " Konstantin Khlebnikov
  2015-09-28 16:37     ` Eric W. Biederman
  2 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-28  8:21 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-api, containers, linux-kernel, Roman Gushchin,
	Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

On 25.09.2015 20:56, Oleg Nesterov wrote:
> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>   {
>> -	struct file *file;
>> +	struct ns_common *ns;
>> +	struct fd f;
>>
>> -	file = fget(fd);
>> -	if (!file)
>> +	f = fdget(fd);
>> +	if (!f.file)
>>   		return ERR_PTR(-EBADF);
>>
>> -	if (file->f_op != &ns_file_operations)
>> +	if (f.file->f_op != &ns_file_operations)
>> +		goto out_invalid;
>> +
>> +	ns = get_proc_ns(file_inode(f.file));
>> +	if (nstype && (ns->ops->type != nstype))
>>   		goto out_invalid;
>>
>> -	return file;
>> +	*fd_ref = f;
>> +	return ns;
>>
>>   out_invalid:
>> -	fput(file);
>> +	fdput(f);
>>   	return ERR_PTR(-EINVAL);
>>   }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().

Hmm. My version has 0 or 2 atomic ops per get-put sequence.
Your version: 2 or 4 atomic ops. Plus even in worst case pinning by
struct fd theoretically scales better because it touches refcount at
struct file: there're might be many of them for one namespace.

>
> Eric, what do you think?
>
> See also "TODO" in the changelog.
>
> Oleg.
>


-- 
Konstantin

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-28  8:21     ` Konstantin Khlebnikov
  0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-09-28  8:21 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
	Serge Hallyn, Eric W. Biederman, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

On 25.09.2015 20:56, Oleg Nesterov wrote:
> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>   {
>> -	struct file *file;
>> +	struct ns_common *ns;
>> +	struct fd f;
>>
>> -	file = fget(fd);
>> -	if (!file)
>> +	f = fdget(fd);
>> +	if (!f.file)
>>   		return ERR_PTR(-EBADF);
>>
>> -	if (file->f_op != &ns_file_operations)
>> +	if (f.file->f_op != &ns_file_operations)
>> +		goto out_invalid;
>> +
>> +	ns = get_proc_ns(file_inode(f.file));
>> +	if (nstype && (ns->ops->type != nstype))
>>   		goto out_invalid;
>>
>> -	return file;
>> +	*fd_ref = f;
>> +	return ns;
>>
>>   out_invalid:
>> -	fput(file);
>> +	fdput(f);
>>   	return ERR_PTR(-EINVAL);
>>   }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().

Hmm. My version has 0 or 2 atomic ops per get-put sequence.
Your version: 2 or 4 atomic ops. Plus even in worst case pinning by
struct fd theoretically scales better because it touches refcount at
struct file: there're might be many of them for one namespace.

>
> Eric, what do you think?
>
> See also "TODO" in the changelog.
>
> Oleg.
>


-- 
Konstantin

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
  2015-09-25 13:52   ` Konstantin Khlebnikov
@ 2015-09-28 16:22     ` Eric W. Biederman
  -1 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:22 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov,
	Chen Fan, Andrew Morton, Linus Torvalds

Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes:

> pid_t getvpid(pid_t pid, int source, int target);
>
> This syscall converts pid from source pid-namespace into pid visible
> in target pid-namespace. If pid is unreachable from target namespace
> then getvpid() returns zero.

Two minor things.

Can we please call this translate_pid? getvpid does not really cover
what this syscall does.

Can you please split wiring up into a separate patch?  You goofed it
up this round and it just adds noise in reviewing the core syscall.

> Namespaces are defined by file descriptors pointing to entries in
> proc (/proc/[pid]/ns/pid). If argument is negative then current pid
> namespace is used.
>
> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Possible error codes:
> ESRCH    - task not found
> EBADF    - closed file descriptor
> EINVAL   - not pid-namespace file descriptor
>
> Such conversion is required for interaction between processes from
> different pid-namespaces. For example system service at host system
> who provide access to restricted set of privileged operations for
> clients from containers have to convert pids back and forward.
>
> Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but
> this interface works only in one way and even that is non-trivial.
>
> Other option is passing pids with credentials via unix socket, but
> this solution requires a lot of preparation and CAP_SYS_ADMIN for
> sending arbitrary pids.
>
> This syscall works in both directions, it's fast and simple.
>
> Examples:
> getvpid(pid, ns, -1)      - get pid in our pid namespace
> getvpid(pid, -1, ns)      - get pid in container
> getvpid(pid, -1, ns) > 0  - is pid is reachable from container?
> getvpid(1, ns1, ns2) > 0  - is ns1 inside ns2?
> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> getvpid(1, ns, -1)        - get init task of pid-namespace
> getvpid(-1, ns, -1)       - get reaper of init task in parent pid-namespace
> getvpid(-pid, -1, -1)     - get ppid by pid
>
> Signed-off-by: Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org>
>
> --
>
> v1: https://lkml.org/lkml/2015/9/15/411
> v2: https://lkml.org/lkml/2015/9/24/278
> v3:
>  * use proc_ns_fdget()
>  * update description
>  * rebase to next-20150925
>  * fix conflict with mlock2
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |    1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |    1 +
>  include/linux/syscalls.h               |    1 +
>  include/uapi/asm-generic/unistd.h      |    4 ++-
>  kernel/sys.c                           |   51 ++++++++++++++++++++++++++++++++
>  5 files changed, 57 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 143ef9f37932..c36c2c65d204 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -383,3 +383,4 @@
>  374	i386	userfaultfd		sys_userfaultfd
>  375	i386	membarrier		sys_membarrier
>  376	i386	mlock2			sys_mlock2
> +377	i386	getvpid			sys_getvpid
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 314a90bfc09c..90bbbc7fdbe0 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -332,6 +332,7 @@
>  323	common	userfaultfd		sys_userfaultfd
>  324	common	membarrier		sys_membarrier
>  325	common	mlock2			sys_mlock2
> +326	common	getvpid			sys_getvpid
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a156b82dd14c..dbb5638260b5 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
>  asmlinkage long sys_alarm(unsigned int seconds);
>  asmlinkage long sys_getpid(void);
>  asmlinkage long sys_getppid(void);
> +asmlinkage long sys_getvpid(pid_t pid, int source, int target);
>  asmlinkage long sys_getuid(void);
>  asmlinkage long sys_geteuid(void);
>  asmlinkage long sys_getgid(void);
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1324b0292ec2..2c1123130f90 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
>  __SYSCALL(__NR_membarrier, sys_membarrier)
>  #define __NR_mlock2 284
>  __SYSCALL(__NR_mlock2, sys_mlock2)
> +#define __NR_mlock2 285
> +__SYSCALL(__NR_getvpid, sys_getvpid)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 285
> +#define __NR_syscalls 286
>  
>  /*
>   * All syscalls below here should go away really,
> diff --git a/kernel/sys.c b/kernel/sys.c
> index fa2f2f671a5c..1e28a36b84fa 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -46,6 +46,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/kprobes.h>
>  #include <linux/user_namespace.h>
> +#include <linux/proc_ns.h>
>  #include <linux/binfmts.h>
>  
>  #include <linux/sched.h>
> @@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid)
>  	return pid;
>  }
>  
> +SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target)
> +{
> +	struct pid_namespace *source_ns, *target_ns;
> +	struct fd source_fd = {}, target_fd = {};
> +	struct pid *struct_pid;
> +	struct ns_common *ns;
> +	pid_t result;
> +
> +	if (source >= 0) {
> +		ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd);
> +		result = PTR_ERR(ns);
> +		if (IS_ERR(ns))
> +			goto out;
> +		source_ns = container_of(ns, struct pid_namespace, ns);
> +	} else
> +		source_ns = task_active_pid_ns(current);
> +
> +	if (target >= 0) {
> +		ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd);
> +		result = PTR_ERR(ns);
> +		if (IS_ERR(ns))
> +			goto out;
> +		target_ns = container_of(ns, struct pid_namespace, ns);
> +	} else
> +		target_ns = task_active_pid_ns(current);
> +
> +	rcu_read_lock();
> +	struct_pid = find_pid_ns(abs(pid), source_ns);
> +
> +	if (struct_pid && pid < 0) {
> +		struct task_struct *task;
> +
> +		task = pid_task(struct_pid, PIDTYPE_PID);
> +		if (task)
> +			task = rcu_dereference(task->real_parent);
> +		struct_pid = task ? task_pid(task) : NULL;
> +	}
> +
> +	if (struct_pid)
> +		result = pid_nr_ns(struct_pid, target_ns);
> +	else
> +		result = -ESRCH;
> +	rcu_read_unlock();
> +
> +out:
> +	fdput(target_fd);
> +	fdput(source_fd);
> +	return result;
> +}
> +
>  SYSCALL_DEFINE0(getuid)
>  {
>  	/* Only we change this so SMP safe */

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-09-28 16:22     ` Eric W. Biederman
  0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:22 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-api, containers, linux-kernel, Roman Gushchin,
	Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:

> pid_t getvpid(pid_t pid, int source, int target);
>
> This syscall converts pid from source pid-namespace into pid visible
> in target pid-namespace. If pid is unreachable from target namespace
> then getvpid() returns zero.

Two minor things.

Can we please call this translate_pid? getvpid does not really cover
what this syscall does.

Can you please split wiring up into a separate patch?  You goofed it
up this round and it just adds noise in reviewing the core syscall.

> Namespaces are defined by file descriptors pointing to entries in
> proc (/proc/[pid]/ns/pid). If argument is negative then current pid
> namespace is used.
>
> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Possible error codes:
> ESRCH    - task not found
> EBADF    - closed file descriptor
> EINVAL   - not pid-namespace file descriptor
>
> Such conversion is required for interaction between processes from
> different pid-namespaces. For example system service at host system
> who provide access to restricted set of privileged operations for
> clients from containers have to convert pids back and forward.
>
> Recent kernels expose virtual pids in /proc/[pid]/status:NSpid, but
> this interface works only in one way and even that is non-trivial.
>
> Other option is passing pids with credentials via unix socket, but
> this solution requires a lot of preparation and CAP_SYS_ADMIN for
> sending arbitrary pids.
>
> This syscall works in both directions, it's fast and simple.
>
> Examples:
> getvpid(pid, ns, -1)      - get pid in our pid namespace
> getvpid(pid, -1, ns)      - get pid in container
> getvpid(pid, -1, ns) > 0  - is pid is reachable from container?
> getvpid(1, ns1, ns2) > 0  - is ns1 inside ns2?
> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> getvpid(1, ns, -1)        - get init task of pid-namespace
> getvpid(-1, ns, -1)       - get reaper of init task in parent pid-namespace
> getvpid(-pid, -1, -1)     - get ppid by pid
>
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
>
> --
>
> v1: https://lkml.org/lkml/2015/9/15/411
> v2: https://lkml.org/lkml/2015/9/24/278
> v3:
>  * use proc_ns_fdget()
>  * update description
>  * rebase to next-20150925
>  * fix conflict with mlock2
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |    1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |    1 +
>  include/linux/syscalls.h               |    1 +
>  include/uapi/asm-generic/unistd.h      |    4 ++-
>  kernel/sys.c                           |   51 ++++++++++++++++++++++++++++++++
>  5 files changed, 57 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 143ef9f37932..c36c2c65d204 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -383,3 +383,4 @@
>  374	i386	userfaultfd		sys_userfaultfd
>  375	i386	membarrier		sys_membarrier
>  376	i386	mlock2			sys_mlock2
> +377	i386	getvpid			sys_getvpid
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 314a90bfc09c..90bbbc7fdbe0 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -332,6 +332,7 @@
>  323	common	userfaultfd		sys_userfaultfd
>  324	common	membarrier		sys_membarrier
>  325	common	mlock2			sys_mlock2
> +326	common	getvpid			sys_getvpid
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a156b82dd14c..dbb5638260b5 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -222,6 +222,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
>  asmlinkage long sys_alarm(unsigned int seconds);
>  asmlinkage long sys_getpid(void);
>  asmlinkage long sys_getppid(void);
> +asmlinkage long sys_getvpid(pid_t pid, int source, int target);
>  asmlinkage long sys_getuid(void);
>  asmlinkage long sys_geteuid(void);
>  asmlinkage long sys_getgid(void);
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1324b0292ec2..2c1123130f90 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
>  __SYSCALL(__NR_membarrier, sys_membarrier)
>  #define __NR_mlock2 284
>  __SYSCALL(__NR_mlock2, sys_mlock2)
> +#define __NR_mlock2 285
> +__SYSCALL(__NR_getvpid, sys_getvpid)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 285
> +#define __NR_syscalls 286
>  
>  /*
>   * All syscalls below here should go away really,
> diff --git a/kernel/sys.c b/kernel/sys.c
> index fa2f2f671a5c..1e28a36b84fa 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -46,6 +46,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/kprobes.h>
>  #include <linux/user_namespace.h>
> +#include <linux/proc_ns.h>
>  #include <linux/binfmts.h>
>  
>  #include <linux/sched.h>
> @@ -855,6 +856,56 @@ SYSCALL_DEFINE0(getppid)
>  	return pid;
>  }
>  
> +SYSCALL_DEFINE3(getvpid, pid_t, pid, int, source, int, target)
> +{
> +	struct pid_namespace *source_ns, *target_ns;
> +	struct fd source_fd = {}, target_fd = {};
> +	struct pid *struct_pid;
> +	struct ns_common *ns;
> +	pid_t result;
> +
> +	if (source >= 0) {
> +		ns = proc_ns_fdget(source, CLONE_NEWPID, &source_fd);
> +		result = PTR_ERR(ns);
> +		if (IS_ERR(ns))
> +			goto out;
> +		source_ns = container_of(ns, struct pid_namespace, ns);
> +	} else
> +		source_ns = task_active_pid_ns(current);
> +
> +	if (target >= 0) {
> +		ns = proc_ns_fdget(target, CLONE_NEWPID, &target_fd);
> +		result = PTR_ERR(ns);
> +		if (IS_ERR(ns))
> +			goto out;
> +		target_ns = container_of(ns, struct pid_namespace, ns);
> +	} else
> +		target_ns = task_active_pid_ns(current);
> +
> +	rcu_read_lock();
> +	struct_pid = find_pid_ns(abs(pid), source_ns);
> +
> +	if (struct_pid && pid < 0) {
> +		struct task_struct *task;
> +
> +		task = pid_task(struct_pid, PIDTYPE_PID);
> +		if (task)
> +			task = rcu_dereference(task->real_parent);
> +		struct_pid = task ? task_pid(task) : NULL;
> +	}
> +
> +	if (struct_pid)
> +		result = pid_nr_ns(struct_pid, target_ns);
> +	else
> +		result = -ESRCH;
> +	rcu_read_unlock();
> +
> +out:
> +	fdput(target_fd);
> +	fdput(source_fd);
> +	return result;
> +}
> +
>  SYSCALL_DEFINE0(getuid)
>  {
>  	/* Only we change this so SMP safe */

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]   ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2015-09-25 17:57       ` Oleg Nesterov
  2015-09-28  8:21     ` [PATCH 0/1] " Konstantin Khlebnikov
@ 2015-09-28 16:37     ` Eric W. Biederman
  2 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:37 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Roman Gushchin, Konstantin Khlebnikov,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
	Andrew Morton, Linus Torvalds

Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:

> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>  {
>> -	struct file *file;
>> +	struct ns_common *ns;
>> +	struct fd f;
>>  
>> -	file = fget(fd);
>> -	if (!file)
>> +	f = fdget(fd);
>> +	if (!f.file)
>>  		return ERR_PTR(-EBADF);
>>  
>> -	if (file->f_op != &ns_file_operations)
>> +	if (f.file->f_op != &ns_file_operations)
>> +		goto out_invalid;
>> +
>> +	ns = get_proc_ns(file_inode(f.file));
>> +	if (nstype && (ns->ops->type != nstype))
>>  		goto out_invalid;
>>  
>> -	return file;
>> +	*fd_ref = f;
>> +	return ns;
>>  
>>  out_invalid:
>> -	fput(file);
>> +	fdput(f);
>>  	return ERR_PTR(-EINVAL);
>>  }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>
> Eric, what do you think?

At some level I don't care this is not exposed to userspace.

However since we are going several rounds with this.

Of the existing uses several of them sleep, which unfortunately means we
can not use rcu locking for everything.  The network namespace ones wind
up taking a reference to struct net because the have the legacy pid case
to deal with.  Which makes we can not use fdget for all callers either.

For this translate_pid rcu locking is sufficient, rcu locking is easy
and doing any more than rcu locking just seems silly.  So let me
respectfully suggest.

struct ns_common *ns_by_fd_rcu(int fd, int type)
{
	struct files_struct *files = current->files;
	struct file *file;
	struct ns_common *ns;
	void *ret;

	file = fcheck_files(files, fd);
        if (!file)
		return ERR_PTR(-EBADF);

	if (file->f_mode & FMODE_PATH)
		return ERR_PTR(-EINVAL);

	if (file->f_op != &ns_file_operations)
        	return ERR_PTR(-EINVAL);

	ns = get_proc_ns(file_inode(file));
	if (ns->ops->type != type)
		return ERR_PTR(-EINVAL);

	return ns;
}

struct pid_namespace *pidns_by_fd_rcu(int fd)
{
	struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
        if (IS_ERR(ns))
		return ERR_CAST(ns);
	return container_of(ns, struct pid_namespace, ns);
}

SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
{
	struct pid_namespace *source, *target;
	struct pid *pid;
	pid_t result;

	rcu_read_lock();

	if (sourcefd >= 0)
        	source = pidns_by_fd_rcu(sourcefd);
	else
		source = task_active_pid_ns(current);

	if (targetfd >= 0)
        	target = pidns_by_fd_rcu(targetfd);
	else
		target = task_active_pid_ns(current);

	pid = find_pid_ns(pid_nr, source);
        result = pid_nr_ns(pid, target);
        if (result == 0)
		result = -ESRCH;          

	rcu_read_unlock();

	return result;
}

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]   ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-09-28 16:37     ` Eric W. Biederman
  2015-09-28  8:21     ` [PATCH 0/1] " Konstantin Khlebnikov
  2015-09-28 16:37     ` Eric W. Biederman
  2 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:37 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
	Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

Oleg Nesterov <oleg@redhat.com> writes:

> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>  {
>> -	struct file *file;
>> +	struct ns_common *ns;
>> +	struct fd f;
>>  
>> -	file = fget(fd);
>> -	if (!file)
>> +	f = fdget(fd);
>> +	if (!f.file)
>>  		return ERR_PTR(-EBADF);
>>  
>> -	if (file->f_op != &ns_file_operations)
>> +	if (f.file->f_op != &ns_file_operations)
>> +		goto out_invalid;
>> +
>> +	ns = get_proc_ns(file_inode(f.file));
>> +	if (nstype && (ns->ops->type != nstype))
>>  		goto out_invalid;
>>  
>> -	return file;
>> +	*fd_ref = f;
>> +	return ns;
>>  
>>  out_invalid:
>> -	fput(file);
>> +	fdput(f);
>>  	return ERR_PTR(-EINVAL);
>>  }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>
> Eric, what do you think?

At some level I don't care this is not exposed to userspace.

However since we are going several rounds with this.

Of the existing uses several of them sleep, which unfortunately means we
can not use rcu locking for everything.  The network namespace ones wind
up taking a reference to struct net because the have the legacy pid case
to deal with.  Which makes we can not use fdget for all callers either.

For this translate_pid rcu locking is sufficient, rcu locking is easy
and doing any more than rcu locking just seems silly.  So let me
respectfully suggest.

struct ns_common *ns_by_fd_rcu(int fd, int type)
{
	struct files_struct *files = current->files;
	struct file *file;
	struct ns_common *ns;
	void *ret;

	file = fcheck_files(files, fd);
        if (!file)
		return ERR_PTR(-EBADF);

	if (file->f_mode & FMODE_PATH)
		return ERR_PTR(-EINVAL);

	if (file->f_op != &ns_file_operations)
        	return ERR_PTR(-EINVAL);

	ns = get_proc_ns(file_inode(file));
	if (ns->ops->type != type)
		return ERR_PTR(-EINVAL);

	return ns;
}

struct pid_namespace *pidns_by_fd_rcu(int fd)
{
	struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
        if (IS_ERR(ns))
		return ERR_CAST(ns);
	return container_of(ns, struct pid_namespace, ns);
}

SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
{
	struct pid_namespace *source, *target;
	struct pid *pid;
	pid_t result;

	rcu_read_lock();

	if (sourcefd >= 0)
        	source = pidns_by_fd_rcu(sourcefd);
	else
		source = task_active_pid_ns(current);

	if (targetfd >= 0)
        	target = pidns_by_fd_rcu(targetfd);
	else
		target = task_active_pid_ns(current);

	pid = find_pid_ns(pid_nr, source);
        result = pid_nr_ns(pid, target);
        if (result == 0)
		result = -ESRCH;          

	rcu_read_unlock();

	return result;
}

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-28 16:37     ` Eric W. Biederman
  0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:37 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
	Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds,
	Stéphane Graber

Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:

> On 09/25, Konstantin Khlebnikov wrote:
>>
>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>  {
>> -	struct file *file;
>> +	struct ns_common *ns;
>> +	struct fd f;
>>  
>> -	file = fget(fd);
>> -	if (!file)
>> +	f = fdget(fd);
>> +	if (!f.file)
>>  		return ERR_PTR(-EBADF);
>>  
>> -	if (file->f_op != &ns_file_operations)
>> +	if (f.file->f_op != &ns_file_operations)
>> +		goto out_invalid;
>> +
>> +	ns = get_proc_ns(file_inode(f.file));
>> +	if (nstype && (ns->ops->type != nstype))
>>  		goto out_invalid;
>>  
>> -	return file;
>> +	*fd_ref = f;
>> +	return ns;
>>  
>>  out_invalid:
>> -	fput(file);
>> +	fdput(f);
>>  	return ERR_PTR(-EINVAL);
>>  }
>
> Well yes, fdget() makes sense but this is minor.
>
> Honestly, I do not really like the new helper... I understand this
> is subjective, so I won't insist. But how about 1/1? We do not need
> fd/file at all. With this patch your sys_getvpid() can just use
> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>
> Eric, what do you think?

At some level I don't care this is not exposed to userspace.

However since we are going several rounds with this.

Of the existing uses several of them sleep, which unfortunately means we
can not use rcu locking for everything.  The network namespace ones wind
up taking a reference to struct net because the have the legacy pid case
to deal with.  Which makes we can not use fdget for all callers either.

For this translate_pid rcu locking is sufficient, rcu locking is easy
and doing any more than rcu locking just seems silly.  So let me
respectfully suggest.

struct ns_common *ns_by_fd_rcu(int fd, int type)
{
	struct files_struct *files = current->files;
	struct file *file;
	struct ns_common *ns;
	void *ret;

	file = fcheck_files(files, fd);
        if (!file)
		return ERR_PTR(-EBADF);

	if (file->f_mode & FMODE_PATH)
		return ERR_PTR(-EINVAL);

	if (file->f_op != &ns_file_operations)
        	return ERR_PTR(-EINVAL);

	ns = get_proc_ns(file_inode(file));
	if (ns->ops->type != type)
		return ERR_PTR(-EINVAL);

	return ns;
}

struct pid_namespace *pidns_by_fd_rcu(int fd)
{
	struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
        if (IS_ERR(ns))
		return ERR_CAST(ns);
	return container_of(ns, struct pid_namespace, ns);
}

SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
{
	struct pid_namespace *source, *target;
	struct pid *pid;
	pid_t result;

	rcu_read_lock();

	if (sourcefd >= 0)
        	source = pidns_by_fd_rcu(sourcefd);
	else
		source = task_active_pid_ns(current);

	if (targetfd >= 0)
        	target = pidns_by_fd_rcu(targetfd);
	else
		target = task_active_pid_ns(current);

	pid = find_pid_ns(pid_nr, source);
        result = pid_nr_ns(pid, target);
        if (result == 0)
		result = -ESRCH;          

	rcu_read_unlock();

	return result;
}

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
  2015-09-25 13:52   ` Konstantin Khlebnikov
@ 2015-09-28 16:57     ` Eric W. Biederman
  -1 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:57 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov,
	Chen Fan, Andrew Morton, Linus Torvalds

Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes:

> If pid is negative then getvpid() returns pid of parent task for -pid.

Now that I am noticing this.  I don't think I have seen any discussion
about justifying a syscall getting another processes parent pid.  My
apologies if I just missed it.

Why do we want the the parent pid?  We can we usefully do with it?
Is proc really that bad of an interface?

Fetching a parent pid feels like a separate logical operation
from pid translation.  Which makes me a bit uneasy about this
part of the conversation.

> Examples:
> getvpid(pid, ns, -1)      - get pid in our pid namespace
> getvpid(pid, -1, ns)      - get pid in container
> getvpid(pid, -1, ns) > 0  - is pid is reachable from container?
> getvpid(1, ns1, ns2) > 0  - is ns1 inside ns2?
> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> getvpid(1, ns, -1)        - get init task of pid-namespace
> getvpid(-1, ns, -1)       - get reaper of init task in parent pid-namespace
> getvpid(-pid, -1, -1)     - get ppid by pid

As I step back and pay attention to this case I am half wondering if
perhaps what would be most useful is a file descriptor that refers
to a pid and an updated set of system calls that takes pid file
descriptors instead of pids.

Something like:

    getpidfd(int pidnsfd, pid_t pid);
    
    waitfd(int pidfd, int *status, int options, struct rusage *rusage);
    
    killfd(int pidfd, int sig);
    
    clonefd(...);
    
And perhaps:
    pid_nr_ns(int pidnsfd, int pidfd);

    parentfd(int pidfd);

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-09-28 16:57     ` Eric W. Biederman
  0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-28 16:57 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-api, containers, linux-kernel, Roman Gushchin,
	Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:

> If pid is negative then getvpid() returns pid of parent task for -pid.

Now that I am noticing this.  I don't think I have seen any discussion
about justifying a syscall getting another processes parent pid.  My
apologies if I just missed it.

Why do we want the the parent pid?  We can we usefully do with it?
Is proc really that bad of an interface?

Fetching a parent pid feels like a separate logical operation
from pid translation.  Which makes me a bit uneasy about this
part of the conversation.

> Examples:
> getvpid(pid, ns, -1)      - get pid in our pid namespace
> getvpid(pid, -1, ns)      - get pid in container
> getvpid(pid, -1, ns) > 0  - is pid is reachable from container?
> getvpid(1, ns1, ns2) > 0  - is ns1 inside ns2?
> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> getvpid(1, ns, -1)        - get init task of pid-namespace
> getvpid(-1, ns, -1)       - get reaper of init task in parent pid-namespace
> getvpid(-pid, -1, -1)     - get ppid by pid

As I step back and pay attention to this case I am half wondering if
perhaps what would be most useful is a file descriptor that refers
to a pid and an updated set of system calls that takes pid file
descriptors instead of pids.

Something like:

    getpidfd(int pidnsfd, pid_t pid);
    
    waitfd(int pidfd, int *status, int options, struct rusage *rusage);
    
    killfd(int pidfd, int sig);
    
    clonefd(...);
    
And perhaps:
    pid_nr_ns(int pidnsfd, int pidfd);

    parentfd(int pidfd);

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]     ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-29 16:43       ` Oleg Nesterov
  2015-09-30  2:54       ` Chen Fan
  1 sibling, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 16:43 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Roman Gushchin, Konstantin Khlebnikov,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
	Andrew Morton, Linus Torvalds

On 09/28, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
> > Honestly, I do not really like the new helper... I understand this
> > is subjective, so I won't insist. But how about 1/1? We do not need
> > fd/file at all. With this patch your sys_getvpid() can just use
> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
> >
> > Eric, what do you think?
>
> At some level I don't care this is not exposed to userspace.

I agree, this is minor. But you know, the kernel is already complicated
too much, we should try to simplify/cleanup everything we can ;)

> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything.  The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with. Which makes we can not use fdget for all callers either.

And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
by the network namespace.

> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly.  So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> 	struct files_struct *files = current->files;
> 	struct file *file;
> 	struct ns_common *ns;
> 	void *ret;
>
> 	file = fcheck_files(files, fd);
>         if (!file)
> 		return ERR_PTR(-EBADF);
>
> 	if (file->f_mode & FMODE_PATH)
> 		return ERR_PTR(-EINVAL);
>
> 	if (file->f_op != &ns_file_operations)
>         	return ERR_PTR(-EINVAL);
>
> 	ns = get_proc_ns(file_inode(file));
> 	if (ns->ops->type != type)
> 		return ERR_PTR(-EINVAL);
>
> 	return ns;
> }

OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).

And in any case fcheck_files() makes more sense than fdget(), somehow I did
not think about this when I sent 1/1.

Hmm. and after the quick look at cleanup_net() I can't understand whether
get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
not... Can it?

Oleg.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]     ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-29 16:43       ` Oleg Nesterov
  2015-09-30  2:54       ` Chen Fan
  1 sibling, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 16:43 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
	Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

On 09/28, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg@redhat.com> writes:
>
> > Honestly, I do not really like the new helper... I understand this
> > is subjective, so I won't insist. But how about 1/1? We do not need
> > fd/file at all. With this patch your sys_getvpid() can just use
> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
> >
> > Eric, what do you think?
>
> At some level I don't care this is not exposed to userspace.

I agree, this is minor. But you know, the kernel is already complicated
too much, we should try to simplify/cleanup everything we can ;)

> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything.  The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with. Which makes we can not use fdget for all callers either.

And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
by the network namespace.

> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly.  So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> 	struct files_struct *files = current->files;
> 	struct file *file;
> 	struct ns_common *ns;
> 	void *ret;
>
> 	file = fcheck_files(files, fd);
>         if (!file)
> 		return ERR_PTR(-EBADF);
>
> 	if (file->f_mode & FMODE_PATH)
> 		return ERR_PTR(-EINVAL);
>
> 	if (file->f_op != &ns_file_operations)
>         	return ERR_PTR(-EINVAL);
>
> 	ns = get_proc_ns(file_inode(file));
> 	if (ns->ops->type != type)
> 		return ERR_PTR(-EINVAL);
>
> 	return ns;
> }

OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).

And in any case fcheck_files() makes more sense than fdget(), somehow I did
not think about this when I sent 1/1.

Hmm. and after the quick look at cleanup_net() I can't understand whether
get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
not... Can it?

Oleg.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-29 16:43       ` Oleg Nesterov
  0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 16:43 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
	Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds,
	Stéphane Graber

On 09/28, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
> > Honestly, I do not really like the new helper... I understand this
> > is subjective, so I won't insist. But how about 1/1? We do not need
> > fd/file at all. With this patch your sys_getvpid() can just use
> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
> >
> > Eric, what do you think?
>
> At some level I don't care this is not exposed to userspace.

I agree, this is minor. But you know, the kernel is already complicated
too much, we should try to simplify/cleanup everything we can ;)

> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything.  The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with. Which makes we can not use fdget for all callers either.

And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
by the network namespace.

> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly.  So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> 	struct files_struct *files = current->files;
> 	struct file *file;
> 	struct ns_common *ns;
> 	void *ret;
>
> 	file = fcheck_files(files, fd);
>         if (!file)
> 		return ERR_PTR(-EBADF);
>
> 	if (file->f_mode & FMODE_PATH)
> 		return ERR_PTR(-EINVAL);
>
> 	if (file->f_op != &ns_file_operations)
>         	return ERR_PTR(-EINVAL);
>
> 	ns = get_proc_ns(file_inode(file));
> 	if (ns->ops->type != type)
> 		return ERR_PTR(-EINVAL);
>
> 	return ns;
> }

OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).

And in any case fcheck_files() makes more sense than fdget(), somehow I did
not think about this when I sent 1/1.

Hmm. and after the quick look at cleanup_net() I can't understand whether
get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
not... Can it?

Oleg.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]       ` <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-09-29 17:30         ` Eric W. Biederman
  0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 17:30 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Roman Gushchin, Konstantin Khlebnikov,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
	Andrew Morton, Linus Torvalds

Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:

> On 09/28, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>>
>> > Honestly, I do not really like the new helper... I understand this
>> > is subjective, so I won't insist. But how about 1/1? We do not need
>> > fd/file at all. With this patch your sys_getvpid() can just use
>> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>> >
>> > Eric, what do you think?
>>
>> At some level I don't care this is not exposed to userspace.
>
> I agree, this is minor. But you know, the kernel is already complicated
> too much, we should try to simplify/cleanup everything we can ;)
>
>> Of the existing uses several of them sleep, which unfortunately means we
>> can not use rcu locking for everything.  The network namespace ones wind
>> up taking a reference to struct net because the have the legacy pid case
>> to deal with. Which makes we can not use fdget for all callers either.
>
> And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
> by the network namespace.
>
>> For this translate_pid rcu locking is sufficient, rcu locking is easy
>> and doing any more than rcu locking just seems silly.  So let me
>> respectfully suggest.
>>
>> struct ns_common *ns_by_fd_rcu(int fd, int type)
>> {
>> 	struct files_struct *files = current->files;
>> 	struct file *file;
>> 	struct ns_common *ns;
>> 	void *ret;
>>
>> 	file = fcheck_files(files, fd);
>>         if (!file)
>> 		return ERR_PTR(-EBADF);
>>
>> 	if (file->f_mode & FMODE_PATH)
>> 		return ERR_PTR(-EINVAL);
>>
>> 	if (file->f_op != &ns_file_operations)
>>         	return ERR_PTR(-EINVAL);
>>
>> 	ns = get_proc_ns(file_inode(file));
>> 	if (ns->ops->type != type)
>> 		return ERR_PTR(-EINVAL);
>>
>> 	return ns;
>> }
>
> OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>
> And in any case fcheck_files() makes more sense than fdget(), somehow I did
> not think about this when I sent 1/1.
>
> Hmm. and after the quick look at cleanup_net() I can't understand whether
> get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> not... Can it?

Some of those places need a reference that allows them to sleep, and the
code is shared with the legacy pid case so with an addition of get_net
we can use ns_by_fd_rcu().   There are cases like setns that could
use ns_by_fd_rcu() with code reording.

We can implement get_net_ns_by_fd as:
struct net *get_net_ns_by_fd(int fd)
{
        struct net *net;

	rcu_read_lock();
	net = net_ns_by_fd_rcu(fd);
        if (!IS_ERR(net))
        	get_net(net);
        rcu_read_unlock();

	return net;
}

Which means we can achieve code sharing with the pure rcu version
as a base.

If the networking code did not have the legacy pid case to handle I
would want to do something with struct fd, as the file descriptor
already keeps the struct net reference alive and struct fd allows
for sleeping.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]       ` <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-09-29 17:30         ` Eric W. Biederman
  0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 17:30 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
	Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

Oleg Nesterov <oleg@redhat.com> writes:

> On 09/28, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg@redhat.com> writes:
>>
>> > Honestly, I do not really like the new helper... I understand this
>> > is subjective, so I won't insist. But how about 1/1? We do not need
>> > fd/file at all. With this patch your sys_getvpid() can just use
>> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>> >
>> > Eric, what do you think?
>>
>> At some level I don't care this is not exposed to userspace.
>
> I agree, this is minor. But you know, the kernel is already complicated
> too much, we should try to simplify/cleanup everything we can ;)
>
>> Of the existing uses several of them sleep, which unfortunately means we
>> can not use rcu locking for everything.  The network namespace ones wind
>> up taking a reference to struct net because the have the legacy pid case
>> to deal with. Which makes we can not use fdget for all callers either.
>
> And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
> by the network namespace.
>
>> For this translate_pid rcu locking is sufficient, rcu locking is easy
>> and doing any more than rcu locking just seems silly.  So let me
>> respectfully suggest.
>>
>> struct ns_common *ns_by_fd_rcu(int fd, int type)
>> {
>> 	struct files_struct *files = current->files;
>> 	struct file *file;
>> 	struct ns_common *ns;
>> 	void *ret;
>>
>> 	file = fcheck_files(files, fd);
>>         if (!file)
>> 		return ERR_PTR(-EBADF);
>>
>> 	if (file->f_mode & FMODE_PATH)
>> 		return ERR_PTR(-EINVAL);
>>
>> 	if (file->f_op != &ns_file_operations)
>>         	return ERR_PTR(-EINVAL);
>>
>> 	ns = get_proc_ns(file_inode(file));
>> 	if (ns->ops->type != type)
>> 		return ERR_PTR(-EINVAL);
>>
>> 	return ns;
>> }
>
> OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>
> And in any case fcheck_files() makes more sense than fdget(), somehow I did
> not think about this when I sent 1/1.
>
> Hmm. and after the quick look at cleanup_net() I can't understand whether
> get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> not... Can it?

Some of those places need a reference that allows them to sleep, and the
code is shared with the legacy pid case so with an addition of get_net
we can use ns_by_fd_rcu().   There are cases like setns that could
use ns_by_fd_rcu() with code reording.

We can implement get_net_ns_by_fd as:
struct net *get_net_ns_by_fd(int fd)
{
        struct net *net;

	rcu_read_lock();
	net = net_ns_by_fd_rcu(fd);
        if (!IS_ERR(net))
        	get_net(net);
        rcu_read_unlock();

	return net;
}

Which means we can achieve code sharing with the pure rcu version
as a base.

If the networking code did not have the legacy pid case to handle I
would want to do something with struct fd, as the file descriptor
already keeps the struct net reference alive and struct fd allows
for sleeping.

Eric


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-29 17:30         ` Eric W. Biederman
  0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 17:30 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
	Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds,
	Stéphane Graber

Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:

> On 09/28, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>>
>> > Honestly, I do not really like the new helper... I understand this
>> > is subjective, so I won't insist. But how about 1/1? We do not need
>> > fd/file at all. With this patch your sys_getvpid() can just use
>> > proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>> >
>> > Eric, what do you think?
>>
>> At some level I don't care this is not exposed to userspace.
>
> I agree, this is minor. But you know, the kernel is already complicated
> too much, we should try to simplify/cleanup everything we can ;)
>
>> Of the existing uses several of them sleep, which unfortunately means we
>> can not use rcu locking for everything.  The network namespace ones wind
>> up taking a reference to struct net because the have the legacy pid case
>> to deal with. Which makes we can not use fdget for all callers either.
>
> And that is why 1/1 adds proc_get_ns_by_fd/get_type which can also be used
> by the network namespace.
>
>> For this translate_pid rcu locking is sufficient, rcu locking is easy
>> and doing any more than rcu locking just seems silly.  So let me
>> respectfully suggest.
>>
>> struct ns_common *ns_by_fd_rcu(int fd, int type)
>> {
>> 	struct files_struct *files = current->files;
>> 	struct file *file;
>> 	struct ns_common *ns;
>> 	void *ret;
>>
>> 	file = fcheck_files(files, fd);
>>         if (!file)
>> 		return ERR_PTR(-EBADF);
>>
>> 	if (file->f_mode & FMODE_PATH)
>> 		return ERR_PTR(-EINVAL);
>>
>> 	if (file->f_op != &ns_file_operations)
>>         	return ERR_PTR(-EINVAL);
>>
>> 	ns = get_proc_ns(file_inode(file));
>> 	if (ns->ops->type != type)
>> 		return ERR_PTR(-EINVAL);
>>
>> 	return ns;
>> }
>
> OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>
> And in any case fcheck_files() makes more sense than fdget(), somehow I did
> not think about this when I sent 1/1.
>
> Hmm. and after the quick look at cleanup_net() I can't understand whether
> get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> not... Can it?

Some of those places need a reference that allows them to sleep, and the
code is shared with the legacy pid case so with an addition of get_net
we can use ns_by_fd_rcu().   There are cases like setns that could
use ns_by_fd_rcu() with code reording.

We can implement get_net_ns_by_fd as:
struct net *get_net_ns_by_fd(int fd)
{
        struct net *net;

	rcu_read_lock();
	net = net_ns_by_fd_rcu(fd);
        if (!IS_ERR(net))
        	get_net(net);
        rcu_read_unlock();

	return net;
}

Which means we can achieve code sharing with the pure rcu version
as a base.

If the networking code did not have the legacy pid case to handle I
would want to do something with struct fd, as the file descriptor
already keeps the struct net reference alive and struct fd allows
for sleeping.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]         ` <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-29 18:38           ` Oleg Nesterov
  0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 18:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Roman Gushchin, Konstantin Khlebnikov,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
	Andrew Morton, Linus Torvalds

On 09/29, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
> >
> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
> > not think about this when I sent 1/1.
> >
> > Hmm. and after the quick look at cleanup_net() I can't understand whether
> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> > not... Can it?
>
> Some of those places need a reference that allows them to sleep, and the
> code is shared with the legacy pid case so with an addition of get_net
> we can use ns_by_fd_rcu().   There are cases like setns that could
> use ns_by_fd_rcu() with code reording.
>
> We can implement get_net_ns_by_fd as:
> struct net *get_net_ns_by_fd(int fd)
> {
>         struct net *net;
>
> 	rcu_read_lock();
> 	net = net_ns_by_fd_rcu(fd);
>         if (!IS_ERR(net))
>         	get_net(net);
>         rcu_read_unlock();
>
> 	return net;
> }
>
> Which means we can achieve code sharing with the pure rcu version
> as a base.

Yes, this is what I meant... but don't we need maybe_get_net() ?

Oleg.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]         ` <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-29 18:38           ` Oleg Nesterov
  0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 18:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
	Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

On 09/29, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg@redhat.com> writes:
>
> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
> >
> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
> > not think about this when I sent 1/1.
> >
> > Hmm. and after the quick look at cleanup_net() I can't understand whether
> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> > not... Can it?
>
> Some of those places need a reference that allows them to sleep, and the
> code is shared with the legacy pid case so with an addition of get_net
> we can use ns_by_fd_rcu().   There are cases like setns that could
> use ns_by_fd_rcu() with code reording.
>
> We can implement get_net_ns_by_fd as:
> struct net *get_net_ns_by_fd(int fd)
> {
>         struct net *net;
>
> 	rcu_read_lock();
> 	net = net_ns_by_fd_rcu(fd);
>         if (!IS_ERR(net))
>         	get_net(net);
>         rcu_read_unlock();
>
> 	return net;
> }
>
> Which means we can achieve code sharing with the pure rcu version
> as a base.

Yes, this is what I meant... but don't we need maybe_get_net() ?

Oleg.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-29 18:38           ` Oleg Nesterov
  0 siblings, 0 replies; 37+ messages in thread
From: Oleg Nesterov @ 2015-09-29 18:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
	Serge Hallyn, Chen Fan, Andrew Morton, Linus Torvalds,
	Stéphane Graber

On 09/29, Eric W. Biederman wrote:
>
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
> >
> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
> > not think about this when I sent 1/1.
> >
> > Hmm. and after the quick look at cleanup_net() I can't understand whether
> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
> > not... Can it?
>
> Some of those places need a reference that allows them to sleep, and the
> code is shared with the legacy pid case so with an addition of get_net
> we can use ns_by_fd_rcu().   There are cases like setns that could
> use ns_by_fd_rcu() with code reording.
>
> We can implement get_net_ns_by_fd as:
> struct net *get_net_ns_by_fd(int fd)
> {
>         struct net *net;
>
> 	rcu_read_lock();
> 	net = net_ns_by_fd_rcu(fd);
>         if (!IS_ERR(net))
>         	get_net(net);
>         rcu_read_unlock();
>
> 	return net;
> }
>
> Which means we can achieve code sharing with the pure rcu version
> as a base.

Yes, this is what I meant... but don't we need maybe_get_net() ?

Oleg.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
  2015-09-29 18:38           ` Oleg Nesterov
@ 2015-09-29 19:05               ` Eric W. Biederman
  -1 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 19:05 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Roman Gushchin, Konstantin Khlebnikov,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Chen Fan,
	Andrew Morton, Linus Torvalds

Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:

> On 09/29, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>>
>> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>> >
>> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
>> > not think about this when I sent 1/1.
>> >
>> > Hmm. and after the quick look at cleanup_net() I can't understand whether
>> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
>> > not... Can it?
>>
>> Some of those places need a reference that allows them to sleep, and the
>> code is shared with the legacy pid case so with an addition of get_net
>> we can use ns_by_fd_rcu().   There are cases like setns that could
>> use ns_by_fd_rcu() with code reording.
>>
>> We can implement get_net_ns_by_fd as:
>> struct net *get_net_ns_by_fd(int fd)
>> {
>>         struct net *net;
>>
>> 	rcu_read_lock();
>> 	net = net_ns_by_fd_rcu(fd);
>>         if (!IS_ERR(net))
>>         	get_net(net);
>>         rcu_read_unlock();
>>
>> 	return net;
>> }
>>
>> Which means we can achieve code sharing with the pure rcu version
>> as a base.
>
> Yes, this is what I meant... but don't we need maybe_get_net() ?

Let me see.

The call chain is a little interesting so let's trace it.

thread1                               thread2
  net = net_ns_by_fd_rcu(fd);           fput(file)
  get_net(net);                           ____fput
                                            __fput
                                              dput
                                                dentry_kill
                                                  __dentry_kill
                                                    dentry_iput
                                                      iput
                                                        iput_final
                                                          evict
                                                            ->evict_inode (nsfs_evict)
                                                              netns_put
                                                                 put_net
                                                                   __put_net
                                                                      queue_work(net_cleanup_work)
                                                                      ...
                                                                      cleanup_net
                                                                        synchronize_rcu.


Given that someone in another thread can put the file descriptor and I
don't see anything explicitly waiting until after the rcu critical
section ends to put the network namespace I guess we do need
maybe_get_net.

In practice I think the delay logic in fput will act as an rcu barrier,
but that is not guaranteed so we had better not count on it.  Sigh.

Anyway this is a long ways from a syscall to translate pids.

Eric

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-29 19:05               ` Eric W. Biederman
  0 siblings, 0 replies; 37+ messages in thread
From: Eric W. Biederman @ 2015-09-29 19:05 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
	Roman Gushchin, Serge Hallyn, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

Oleg Nesterov <oleg@redhat.com> writes:

> On 09/29, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <oleg@redhat.com> writes:
>>
>> > OK, I won't insist, this too looks better to me than proc_ns_fdget(&fd_ref).
>> >
>> > And in any case fcheck_files() makes more sense than fdget(), somehow I did
>> > not think about this when I sent 1/1.
>> >
>> > Hmm. and after the quick look at cleanup_net() I can't understand whether
>> > get_net_ns_by_fd() can use ns_by_fd_rcu() + maybe_get_net(to_net_ns()) or
>> > not... Can it?
>>
>> Some of those places need a reference that allows them to sleep, and the
>> code is shared with the legacy pid case so with an addition of get_net
>> we can use ns_by_fd_rcu().   There are cases like setns that could
>> use ns_by_fd_rcu() with code reording.
>>
>> We can implement get_net_ns_by_fd as:
>> struct net *get_net_ns_by_fd(int fd)
>> {
>>         struct net *net;
>>
>> 	rcu_read_lock();
>> 	net = net_ns_by_fd_rcu(fd);
>>         if (!IS_ERR(net))
>>         	get_net(net);
>>         rcu_read_unlock();
>>
>> 	return net;
>> }
>>
>> Which means we can achieve code sharing with the pure rcu version
>> as a base.
>
> Yes, this is what I meant... but don't we need maybe_get_net() ?

Let me see.

The call chain is a little interesting so let's trace it.

thread1                               thread2
  net = net_ns_by_fd_rcu(fd);           fput(file)
  get_net(net);                           ____fput
                                            __fput
                                              dput
                                                dentry_kill
                                                  __dentry_kill
                                                    dentry_iput
                                                      iput
                                                        iput_final
                                                          evict
                                                            ->evict_inode (nsfs_evict)
                                                              netns_put
                                                                 put_net
                                                                   __put_net
                                                                      queue_work(net_cleanup_work)
                                                                      ...
                                                                      cleanup_net
                                                                        synchronize_rcu.


Given that someone in another thread can put the file descriptor and I
don't see anything explicitly waiting until after the rcu critical
section ends to put the network namespace I guess we do need
maybe_get_net.

In practice I think the delay logic in fput will act as an rcu barrier,
but that is not guaranteed so we had better not count on it.  Sigh.

Anyway this is a long ways from a syscall to translate pids.

Eric


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]     ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
  2015-09-29 16:43       ` Oleg Nesterov
@ 2015-09-30  2:54       ` Chen Fan
  1 sibling, 0 replies; 37+ messages in thread
From: Chen Fan @ 2015-09-30  2:54 UTC (permalink / raw)
  To: Eric W. Biederman, Oleg Nesterov
  Cc: Roman Gushchin, Konstantin Khlebnikov,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andrew Morton,
	Linus Torvalds


On 09/29/2015 12:37 AM, Eric W. Biederman wrote:
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
>> On 09/25, Konstantin Khlebnikov wrote:
>>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>>   {
>>> -	struct file *file;
>>> +	struct ns_common *ns;
>>> +	struct fd f;
>>>   
>>> -	file = fget(fd);
>>> -	if (!file)
>>> +	f = fdget(fd);
>>> +	if (!f.file)
>>>   		return ERR_PTR(-EBADF);
>>>   
>>> -	if (file->f_op != &ns_file_operations)
>>> +	if (f.file->f_op != &ns_file_operations)
>>> +		goto out_invalid;
>>> +
>>> +	ns = get_proc_ns(file_inode(f.file));
>>> +	if (nstype && (ns->ops->type != nstype))
>>>   		goto out_invalid;
>>>   
>>> -	return file;
>>> +	*fd_ref = f;
>>> +	return ns;
>>>   
>>>   out_invalid:
>>> -	fput(file);
>>> +	fdput(f);
>>>   	return ERR_PTR(-EINVAL);
>>>   }
>> Well yes, fdget() makes sense but this is minor.
>>
>> Honestly, I do not really like the new helper... I understand this
>> is subjective, so I won't insist. But how about 1/1? We do not need
>> fd/file at all. With this patch your sys_getvpid() can just use
>> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>>
>> Eric, what do you think?
> At some level I don't care this is not exposed to userspace.
>
> However since we are going several rounds with this.
>
> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything.  The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with.  Which makes we can not use fdget for all callers either.
>
> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly.  So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> 	struct files_struct *files = current->files;
> 	struct file *file;
> 	struct ns_common *ns;
> 	void *ret;
pointer files seems no more useful. we can
use fcheck(fd) instead.

Thanks,

>
> 	file = fcheck_files(files, fd);
>          if (!file)
> 		return ERR_PTR(-EBADF);
>
> 	if (file->f_mode & FMODE_PATH)
> 		return ERR_PTR(-EINVAL);
>
> 	if (file->f_op != &ns_file_operations)
>          	return ERR_PTR(-EINVAL);
>
> 	ns = get_proc_ns(file_inode(file));
> 	if (ns->ops->type != type)
> 		return ERR_PTR(-EINVAL);
>
> 	return ns;
> }
>
> struct pid_namespace *pidns_by_fd_rcu(int fd)
> {
> 	struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
>          if (IS_ERR(ns))
> 		return ERR_CAST(ns);
> 	return container_of(ns, struct pid_namespace, ns);
> }
>
> SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
> {
> 	struct pid_namespace *source, *target;
> 	struct pid *pid;
> 	pid_t result;
>
> 	rcu_read_lock();
>
> 	if (sourcefd >= 0)
>          	source = pidns_by_fd_rcu(sourcefd);
> 	else
> 		source = task_active_pid_ns(current);
>
> 	if (targetfd >= 0)
>          	target = pidns_by_fd_rcu(targetfd);
> 	else
> 		target = task_active_pid_ns(current);
>
> 	pid = find_pid_ns(pid_nr, source);
>          result = pid_nr_ns(pid, target);
>          if (result == 0)
> 		result = -ESRCH;
>
> 	rcu_read_unlock();
>
> 	return result;
> }
>
> Eric
> .
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
       [not found]     ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-09-30  2:54       ` Chen Fan
  2015-09-30  2:54       ` Chen Fan
  1 sibling, 0 replies; 37+ messages in thread
From: Chen Fan @ 2015-09-30  2:54 UTC (permalink / raw)
  To: Eric W. Biederman, Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-api, containers, linux-kernel,
	Roman Gushchin, Serge Hallyn, Andrew Morton, Linus Torvalds,
	Stéphane Graber


On 09/29/2015 12:37 AM, Eric W. Biederman wrote:
> Oleg Nesterov <oleg@redhat.com> writes:
>
>> On 09/25, Konstantin Khlebnikov wrote:
>>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>>   {
>>> -	struct file *file;
>>> +	struct ns_common *ns;
>>> +	struct fd f;
>>>   
>>> -	file = fget(fd);
>>> -	if (!file)
>>> +	f = fdget(fd);
>>> +	if (!f.file)
>>>   		return ERR_PTR(-EBADF);
>>>   
>>> -	if (file->f_op != &ns_file_operations)
>>> +	if (f.file->f_op != &ns_file_operations)
>>> +		goto out_invalid;
>>> +
>>> +	ns = get_proc_ns(file_inode(f.file));
>>> +	if (nstype && (ns->ops->type != nstype))
>>>   		goto out_invalid;
>>>   
>>> -	return file;
>>> +	*fd_ref = f;
>>> +	return ns;
>>>   
>>>   out_invalid:
>>> -	fput(file);
>>> +	fdput(f);
>>>   	return ERR_PTR(-EINVAL);
>>>   }
>> Well yes, fdget() makes sense but this is minor.
>>
>> Honestly, I do not really like the new helper... I understand this
>> is subjective, so I won't insist. But how about 1/1? We do not need
>> fd/file at all. With this patch your sys_getvpid() can just use
>> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>>
>> Eric, what do you think?
> At some level I don't care this is not exposed to userspace.
>
> However since we are going several rounds with this.
>
> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything.  The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with.  Which makes we can not use fdget for all callers either.
>
> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly.  So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> 	struct files_struct *files = current->files;
> 	struct file *file;
> 	struct ns_common *ns;
> 	void *ret;
pointer files seems no more useful. we can
use fcheck(fd) instead.

Thanks,

>
> 	file = fcheck_files(files, fd);
>          if (!file)
> 		return ERR_PTR(-EBADF);
>
> 	if (file->f_mode & FMODE_PATH)
> 		return ERR_PTR(-EINVAL);
>
> 	if (file->f_op != &ns_file_operations)
>          	return ERR_PTR(-EINVAL);
>
> 	ns = get_proc_ns(file_inode(file));
> 	if (ns->ops->type != type)
> 		return ERR_PTR(-EINVAL);
>
> 	return ns;
> }
>
> struct pid_namespace *pidns_by_fd_rcu(int fd)
> {
> 	struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
>          if (IS_ERR(ns))
> 		return ERR_CAST(ns);
> 	return container_of(ns, struct pid_namespace, ns);
> }
>
> SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
> {
> 	struct pid_namespace *source, *target;
> 	struct pid *pid;
> 	pid_t result;
>
> 	rcu_read_lock();
>
> 	if (sourcefd >= 0)
>          	source = pidns_by_fd_rcu(sourcefd);
> 	else
> 		source = task_active_pid_ns(current);
>
> 	if (targetfd >= 0)
>          	target = pidns_by_fd_rcu(targetfd);
> 	else
> 		target = task_active_pid_ns(current);
>
> 	pid = find_pid_ns(pid_nr, source);
>          result = pid_nr_ns(pid, target);
>          if (result == 0)
> 		result = -ESRCH;
>
> 	rcu_read_unlock();
>
> 	return result;
> }
>
> Eric
> .
>


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 0/1] ns: introduce proc_get_ns_by_fd()
@ 2015-09-30  2:54       ` Chen Fan
  0 siblings, 0 replies; 37+ messages in thread
From: Chen Fan @ 2015-09-30  2:54 UTC (permalink / raw)
  To: Eric W. Biederman, Oleg Nesterov
  Cc: Konstantin Khlebnikov, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
	Serge Hallyn, Andrew Morton, Linus Torvalds,
	Stéphane Graber


On 09/29/2015 12:37 AM, Eric W. Biederman wrote:
> Oleg Nesterov <oleg-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> writes:
>
>> On 09/25, Konstantin Khlebnikov wrote:
>>> +struct ns_common *proc_ns_fdget(int fd, int nstype, struct fd *fd_ref)
>>>   {
>>> -	struct file *file;
>>> +	struct ns_common *ns;
>>> +	struct fd f;
>>>   
>>> -	file = fget(fd);
>>> -	if (!file)
>>> +	f = fdget(fd);
>>> +	if (!f.file)
>>>   		return ERR_PTR(-EBADF);
>>>   
>>> -	if (file->f_op != &ns_file_operations)
>>> +	if (f.file->f_op != &ns_file_operations)
>>> +		goto out_invalid;
>>> +
>>> +	ns = get_proc_ns(file_inode(f.file));
>>> +	if (nstype && (ns->ops->type != nstype))
>>>   		goto out_invalid;
>>>   
>>> -	return file;
>>> +	*fd_ref = f;
>>> +	return ns;
>>>   
>>>   out_invalid:
>>> -	fput(file);
>>> +	fdput(f);
>>>   	return ERR_PTR(-EINVAL);
>>>   }
>> Well yes, fdget() makes sense but this is minor.
>>
>> Honestly, I do not really like the new helper... I understand this
>> is subjective, so I won't insist. But how about 1/1? We do not need
>> fd/file at all. With this patch your sys_getvpid() can just use
>> proc_get_ns_by_fd(fd, CLONE_NEWPID) and put_pid_ns().
>>
>> Eric, what do you think?
> At some level I don't care this is not exposed to userspace.
>
> However since we are going several rounds with this.
>
> Of the existing uses several of them sleep, which unfortunately means we
> can not use rcu locking for everything.  The network namespace ones wind
> up taking a reference to struct net because the have the legacy pid case
> to deal with.  Which makes we can not use fdget for all callers either.
>
> For this translate_pid rcu locking is sufficient, rcu locking is easy
> and doing any more than rcu locking just seems silly.  So let me
> respectfully suggest.
>
> struct ns_common *ns_by_fd_rcu(int fd, int type)
> {
> 	struct files_struct *files = current->files;
> 	struct file *file;
> 	struct ns_common *ns;
> 	void *ret;
pointer files seems no more useful. we can
use fcheck(fd) instead.

Thanks,

>
> 	file = fcheck_files(files, fd);
>          if (!file)
> 		return ERR_PTR(-EBADF);
>
> 	if (file->f_mode & FMODE_PATH)
> 		return ERR_PTR(-EINVAL);
>
> 	if (file->f_op != &ns_file_operations)
>          	return ERR_PTR(-EINVAL);
>
> 	ns = get_proc_ns(file_inode(file));
> 	if (ns->ops->type != type)
> 		return ERR_PTR(-EINVAL);
>
> 	return ns;
> }
>
> struct pid_namespace *pidns_by_fd_rcu(int fd)
> {
> 	struct ns_common *ns = ns_by_fd_rcu(fd, CLONE_NEWPID);
>          if (IS_ERR(ns))
> 		return ERR_CAST(ns);
> 	return container_of(ns, struct pid_namespace, ns);
> }
>
> SYSCALL_DEFINE3(translate_pid, pid_t, pid_nr, int, sourcefd, int, targetfd)
> {
> 	struct pid_namespace *source, *target;
> 	struct pid *pid;
> 	pid_t result;
>
> 	rcu_read_lock();
>
> 	if (sourcefd >= 0)
>          	source = pidns_by_fd_rcu(sourcefd);
> 	else
> 		source = task_active_pid_ns(current);
>
> 	if (targetfd >= 0)
>          	target = pidns_by_fd_rcu(targetfd);
> 	else
> 		target = task_active_pid_ns(current);
>
> 	pid = find_pid_ns(pid_nr, source);
>          result = pid_nr_ns(pid, target);
>          if (result == 0)
> 		result = -ESRCH;
>
> 	rcu_read_unlock();
>
> 	return result;
> }
>
> Eric
> .
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
       [not found]     ` <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-10-20 10:04       ` Konstantin Khlebnikov
  0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-10-20 10:04 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Roman Gushchin, linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Serge Hallyn, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Oleg Nesterov,
	Chen Fan, Andrew Morton, Linus Torvalds

On 28.09.2015 19:57, Eric W. Biederman wrote:
> Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes:
>
>> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Now that I am noticing this.  I don't think I have seen any discussion
> about justifying a syscall getting another processes parent pid.  My
> apologies if I just missed it.
>

Sorry for late response. This completely fell out of my mind after LinuxCon.

> Why do we want the the parent pid?  We can we usefully do with it?
> Is proc really that bad of an interface?
>
> Fetching a parent pid feels like a separate logical operation
> from pid translation.  Which makes me a bit uneasy about this
> part of the conversation.

Yep proc interface is bad. /proc/$pid/stat is almost impossible to
parse without flaws because task could set second field "comm" into
any string and fake ppid  - for example ") Z 1". /proc/$pid/status
is better but it has more information and thus slower.

This trick for distant getppid looks cheap useful:
in this interface space of negative pids is free for use.

>
>> Examples:
>> getvpid(pid, ns, -1)      - get pid in our pid namespace
>> getvpid(pid, -1, ns)      - get pid in container
>> getvpid(pid, -1, ns) > 0  - is pid is reachable from container?
>> getvpid(1, ns1, ns2) > 0  - is ns1 inside ns2?
>> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
>> getvpid(1, ns, -1)        - get init task of pid-namespace
>> getvpid(-1, ns, -1)       - get reaper of init task in parent pid-namespace
>> getvpid(-pid, -1, -1)     - get ppid by pid
>
> As I step back and pay attention to this case I am half wondering if
> perhaps what would be most useful is a file descriptor that refers
> to a pid and an updated set of system calls that takes pid file
> descriptors instead of pids.

Fd which pins pids isn't a good idea.

I think it's better to refer (but not hold) task rather than pid.
For example inode of taskfd will hold small buffer for task exit
status: task holds reference to its own taskfd inode and populates
status when exits. Here will be no zombies and delayed reaping.

Something like:

task_fd = clonefd()
                      ...
select(...)
                      exit(...)
pread(task_fd, &status_rusage_etc, sizeof, 0);
close(task_fd);

Task pid also could be part of structure in that fd. Potentially it
could provide the same information as /proc/$pid/... in effective
binary format: we can read only required fields of structure and
kernel can skip unneeded calculations.

>
> Something like:
>
>      getpidfd(int pidnsfd, pid_t pid);
>
>      waitfd(int pidfd, int *status, int options, struct rusage *rusage);
>
>      killfd(int pidfd, int sig);
>
>      clonefd(...);
>
> And perhaps:
>      pid_nr_ns(int pidnsfd, int pidfd);
>
>      parentfd(int pidfd);
>
> Eric
>

-- 
Konstantin

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
       [not found]     ` <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
@ 2015-10-20 10:04       ` Konstantin Khlebnikov
  0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-10-20 10:04 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-api, containers, linux-kernel, Roman Gushchin,
	Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

On 28.09.2015 19:57, Eric W. Biederman wrote:
> Konstantin Khlebnikov <khlebnikov@yandex-team.ru> writes:
>
>> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Now that I am noticing this.  I don't think I have seen any discussion
> about justifying a syscall getting another processes parent pid.  My
> apologies if I just missed it.
>

Sorry for late response. This completely fell out of my mind after LinuxCon.

> Why do we want the the parent pid?  We can we usefully do with it?
> Is proc really that bad of an interface?
>
> Fetching a parent pid feels like a separate logical operation
> from pid translation.  Which makes me a bit uneasy about this
> part of the conversation.

Yep proc interface is bad. /proc/$pid/stat is almost impossible to
parse without flaws because task could set second field "comm" into
any string and fake ppid  - for example ") Z 1". /proc/$pid/status
is better but it has more information and thus slower.

This trick for distant getppid looks cheap useful:
in this interface space of negative pids is free for use.

>
>> Examples:
>> getvpid(pid, ns, -1)      - get pid in our pid namespace
>> getvpid(pid, -1, ns)      - get pid in container
>> getvpid(pid, -1, ns) > 0  - is pid is reachable from container?
>> getvpid(1, ns1, ns2) > 0  - is ns1 inside ns2?
>> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
>> getvpid(1, ns, -1)        - get init task of pid-namespace
>> getvpid(-1, ns, -1)       - get reaper of init task in parent pid-namespace
>> getvpid(-pid, -1, -1)     - get ppid by pid
>
> As I step back and pay attention to this case I am half wondering if
> perhaps what would be most useful is a file descriptor that refers
> to a pid and an updated set of system calls that takes pid file
> descriptors instead of pids.

Fd which pins pids isn't a good idea.

I think it's better to refer (but not hold) task rather than pid.
For example inode of taskfd will hold small buffer for task exit
status: task holds reference to its own taskfd inode and populates
status when exits. Here will be no zombies and delayed reaping.

Something like:

task_fd = clonefd()
                      ...
select(...)
                      exit(...)
pread(task_fd, &status_rusage_etc, sizeof, 0);
close(task_fd);

Task pid also could be part of structure in that fd. Potentially it
could provide the same information as /proc/$pid/... in effective
binary format: we can read only required fields of structure and
kernel can skip unneeded calculations.

>
> Something like:
>
>      getpidfd(int pidnsfd, pid_t pid);
>
>      waitfd(int pidfd, int *status, int options, struct rusage *rusage);
>
>      killfd(int pidfd, int sig);
>
>      clonefd(...);
>
> And perhaps:
>      pid_nr_ns(int pidnsfd, int pidfd);
>
>      parentfd(int pidfd);
>
> Eric
>

-- 
Konstantin

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH RFC v3 2/2] pidns: introduce syscall getvpid
@ 2015-10-20 10:04       ` Konstantin Khlebnikov
  0 siblings, 0 replies; 37+ messages in thread
From: Konstantin Khlebnikov @ 2015-10-20 10:04 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Roman Gushchin,
	Serge Hallyn, Oleg Nesterov, Chen Fan, Andrew Morton,
	Linus Torvalds, Stéphane Graber

On 28.09.2015 19:57, Eric W. Biederman wrote:
> Konstantin Khlebnikov <khlebnikov-XoJtRXgx1JseBXzfvpsJ4g@public.gmane.org> writes:
>
>> If pid is negative then getvpid() returns pid of parent task for -pid.
>
> Now that I am noticing this.  I don't think I have seen any discussion
> about justifying a syscall getting another processes parent pid.  My
> apologies if I just missed it.
>

Sorry for late response. This completely fell out of my mind after LinuxCon.

> Why do we want the the parent pid?  We can we usefully do with it?
> Is proc really that bad of an interface?
>
> Fetching a parent pid feels like a separate logical operation
> from pid translation.  Which makes me a bit uneasy about this
> part of the conversation.

Yep proc interface is bad. /proc/$pid/stat is almost impossible to
parse without flaws because task could set second field "comm" into
any string and fake ppid  - for example ") Z 1". /proc/$pid/status
is better but it has more information and thus slower.

This trick for distant getppid looks cheap useful:
in this interface space of negative pids is free for use.

>
>> Examples:
>> getvpid(pid, ns, -1)      - get pid in our pid namespace
>> getvpid(pid, -1, ns)      - get pid in container
>> getvpid(pid, -1, ns) > 0  - is pid is reachable from container?
>> getvpid(1, ns1, ns2) > 0  - is ns1 inside ns2?
>> getvpid(1, ns1, ns2) == 0 - is ns1 outside ns2?
>> getvpid(1, ns, -1)        - get init task of pid-namespace
>> getvpid(-1, ns, -1)       - get reaper of init task in parent pid-namespace
>> getvpid(-pid, -1, -1)     - get ppid by pid
>
> As I step back and pay attention to this case I am half wondering if
> perhaps what would be most useful is a file descriptor that refers
> to a pid and an updated set of system calls that takes pid file
> descriptors instead of pids.

Fd which pins pids isn't a good idea.

I think it's better to refer (but not hold) task rather than pid.
For example inode of taskfd will hold small buffer for task exit
status: task holds reference to its own taskfd inode and populates
status when exits. Here will be no zombies and delayed reaping.

Something like:

task_fd = clonefd()
                      ...
select(...)
                      exit(...)
pread(task_fd, &status_rusage_etc, sizeof, 0);
close(task_fd);

Task pid also could be part of structure in that fd. Potentially it
could provide the same information as /proc/$pid/... in effective
binary format: we can read only required fields of structure and
kernel can skip unneeded calculations.

>
> Something like:
>
>      getpidfd(int pidnsfd, pid_t pid);
>
>      waitfd(int pidfd, int *status, int options, struct rusage *rusage);
>
>      killfd(int pidfd, int sig);
>
>      clonefd(...);
>
> And perhaps:
>      pid_nr_ns(int pidnsfd, int pidfd);
>
>      parentfd(int pidfd);
>
> Eric
>

-- 
Konstantin

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2015-10-20 10:04 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-25 13:52 [PATCH RFC v3 1/2] nsfs: replace proc_ns_fget() with proc_ns_fdget() Konstantin Khlebnikov
2015-09-25 13:52 ` Konstantin Khlebnikov
2015-09-25 13:52 ` [PATCH RFC v3 2/2] pidns: introduce syscall getvpid Konstantin Khlebnikov
2015-09-25 13:52   ` Konstantin Khlebnikov
2015-09-28  4:12   ` kbuild test robot
2015-09-28  4:12     ` kbuild test robot
2015-09-28 16:22   ` Eric W. Biederman
2015-09-28 16:22     ` Eric W. Biederman
2015-09-28 16:57   ` Eric W. Biederman
2015-09-28 16:57     ` Eric W. Biederman
     [not found]     ` <87d1x25vng.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2015-10-20 10:04       ` Konstantin Khlebnikov
2015-10-20 10:04     ` Konstantin Khlebnikov
2015-10-20 10:04       ` Konstantin Khlebnikov
2015-09-25 17:56 ` [PATCH 0/1] ns: introduce proc_get_ns_by_fd() Oleg Nesterov
2015-09-25 17:56   ` Oleg Nesterov
     [not found]   ` <20150925175654.GA12504-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-09-25 17:57     ` [PATCH 1/1] " Oleg Nesterov
2015-09-25 17:57       ` Oleg Nesterov
2015-09-28  8:21     ` [PATCH 0/1] " Konstantin Khlebnikov
2015-09-28 16:37     ` Eric W. Biederman
2015-09-28  8:21   ` Konstantin Khlebnikov
2015-09-28  8:21     ` Konstantin Khlebnikov
2015-09-28 16:37   ` Eric W. Biederman
2015-09-28 16:37     ` Eric W. Biederman
2015-09-29 16:43     ` Oleg Nesterov
2015-09-29 16:43       ` Oleg Nesterov
     [not found]       ` <20150929164315.GA16734-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-09-29 17:30         ` Eric W. Biederman
2015-09-29 17:30       ` Eric W. Biederman
2015-09-29 17:30         ` Eric W. Biederman
2015-09-29 18:38         ` Oleg Nesterov
2015-09-29 18:38           ` Oleg Nesterov
     [not found]           ` <20150929183833.GA21875-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-09-29 19:05             ` Eric W. Biederman
2015-09-29 19:05               ` Eric W. Biederman
     [not found]         ` <874mid16bk.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2015-09-29 18:38           ` Oleg Nesterov
     [not found]     ` <871tdi8pqj.fsf-JOvCrm2gF+uungPnsOpG7nhyD016LWXt@public.gmane.org>
2015-09-29 16:43       ` Oleg Nesterov
2015-09-30  2:54       ` Chen Fan
2015-09-30  2:54     ` Chen Fan
2015-09-30  2:54       ` Chen Fan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.