linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
@ 2019-07-31 16:12 Adrian Reber
  2019-07-31 16:12 ` [PATCH v2 2/2] selftests: add test for clone3() with set_tid Adrian Reber
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Adrian Reber @ 2019-07-31 16:12 UTC (permalink / raw)
  To: Christian Brauner, Eric Biederman, Pavel Emelianov, Jann Horn,
	Oleg Nesterov, Dmitry Safonov
  Cc: linux-kernel, Andrei Vagin, Mike Rapoport, Radostin Stoyanov,
	Adrian Reber

The main motivation to add CLONE_SET_TID to clone3() is CRIU.

To restore a process with the same PID/TID CRIU currently uses
/proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
ns_last_pid and then (quickly) does a clone(). This works most of the
time, but it is racy. It is also slow as it requires multiple syscalls.

Extending clone3() to support CLONE_SET_TID makes it possible restore a
process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
race free (as long as the desired PID/TID is available).

This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
on clone3() with set_tid as they are currently in place for ns_last_pid.

v2:
 - Removed (size < sizeof(struct clone_args)) as discussed with
   Christian and Dmitry
 - Added comment to ((set_tid != 1) && idr_get_cursor() <= 1) (Oleg)
 - Use idr_alloc() instead of idr_alloc_cyclic() (Oleg)

Signed-off-by: Adrian Reber <areber@redhat.com>
---
 include/linux/pid.h        |  2 +-
 include/linux/sched/task.h |  1 +
 include/uapi/linux/sched.h |  2 ++
 kernel/fork.c              | 25 ++++++++++++++++---------
 kernel/pid.c               | 30 +++++++++++++++++++++++-------
 5 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 2a83e434db9d..052000db0ced 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -116,7 +116,7 @@ extern struct pid *find_vpid(int nr);
 extern struct pid *find_get_pid(int nr);
 extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
 
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t set_tid);
 extern void free_pid(struct pid *pid);
 extern void disable_pid_allocation(struct pid_namespace *ns);
 
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 0497091e40c1..4f2a80564332 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -26,6 +26,7 @@ struct kernel_clone_args {
 	unsigned long stack;
 	unsigned long stack_size;
 	unsigned long tls;
+	pid_t set_tid;
 };
 
 /*
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index b3105ac1381a..8c4e803e8147 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -32,6 +32,7 @@
 #define CLONE_NEWPID		0x20000000	/* New pid namespace */
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
 #define CLONE_IO		0x80000000	/* Clone io context */
+#define CLONE_SET_TID		0x100000000ULL	/* set if the desired TID is set in set_tid */
 
 /*
  * Arguments for the clone3 syscall
@@ -45,6 +46,7 @@ struct clone_args {
 	__aligned_u64 stack;
 	__aligned_u64 stack_size;
 	__aligned_u64 tls;
+	__aligned_u64 set_tid;
 };
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 2852d0e76ea3..405f98ce4c83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2031,7 +2031,7 @@ static __latent_entropy struct task_struct *copy_process(
 	stackleak_task_init(p);
 
 	if (pid != &init_struct_pid) {
-		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+		pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid);
 		if (IS_ERR(pid)) {
 			retval = PTR_ERR(pid);
 			goto bad_fork_cleanup_thread;
@@ -2530,14 +2530,12 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 					      struct clone_args __user *uargs,
 					      size_t size)
 {
+	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct clone_args args;
 
 	if (unlikely(size > PAGE_SIZE))
 		return -E2BIG;
 
-	if (unlikely(size < sizeof(struct clone_args)))
-		return -EINVAL;
-
 	if (unlikely(!access_ok(uargs, size)))
 		return -EFAULT;
 
@@ -2562,6 +2560,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 	if (copy_from_user(&args, uargs, size))
 		return -EFAULT;
 
+	if ((args.flags & CLONE_SET_TID) && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+		return -EPERM;
+
 	*kargs = (struct kernel_clone_args){
 		.flags		= args.flags,
 		.pidfd		= u64_to_user_ptr(args.pidfd),
@@ -2571,6 +2572,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 		.stack		= args.stack,
 		.stack_size	= args.stack_size,
 		.tls		= args.tls,
+		.set_tid	= args.set_tid,
 	};
 
 	return 0;
@@ -2578,11 +2580,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
 
 static bool clone3_args_valid(const struct kernel_clone_args *kargs)
 {
-	/*
-	 * All lower bits of the flag word are taken.
-	 * Verify that no other unknown flags are passed along.
-	 */
-	if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+	/* Verify that no other unknown flags are passed along. */
+	if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_SET_TID))
+		return false;
+
+	/* Fail if set_tid is set without CLONE_SET_TID */
+	if (kargs->set_tid && !(kargs->flags & CLONE_SET_TID))
+		return false;
+
+	/* Also fail if set_tid is invalid */
+	if ((kargs->set_tid <= 0) && (kargs->flags & CLONE_SET_TID))
 		return false;
 
 	/*
diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e437217..977f3ac39d7f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -157,7 +157,7 @@ void free_pid(struct pid *pid)
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
 {
 	struct pid *pid;
 	enum pid_type type;
@@ -186,12 +186,28 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
 			pid_min = RESERVED_PIDS;
 
-		/*
-		 * Store a null pointer so find_pid_ns does not find
-		 * a partially initialized PID (see below).
-		 */
-		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
-				      pid_max, GFP_ATOMIC);
+		if (set_tid) {
+			/*
+			 * Also fail if a PID != 1 is requested
+			 * and no PID 1 exists.
+			 */
+			if ((set_tid >= pid_max) || ((set_tid != 1) &&
+				(idr_get_cursor(&tmp->idr) <= 1))) {
+				spin_unlock_irq(&pidmap_lock);
+				retval = -EINVAL;
+				goto out_free;
+			}
+			nr = idr_alloc(&tmp->idr, NULL, set_tid,
+				       set_tid + 1, GFP_ATOMIC);
+			set_tid = 0;
+		} else {
+			/*
+			 * Store a null pointer so find_pid_ns does not find
+			 * a partially initialized PID (see below).
+			 */
+			nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+					      pid_max, GFP_ATOMIC);
+		}
 		spin_unlock_irq(&pidmap_lock);
 		idr_preload_end();
 
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 2/2] selftests: add test for clone3() with set_tid
  2019-07-31 16:12 [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID Adrian Reber
@ 2019-07-31 16:12 ` Adrian Reber
  2019-07-31 16:49 ` [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID Dmitry Safonov
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Adrian Reber @ 2019-07-31 16:12 UTC (permalink / raw)
  To: Christian Brauner, Eric Biederman, Pavel Emelianov, Jann Horn,
	Oleg Nesterov, Dmitry Safonov
  Cc: linux-kernel, Andrei Vagin, Mike Rapoport, Radostin Stoyanov,
	Adrian Reber

This tests clone3() with set_tid to see if all desired PIDs are working
as expected. The test tries to clone3() with a set_tid of -1, 1,
pid_max, a PID which is already in use and an unused PID. The same
tests are also running in PID namespace.

Signed-off-by: Adrian Reber <areber@redhat.com>
---
 tools/testing/selftests/clone3/.gitignore     |   1 +
 tools/testing/selftests/clone3/Makefile       |  11 ++
 .../testing/selftests/clone3/clone3_set_tid.c | 148 ++++++++++++++++++
 3 files changed, 160 insertions(+)
 create mode 100644 tools/testing/selftests/clone3/.gitignore
 create mode 100644 tools/testing/selftests/clone3/Makefile
 create mode 100644 tools/testing/selftests/clone3/clone3_set_tid.c

diff --git a/tools/testing/selftests/clone3/.gitignore b/tools/testing/selftests/clone3/.gitignore
new file mode 100644
index 000000000000..09ccea33016c
--- /dev/null
+++ b/tools/testing/selftests/clone3/.gitignore
@@ -0,0 +1 @@
+clone3_set_tid
diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile
new file mode 100644
index 000000000000..45c77b50f367
--- /dev/null
+++ b/tools/testing/selftests/clone3/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+
+CFLAGS += -I../../../../usr/include/
+
+ifeq ($(ARCH),x86_64)
+	TEST_GEN_PROGS := clone3_set_tid
+endif
+
+include ../lib.mk
diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c
new file mode 100644
index 000000000000..1ed0845aa4c5
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3_set_tid.c
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Based on Christian Brauner's clone3() example */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include "../kselftest.h"
+
+static pid_t raw_clone(struct clone_args *args)
+{
+	return syscall(__NR_clone3, args, sizeof(struct clone_args));
+}
+
+static int call_clone3_set_tid(int set_tid, int flags)
+{
+	struct clone_args args = {0};
+	pid_t ppid = -1;
+	pid_t pid = -1;
+	int status;
+
+	args.flags = flags | CLONE_SET_TID;
+	args.exit_signal = SIGCHLD;
+	args.set_tid = set_tid;
+
+	pid = raw_clone(&args);
+	if (pid < 0) {
+		ksft_print_msg("%s - Failed to create new process\n",
+				strerror(errno));
+		return -errno;
+	}
+
+	if (pid == 0) {
+		ksft_print_msg("I am the child, my PID is %d\n", getpid());
+		if (set_tid != getpid())
+			_exit(EXIT_FAILURE);
+		_exit(EXIT_SUCCESS);
+	}
+
+	ppid = getpid();
+	ksft_print_msg("I am the parent (%d). My child's pid is %d\n",
+			ppid, pid);
+
+	(void)wait(&status);
+	if (WEXITSTATUS(status))
+		return WEXITSTATUS(status);
+
+	return 0;
+}
+
+static int test_clone3_set_tid(int set_tid, int flags, int expected)
+{
+	int ret;
+	ksft_print_msg("[%d] Trying clone3() with CLONE_SET_TID to %d "
+		        "and 0x%x\n", getpid(), set_tid, flags);
+	ret = call_clone3_set_tid(set_tid, flags);
+	ksft_print_msg("[%d] clone3() with CLONE_SET_TID %d says :%d "
+			"- expected %d\n", getpid(), set_tid, ret, expected);
+	if (ret != expected)
+		ksft_exit_fail_msg("[%d] Result (%d) is different than "
+				    "expected (%d)\n", getpid(), ret, expected);
+	ksft_test_result_pass("[%d] Result (%d) matches expectation (%d)\n",
+			getpid(), ret, expected);
+	return 0;
+}
+int main(int argc, char *argv[])
+{
+	FILE *f;
+	int pid_max = 0;
+	pid_t pid;
+	pid_t ns_pid;
+	int ret = -1;
+
+	ksft_print_header();
+	ksft_set_plan(10);
+
+	f = fopen("/proc/sys/kernel/pid_max", "r");
+	if (f == NULL)
+		ksft_exit_fail_msg("%s - Could not open /proc/sys/kernel/pid_max\n",
+				strerror(errno));
+	fscanf(f, "%d", &pid_max);
+	fclose(f);
+	ksft_print_msg("/proc/sys/kernel/pid_max %d\n", pid_max);
+
+	/* First try with an invalid PID */
+	if (test_clone3_set_tid(-1, 0, -EINVAL))
+		goto on_error;
+	if (test_clone3_set_tid(-1, CLONE_NEWPID, -EINVAL))
+		goto on_error;
+	/* Then with PID 1 */
+	if (test_clone3_set_tid(1, 0, -EAGAIN))
+		goto on_error;
+	/* PID 1 should not fail in a PID namespace */
+	if (test_clone3_set_tid(1, CLONE_NEWPID, 0))
+		goto on_error;
+	/* pid_max should fail everywhere */
+	if (test_clone3_set_tid(pid_max, 0, -EINVAL))
+		goto on_error;
+	if (test_clone3_set_tid(pid_max, CLONE_NEWPID, -EINVAL))
+		goto on_error;
+	/* Find the current active PID */
+	pid = fork();
+	if (pid == 0) {
+		ksft_print_msg("Child has PID %d\n", getpid());
+		sleep(1);
+		_exit(EXIT_SUCCESS);
+	}
+	/* Try to create a process with that PID should fail */
+	if (test_clone3_set_tid(pid, 0, -EAGAIN))
+		goto on_error;
+	(void)wait(NULL);
+	/* After the child has finished, try again with the same PID */
+	if (test_clone3_set_tid(pid, 0, 0))
+		goto on_error;
+	/* This should fail as there is no PID 1 in that namespace */
+	if (test_clone3_set_tid(pid, CLONE_NEWPID, -EINVAL))
+		goto on_error;
+	unshare(CLONE_NEWPID);
+	/* Let's create a PID 1 */
+	ns_pid = fork();
+	if (ns_pid == 0) {
+		ksft_print_msg("Child in PID namespace has PID %d\n", getpid());
+		sleep(1);
+		_exit(EXIT_SUCCESS);
+	}
+	/*
+	 * Now, after the unshare() it should be possible to create a process
+	 * with another ID than 1 in the PID namespace.
+	 */
+	if (test_clone3_set_tid(2, 0, 0))
+		goto on_error;
+	(void)wait(NULL);
+
+	ret = 0;
+on_error:
+
+	return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-07-31 16:12 [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID Adrian Reber
  2019-07-31 16:12 ` [PATCH v2 2/2] selftests: add test for clone3() with set_tid Adrian Reber
@ 2019-07-31 16:49 ` Dmitry Safonov
  2019-07-31 16:56   ` Dmitry Safonov
  2019-07-31 17:41 ` Oleg Nesterov
  2019-08-02 13:19 ` Christian Brauner
  3 siblings, 1 reply; 16+ messages in thread
From: Dmitry Safonov @ 2019-07-31 16:49 UTC (permalink / raw)
  To: Adrian Reber, Christian Brauner, Eric Biederman, Pavel Emelianov,
	Jann Horn, Oleg Nesterov
  Cc: linux-kernel, Andrei Vagin, Mike Rapoport, Radostin Stoyanov

Hi Adrian,

On 7/31/19 5:12 PM, Adrian Reber wrote:
[..]
> @@ -2530,14 +2530,12 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>  					      struct clone_args __user *uargs,
>  					      size_t size)
>  {
> +	struct pid_namespace *pid_ns = task_active_pid_ns(current);
>  	struct clone_args args;
>  
>  	if (unlikely(size > PAGE_SIZE))
>  		return -E2BIG;
>  
> -	if (unlikely(size < sizeof(struct clone_args)))
> -		return -EINVAL;
> -

It might be better to validate it still somehow, but I don't insist.

[..]
> @@ -2578,11 +2580,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>  
>  static bool clone3_args_valid(const struct kernel_clone_args *kargs)
>  {
> -	/*
> -	 * All lower bits of the flag word are taken.
> -	 * Verify that no other unknown flags are passed along.
> -	 */
> -	if (kargs->flags & ~CLONE_LEGACY_FLAGS)
> +	/* Verify that no other unknown flags are passed along. */
> +	if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_SET_TID))
> +		return false;
> +
> +	/* Fail if set_tid is set without CLONE_SET_TID */
> +	if (kargs->set_tid && !(kargs->flags & CLONE_SET_TID))
> +		return false;
> +
> +	/* Also fail if set_tid is invalid */
> +	if ((kargs->set_tid <= 0) && (kargs->flags & CLONE_SET_TID))
>  		return false;

Sorry for not mentioning it on v1, but I've noticed it only now:
you check kargs->set_tid even with the legacy-sized kernel_clone_args,
which is probably some random value on a task's stack?

[..]

Thanks much,
          Dmitry

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-07-31 16:49 ` [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID Dmitry Safonov
@ 2019-07-31 16:56   ` Dmitry Safonov
  0 siblings, 0 replies; 16+ messages in thread
From: Dmitry Safonov @ 2019-07-31 16:56 UTC (permalink / raw)
  To: Adrian Reber, Christian Brauner, Eric Biederman, Pavel Emelianov,
	Jann Horn, Oleg Nesterov
  Cc: linux-kernel, Andrei Vagin, Mike Rapoport, Radostin Stoyanov

On 7/31/19 5:49 PM, Dmitry Safonov wrote:
> Hi Adrian,
> 
> On 7/31/19 5:12 PM, Adrian Reber wrote:
> [..]
>> @@ -2530,14 +2530,12 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>>  					      struct clone_args __user *uargs,
>>  					      size_t size)
>>  {
>> +	struct pid_namespace *pid_ns = task_active_pid_ns(current);
>>  	struct clone_args args;
>>  
>>  	if (unlikely(size > PAGE_SIZE))
>>  		return -E2BIG;
>>  
>> -	if (unlikely(size < sizeof(struct clone_args)))
>> -		return -EINVAL;
>> -
> 
> It might be better to validate it still somehow, but I don't insist.
> 
> [..]
>> @@ -2578,11 +2580,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>>  
>>  static bool clone3_args_valid(const struct kernel_clone_args *kargs)
>>  {
>> -	/*
>> -	 * All lower bits of the flag word are taken.
>> -	 * Verify that no other unknown flags are passed along.
>> -	 */
>> -	if (kargs->flags & ~CLONE_LEGACY_FLAGS)
>> +	/* Verify that no other unknown flags are passed along. */
>> +	if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_SET_TID))
>> +		return false;
>> +
>> +	/* Fail if set_tid is set without CLONE_SET_TID */
>> +	if (kargs->set_tid && !(kargs->flags & CLONE_SET_TID))
>> +		return false;
>> +
>> +	/* Also fail if set_tid is invalid */
>> +	if ((kargs->set_tid <= 0) && (kargs->flags & CLONE_SET_TID))
>>  		return false;
> 
> Sorry for not mentioning it on v1, but I've noticed it only now:
> you check kargs->set_tid even with the legacy-sized kernel_clone_args,
> which is probably some random value on a task's stack?

Self-correction: On kernel stack in copy_clone_args_from_user().
Which may probably be considered as a security leak..
Sorry again for not spotting it in v1.

Thanks,
          Dmitry

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-07-31 16:12 [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID Adrian Reber
  2019-07-31 16:12 ` [PATCH v2 2/2] selftests: add test for clone3() with set_tid Adrian Reber
  2019-07-31 16:49 ` [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID Dmitry Safonov
@ 2019-07-31 17:41 ` Oleg Nesterov
  2019-08-02  7:25   ` Adrian Reber
  2019-08-02 13:19 ` Christian Brauner
  3 siblings, 1 reply; 16+ messages in thread
From: Oleg Nesterov @ 2019-07-31 17:41 UTC (permalink / raw)
  To: Adrian Reber
  Cc: Christian Brauner, Eric Biederman, Pavel Emelianov, Jann Horn,
	Dmitry Safonov, linux-kernel, Andrei Vagin, Mike Rapoport,
	Radostin Stoyanov

On 07/31, Adrian Reber wrote:
>
> Extending clone3() to support CLONE_SET_TID makes it possible restore a
> process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
> race free (as long as the desired PID/TID is available).

I personally like this... but please see the question below.

> +struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
>  {
>  	struct pid *pid;
>  	enum pid_type type;
> @@ -186,12 +186,28 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>  		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
>  			pid_min = RESERVED_PIDS;
>  
> -		/*
> -		 * Store a null pointer so find_pid_ns does not find
> -		 * a partially initialized PID (see below).
> -		 */
> -		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> -				      pid_max, GFP_ATOMIC);
> +		if (set_tid) {
> +			/*
> +			 * Also fail if a PID != 1 is requested
> +			 * and no PID 1 exists.
> +			 */
> +			if ((set_tid >= pid_max) || ((set_tid != 1) &&
> +				(idr_get_cursor(&tmp->idr) <= 1)))
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Ah, I forgot to mention... this should work but only because
RESERVED_PIDS > 0. How about idr_is_empty() ?


But the main question is how it can really help if ns->level > 0, unlikely
CRIU will ever need to clone the process with the same pid_nr == set_tid
in the ns->parent chain.

So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
Or I missed something ?

Oleg.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-07-31 17:41 ` Oleg Nesterov
@ 2019-08-02  7:25   ` Adrian Reber
  2019-08-02 12:47     ` Oleg Nesterov
  0 siblings, 1 reply; 16+ messages in thread
From: Adrian Reber @ 2019-08-02  7:25 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Christian Brauner, Eric Biederman, Pavel Emelianov, Jann Horn,
	Dmitry Safonov, linux-kernel, Andrei Vagin, Mike Rapoport,
	Radostin Stoyanov

On Wed, Jul 31, 2019 at 07:41:36PM +0200, Oleg Nesterov wrote:
> On 07/31, Adrian Reber wrote:
> >
> > Extending clone3() to support CLONE_SET_TID makes it possible restore a
> > process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
> > race free (as long as the desired PID/TID is available).
> 
> I personally like this... but please see the question below.
> 
> > +struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
> >  {
> >  	struct pid *pid;
> >  	enum pid_type type;
> > @@ -186,12 +186,28 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> >  		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
> >  			pid_min = RESERVED_PIDS;
> >  
> > -		/*
> > -		 * Store a null pointer so find_pid_ns does not find
> > -		 * a partially initialized PID (see below).
> > -		 */
> > -		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> > -				      pid_max, GFP_ATOMIC);
> > +		if (set_tid) {
> > +			/*
> > +			 * Also fail if a PID != 1 is requested
> > +			 * and no PID 1 exists.
> > +			 */
> > +			if ((set_tid >= pid_max) || ((set_tid != 1) &&
> > +				(idr_get_cursor(&tmp->idr) <= 1)))
>                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> Ah, I forgot to mention... this should work but only because
> RESERVED_PIDS > 0. How about idr_is_empty() ?
> 
> 
> But the main question is how it can really help if ns->level > 0, unlikely
> CRIU will ever need to clone the process with the same pid_nr == set_tid
> in the ns->parent chain.

Not sure I understand what you mean. For CRIU only the PID in the PID
namespace is relevant.

> So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
> Or I missed something ?

Not sure why and how an array would be needed. Could you give me some
more details why you think this is needed.

		Adrian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-08-02  7:25   ` Adrian Reber
@ 2019-08-02 12:47     ` Oleg Nesterov
  2019-08-02 13:02       ` Christian Brauner
  2019-08-02 13:24       ` Oleg Nesterov
  0 siblings, 2 replies; 16+ messages in thread
From: Oleg Nesterov @ 2019-08-02 12:47 UTC (permalink / raw)
  To: Adrian Reber
  Cc: Christian Brauner, Eric Biederman, Pavel Emelianov, Jann Horn,
	Dmitry Safonov, linux-kernel, Andrei Vagin, Mike Rapoport,
	Radostin Stoyanov

On 08/02, Adrian Reber wrote:
>
> On Wed, Jul 31, 2019 at 07:41:36PM +0200, Oleg Nesterov wrote:
> > But the main question is how it can really help if ns->level > 0, unlikely
> > CRIU will ever need to clone the process with the same pid_nr == set_tid
> > in the ns->parent chain.
>
> Not sure I understand what you mean. For CRIU only the PID in the PID
> namespace is relevant.

If it runs "inside" this namespace. But in this case alloc_pid() should
use nr == set_tid only once in the main loop, when i == ns->level.

It doesn't need to have the same pid_nr in the parent pid namespace.

And in fact we should not allow criu (or anything else) to control the child's
pid_nr in the parent(s) namespace.

Right?

> > So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
> > Or I missed something ?
>
> Not sure why and how an array would be needed. Could you give me some
> more details why you think this is needed.

IIURC, criu can restore the process tree along with nested pid namespaces.

how can this patch help in this case?

But again, perhaps I missed something. I forgot everything about criu.

Oleg.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-08-02 12:47     ` Oleg Nesterov
@ 2019-08-02 13:02       ` Christian Brauner
  2019-08-02 13:24       ` Oleg Nesterov
  1 sibling, 0 replies; 16+ messages in thread
From: Christian Brauner @ 2019-08-02 13:02 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Adrian Reber, Christian Brauner, Eric Biederman, Pavel Emelianov,
	Jann Horn, Dmitry Safonov, linux-kernel, Andrei Vagin,
	Mike Rapoport, Radostin Stoyanov

On Fri, Aug 02, 2019 at 02:47:38PM +0200, Oleg Nesterov wrote:
> On 08/02, Adrian Reber wrote:
> >
> > On Wed, Jul 31, 2019 at 07:41:36PM +0200, Oleg Nesterov wrote:
> > > But the main question is how it can really help if ns->level > 0, unlikely
> > > CRIU will ever need to clone the process with the same pid_nr == set_tid
> > > in the ns->parent chain.
> >
> > Not sure I understand what you mean. For CRIU only the PID in the PID
> > namespace is relevant.
> 
> If it runs "inside" this namespace. But in this case alloc_pid() should
> use nr == set_tid only once in the main loop, when i == ns->level.
> 
> It doesn't need to have the same pid_nr in the parent pid namespace.
> 
> And in fact we should not allow criu (or anything else) to control the child's
> pid_nr in the parent(s) namespace.

This should definitely not be possible!

> 
> Right?
> 
> > > So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
> > > Or I missed something ?
> >
> > Not sure why and how an array would be needed. Could you give me some
> > more details why you think this is needed.
> 
> IIURC, criu can restore the process tree along with nested pid namespaces.

Hm, I'm not a fan of this array approach...

> 
> how can this patch help in this case?
> 
> But again, perhaps I missed something. I forgot everything about criu.
> 
> Oleg.
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-07-31 16:12 [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID Adrian Reber
                   ` (2 preceding siblings ...)
  2019-07-31 17:41 ` Oleg Nesterov
@ 2019-08-02 13:19 ` Christian Brauner
  2019-08-02 13:30   ` Oleg Nesterov
  3 siblings, 1 reply; 16+ messages in thread
From: Christian Brauner @ 2019-08-02 13:19 UTC (permalink / raw)
  To: Adrian Reber
  Cc: Christian Brauner, Eric Biederman, Pavel Emelianov, Jann Horn,
	Oleg Nesterov, Dmitry Safonov, linux-kernel, Andrei Vagin,
	Mike Rapoport, Radostin Stoyanov

On Wed, Jul 31, 2019 at 06:12:22PM +0200, Adrian Reber wrote:
> The main motivation to add CLONE_SET_TID to clone3() is CRIU.
> 
> To restore a process with the same PID/TID CRIU currently uses
> /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
> ns_last_pid and then (quickly) does a clone(). This works most of the
> time, but it is racy. It is also slow as it requires multiple syscalls.

Can you elaborate how this is racy, please. Afaict, CRIU will always
usually restore in a new pid namespace that it controls, right? What is
the exact race?

> 
> Extending clone3() to support CLONE_SET_TID makes it possible restore a
> process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
> race free (as long as the desired PID/TID is available).
> 
> This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
> on clone3() with set_tid as they are currently in place for ns_last_pid.
> 
> v2:
>  - Removed (size < sizeof(struct clone_args)) as discussed with
>    Christian and Dmitry
>  - Added comment to ((set_tid != 1) && idr_get_cursor() <= 1) (Oleg)
>  - Use idr_alloc() instead of idr_alloc_cyclic() (Oleg)

Fwiw, the changelog should be placed after the "---" after your SOB, so
rather :):

Signed-off-by: Adrian Reber <areber@redhat.com>
---
v2:
 - Removed (size < sizeof(struct clone_args)) as discussed with
   Christian and Dmitry
 - Added comment to ((set_tid != 1) && idr_get_cursor() <= 1) (Oleg)
 - Use idr_alloc() instead of idr_alloc_cyclic() (Oleg)
---

> 
> Signed-off-by: Adrian Reber <areber@redhat.com>
> ---
>  include/linux/pid.h        |  2 +-
>  include/linux/sched/task.h |  1 +
>  include/uapi/linux/sched.h |  2 ++
>  kernel/fork.c              | 25 ++++++++++++++++---------
>  kernel/pid.c               | 30 +++++++++++++++++++++++-------
>  5 files changed, 43 insertions(+), 17 deletions(-)
> 
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 2a83e434db9d..052000db0ced 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -116,7 +116,7 @@ extern struct pid *find_vpid(int nr);
>  extern struct pid *find_get_pid(int nr);
>  extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
>  
> -extern struct pid *alloc_pid(struct pid_namespace *ns);
> +extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t set_tid);
>  extern void free_pid(struct pid *pid);
>  extern void disable_pid_allocation(struct pid_namespace *ns);
>  
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index 0497091e40c1..4f2a80564332 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -26,6 +26,7 @@ struct kernel_clone_args {
>  	unsigned long stack;
>  	unsigned long stack_size;
>  	unsigned long tls;
> +	pid_t set_tid;
>  };
>  
>  /*
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index b3105ac1381a..8c4e803e8147 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -32,6 +32,7 @@
>  #define CLONE_NEWPID		0x20000000	/* New pid namespace */
>  #define CLONE_NEWNET		0x40000000	/* New network namespace */
>  #define CLONE_IO		0x80000000	/* Clone io context */
> +#define CLONE_SET_TID		0x100000000ULL	/* set if the desired TID is set in set_tid */
>  
>  /*
>   * Arguments for the clone3 syscall
> @@ -45,6 +46,7 @@ struct clone_args {
>  	__aligned_u64 stack;
>  	__aligned_u64 stack_size;
>  	__aligned_u64 tls;
> +	__aligned_u64 set_tid;
>  };
>  
>  /*
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 2852d0e76ea3..405f98ce4c83 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2031,7 +2031,7 @@ static __latent_entropy struct task_struct *copy_process(
>  	stackleak_task_init(p);
>  
>  	if (pid != &init_struct_pid) {
> -		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
> +		pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid);
>  		if (IS_ERR(pid)) {
>  			retval = PTR_ERR(pid);
>  			goto bad_fork_cleanup_thread;
> @@ -2530,14 +2530,12 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>  					      struct clone_args __user *uargs,
>  					      size_t size)
>  {
> +	struct pid_namespace *pid_ns = task_active_pid_ns(current);
>  	struct clone_args args;
>  
>  	if (unlikely(size > PAGE_SIZE))
>  		return -E2BIG;
>  
> -	if (unlikely(size < sizeof(struct clone_args)))
> -		return -EINVAL;
> -
>  	if (unlikely(!access_ok(uargs, size)))
>  		return -EFAULT;
>  
> @@ -2562,6 +2560,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>  	if (copy_from_user(&args, uargs, size))
>  		return -EFAULT;
>  
> +	if ((args.flags & CLONE_SET_TID) && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
> +		return -EPERM;

Have you made sure that this makes sense with all flags, e.g. does this
make sense when specified with CLONE_THREAD?

> +
>  	*kargs = (struct kernel_clone_args){
>  		.flags		= args.flags,
>  		.pidfd		= u64_to_user_ptr(args.pidfd),
> @@ -2571,6 +2572,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>  		.stack		= args.stack,
>  		.stack_size	= args.stack_size,
>  		.tls		= args.tls,
> +		.set_tid	= args.set_tid,
>  	};
>  
>  	return 0;
> @@ -2578,11 +2580,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
>  
>  static bool clone3_args_valid(const struct kernel_clone_args *kargs)
>  {
> -	/*
> -	 * All lower bits of the flag word are taken.
> -	 * Verify that no other unknown flags are passed along.
> -	 */
> -	if (kargs->flags & ~CLONE_LEGACY_FLAGS)
> +	/* Verify that no other unknown flags are passed along. */
> +	if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_SET_TID))
> +		return false;
> +
> +	/* Fail if set_tid is set without CLONE_SET_TID */
> +	if (kargs->set_tid && !(kargs->flags & CLONE_SET_TID))
> +		return false;
> +
> +	/* Also fail if set_tid is invalid */
> +	if ((kargs->set_tid <= 0) && (kargs->flags & CLONE_SET_TID))
>  		return false;
>  
>  	/*
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 0a9f2e437217..977f3ac39d7f 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -157,7 +157,7 @@ void free_pid(struct pid *pid)
>  	call_rcu(&pid->rcu, delayed_put_pid);
>  }
>  
> -struct pid *alloc_pid(struct pid_namespace *ns)
> +struct pid *alloc_pid(struct pid_namespace *ns, int set_tid)
>  {
>  	struct pid *pid;
>  	enum pid_type type;
> @@ -186,12 +186,28 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>  		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
>  			pid_min = RESERVED_PIDS;
>  
> -		/*
> -		 * Store a null pointer so find_pid_ns does not find
> -		 * a partially initialized PID (see below).
> -		 */
> -		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> -				      pid_max, GFP_ATOMIC);
> +		if (set_tid) {
> +			/*
> +			 * Also fail if a PID != 1 is requested
> +			 * and no PID 1 exists.
> +			 */
> +			if ((set_tid >= pid_max) || ((set_tid != 1) &&
> +				(idr_get_cursor(&tmp->idr) <= 1))) {
> +				spin_unlock_irq(&pidmap_lock);
> +				retval = -EINVAL;
> +				goto out_free;
> +			}
> +			nr = idr_alloc(&tmp->idr, NULL, set_tid,
> +				       set_tid + 1, GFP_ATOMIC);

Hm, feels to me that we should report a different error code than EAGAIN
when the allocation fails for set_tid. Right now you get EAGAIN for both
the non set_tid and the set_tid codepath.
But for set_tid the error that you likely should surface is EEXIST, i.e.
that pid is already taken.

> +			set_tid = 0;
> +		} else {
> +			/*
> +			 * Store a null pointer so find_pid_ns does not find
> +			 * a partially initialized PID (see below).
> +			 */
> +			nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> +					      pid_max, GFP_ATOMIC);
> +		}
>  		spin_unlock_irq(&pidmap_lock);
>  		idr_preload_end();
>  
> -- 
> 2.21.0
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-08-02 12:47     ` Oleg Nesterov
  2019-08-02 13:02       ` Christian Brauner
@ 2019-08-02 13:24       ` Oleg Nesterov
  2019-08-02 13:46         ` Oleg Nesterov
  1 sibling, 1 reply; 16+ messages in thread
From: Oleg Nesterov @ 2019-08-02 13:24 UTC (permalink / raw)
  To: Adrian Reber
  Cc: Christian Brauner, Eric Biederman, Pavel Emelianov, Jann Horn,
	Dmitry Safonov, linux-kernel, Andrei Vagin, Mike Rapoport,
	Radostin Stoyanov

On 08/02, Oleg Nesterov wrote:
>
> On 08/02, Adrian Reber wrote:
> >
> > On Wed, Jul 31, 2019 at 07:41:36PM +0200, Oleg Nesterov wrote:
> > > But the main question is how it can really help if ns->level > 0, unlikely
> > > CRIU will ever need to clone the process with the same pid_nr == set_tid
> > > in the ns->parent chain.
> >
> > Not sure I understand what you mean. For CRIU only the PID in the PID
> > namespace is relevant.
>
> If it runs "inside" this namespace. But in this case alloc_pid() should
> use nr == set_tid only once in the main loop, when i == ns->level.

and I just noticed that your patch does exactly this, it clears set_tid
after the 1st usage when i == ns->level.

> > > So may be kernel_clone_args->set_tid should be pid_t __user *set_tid_array?
> > > Or I missed something ?
> >
> > Not sure why and how an array would be needed. Could you give me some
> > more details why you think this is needed.
>
> IIURC, criu can restore the process tree along with nested pid namespaces.
>
> how can this patch help in this case?
>
> But again, perhaps I missed something. I forgot everything about criu.

Yes... I guess in this case it can "safely" modify ns_last_pid in the child
namespaces it needs to create.

So Adrian, sorry for confusion, I think your patch is fine.

Oleg.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-08-02 13:19 ` Christian Brauner
@ 2019-08-02 13:30   ` Oleg Nesterov
  2019-08-02 13:50     ` Christian Brauner
  0 siblings, 1 reply; 16+ messages in thread
From: Oleg Nesterov @ 2019-08-02 13:30 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Adrian Reber, Eric Biederman, Pavel Emelianov, Jann Horn,
	Dmitry Safonov, linux-kernel, Andrei Vagin, Mike Rapoport,
	Radostin Stoyanov

On 08/02, Christian Brauner wrote:
>
> On Wed, Jul 31, 2019 at 06:12:22PM +0200, Adrian Reber wrote:
> > The main motivation to add CLONE_SET_TID to clone3() is CRIU.
> >
> > To restore a process with the same PID/TID CRIU currently uses
> > /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
> > ns_last_pid and then (quickly) does a clone(). This works most of the
> > time, but it is racy. It is also slow as it requires multiple syscalls.
>
> Can you elaborate how this is racy, please. Afaict, CRIU will always
> usually restore in a new pid namespace that it controls, right?

Why? No. For example you can checkpoint (not sure this is correct word)
a single process in your namespace, then (try to restore) it. 

> What is
> the exact race?

something else in the same namespace can fork() right after criu writes
the pid-for-restore into ns_last_pid.

Oleg.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-08-02 13:24       ` Oleg Nesterov
@ 2019-08-02 13:46         ` Oleg Nesterov
  2019-08-02 13:52           ` Christian Brauner
  0 siblings, 1 reply; 16+ messages in thread
From: Oleg Nesterov @ 2019-08-02 13:46 UTC (permalink / raw)
  To: Adrian Reber
  Cc: Christian Brauner, Eric Biederman, Pavel Emelianov, Jann Horn,
	Dmitry Safonov, linux-kernel, Andrei Vagin, Mike Rapoport,
	Radostin Stoyanov

On 08/02, Oleg Nesterov wrote:
>
> So Adrian, sorry for confusion, I think your patch is fine.

Yes... but do we really need the new CLONE_SET_TID ?

set_tid == 0 has no effect, can't we simply check kargs->set_tid != 0
before ns_capable() ?

and to remind, I still think alloc_pid() should use idr_is_empty(), but
I won't insist.

Oleg.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-08-02 13:30   ` Oleg Nesterov
@ 2019-08-02 13:50     ` Christian Brauner
  2019-08-02 15:10       ` Adrian Reber
  0 siblings, 1 reply; 16+ messages in thread
From: Christian Brauner @ 2019-08-02 13:50 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Christian Brauner, Adrian Reber, Eric Biederman, Pavel Emelianov,
	Jann Horn, Dmitry Safonov, linux-kernel, Andrei Vagin,
	Mike Rapoport, Radostin Stoyanov

On Fri, Aug 02, 2019 at 03:30:01PM +0200, Oleg Nesterov wrote:
> On 08/02, Christian Brauner wrote:
> >
> > On Wed, Jul 31, 2019 at 06:12:22PM +0200, Adrian Reber wrote:
> > > The main motivation to add CLONE_SET_TID to clone3() is CRIU.
> > >
> > > To restore a process with the same PID/TID CRIU currently uses
> > > /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
> > > ns_last_pid and then (quickly) does a clone(). This works most of the
> > > time, but it is racy. It is also slow as it requires multiple syscalls.
> >
> > Can you elaborate how this is racy, please. Afaict, CRIU will always
> > usually restore in a new pid namespace that it controls, right?
> 
> Why? No. For example you can checkpoint (not sure this is correct word)
> a single process in your namespace, then (try to restore) it. 
> 
> > What is
> > the exact race?
> 
> something else in the same namespace can fork() right after criu writes
> the pid-for-restore into ns_last_pid.

Ok, that makes sense. :)
My CRIU userspace knowledge is sporadic, so I'm not sure how exactly it
restores process trees in pid namespaces and what workloads this would
especially help with.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-08-02 13:46         ` Oleg Nesterov
@ 2019-08-02 13:52           ` Christian Brauner
  2019-08-02 16:50             ` Adrian Reber
  0 siblings, 1 reply; 16+ messages in thread
From: Christian Brauner @ 2019-08-02 13:52 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Adrian Reber, Christian Brauner, Eric Biederman, Pavel Emelianov,
	Jann Horn, Dmitry Safonov, linux-kernel, Andrei Vagin,
	Mike Rapoport, Radostin Stoyanov

On Fri, Aug 02, 2019 at 03:46:11PM +0200, Oleg Nesterov wrote:
> On 08/02, Oleg Nesterov wrote:
> >
> > So Adrian, sorry for confusion, I think your patch is fine.
> 
> Yes... but do we really need the new CLONE_SET_TID ?
> 
> set_tid == 0 has no effect, can't we simply check kargs->set_tid != 0
> before ns_capable() ?

Yeah, I agree that sounds much better and aligns with exit_signal.

Christian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-08-02 13:50     ` Christian Brauner
@ 2019-08-02 15:10       ` Adrian Reber
  0 siblings, 0 replies; 16+ messages in thread
From: Adrian Reber @ 2019-08-02 15:10 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Oleg Nesterov, Eric Biederman, Pavel Emelianov, Jann Horn,
	Dmitry Safonov, linux-kernel, Andrei Vagin, Mike Rapoport,
	Radostin Stoyanov

On Fri, Aug 02, 2019 at 03:50:54PM +0200, Christian Brauner wrote:
> On Fri, Aug 02, 2019 at 03:30:01PM +0200, Oleg Nesterov wrote:
> > On 08/02, Christian Brauner wrote:
> > >
> > > On Wed, Jul 31, 2019 at 06:12:22PM +0200, Adrian Reber wrote:
> > > > The main motivation to add CLONE_SET_TID to clone3() is CRIU.
> > > >
> > > > To restore a process with the same PID/TID CRIU currently uses
> > > > /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
> > > > ns_last_pid and then (quickly) does a clone(). This works most of the
> > > > time, but it is racy. It is also slow as it requires multiple syscalls.
> > >
> > > Can you elaborate how this is racy, please. Afaict, CRIU will always
> > > usually restore in a new pid namespace that it controls, right?
> > 
> > Why? No. For example you can checkpoint (not sure this is correct word)
> > a single process in your namespace, then (try to restore) it. 
> > 
> > > What is
> > > the exact race?
> > 
> > something else in the same namespace can fork() right after criu writes
> > the pid-for-restore into ns_last_pid.
> 
> Ok, that makes sense. :)
> My CRIU userspace knowledge is sporadic, so I'm not sure how exactly it
> restores process trees in pid namespaces and what workloads this would
> especially help with.

Just what Oleg said. CRIU can restore processes in a new PID namespaces
or in an existing. To restore a process into an existing PID namespace
has the possibility of a PID collision, but if the PID is not yet in use
there is no limitation from CRIU's side.

Restoring into an existing PID namespace which is used by other
processes always has the possibility that between writing to
/proc/sys/kernel/ns_last_pid and clone() something else has fork()'d and
therefore it is racy.

		Adrian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID
  2019-08-02 13:52           ` Christian Brauner
@ 2019-08-02 16:50             ` Adrian Reber
  0 siblings, 0 replies; 16+ messages in thread
From: Adrian Reber @ 2019-08-02 16:50 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Oleg Nesterov, Eric Biederman, Pavel Emelianov, Jann Horn,
	Dmitry Safonov, linux-kernel, Andrei Vagin, Mike Rapoport,
	Radostin Stoyanov

On Fri, Aug 02, 2019 at 03:52:49PM +0200, Christian Brauner wrote:
> On Fri, Aug 02, 2019 at 03:46:11PM +0200, Oleg Nesterov wrote:
> > On 08/02, Oleg Nesterov wrote:
> > >
> > > So Adrian, sorry for confusion, I think your patch is fine.

Good to know.

> > Yes... but do we really need the new CLONE_SET_TID ?
> > 
> > set_tid == 0 has no effect, can't we simply check kargs->set_tid != 0
> > before ns_capable() ?
> 
> Yeah, I agree that sounds much better and aligns with exit_signal.

Let me remove CLONE_SET_TID from the patch and I will try out
idr_is_empty().

I will also address Dmitry's comment about accessing smaller parameter
structs.

		Adrian

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2019-08-02 16:50 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-31 16:12 [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID Adrian Reber
2019-07-31 16:12 ` [PATCH v2 2/2] selftests: add test for clone3() with set_tid Adrian Reber
2019-07-31 16:49 ` [PATCH v2 1/2] fork: extend clone3() to support CLONE_SET_TID Dmitry Safonov
2019-07-31 16:56   ` Dmitry Safonov
2019-07-31 17:41 ` Oleg Nesterov
2019-08-02  7:25   ` Adrian Reber
2019-08-02 12:47     ` Oleg Nesterov
2019-08-02 13:02       ` Christian Brauner
2019-08-02 13:24       ` Oleg Nesterov
2019-08-02 13:46         ` Oleg Nesterov
2019-08-02 13:52           ` Christian Brauner
2019-08-02 16:50             ` Adrian Reber
2019-08-02 13:19 ` Christian Brauner
2019-08-02 13:30   ` Oleg Nesterov
2019-08-02 13:50     ` Christian Brauner
2019-08-02 15:10       ` Adrian Reber

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).