All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][v3][PATCH 1/7] Factor out code to allocate pidmap page
@ 2009-05-30 23:57 Sukadev Bhattiprolu
       [not found] ` <20090530235714.GA4083-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-05-30 23:57 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, David C. Hansen


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Mon, 4 May 2009 01:17:39 -0700
Subject: [RFC][v3][PATCH 1/7] Factor out code to allocate pidmap page

To implement support for clone_with_pids() system call we would
need to allocate pidmap page in more than one place. Move this
code to a new function alloc_pidmap_page().

Changelog[v2]:
	- (Matt Helsley, Dave Hansen) Have alloc_pidmap_page() return
	  -ENOMEM on error instead of -1.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 kernel/pid.c |   46 ++++++++++++++++++++++++++++++----------------
 1 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78..9ff33cc 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,9 +122,34 @@ static void free_pidmap(struct upid *upid)
 	atomic_inc(&map->nr_free);
 }
 
+static int alloc_pidmap_page(struct pidmap *map)
+{
+	void *page;
+
+	if (likely(map->page))
+		return 0;
+
+	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+	/*
+	 * Free the page if someone raced with us installing it:
+	 */
+	spin_lock_irq(&pidmap_lock);
+	if (map->page)
+		kfree(page);
+	else
+		map->page = page;
+	spin_unlock_irq(&pidmap_lock);
+
+	if (unlikely(!map->page))
+		return -ENOMEM;
+
+	return 0;
+}
+
 static int alloc_pidmap(struct pid_namespace *pid_ns)
 {
-	int i, offset, max_scan, pid, last = pid_ns->last_pid;
+	int i, rc, offset, max_scan, pid, last = pid_ns->last_pid;
 	struct pidmap *map;
 
 	pid = last + 1;
@@ -134,21 +159,10 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
 	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
 	for (i = 0; i <= max_scan; ++i) {
-		if (unlikely(!map->page)) {
-			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-			/*
-			 * Free the page if someone raced with us
-			 * installing it:
-			 */
-			spin_lock_irq(&pidmap_lock);
-			if (map->page)
-				kfree(page);
-			else
-				map->page = page;
-			spin_unlock_irq(&pidmap_lock);
-			if (unlikely(!map->page))
-				break;
-		}
+		rc = alloc_pidmap_page(map);
+		if (rc)
+			break;
+
 		if (likely(atomic_read(&map->nr_free))) {
 			do {
 				if (!test_and_set_bit(offset, map->page)) {
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [RFC][v3][PATCH 2/7] Have alloc_pidmap() return actual error code
       [not found] ` <20090530235714.GA4083-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-31  0:01   ` Sukadev Bhattiprolu
       [not found]     ` <20090531000115.GA4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-31  0:01   ` [RFC][v3][PATCH 3/7] Add target_pid parameter to alloc_pidmap() Sukadev Bhattiprolu
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-05-31  0:01 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, David C. Hansen


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Mon, 4 May 2009 01:17:40 -0700
Subject: [RFC][v3][PATCH 2/7] Have alloc_pidmap() return actual error code

alloc_pidmap() can fail either because all pid numbers are in use or
because memory allocation failed.  With support for setting a specific
pid number, alloc_pidmap() would also fail if either the given pid
number is invalid or in use.

Rather than have callers assume -ENOMEM, have alloc_pidmap() return
the actual error.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 kernel/fork.c |    5 +++--
 kernel/pid.c  |    9 ++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd..f8411a8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1119,10 +1119,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_io;
 
 	if (pid != &init_struct_pid) {
-		retval = -ENOMEM;
 		pid = alloc_pid(p->nsproxy->pid_ns);
-		if (!pid)
+		if (IS_ERR(pid)) {
+			retval = PTR_ERR(pid);
 			goto bad_fork_cleanup_io;
+		}
 
 		if (clone_flags & CLONE_NEWPID) {
 			retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
diff --git a/kernel/pid.c b/kernel/pid.c
index 9ff33cc..b2d6a19 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -158,6 +158,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	offset = pid & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
 	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+	rc = -EAGAIN;
 	for (i = 0; i <= max_scan; ++i) {
 		rc = alloc_pidmap_page(map);
 		if (rc)
@@ -188,12 +189,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 		} else {
 			map = &pid_ns->pidmap[0];
 			offset = RESERVED_PIDS;
-			if (unlikely(last == offset))
+			if (unlikely(last == offset)) {
+				rc = -EAGAIN;
 				break;
+			}
 		}
 		pid = mk_pid(pid_ns, map, offset);
 	}
-	return -1;
+	return rc;
 }
 
 int next_pidmap(struct pid_namespace *pid_ns, int last)
@@ -298,7 +301,7 @@ out_free:
 		free_pidmap(pid->numbers + i);
 
 	kmem_cache_free(ns->pid_cachep, pid);
-	pid = NULL;
+	pid = ERR_PTR(nr);
 	goto out;
 }
 
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [RFC][v3][PATCH 3/7] Add target_pid parameter to alloc_pidmap()
       [not found] ` <20090530235714.GA4083-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-31  0:01   ` [RFC][v3][PATCH 2/7] Have alloc_pidmap() return actual error code Sukadev Bhattiprolu
@ 2009-05-31  0:01   ` Sukadev Bhattiprolu
       [not found]     ` <20090531000144.GB4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-31  0:02   ` [RFC][v3][PATCH 4/7] Add target_pids parameter to alloc_pid() Sukadev Bhattiprolu
                     ` (4 subsequent siblings)
  6 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-05-31  0:01 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, David C. Hansen


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Mon, 4 May 2009 01:17:41 -0700
Subject: [RFC][v3][PATCH 3/7] Add target_pid parameter to alloc_pidmap()

With support for setting a specific pid number for a process,
alloc_pidmap() will need a paramter a 'target_pid' parameter.

Changelog[v2]:
	- (Serge Hallyn) Check for 'pid < 0' in set_pidmap().(Code
	  actually checks for 'pid <= 0' for completeness).

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 kernel/pid.c |   28 ++++++++++++++++++++++++++--
 1 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index b2d6a19..b44dd21 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -147,11 +147,35 @@ static int alloc_pidmap_page(struct pidmap *map)
 	return 0;
 }
 
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+static int set_pidmap(struct pid_namespace *pid_ns, int pid)
+{
+	int offset;
+	struct pidmap *map;
+
+	if (pid <= 0 || pid >= pid_max)
+		return -EINVAL;
+
+	offset = pid & BITS_PER_PAGE_MASK;
+	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
+
+	if (alloc_pidmap_page(map))
+		return -ENOMEM;
+
+	if (test_and_set_bit(offset, map->page))
+		return -EBUSY;
+
+	atomic_dec(&map->nr_free);
+	return pid;
+}
+
+static int alloc_pidmap(struct pid_namespace *pid_ns, int target_pid)
 {
 	int i, rc, offset, max_scan, pid, last = pid_ns->last_pid;
 	struct pidmap *map;
 
+	if (target_pid)
+		return set_pidmap(pid_ns, target_pid);
+
 	pid = last + 1;
 	if (pid >= pid_max)
 		pid = RESERVED_PIDS;
@@ -270,7 +294,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	tmp = ns;
 	for (i = ns->level; i >= 0; i--) {
-		nr = alloc_pidmap(tmp);
+		nr = alloc_pidmap(tmp, 0);
 		if (nr < 0)
 			goto out_free;
 
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [RFC][v3][PATCH 4/7] Add target_pids parameter to alloc_pid()
       [not found] ` <20090530235714.GA4083-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-31  0:01   ` [RFC][v3][PATCH 2/7] Have alloc_pidmap() return actual error code Sukadev Bhattiprolu
  2009-05-31  0:01   ` [RFC][v3][PATCH 3/7] Add target_pid parameter to alloc_pidmap() Sukadev Bhattiprolu
@ 2009-05-31  0:02   ` Sukadev Bhattiprolu
       [not found]     ` <20090531000220.GC4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-31  0:02   ` [RFC][v3][PATCH 5/7] Add target_pids parameter to copy_process() Sukadev Bhattiprolu
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-05-31  0:02 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, David C. Hansen


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Mon, 4 May 2009 01:17:42 -0700
Subject: [RFC][v3][PATCH 4/7] Add target_pids parameter to alloc_pid()

This parameter is currently NULL, but will be used in a follow-on patch.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 include/linux/pid.h |    2 +-
 kernel/fork.c       |    3 ++-
 kernel/pid.c        |    9 +++++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 49f1c2f..914185d 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr);
 extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
 int next_pidmap(struct pid_namespace *pid_ns, int last);
 
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *target_pids);
 extern void free_pid(struct pid *pid);
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index f8411a8..d2d69d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -954,6 +954,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	int retval;
 	struct task_struct *p;
 	int cgroup_callbacks_done = 0;
+	pid_t *target_pids = NULL;
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1119,7 +1120,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_io;
 
 	if (pid != &init_struct_pid) {
-		pid = alloc_pid(p->nsproxy->pid_ns);
+		pid = alloc_pid(p->nsproxy->pid_ns, target_pids);
 		if (IS_ERR(pid)) {
 			retval = PTR_ERR(pid);
 			goto bad_fork_cleanup_io;
diff --git a/kernel/pid.c b/kernel/pid.c
index b44dd21..090b221 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -280,13 +280,14 @@ void free_pid(struct pid *pid)
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *target_pids)
 {
 	struct pid *pid;
 	enum pid_type type;
 	int i, nr;
 	struct pid_namespace *tmp;
 	struct upid *upid;
+	int tpid;
 
 	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
 	if (!pid)
@@ -294,7 +295,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	tmp = ns;
 	for (i = ns->level; i >= 0; i--) {
-		nr = alloc_pidmap(tmp, 0);
+		tpid = 0;
+		if (target_pids)
+			tpid = target_pids[i];
+
+		nr = alloc_pidmap(tmp, tpid);
 		if (nr < 0)
 			goto out_free;
 
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [RFC][v3][PATCH 5/7] Add target_pids parameter to copy_process()
       [not found] ` <20090530235714.GA4083-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-05-31  0:02   ` [RFC][v3][PATCH 4/7] Add target_pids parameter to alloc_pid() Sukadev Bhattiprolu
@ 2009-05-31  0:02   ` Sukadev Bhattiprolu
       [not found]     ` <20090531000237.GD4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-31  0:02   ` [RFC][v3][PATCH 6/7] Define do_fork_with_pids() Sukadev Bhattiprolu
                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-05-31  0:02 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, David C. Hansen


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Mon, 4 May 2009 01:17:43 -0700
Subject: [RFC][v3][PATCH 5/7] Add target_pids parameter to copy_process()

The new parameter will be used in a follow-on patch when clone_with_pids()
is implemented.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 kernel/fork.c |    7 ++++---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index d2d69d3..373411e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -949,12 +949,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					unsigned long stack_size,
 					int __user *child_tidptr,
 					struct pid *pid,
+					pid_t *target_pids,
 					int trace)
 {
 	int retval;
 	struct task_struct *p;
 	int cgroup_callbacks_done = 0;
-	pid_t *target_pids = NULL;
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1327,7 +1327,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
 	struct pt_regs regs;
 
 	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-			    &init_struct_pid, 0);
+			    &init_struct_pid, NULL, 0);
 	if (!IS_ERR(task))
 		init_idle(task, cpu);
 
@@ -1350,6 +1350,7 @@ long do_fork(unsigned long clone_flags,
 	struct task_struct *p;
 	int trace = 0;
 	long nr;
+	pid_t *target_pids = NULL;
 
 	/*
 	 * Do some preliminary argument and permissions checking before we
@@ -1390,7 +1391,7 @@ long do_fork(unsigned long clone_flags,
 		trace = tracehook_prepare_clone(clone_flags);
 
 	p = copy_process(clone_flags, stack_start, regs, stack_size,
-			 child_tidptr, NULL, trace);
+			 child_tidptr, NULL, target_pids, trace);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [RFC][v3][PATCH 6/7] Define do_fork_with_pids()
       [not found] ` <20090530235714.GA4083-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (3 preceding siblings ...)
  2009-05-31  0:02   ` [RFC][v3][PATCH 5/7] Add target_pids parameter to copy_process() Sukadev Bhattiprolu
@ 2009-05-31  0:02   ` Sukadev Bhattiprolu
       [not found]     ` <20090531000255.GE4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-31  0:03   ` [RFC][v3][PATCH 7/7] Define clone_with_pids syscall Sukadev Bhattiprolu
  2009-06-01  8:16   ` [RFC][v3][PATCH 1/7] Factor out code to allocate pidmap page Amerigo Wang
  6 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-05-31  0:02 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, David C. Hansen


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Mon, 4 May 2009 01:17:44 -0700
Subject: [RFC][v3][PATCH 6/7] Define do_fork_with_pids()

do_fork_with_pids() is same as do_fork(), except that it takes an
additional, 'pid_set', parameter. This parameter, currently unused,
specifies the set of target pids of the process in each of its pid
namespaces.

Changelog[v3]:
	- Fix "long-line" warning from checkpatch.pl

Changelog[v2]:
	- To facilitate moving architecture-inpdendent code to kernel/fork.c
	  pass in 'struct target_pid_set __user *' to do_fork_with_pids()
	  rather than 'pid_t *' (next patch moves the arch-independent
	  code to kernel/fork.c)

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 include/linux/sched.h |    3 +++
 include/linux/types.h |    5 +++++
 kernel/fork.c         |   16 ++++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..e3bdb86 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1995,6 +1995,9 @@ extern int disallow_signal(int);
 
 extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+extern long do_fork_with_pids(unsigned long, unsigned long, struct pt_regs *,
+				unsigned long, int __user *, int __user *,
+				struct target_pid_set __user *pid_set);
 struct task_struct *fork_idle(int);
 
 extern void set_task_comm(struct task_struct *tsk, char *from);
diff --git a/include/linux/types.h b/include/linux/types.h
index 5abe354..17ec186 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -204,6 +204,11 @@ struct ustat {
 	char			f_fpack[6];
 };
 
+struct target_pid_set {
+	int num_pids;
+	pid_t *target_pids;
+};
+
 #endif	/* __KERNEL__ */
 #endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 373411e..a16ef7b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1340,12 +1340,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
  */
-long do_fork(unsigned long clone_flags,
+long do_fork_with_pids(unsigned long clone_flags,
 	      unsigned long stack_start,
 	      struct pt_regs *regs,
 	      unsigned long stack_size,
 	      int __user *parent_tidptr,
-	      int __user *child_tidptr)
+	      int __user *child_tidptr,
+	      struct target_pid_set __user *pid_setp)
 {
 	struct task_struct *p;
 	int trace = 0;
@@ -1448,6 +1449,17 @@ long do_fork(unsigned long clone_flags,
 	return nr;
 }
 
+long do_fork(unsigned long clone_flags,
+	      unsigned long stack_start,
+	      struct pt_regs *regs,
+	      unsigned long stack_size,
+	      int __user *parent_tidptr,
+	      int __user *child_tidptr)
+{
+	return do_fork_with_pids(clone_flags, stack_start, regs, stack_size,
+			parent_tidptr, child_tidptr, NULL);
+}
+
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found] ` <20090530235714.GA4083-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (4 preceding siblings ...)
  2009-05-31  0:02   ` [RFC][v3][PATCH 6/7] Define do_fork_with_pids() Sukadev Bhattiprolu
@ 2009-05-31  0:03   ` Sukadev Bhattiprolu
       [not found]     ` <20090531000350.GF4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-06-01  8:16   ` [RFC][v3][PATCH 1/7] Factor out code to allocate pidmap page Amerigo Wang
  6 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-05-31  0:03 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, David C. Hansen


Serge,

I have removed restriction on CLONE_NEWPID and may need a new ack.

Sukadev
---
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Mon, 4 May 2009 01:17:45 -0700
Subject: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall

Container restart requires that a task have the same pid it had when it was
checkpointed. When containers are nested the tasks within the containers
exist in multiple pid namespaces and hence have multiple pids to specify
during restart.

clone_with_pids(), intended for use during restart, is the same as clone(),
except that it takes a 'target_pid_set' paramter. This parameter lets caller
choose specific pid numbers for the child process, in the process's active
and ancestor pid namespaces. (Descendant pid namespaces in general don't
matter since processes don't have pids in them anyway, but see comments
in copy_target_pids() regarding CLONE_NEWPID).

Unlike clone(), clone_with_pids() needs CAP_SYS_ADMIN, at least for now, to
prevent unprivileged processes from misusing this interface.

Call clone_with_pids as follows:

	pid_t pids[] = { 0, 77, 99 };
	struct target_pid_set pid_set;

	pid_set.num_pids = sizeof(pids) / sizeof(int);
	pid_set.target_pids = &pids;

	syscall(__NR_clone_with_pids, flags, stack, NULL, NULL, NULL, &pid_set);

If a target-pid is 0, the kernel continues to assign a pid for the process in
that namespace. In the above example, pids[0] is 0, meaning the kernel will
assign next available pid to the process in init_pid_ns. But kernel will assign
pid 77 in the child pid namespace 1 and pid 99 in pid namespace 2. If either
77 or 99 are taken, the system call fails with -EBUSY.

If 'pid_set.num_pids' exceeds the current nesting level of pid namespaces,
the system call fails with -EINVAL.

Its mostly an exploratory patch seeking feedback on the interface.

NOTE:
	Compared to clone(), clone_with_pids() needs to pass in two more
	pieces of information:

		- number of pids in the set
		- user buffer containing the list of pids.

	But since clone() already takes 5 parameters, use a 'struct
	target_pid_set'.

TODO:
	- Gently tested.
	- May need additional sanity checks in do_fork_with_pids().

Changelog[v3]:
	- (Oren Laadan) Allow CLONE_NEWPID flag (by allocating an extra pid
	  in the target_pids[] list and setting it 0. See copy_target_pids()).
	- (Oren Laadan) Specified target pids should apply only to youngest
	  pid-namespaces (see copy_target_pids())
	- (Matt Helsley) Update patch description.

Changelog[v2]:
	- Remove unnecessary printk and add a note to callers of
	  copy_target_pids() to free target_pids.
	- (Serge Hallyn) Mention CAP_SYS_ADMIN restriction in patch description.
	- (Oren Laadan) Add checks for 'num_pids < 0' (return -EINVAL) and
	  'num_pids == 0' (fall back to normal clone()).
	- Move arch-independent code (sanity checks and copy-in of target-pids)
	  into kernel/fork.c and simplify sys_clone_with_pids()

Changelog[v1]:
	- Fixed some compile errors (had fixed these errors earlier in my
	  git tree but had not refreshed patches before emailing them)

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 arch/x86/include/asm/syscalls.h    |    1 +
 arch/x86/include/asm/unistd_32.h   |    1 +
 arch/x86/kernel/entry_32.S         |    1 +
 arch/x86/kernel/process_32.c       |   21 +++++++
 arch/x86/kernel/syscall_table_32.S |    1 +
 kernel/fork.c                      |  108 +++++++++++++++++++++++++++++++++++-
 6 files changed, 132 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408..1fdc149 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -31,6 +31,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
 /* kernel/process_32.c */
 int sys_fork(struct pt_regs *);
 int sys_clone(struct pt_regs *);
+int sys_clone_with_pids(struct pt_regs *);
 int sys_vfork(struct pt_regs *);
 int sys_execve(struct pt_regs *);
 
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74..90f906f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,7 @@
 #define __NR_inotify_init1	332
 #define __NR_preadv		333
 #define __NR_pwritev		334
+#define __NR_clone_with_pids	335
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c929add..ee92b0d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -707,6 +707,7 @@ ptregs_##name: \
 PTREGSCALL(iopl)
 PTREGSCALL(fork)
 PTREGSCALL(clone)
+PTREGSCALL(clone_with_pids)
 PTREGSCALL(vfork)
 PTREGSCALL(execve)
 PTREGSCALL(sigaltstack)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84..1efc3de 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -445,6 +445,27 @@ int sys_clone(struct pt_regs *regs)
 	return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
 }
 
+int sys_clone_with_pids(struct pt_regs *regs)
+{
+	unsigned long clone_flags;
+	unsigned long newsp;
+	int __user *parent_tidptr;
+	int __user *child_tidptr;
+	void __user *upid_setp;
+
+	clone_flags = regs->bx;
+	newsp = regs->cx;
+	parent_tidptr = (int __user *)regs->dx;
+	child_tidptr = (int __user *)regs->di;
+	upid_setp = (void __user *)regs->bp;
+
+	if (!newsp)
+		newsp = regs->sp;
+
+	return do_fork_with_pids(clone_flags, newsp, regs, 0, parent_tidptr,
+			child_tidptr, upid_setp);
+}
+
 /*
  * sys_execve() executes a new program.
  */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c873..94c1a58 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init1
 	.long sys_preadv
 	.long sys_pwritev
+	.long ptregs_clone_with_pids	/* 335 */
diff --git a/kernel/fork.c b/kernel/fork.c
index a16ef7b..7738aed 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1335,6 +1335,97 @@ struct task_struct * __cpuinit fork_idle(int cpu)
 }
 
 /*
+ * If user specified any 'target-pids' in @upid_setp, copy them from
+ * user and return a pointer to a local copy of the list of pids. The
+ * caller must free the list, when they are done using it.
+ *
+ * If user did not specify any target pids, return NULL (caller should
+ * treat this like normal clone).
+ *
+ * On any errors, return the error code
+ */
+static pid_t *copy_target_pids(void __user *upid_setp)
+{
+	int j;
+	int rc;
+	int size;
+	int unum_pids;		/* # of pids specified by user */
+	int knum_pids;		/* # of pids needed in kernel */
+	pid_t *target_pids;
+	struct target_pid_set pid_set;
+
+	if (!upid_setp)
+		return NULL;
+
+	rc = copy_from_user(&pid_set, upid_setp, sizeof(pid_set));
+	if (rc)
+		return ERR_PTR(-EFAULT);
+
+	unum_pids = pid_set.num_pids;
+	knum_pids = task_pid(current)->level + 1;
+
+	if (!unum_pids)
+		return NULL;
+
+	if (unum_pids < 0 || unum_pids > knum_pids)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * To keep alloc_pid() simple, allocate an extra pid_t in target_pids[]
+	 * and set it to 0. This last entry in target_pids[] corresponds to the
+	 * (yet-to-be-created) descendant pid-namespace if CLONE_NEWPID was
+	 * specified. If CLONE_NEWPID was not specified, this last entry will
+	 * simply be ignored.
+	 */
+	target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);
+	if (!target_pids)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * A process running in a level 2 pid namespace has three pid namespaces
+	 * and hence three pid numbers. If this process is checkpointed,
+	 * information about these three namespaces are saved. We refer to these
+	 * namespaces as 'known namespaces'.
+	 *
+	 * If this checkpointed process is however restarted in a level 3 pid
+	 * namespace, the restarted process has an extra ancestor pid namespace
+	 * (i.e 'unknown namespace') and 'knum_pids' exceeds 'unum_pids'.
+	 *
+	 * During restart, the process requests specific pids for its 'known
+	 * namespaces' and lets kernel assign pids to its 'unknown namespaces'.
+	 *
+	 * Since the requested-pids correspond to 'known namespaces' and since
+	 * 'known-namespaces' are younger than (i.e descendants of) 'unknown-
+	 * namespaces', copy requested pids to the back-end of target_pids[]
+	 * (i.e before the last entry for CLONE_NEWPID mentioned above).
+	 * Any entries in target_pids[] not corresponding to a requested pid
+	 * will be set to zero and kernel assigns a pid in those namespaces.
+	 *
+	 * NOTE: The order of pids in target_pids[] is oldest pid namespace to
+	 * 	 youngest (target_pids[0] corresponds to init_pid_ns). i.e.
+	 * 	 the order is:
+	 *
+	 * 		- pids for 'unknown-namespaces' (if any)
+	 * 		- pids for 'known-namespaces' (requested pids)
+	 * 		- 0 in the last entry (for CLONE_NEWPID).
+	 */
+	j = knum_pids - unum_pids;
+	size = unum_pids * sizeof(pid_t);
+
+	rc = copy_from_user(&target_pids[j], pid_set.target_pids, size);
+	if (rc) {
+		rc = -EFAULT;
+		goto out_free;
+	}
+
+	return target_pids;
+
+out_free:
+	kfree(target_pids);
+	return ERR_PTR(rc);
+}
+
+/*
  *  Ok, this is the main fork-routine.
  *
  * It copies the process, and if successful kick-starts
@@ -1351,7 +1442,7 @@ long do_fork_with_pids(unsigned long clone_flags,
 	struct task_struct *p;
 	int trace = 0;
 	long nr;
-	pid_t *target_pids = NULL;
+	pid_t *target_pids;
 
 	/*
 	 * Do some preliminary argument and permissions checking before we
@@ -1385,6 +1476,17 @@ long do_fork_with_pids(unsigned long clone_flags,
 		}
 	}
 
+	target_pids = copy_target_pids(pid_setp);
+
+	if (target_pids) {
+		if (IS_ERR(target_pids))
+			return PTR_ERR(target_pids);
+
+		nr = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out_free;
+	}
+
 	/*
 	 * When called from kernel_thread, don't do user tracing stuff.
 	 */
@@ -1446,6 +1548,10 @@ long do_fork_with_pids(unsigned long clone_flags,
 	} else {
 		nr = PTR_ERR(p);
 	}
+
+out_free:
+	kfree(target_pids);
+
 	return nr;
 }
 
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]     ` <20090531000350.GF4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-05-31 17:59       ` Oren Laadan
  2009-06-01 15:16       ` Serge E. Hallyn
  2009-06-13 18:18       ` Sukadev Bhattiprolu
  2 siblings, 0 replies; 30+ messages in thread
From: Oren Laadan @ 2009-05-31 17:59 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen


the code looks good. A couple of nits about comments, below.

Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>

Sukadev Bhattiprolu wrote:
> Serge,
> 
> I have removed restriction on CLONE_NEWPID and may need a new ack.
> 
> Sukadev
> ---
> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> Date: Mon, 4 May 2009 01:17:45 -0700
> Subject: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
> 
> Container restart requires that a task have the same pid it had when it was
> checkpointed. When containers are nested the tasks within the containers
> exist in multiple pid namespaces and hence have multiple pids to specify
> during restart.
> 
> clone_with_pids(), intended for use during restart, is the same as clone(),
> except that it takes a 'target_pid_set' paramter. This parameter lets caller
> choose specific pid numbers for the child process, in the process's active
> and ancestor pid namespaces. (Descendant pid namespaces in general don't
> matter since processes don't have pids in them anyway, but see comments
> in copy_target_pids() regarding CLONE_NEWPID).
> 
> Unlike clone(), clone_with_pids() needs CAP_SYS_ADMIN, at least for now, to
> prevent unprivileged processes from misusing this interface.
> 
> Call clone_with_pids as follows:
> 
> 	pid_t pids[] = { 0, 77, 99 };
> 	struct target_pid_set pid_set;
> 
> 	pid_set.num_pids = sizeof(pids) / sizeof(int);
> 	pid_set.target_pids = &pids;
> 
> 	syscall(__NR_clone_with_pids, flags, stack, NULL, NULL, NULL, &pid_set);
> 
> If a target-pid is 0, the kernel continues to assign a pid for the process in
> that namespace. In the above example, pids[0] is 0, meaning the kernel will
> assign next available pid to the process in init_pid_ns. But kernel will assign
> pid 77 in the child pid namespace 1 and pid 99 in pid namespace 2. If either
> 77 or 99 are taken, the system call fails with -EBUSY.
> 
> If 'pid_set.num_pids' exceeds the current nesting level of pid namespaces,
> the system call fails with -EINVAL.

Perhaps also add:
" If 'pid_set.num_pids' falls short of the current nesting level of
  pid namespaces, the syscall call assumes extra 0's at the head of
  the array."

> 
> Its mostly an exploratory patch seeking feedback on the interface.
> 
> NOTE:
> 	Compared to clone(), clone_with_pids() needs to pass in two more
> 	pieces of information:
> 
> 		- number of pids in the set
> 		- user buffer containing the list of pids.
> 
> 	But since clone() already takes 5 parameters, use a 'struct
> 	target_pid_set'.
> 
> TODO:
> 	- Gently tested.
> 	- May need additional sanity checks in do_fork_with_pids().
> 
> Changelog[v3]:
> 	- (Oren Laadan) Allow CLONE_NEWPID flag (by allocating an extra pid
> 	  in the target_pids[] list and setting it 0. See copy_target_pids()).
> 	- (Oren Laadan) Specified target pids should apply only to youngest
> 	  pid-namespaces (see copy_target_pids())
> 	- (Matt Helsley) Update patch description.
> 
> Changelog[v2]:
> 	- Remove unnecessary printk and add a note to callers of
> 	  copy_target_pids() to free target_pids.
> 	- (Serge Hallyn) Mention CAP_SYS_ADMIN restriction in patch description.
> 	- (Oren Laadan) Add checks for 'num_pids < 0' (return -EINVAL) and
> 	  'num_pids == 0' (fall back to normal clone()).
> 	- Move arch-independent code (sanity checks and copy-in of target-pids)
> 	  into kernel/fork.c and simplify sys_clone_with_pids()
> 
> Changelog[v1]:
> 	- Fixed some compile errors (had fixed these errors earlier in my
> 	  git tree but had not refreshed patches before emailing them)
> 
> Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> ---
>  arch/x86/include/asm/syscalls.h    |    1 +
>  arch/x86/include/asm/unistd_32.h   |    1 +
>  arch/x86/kernel/entry_32.S         |    1 +
>  arch/x86/kernel/process_32.c       |   21 +++++++
>  arch/x86/kernel/syscall_table_32.S |    1 +
>  kernel/fork.c                      |  108 +++++++++++++++++++++++++++++++++++-
>  6 files changed, 132 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
> index 7043408..1fdc149 100644
> --- a/arch/x86/include/asm/syscalls.h
> +++ b/arch/x86/include/asm/syscalls.h
> @@ -31,6 +31,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
>  /* kernel/process_32.c */
>  int sys_fork(struct pt_regs *);
>  int sys_clone(struct pt_regs *);
> +int sys_clone_with_pids(struct pt_regs *);
>  int sys_vfork(struct pt_regs *);
>  int sys_execve(struct pt_regs *);
>  
> diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
> index 6e72d74..90f906f 100644
> --- a/arch/x86/include/asm/unistd_32.h
> +++ b/arch/x86/include/asm/unistd_32.h
> @@ -340,6 +340,7 @@
>  #define __NR_inotify_init1	332
>  #define __NR_preadv		333
>  #define __NR_pwritev		334
> +#define __NR_clone_with_pids	335
>  
>  #ifdef __KERNEL__
>  
> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> index c929add..ee92b0d 100644
> --- a/arch/x86/kernel/entry_32.S
> +++ b/arch/x86/kernel/entry_32.S
> @@ -707,6 +707,7 @@ ptregs_##name: \
>  PTREGSCALL(iopl)
>  PTREGSCALL(fork)
>  PTREGSCALL(clone)
> +PTREGSCALL(clone_with_pids)
>  PTREGSCALL(vfork)
>  PTREGSCALL(execve)
>  PTREGSCALL(sigaltstack)
> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> index 76f8f84..1efc3de 100644
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -445,6 +445,27 @@ int sys_clone(struct pt_regs *regs)
>  	return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
>  }
>  
> +int sys_clone_with_pids(struct pt_regs *regs)
> +{
> +	unsigned long clone_flags;
> +	unsigned long newsp;
> +	int __user *parent_tidptr;
> +	int __user *child_tidptr;
> +	void __user *upid_setp;
> +
> +	clone_flags = regs->bx;
> +	newsp = regs->cx;
> +	parent_tidptr = (int __user *)regs->dx;
> +	child_tidptr = (int __user *)regs->di;
> +	upid_setp = (void __user *)regs->bp;
> +
> +	if (!newsp)
> +		newsp = regs->sp;
> +
> +	return do_fork_with_pids(clone_flags, newsp, regs, 0, parent_tidptr,
> +			child_tidptr, upid_setp);
> +}
> +
>  /*
>   * sys_execve() executes a new program.
>   */
> diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
> index ff5c873..94c1a58 100644
> --- a/arch/x86/kernel/syscall_table_32.S
> +++ b/arch/x86/kernel/syscall_table_32.S
> @@ -334,3 +334,4 @@ ENTRY(sys_call_table)
>  	.long sys_inotify_init1
>  	.long sys_preadv
>  	.long sys_pwritev
> +	.long ptregs_clone_with_pids	/* 335 */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index a16ef7b..7738aed 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1335,6 +1335,97 @@ struct task_struct * __cpuinit fork_idle(int cpu)
>  }
>  
>  /*
> + * If user specified any 'target-pids' in @upid_setp, copy them from
> + * user and return a pointer to a local copy of the list of pids. The
> + * caller must free the list, when they are done using it.
> + *
> + * If user did not specify any target pids, return NULL (caller should
> + * treat this like normal clone).
> + *
> + * On any errors, return the error code
> + */
> +static pid_t *copy_target_pids(void __user *upid_setp)
> +{
> +	int j;
> +	int rc;
> +	int size;
> +	int unum_pids;		/* # of pids specified by user */
> +	int knum_pids;		/* # of pids needed in kernel */
> +	pid_t *target_pids;
> +	struct target_pid_set pid_set;
> +
> +	if (!upid_setp)
> +		return NULL;
> +
> +	rc = copy_from_user(&pid_set, upid_setp, sizeof(pid_set));
> +	if (rc)
> +		return ERR_PTR(-EFAULT);
> +
> +	unum_pids = pid_set.num_pids;
> +	knum_pids = task_pid(current)->level + 1;
> +
> +	if (!unum_pids)
> +		return NULL;
> +
> +	if (unum_pids < 0 || unum_pids > knum_pids)
> +		return ERR_PTR(-EINVAL);
> +
> +	/*
> +	 * To keep alloc_pid() simple, allocate an extra pid_t in target_pids[]
> +	 * and set it to 0. This last entry in target_pids[] corresponds to the
> +	 * (yet-to-be-created) descendant pid-namespace if CLONE_NEWPID was
> +	 * specified. If CLONE_NEWPID was not specified, this last entry will
> +	 * simply be ignored.
> +	 */
> +	target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);
> +	if (!target_pids)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/*
> +	 * A process running in a level 2 pid namespace has three pid namespaces
> +	 * and hence three pid numbers. If this process is checkpointed,
> +	 * information about these three namespaces are saved. We refer to these
> +	 * namespaces as 'known namespaces'.

The number of levels saved at checkpoint depends not only on the
checkpointee, but also on the checkpointer's nesting level.

Perhaps:
  [...and hence three pid numbers.] The process can be checkpointed
  from (its) level 0, 1 or 2. If this process is checkpointed from
  level 1, information about two namespaces is saved. [We refer ...]

> +	 *
> +	 * If this checkpointed process is however restarted in a level 3 pid
> +	 * namespace, the restarted process has an extra ancestor pid namespace
> +	 * (i.e 'unknown namespace') and 'knum_pids' exceeds 'unum_pids'.
> +	 *
> +	 * During restart, the process requests specific pids for its 'known
> +	 * namespaces' and lets kernel assign pids to its 'unknown namespaces'.
> +	 *
> +	 * Since the requested-pids correspond to 'known namespaces' and since
> +	 * 'known-namespaces' are younger than (i.e descendants of) 'unknown-
> +	 * namespaces', copy requested pids to the back-end of target_pids[]
> +	 * (i.e before the last entry for CLONE_NEWPID mentioned above).
> +	 * Any entries in target_pids[] not corresponding to a requested pid
> +	 * will be set to zero and kernel assigns a pid in those namespaces.
> +	 *
> +	 * NOTE: The order of pids in target_pids[] is oldest pid namespace to
> +	 * 	 youngest (target_pids[0] corresponds to init_pid_ns). i.e.
> +	 * 	 the order is:
> +	 *
> +	 * 		- pids for 'unknown-namespaces' (if any)
> +	 * 		- pids for 'known-namespaces' (requested pids)
> +	 * 		- 0 in the last entry (for CLONE_NEWPID).
> +	 */
> +	j = knum_pids - unum_pids;
> +	size = unum_pids * sizeof(pid_t);
> +
> +	rc = copy_from_user(&target_pids[j], pid_set.target_pids, size);
> +	if (rc) {
> +		rc = -EFAULT;
> +		goto out_free;
> +	}
> +
> +	return target_pids;
> +
> +out_free:
> +	kfree(target_pids);
> +	return ERR_PTR(rc);
> +}
> +
> +/*
>   *  Ok, this is the main fork-routine.
>   *
>   * It copies the process, and if successful kick-starts
> @@ -1351,7 +1442,7 @@ long do_fork_with_pids(unsigned long clone_flags,
>  	struct task_struct *p;
>  	int trace = 0;
>  	long nr;
> -	pid_t *target_pids = NULL;
> +	pid_t *target_pids;
>  
>  	/*
>  	 * Do some preliminary argument and permissions checking before we
> @@ -1385,6 +1476,17 @@ long do_fork_with_pids(unsigned long clone_flags,
>  		}
>  	}
>  
> +	target_pids = copy_target_pids(pid_setp);
> +
> +	if (target_pids) {
> +		if (IS_ERR(target_pids))
> +			return PTR_ERR(target_pids);
> +
> +		nr = -EPERM;
> +		if (!capable(CAP_SYS_ADMIN))
> +			goto out_free;
> +	}
> +
>  	/*
>  	 * When called from kernel_thread, don't do user tracing stuff.
>  	 */
> @@ -1446,6 +1548,10 @@ long do_fork_with_pids(unsigned long clone_flags,
>  	} else {
>  		nr = PTR_ERR(p);
>  	}
> +
> +out_free:
> +	kfree(target_pids);
> +
>  	return nr;
>  }
>  

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 1/7] Factor out code to allocate pidmap page
       [not found] ` <20090530235714.GA4083-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (5 preceding siblings ...)
  2009-05-31  0:03   ` [RFC][v3][PATCH 7/7] Define clone_with_pids syscall Sukadev Bhattiprolu
@ 2009-06-01  8:16   ` Amerigo Wang
  6 siblings, 0 replies; 30+ messages in thread
From: Amerigo Wang @ 2009-06-01  8:16 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

On Sat, May 30, 2009 at 04:57:14PM -0700, Sukadev Bhattiprolu wrote:
>
>From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Date: Mon, 4 May 2009 01:17:39 -0700
>Subject: [RFC][v3][PATCH 1/7] Factor out code to allocate pidmap page
>
>To implement support for clone_with_pids() system call we would
>need to allocate pidmap page in more than one place. Move this
>code to a new function alloc_pidmap_page().
>
>Changelog[v2]:
>	- (Matt Helsley, Dave Hansen) Have alloc_pidmap_page() return
>	  -ENOMEM on error instead of -1.
>
>Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
>Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>

Looks good!

Reviewed-by: WANG Cong <xiyou.wangcong-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 2/7] Have alloc_pidmap() return actual error code
       [not found]     ` <20090531000115.GA4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-01  8:17       ` Amerigo Wang
  0 siblings, 0 replies; 30+ messages in thread
From: Amerigo Wang @ 2009-06-01  8:17 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

On Sat, May 30, 2009 at 05:01:15PM -0700, Sukadev Bhattiprolu wrote:
>
>From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Date: Mon, 4 May 2009 01:17:40 -0700
>Subject: [RFC][v3][PATCH 2/7] Have alloc_pidmap() return actual error code
>
>alloc_pidmap() can fail either because all pid numbers are in use or
>because memory allocation failed.  With support for setting a specific
>pid number, alloc_pidmap() would also fail if either the given pid
>number is invalid or in use.
>
>Rather than have callers assume -ENOMEM, have alloc_pidmap() return
>the actual error.
>
>Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
>Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>

Reviewed-by: WANG Cong <xiyou.wangcong-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>



>---
> kernel/fork.c |    5 +++--
> kernel/pid.c  |    9 ++++++---
> 2 files changed, 9 insertions(+), 5 deletions(-)
>
>diff --git a/kernel/fork.c b/kernel/fork.c
>index b9e2edd..f8411a8 100644
>--- a/kernel/fork.c
>+++ b/kernel/fork.c
>@@ -1119,10 +1119,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> 		goto bad_fork_cleanup_io;
> 
> 	if (pid != &init_struct_pid) {
>-		retval = -ENOMEM;
> 		pid = alloc_pid(p->nsproxy->pid_ns);
>-		if (!pid)
>+		if (IS_ERR(pid)) {
>+			retval = PTR_ERR(pid);
> 			goto bad_fork_cleanup_io;
>+		}
> 
> 		if (clone_flags & CLONE_NEWPID) {
> 			retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
>diff --git a/kernel/pid.c b/kernel/pid.c
>index 9ff33cc..b2d6a19 100644
>--- a/kernel/pid.c
>+++ b/kernel/pid.c
>@@ -158,6 +158,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
> 	offset = pid & BITS_PER_PAGE_MASK;
> 	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
> 	max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
>+	rc = -EAGAIN;
> 	for (i = 0; i <= max_scan; ++i) {
> 		rc = alloc_pidmap_page(map);
> 		if (rc)
>@@ -188,12 +189,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
> 		} else {
> 			map = &pid_ns->pidmap[0];
> 			offset = RESERVED_PIDS;
>-			if (unlikely(last == offset))
>+			if (unlikely(last == offset)) {
>+				rc = -EAGAIN;
> 				break;
>+			}
> 		}
> 		pid = mk_pid(pid_ns, map, offset);
> 	}
>-	return -1;
>+	return rc;
> }
> 
> int next_pidmap(struct pid_namespace *pid_ns, int last)
>@@ -298,7 +301,7 @@ out_free:
> 		free_pidmap(pid->numbers + i);
> 
> 	kmem_cache_free(ns->pid_cachep, pid);
>-	pid = NULL;
>+	pid = ERR_PTR(nr);
> 	goto out;
> }
> 
>-- 
>1.5.2.5
>
>_______________________________________________
>Containers mailing list
>Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
>https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 6/7] Define do_fork_with_pids()
       [not found]     ` <20090531000255.GE4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-01  8:19       ` Amerigo Wang
       [not found]         ` <20090601081929.GC4381-+dguKlz9DXUf7BdofF/totBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Amerigo Wang @ 2009-06-01  8:19 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

On Sat, May 30, 2009 at 05:02:55PM -0700, Sukadev Bhattiprolu wrote:
>
>From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Date: Mon, 4 May 2009 01:17:44 -0700
>Subject: [RFC][v3][PATCH 6/7] Define do_fork_with_pids()
>
>do_fork_with_pids() is same as do_fork(), except that it takes an
>additional, 'pid_set', parameter. This parameter, currently unused,
>specifies the set of target pids of the process in each of its pid
>namespaces.
>
>Changelog[v3]:
>	- Fix "long-line" warning from checkpatch.pl
>
>Changelog[v2]:
>	- To facilitate moving architecture-inpdendent code to kernel/fork.c
>	  pass in 'struct target_pid_set __user *' to do_fork_with_pids()
>	  rather than 'pid_t *' (next patch moves the arch-independent
>	  code to kernel/fork.c)
>
>Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
>Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
>---
> include/linux/sched.h |    3 +++
> include/linux/types.h |    5 +++++
> kernel/fork.c         |   16 ++++++++++++++--
> 3 files changed, 22 insertions(+), 2 deletions(-)
>
>diff --git a/include/linux/sched.h b/include/linux/sched.h
>index b4c38bc..e3bdb86 100644
>--- a/include/linux/sched.h
>+++ b/include/linux/sched.h
>@@ -1995,6 +1995,9 @@ extern int disallow_signal(int);
> 
> extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
> extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
>+extern long do_fork_with_pids(unsigned long, unsigned long, struct pt_regs *,
>+				unsigned long, int __user *, int __user *,
>+				struct target_pid_set __user *pid_set);
> struct task_struct *fork_idle(int);
> 
> extern void set_task_comm(struct task_struct *tsk, char *from);
>diff --git a/include/linux/types.h b/include/linux/types.h
>index 5abe354..17ec186 100644
>--- a/include/linux/types.h
>+++ b/include/linux/types.h
>@@ -204,6 +204,11 @@ struct ustat {
> 	char			f_fpack[6];
> };
> 
>+struct target_pid_set {
>+	int num_pids;
>+	pid_t *target_pids;
>+};
>+

A quick question: why not putting this definition into
include/linux/pid.h?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 3/7] Add target_pid parameter to alloc_pidmap()
       [not found]     ` <20090531000144.GB4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-01  8:30       ` Amerigo Wang
  0 siblings, 0 replies; 30+ messages in thread
From: Amerigo Wang @ 2009-06-01  8:30 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

On Sat, May 30, 2009 at 05:01:44PM -0700, Sukadev Bhattiprolu wrote:
>
>From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Date: Mon, 4 May 2009 01:17:41 -0700
>Subject: [RFC][v3][PATCH 3/7] Add target_pid parameter to alloc_pidmap()
>
>With support for setting a specific pid number for a process,
>alloc_pidmap() will need a paramter a 'target_pid' parameter.
>
>Changelog[v2]:
>	- (Serge Hallyn) Check for 'pid < 0' in set_pidmap().(Code
>	  actually checks for 'pid <= 0' for completeness).
>
>Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
>Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>

Reviewed-by: WANG Cong <xiyou.wangcong-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 4/7] Add target_pids parameter to alloc_pid()
       [not found]     ` <20090531000220.GC4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-01  8:34       ` Amerigo Wang
       [not found]         ` <20090601083419.GE4381-+dguKlz9DXUf7BdofF/totBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Amerigo Wang @ 2009-06-01  8:34 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

On Sat, May 30, 2009 at 05:02:20PM -0700, Sukadev Bhattiprolu wrote:
>
>From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Date: Mon, 4 May 2009 01:17:42 -0700
>Subject: [RFC][v3][PATCH 4/7] Add target_pids parameter to alloc_pid()
>
>This parameter is currently NULL, but will be used in a follow-on patch.

<snip>

>diff --git a/kernel/pid.c b/kernel/pid.c
>index b44dd21..090b221 100644
>--- a/kernel/pid.c
>+++ b/kernel/pid.c
>@@ -280,13 +280,14 @@ void free_pid(struct pid *pid)
> 	call_rcu(&pid->rcu, delayed_put_pid);
> }
> 
>-struct pid *alloc_pid(struct pid_namespace *ns)
>+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *target_pids)
> {
> 	struct pid *pid;
> 	enum pid_type type;
> 	int i, nr;
> 	struct pid_namespace *tmp;
> 	struct upid *upid;
>+	int tpid;
> 
> 	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
> 	if (!pid)
>@@ -294,7 +295,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> 
> 	tmp = ns;
> 	for (i = ns->level; i >= 0; i--) {
>-		nr = alloc_pidmap(tmp, 0);
>+		tpid = 0;

How about initializing 'tpid' to 0 when define it?

Otherwise,

Reviewed-by: WANG Cong <xiyou.wangcong-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 5/7] Add target_pids parameter to copy_process()
       [not found]     ` <20090531000237.GD4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-01  8:40       ` Amerigo Wang
  0 siblings, 0 replies; 30+ messages in thread
From: Amerigo Wang @ 2009-06-01  8:40 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

On Sat, May 30, 2009 at 05:02:37PM -0700, Sukadev Bhattiprolu wrote:
>
>From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Date: Mon, 4 May 2009 01:17:43 -0700
>Subject: [RFC][v3][PATCH 5/7] Add target_pids parameter to copy_process()
>
>The new parameter will be used in a follow-on patch when clone_with_pids()
>is implemented.
>
>Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
>Reviewed-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>

Reviewed-by: WANG Cong <xiyou.wangcong-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]     ` <20090531000350.GF4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-31 17:59       ` Oren Laadan
@ 2009-06-01 15:16       ` Serge E. Hallyn
       [not found]         ` <20090601151650.GA20295-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-06-13 18:18       ` Sukadev Bhattiprolu
  2 siblings, 1 reply; 30+ messages in thread
From: Serge E. Hallyn @ 2009-06-01 15:16 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):

Bear with me as I try to get this straight...

Let's say we have the following task being checkpointed (btw, the
actual checkpoint code hopefully will, as Oren mentioned, not
be always checkpointing all of a task's pids, but only the
pids from the checkpointer's pidns level an up - but that doesn't
matter to you):

Either:

level no    |    pid
0               1001
1                50
2                 3

or

level no    |    pid
0               1001
1                50
2                 1

Now we want to restart that inside a container.  So for the
first task we have a parent task:

level no       |     pid
0                    5009
1                    1000
2                     49
3                      2

calling clone_with_pid with upids {1001,50,3} to produce task:

level no       |     pid
0                   5010
1                   1001
2                    50
3                     3

So the numbers in your code for this case become:

> +	unum_pids = pid_set.num_pids;
        unum_pids = 3
> +	knum_pids = task_pid(current)->level + 1;
        knum_pids = 3 + 1 = 4

> +	target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);

XXX First problem:
        target_pids gets room for 5 pids, one more than we need.

> +	j = knum_pids - unum_pids;
        j = 1, so we copy the 3 pids in the right place.

> +	rc = copy_from_user(&target_pids[j], pid_set.target_pids, size);
> +	if (rc) {
> +		rc = -EFAULT;
> +		goto out_free;
> +	}
> +
> +	return target_pids;


For the second one, we have a parent task

level no       |     pid
0                   5009
1                   1000
2                    49

calling clone_with_pid with CLONE_NEWPID and {1001,50,1} to produce:

level no       |     pid
0                   5010
1                   1001
2                    50
3                    1

So the numbers in your code become:

> +	unum_pids = pid_set.num_pids;
        unum_pids = 3
> +	knum_pids = task_pid(current)->level + 1;
        knum_pids = 2 + 1 = 3

> +	target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);

        target_pids gets room for 4 pids

> +	j = knum_pids - unum_pids;

XXX Second problem:
        j = 0, which is not right, should be 1?

> +	rc = copy_from_user(&target_pids[j], pid_set.target_pids, size);
> +	if (rc) {
> +		rc = -EFAULT;
> +		goto out_free;
> +	}
> +
> +	return target_pids;

So it looks like your usage of knum_pids isn't quite right.

(right?)

Or are you expecting userspace to not specify the 1, only 2
upids?  In that case, you're requiring different behavior for
pid 1 than any others, which is kind of weird?  Or is that just
me?

-serge

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]         ` <20090601151650.GA20295-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-01 16:30           ` Oren Laadan
       [not found]             ` <4A240213.70001-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Oren Laadan @ 2009-06-01 16:30 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen



Serge E. Hallyn wrote:
> Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
> 
> Bear with me as I try to get this straight...
> 
> Let's say we have the following task being checkpointed (btw, the
> actual checkpoint code hopefully will, as Oren mentioned, not
> be always checkpointing all of a task's pids, but only the
> pids from the checkpointer's pidns level an up - but that doesn't
> matter to you):
> 
> Either:
> 
> level no    |    pid
> 0               1001
> 1                50
> 2                 3
> 
> or
> 
> level no    |    pid
> 0               1001
> 1                50
> 2                 1
> 
> Now we want to restart that inside a container.  So for the
> first task we have a parent task:
> 
> level no       |     pid
> 0                    5009
> 1                    1000
> 2                     49
> 3                      2
> 
> calling clone_with_pid with upids {1001,50,3} to produce task:
> 
> level no       |     pid
> 0                   5010
> 1                   1001
> 2                    50
> 3                     3
> 
> So the numbers in your code for this case become:
> 
>> +	unum_pids = pid_set.num_pids;
>         unum_pids = 3
>> +	knum_pids = task_pid(current)->level + 1;
>         knum_pids = 3 + 1 = 4
> 
>> +	target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);
> 
> XXX First problem:
>         target_pids gets room for 5 pids, one more than we need.

The two issues are related, and this is intentional. The idea
was that when you use CLONE_NEWPID, you imply a new nesting level
and a pid==1 there. So you are not supposed to specify the pid
of that new level.

IOW, the parent specify the pid's of the child from the _parent's_
level and up (of the desired depth). CLONE_NEWPID creates a new
pidns level below the parent's, and that is not covered in the
array of pids.

By allocation an extra slot and forcing it to be 0, we ensure that
the case of CLONE_NEWPID is covered correctly. Clearly, if this
flag isn't set, then the extra slot is redundant (but doesn't hurt).

> 
>> +	j = knum_pids - unum_pids;
>         j = 1, so we copy the 3 pids in the right place.
> 
>> +	rc = copy_from_user(&target_pids[j], pid_set.target_pids, size);
>> +	if (rc) {
>> +		rc = -EFAULT;
>> +		goto out_free;
>> +	}
>> +
>> +	return target_pids;
> 
> 
> For the second one, we have a parent task
> 
> level no       |     pid
> 0                   5009
> 1                   1000
> 2                    49
> 
> calling clone_with_pid with CLONE_NEWPID and {1001,50,1} to produce:
> 
> level no       |     pid
> 0                   5010
> 1                   1001
> 2                    50
> 3                    1
> 
> So the numbers in your code become:
> 
>> +	unum_pids = pid_set.num_pids;
>         unum_pids = 3

This is a "bug" of the parent. The parent should specify the pids
from the parent's level only and up, and not include the new level
below that will be created. (After all, it will have to be 1).

So unum_pids = 3 will not do what you want; instead it will try to
create the process:

0	1001
1	50
2	1
3	1

And will fail, of course, because pid==1 at level 2 is already
taken.

Instead, parent should say use {1001, 50}.

>> +	knum_pids = task_pid(current)->level + 1;
>         knum_pids = 2 + 1 = 3
> 
>> +	target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);
> 
>         target_pids gets room for 4 pids
> 
>> +	j = knum_pids - unum_pids;
> 
> XXX Second problem:
>         j = 0, which is not right, should be 1?
> 
>> +	rc = copy_from_user(&target_pids[j], pid_set.target_pids, size);
>> +	if (rc) {
>> +		rc = -EFAULT;
>> +		goto out_free;
>> +	}
>> +
>> +	return target_pids;
> 
> So it looks like your usage of knum_pids isn't quite right.
> 
> (right?)

The main difference is that in the first case the parent forks
from level 3 and in the second case is forks from level 2.  And
the syscall is relative to the parent's nesting level.

> Or are you expecting userspace to not specify the 1, only 2
> upids?  In that case, you're requiring different behavior for
> pid 1 than any others, which is kind of weird?  Or is that just
> me?

Either that, or it's just Suka and myself :p

Oren.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]             ` <4A240213.70001-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-06-01 16:42               ` Serge E. Hallyn
       [not found]                 ` <20090601164232.GA23252-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-06-01 16:58               ` Oren Laadan
  1 sibling, 1 reply; 30+ messages in thread
From: Serge E. Hallyn @ 2009-06-01 16:42 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> The two issues are related, and this is intentional. The idea
> was that when you use CLONE_NEWPID, you imply a new nesting level
> and a pid==1 there. So you are not supposed to specify the pid
> of that new level.
> 
> IOW, the parent specify the pid's of the child from the _parent's_
> level and up (of the desired depth). CLONE_NEWPID creates a new
> pidns level below the parent's, and that is not covered in the
> array of pids.
> 
> By allocation an extra slot and forcing it to be 0, we ensure that
> the case of CLONE_NEWPID is covered correctly. Clearly, if this
> flag isn't set, then the extra slot is redundant (but doesn't hurt).

Ok, I see - I guess I don't mind those semantics.  So:

> >> +	j = knum_pids - unum_pids;
> >         j = 1, so we copy the 3 pids in the right place.
> > 
> >> +	rc = copy_from_user(&target_pids[j], pid_set.target_pids, size);
> >> +	if (rc) {
> >> +		rc = -EFAULT;
> >> +		goto out_free;
> >> +	}
> >> +
> >> +	return target_pids;
> > 
> > 
> > For the second one, we have a parent task
> > 
> > level no       |     pid
> > 0                   5009
> > 1                   1000
> > 2                    49
> > 
> > calling clone_with_pid with CLONE_NEWPID and {1001,50,1} to produce:
> > 
> > level no       |     pid
> > 0                   5010
> > 1                   1001
> > 2                    50
> > 3                    1
> > 
> > So the numbers in your code become:
> > 
> >> +	unum_pids = pid_set.num_pids;
> >         unum_pids = 3
> 
> This is a "bug" of the parent. The parent should specify the pids
> from the parent's level only and up, and not include the new level
> below that will be created. (After all, it will have to be 1).
> 
> So unum_pids = 3 will not do what you want; instead it will try to
> create the process:
> 
> 0	1001
> 1	50
> 2	1
> 3	1
> 
> And will fail, of course, because pid==1 at level 2 is already
> taken.
> 
> Instead, parent should say use {1001, 50}.

Ok, but then we have the task:

level no       |     pid
0                   5009
1                   1000
2                    49

calling clone(CLONE_NEWPID) with unum_pids = 2, so

> 
> >> +	knum_pids = task_pid(current)->level + 1;
> >         knum_pids = 2 + 1 = 3
> > 
> >> +	target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);
> > 
> >         target_pids gets room for 4 pids
> > 
> >> +	j = knum_pids - unum_pids;

j = 3 - 2 = 1, so we copy 1001 into pid[1] and
50 into pid[2], with 0 in pid[0] and pid[3]

Looks good.  Thanks for indulging me :)

Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

One last thought - should there be an explicit check to make sure that
if CLONE_NEWPID, then at the end pid[knum_pids+1] = 0?  Or is that
there and I just missed it?

-serge

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]                 ` <20090601164232.GA23252-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-01 16:54                   ` Oren Laadan
       [not found]                     ` <4A2407B2.8030304-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Oren Laadan @ 2009-06-01 16:54 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>> The two issues are related, and this is intentional. The idea
>> was that when you use CLONE_NEWPID, you imply a new nesting level
>> and a pid==1 there. So you are not supposed to specify the pid
>> of that new level.
>>
>> IOW, the parent specify the pid's of the child from the _parent's_
>> level and up (of the desired depth). CLONE_NEWPID creates a new
>> pidns level below the parent's, and that is not covered in the
>> array of pids.
>>
>> By allocation an extra slot and forcing it to be 0, we ensure that
>> the case of CLONE_NEWPID is covered correctly. Clearly, if this
>> flag isn't set, then the extra slot is redundant (but doesn't hurt).
> 
> Ok, I see - I guess I don't mind those semantics.  So:
> 
>>>> +	j = knum_pids - unum_pids;
>>>         j = 1, so we copy the 3 pids in the right place.
>>>
>>>> +	rc = copy_from_user(&target_pids[j], pid_set.target_pids, size);
>>>> +	if (rc) {
>>>> +		rc = -EFAULT;
>>>> +		goto out_free;
>>>> +	}
>>>> +
>>>> +	return target_pids;
>>>
>>> For the second one, we have a parent task
>>>
>>> level no       |     pid
>>> 0                   5009
>>> 1                   1000
>>> 2                    49
>>>
>>> calling clone_with_pid with CLONE_NEWPID and {1001,50,1} to produce:
>>>
>>> level no       |     pid
>>> 0                   5010
>>> 1                   1001
>>> 2                    50
>>> 3                    1
>>>
>>> So the numbers in your code become:
>>>
>>>> +	unum_pids = pid_set.num_pids;
>>>         unum_pids = 3
>> This is a "bug" of the parent. The parent should specify the pids
>> from the parent's level only and up, and not include the new level
>> below that will be created. (After all, it will have to be 1).
>>
>> So unum_pids = 3 will not do what you want; instead it will try to
>> create the process:
>>
>> 0	1001
>> 1	50
>> 2	1
>> 3	1
>>
>> And will fail, of course, because pid==1 at level 2 is already
>> taken.
>>
>> Instead, parent should say use {1001, 50}.
> 
> Ok, but then we have the task:
> 
> level no       |     pid
> 0                   5009
> 1                   1000
> 2                    49
> 
> calling clone(CLONE_NEWPID) with unum_pids = 2, so
> 
>>>> +	knum_pids = task_pid(current)->level + 1;
>>>         knum_pids = 2 + 1 = 3
>>>
>>>> +	target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);
>>>         target_pids gets room for 4 pids
>>>
>>>> +	j = knum_pids - unum_pids;
> 
> j = 3 - 2 = 1, so we copy 1001 into pid[1] and
> 50 into pid[2], with 0 in pid[0] and pid[3]
> 
> Looks good.  Thanks for indulging me :)
> 
> Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> 
> One last thought - should there be an explicit check to make sure that
> if CLONE_NEWPID, then at the end pid[knum_pids+1] = 0?  Or is that
> there and I just missed it?

the wonders of kzalloc() ...

Oren.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]             ` <4A240213.70001-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-06-01 16:42               ` Serge E. Hallyn
@ 2009-06-01 16:58               ` Oren Laadan
  1 sibling, 0 replies; 30+ messages in thread
From: Oren Laadan @ 2009-06-01 16:58 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen



Oren Laadan wrote:
> 
> Serge E. Hallyn wrote:
>> Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
>>
>> Bear with me as I try to get this straight...
>>
>> Let's say we have the following task being checkpointed (btw, the
>> actual checkpoint code hopefully will, as Oren mentioned, not
>> be always checkpointing all of a task's pids, but only the
>> pids from the checkpointer's pidns level an up - but that doesn't
>> matter to you):
>>
>> Either:
>>
>> level no    |    pid
>> 0               1001
>> 1                50
>> 2                 3
>>
>> or
>>
>> level no    |    pid
>> 0               1001
>> 1                50
>> 2                 1
>>
>> Now we want to restart that inside a container.  So for the
>> first task we have a parent task:
>>
>> level no       |     pid
>> 0                    5009
>> 1                    1000
>> 2                     49
>> 3                      2
>>
>> calling clone_with_pid with upids {1001,50,3} to produce task:
>>
>> level no       |     pid
>> 0                   5010
>> 1                   1001
>> 2                    50
>> 3                     3
>>
>> So the numbers in your code for this case become:
>>
>>> +	unum_pids = pid_set.num_pids;
>>         unum_pids = 3
>>> +	knum_pids = task_pid(current)->level + 1;
>>         knum_pids = 3 + 1 = 4
>>
>>> +	target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);
>> XXX First problem:
>>         target_pids gets room for 5 pids, one more than we need.
> 
> The two issues are related, and this is intentional. The idea
> was that when you use CLONE_NEWPID, you imply a new nesting level
> and a pid==1 there. So you are not supposed to specify the pid
> of that new level.
> 
> IOW, the parent specify the pid's of the child from the _parent's_
> level and up (of the desired depth). CLONE_NEWPID creates a new
> pidns level below the parent's, and that is not covered in the
> array of pids.
> 
> By allocation an extra slot and forcing it to be 0, we ensure that
> the case of CLONE_NEWPID is covered correctly. Clearly, if this
> flag isn't set, then the extra slot is redundant (but doesn't hurt).
> 
>>> +	j = knum_pids - unum_pids;
>>         j = 1, so we copy the 3 pids in the right place.
>>
>>> +	rc = copy_from_user(&target_pids[j], pid_set.target_pids, size);
>>> +	if (rc) {
>>> +		rc = -EFAULT;
>>> +		goto out_free;
>>> +	}
>>> +
>>> +	return target_pids;
>>
>> For the second one, we have a parent task
>>
>> level no       |     pid
>> 0                   5009
>> 1                   1000
>> 2                    49
>>
>> calling clone_with_pid with CLONE_NEWPID and {1001,50,1} to produce:
>>
>> level no       |     pid
>> 0                   5010
>> 1                   1001
>> 2                    50
>> 3                    1
>>
>> So the numbers in your code become:
>>
>>> +	unum_pids = pid_set.num_pids;
>>         unum_pids = 3
> 
> This is a "bug" of the parent. The parent should specify the pids
> from the parent's level only and up, and not include the new level
> below that will be created. (After all, it will have to be 1).

Suka: maybe state this explicitly,too, in the patch description
and in the comments in the code ?   Like:

"The syscall operates relative to the parent's nesting level. If
CLONE_NEWPID is used as well, it will create an additional level
below. In this case the syscall indicates the desired pids at the
nesting levels seen by the parent."

Oren.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]                     ` <4A2407B2.8030304-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-06-01 17:19                       ` Serge E. Hallyn
       [not found]                         ` <20090601171943.GA23878-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Serge E. Hallyn @ 2009-06-01 17:19 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> > One last thought - should there be an explicit check to make sure that
> > if CLONE_NEWPID, then at the end pid[knum_pids+1] = 0?  Or is that
> > there and I just missed it?
> 
> the wonders of kzalloc() ...

No.  I'm saying that I don't see anything stopping the user from
doing CLONE_NEWPID while specifying an extra pid, so that they
end up trying to create a pidns init with vpid 5

-serge

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]                         ` <20090601171943.GA23878-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-01 19:35                           ` Oren Laadan
       [not found]                             ` <4A242D66.2070907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Oren Laadan @ 2009-06-01 19:35 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>> One last thought - should there be an explicit check to make sure that
>>> if CLONE_NEWPID, then at the end pid[knum_pids+1] = 0?  Or is that
>>> there and I just missed it?
>> the wonders of kzalloc() ...
> 
> No.  I'm saying that I don't see anything stopping the user from
> doing CLONE_NEWPID while specifying an extra pid, so that they
> end up trying to create a pidns init with vpid 5

The last slot in the array is always zero. The size of the array
is parent's nesting level + 2 (one for counting from zero, and
one for the new pidns, if any).

In your example, the parent is in level 2, so the array is of
size 4. The parent should provide an array of size 2, and then
clone_with_pid() will attempt to set pids of the child at levels
1 and 2 (level 0 is set by the kernel).

If however the parent gives an array of size 3, clone_with_pid()
will try to set pids of the child at level 0, 1 and 2 instead.
level 3 is below the parent, and will always use the last slot
in the (kernel) array.

Finally, for an array of size 4, clone_with_pid() will return
an error.

Did I entirely misread your question ?

Oren.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]                             ` <4A242D66.2070907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-06-01 19:54                               ` Serge E. Hallyn
  0 siblings, 0 replies; 30+ messages in thread
From: Serge E. Hallyn @ 2009-06-01 19:54 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> Did I entirely misread your question ?

Maybe, maybe not, but I pulled my pen+paper back out and
convinced myself it looks ok :)

So again,

Reviewed-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

thanks,
-serge

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 4/7] Add target_pids parameter to alloc_pid()
       [not found]         ` <20090601083419.GE4381-+dguKlz9DXUf7BdofF/totBPR1lH4CV8@public.gmane.org>
@ 2009-06-01 20:52           ` Sukadev Bhattiprolu
       [not found]             ` <20090601205233.GB1812-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-06-01 20:52 UTC (permalink / raw)
  To: Amerigo Wang; +Cc: Containers, David C. Hansen

Amerigo Wang [xiyou.wangcong-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org] wrote:
| >@@ -294,7 +295,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
| > 
| > 	tmp = ns;
| > 	for (i = ns->level; i >= 0; i--) {
| >-		nr = alloc_pidmap(tmp, 0);
| >+		tpid = 0;
| 
| How about initializing 'tpid' to 0 when define it?

'tpid' may contain a non-zero value from the previous iteration and
needs to be reset each time.

| 
| Otherwise,
| 
| Reviewed-by: WANG Cong <xiyou.wangcong-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Thanks for the review.

Sukadev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 4/7] Add target_pids parameter to alloc_pid()
       [not found]             ` <20090601205233.GB1812-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-01 21:01               ` Sukadev Bhattiprolu
  0 siblings, 0 replies; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-06-01 21:01 UTC (permalink / raw)
  To: Amerigo Wang; +Cc: Containers, David C. Hansen

Sukadev Bhattiprolu [sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org] wrote:
| Amerigo Wang [xiyou.wangcong-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org] wrote:
| | >@@ -294,7 +295,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
| | > 
| | > 	tmp = ns;
| | > 	for (i = ns->level; i >= 0; i--) {
| | >-		nr = alloc_pidmap(tmp, 0);
| | >+		tpid = 0;
| | 
| | How about initializing 'tpid' to 0 when define it?
| 
| 'tpid' may contain a non-zero value from the previous iteration and
| needs to be reset each time.

Actually, you are right. target_pids does not change, so tpid could
be set to 0 before the loop.

Sukadev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 6/7] Define do_fork_with_pids()
       [not found]         ` <20090601081929.GC4381-+dguKlz9DXUf7BdofF/totBPR1lH4CV8@public.gmane.org>
@ 2009-06-03  0:35           ` Sukadev Bhattiprolu
       [not found]             ` <20090603003522.GA22704-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-06-03  0:35 UTC (permalink / raw)
  To: Amerigo Wang; +Cc: Containers, David C. Hansen

| >diff --git a/include/linux/types.h b/include/linux/types.h
| >index 5abe354..17ec186 100644
| >--- a/include/linux/types.h
| >+++ b/include/linux/types.h
| >@@ -204,6 +204,11 @@ struct ustat {
| > 	char			f_fpack[6];
| > };
| > 
| >+struct target_pid_set {
| >+	int num_pids;
| >+	pid_t *target_pids;
| >+};
| >+
| 
| A quick question: why not putting this definition into
| include/linux/pid.h?

Well, the data structure needs to be visible to applications so I
chose types.h since its already listed in fork(2) man page and the
related type, 'pid_t' is defined there.

Isn't pid.h is a kernel-only file ?


Sukadev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 6/7] Define do_fork_with_pids()
       [not found]             ` <20090603003522.GA22704-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-04  3:07               ` Oren Laadan
       [not found]                 ` <Pine.LNX.4.64.0906032306240.25421-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Oren Laadan @ 2009-06-04  3:07 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

On Tue, 2 Jun 2009, Sukadev Bhattiprolu wrote:

> | >diff --git a/include/linux/types.h b/include/linux/types.h
> | >index 5abe354..17ec186 100644
> | >--- a/include/linux/types.h
> | >+++ b/include/linux/types.h
> | >@@ -204,6 +204,11 @@ struct ustat {
> | > 	char			f_fpack[6];
> | > };
> | > 
> | >+struct target_pid_set {
> | >+	int num_pids;
> | >+	pid_t *target_pids;
> | >+};
> | >+
> | 
> | A quick question: why not putting this definition into
> | include/linux/pid.h?
> 
> Well, the data structure needs to be visible to applications so I
> chose types.h since its already listed in fork(2) man page and the
> related type, 'pid_t' is defined there.

I don't recall linux/types.h mentioned in the manpage.

And, heh, it ended up within #ifdef __KERNEL__ there...
In fact, the types defined in that header are for kernel use.

Oren.

> 
> Isn't pid.h is a kernel-only file ?
> 
> 
> Sukadev
> 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 6/7] Define do_fork_with_pids()
       [not found]                 ` <Pine.LNX.4.64.0906032306240.25421-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
@ 2009-06-04  7:21                   ` Amerigo Wang
       [not found]                     ` <20090604072112.GA6856-+dguKlz9DXUf7BdofF/totBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 30+ messages in thread
From: Amerigo Wang @ 2009-06-04  7:21 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

On Wed, Jun 03, 2009 at 11:07:40PM -0400, Oren Laadan wrote:
>On Tue, 2 Jun 2009, Sukadev Bhattiprolu wrote:
>
>> | >diff --git a/include/linux/types.h b/include/linux/types.h
>> | >index 5abe354..17ec186 100644
>> | >--- a/include/linux/types.h
>> | >+++ b/include/linux/types.h
>> | >@@ -204,6 +204,11 @@ struct ustat {
>> | > 	char			f_fpack[6];
>> | > };
>> | > 
>> | >+struct target_pid_set {
>> | >+	int num_pids;
>> | >+	pid_t *target_pids;
>> | >+};
>> | >+
>> | 
>> | A quick question: why not putting this definition into
>> | include/linux/pid.h?
>> 
>> Well, the data structure needs to be visible to applications so I
>> chose types.h since its already listed in fork(2) man page and the
>> related type, 'pid_t' is defined there.
>
>I don't recall linux/types.h mentioned in the manpage.

In fact, linux/types.h for userspace only has a few types
for endians...

>
>And, heh, it ended up within #ifdef __KERNEL__ there...
>In fact, the types defined in that header are for kernel use.

Yup, I believe Sukadev put it in a wrong position.

Thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 6/7] Define do_fork_with_pids()
       [not found]                     ` <20090604072112.GA6856-+dguKlz9DXUf7BdofF/totBPR1lH4CV8@public.gmane.org>
@ 2009-06-04 15:41                       ` Sukadev Bhattiprolu
  0 siblings, 0 replies; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-06-04 15:41 UTC (permalink / raw)
  To: Amerigo Wang; +Cc: Containers, David C. Hansen

Amerigo Wang [xiyou.wangcong-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org] wrote:
| On Wed, Jun 03, 2009 at 11:07:40PM -0400, Oren Laadan wrote:
| >On Tue, 2 Jun 2009, Sukadev Bhattiprolu wrote:
| >
| >> | >diff --git a/include/linux/types.h b/include/linux/types.h
| >> | >index 5abe354..17ec186 100644
| >> | >--- a/include/linux/types.h
| >> | >+++ b/include/linux/types.h
| >> | >@@ -204,6 +204,11 @@ struct ustat {
| >> | > 	char			f_fpack[6];
| >> | > };
| >> | > 
| >> | >+struct target_pid_set {
| >> | >+	int num_pids;
| >> | >+	pid_t *target_pids;
| >> | >+};
| >> | >+
| >> | 
| >> | A quick question: why not putting this definition into
| >> | include/linux/pid.h?
| >> 
| >> Well, the data structure needs to be visible to applications so I
| >> chose types.h since its already listed in fork(2) man page and the
| >> related type, 'pid_t' is defined there.
| >
| >I don't recall linux/types.h mentioned in the manpage.
| 
| In fact, linux/types.h for userspace only has a few types
| for endians...

Hmm, user space's <linux/types.h> does not seem to be restricted to endianess.

On both RHEL5.2 and Ubuntu 7.10, I see fd_set, dev_t, ino_t etc. Lot of these
definitions correspond to what is in the kernel source's include/linux/types.h,

One difference though is in the kernel source tree, they are under 
#ifdef __KERNEL__ while in user's linux/include, they are under 
#ifndef __KERNEL_STRICT_NAMES.

I am trying to find out how new stuff in kernel's include/linux/types.h
ends up in user's <linux/types.h>. It may require a separate patch to
the kernel headers package.

| 
| >
| >And, heh, it ended up within #ifdef __KERNEL__ there...
| >In fact, the types defined in that header are for kernel use.
| 
| Yup, I believe Sukadev put it in a wrong position.
| 
| Thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]     ` <20090531000350.GF4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-05-31 17:59       ` Oren Laadan
  2009-06-01 15:16       ` Serge E. Hallyn
@ 2009-06-13 18:18       ` Sukadev Bhattiprolu
       [not found]         ` <20090613181838.GA2775-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2 siblings, 1 reply; 30+ messages in thread
From: Sukadev Bhattiprolu @ 2009-06-13 18:18 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, David C. Hansen

| Call clone_with_pids as follows:
| 
| 	pid_t pids[] = { 0, 77, 99 };
| 	struct target_pid_set pid_set;
| 
| 	pid_set.num_pids = sizeof(pids) / sizeof(int);
| 	pid_set.target_pids = &pids;
| 
| 	syscall(__NR_clone_with_pids, flags, stack, NULL, NULL, NULL, &pid_set);


Now that we need to do a copy_from_user() anyway, is it worth collapsing
one or more parameters into a structure ?

	struct clone_with_pid_args {
		u64 flags;
		void *stack;
		void *tls;
		struct target_pid_set *pid_set;
	};

	struct clone_tid_info {
		int *parent_tidptr;
		int *child_tidptr;
	}

	struct clone_with_pid_args cwp_args;
	struct clone_tid_info ctid_inf;

	...

	syscall(__NR_clone_with_pids, &cwp_args, &ctid_info);

One advantage would be that it would make room for 64 bit clone flags, so we
won't need a new flavor of clone_with_pids() when clone_flags is extended.

Sukadev

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC][v3][PATCH 7/7] Define clone_with_pids syscall
       [not found]         ` <20090613181838.GA2775-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-06-13 20:22           ` Oren Laadan
  0 siblings, 0 replies; 30+ messages in thread
From: Oren Laadan @ 2009-06-13 20:22 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen



Sukadev Bhattiprolu wrote:
> | Call clone_with_pids as follows:
> | 
> | 	pid_t pids[] = { 0, 77, 99 };
> | 	struct target_pid_set pid_set;
> | 
> | 	pid_set.num_pids = sizeof(pids) / sizeof(int);
> | 	pid_set.target_pids = &pids;
> | 
> | 	syscall(__NR_clone_with_pids, flags, stack, NULL, NULL, NULL, &pid_set);
> 
> 
> Now that we need to do a copy_from_user() anyway, is it worth collapsing
> one or more parameters into a structure ?

Definitely so.

> 
> 	struct clone_with_pid_args {
> 		u64 flags;
> 		void *stack;
> 		void *tls;
> 		struct target_pid_set *pid_set;
> 	};
> 
> 	struct clone_tid_info {
> 		int *parent_tidptr;
> 		int *child_tidptr;
> 	}
> 
> 	struct clone_with_pid_args cwp_args;
> 	struct clone_tid_info ctid_inf;
> 
> 	...
> 
> 	syscall(__NR_clone_with_pids, &cwp_args, &ctid_info);
> 
> One advantage would be that it would make room for 64 bit clone flags, so we
> won't need a new flavor of clone_with_pids() when clone_flags is extended.

Because clone_with_pids() is a super-set of clone(), it is possible
that it (itself) becomes the successor that allows 64-bit clone flags.
(instead of, say, adding a clone64, clone2, clone_next_generation etc).

To allow this, we need to be careful to not copy_from_user for the
when fast-path is needed for common thread/task creation, if possible.

Oren.

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2009-06-13 20:22 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-05-30 23:57 [RFC][v3][PATCH 1/7] Factor out code to allocate pidmap page Sukadev Bhattiprolu
     [not found] ` <20090530235714.GA4083-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-31  0:01   ` [RFC][v3][PATCH 2/7] Have alloc_pidmap() return actual error code Sukadev Bhattiprolu
     [not found]     ` <20090531000115.GA4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-01  8:17       ` Amerigo Wang
2009-05-31  0:01   ` [RFC][v3][PATCH 3/7] Add target_pid parameter to alloc_pidmap() Sukadev Bhattiprolu
     [not found]     ` <20090531000144.GB4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-01  8:30       ` Amerigo Wang
2009-05-31  0:02   ` [RFC][v3][PATCH 4/7] Add target_pids parameter to alloc_pid() Sukadev Bhattiprolu
     [not found]     ` <20090531000220.GC4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-01  8:34       ` Amerigo Wang
     [not found]         ` <20090601083419.GE4381-+dguKlz9DXUf7BdofF/totBPR1lH4CV8@public.gmane.org>
2009-06-01 20:52           ` Sukadev Bhattiprolu
     [not found]             ` <20090601205233.GB1812-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-01 21:01               ` Sukadev Bhattiprolu
2009-05-31  0:02   ` [RFC][v3][PATCH 5/7] Add target_pids parameter to copy_process() Sukadev Bhattiprolu
     [not found]     ` <20090531000237.GD4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-01  8:40       ` Amerigo Wang
2009-05-31  0:02   ` [RFC][v3][PATCH 6/7] Define do_fork_with_pids() Sukadev Bhattiprolu
     [not found]     ` <20090531000255.GE4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-01  8:19       ` Amerigo Wang
     [not found]         ` <20090601081929.GC4381-+dguKlz9DXUf7BdofF/totBPR1lH4CV8@public.gmane.org>
2009-06-03  0:35           ` Sukadev Bhattiprolu
     [not found]             ` <20090603003522.GA22704-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-04  3:07               ` Oren Laadan
     [not found]                 ` <Pine.LNX.4.64.0906032306240.25421-CXF6herHY6ykSYb+qCZC/1i27PF6R63G9nwVQlTi/Pw@public.gmane.org>
2009-06-04  7:21                   ` Amerigo Wang
     [not found]                     ` <20090604072112.GA6856-+dguKlz9DXUf7BdofF/totBPR1lH4CV8@public.gmane.org>
2009-06-04 15:41                       ` Sukadev Bhattiprolu
2009-05-31  0:03   ` [RFC][v3][PATCH 7/7] Define clone_with_pids syscall Sukadev Bhattiprolu
     [not found]     ` <20090531000350.GF4191-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-05-31 17:59       ` Oren Laadan
2009-06-01 15:16       ` Serge E. Hallyn
     [not found]         ` <20090601151650.GA20295-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-01 16:30           ` Oren Laadan
     [not found]             ` <4A240213.70001-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-06-01 16:42               ` Serge E. Hallyn
     [not found]                 ` <20090601164232.GA23252-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-01 16:54                   ` Oren Laadan
     [not found]                     ` <4A2407B2.8030304-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-06-01 17:19                       ` Serge E. Hallyn
     [not found]                         ` <20090601171943.GA23878-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-01 19:35                           ` Oren Laadan
     [not found]                             ` <4A242D66.2070907-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-06-01 19:54                               ` Serge E. Hallyn
2009-06-01 16:58               ` Oren Laadan
2009-06-13 18:18       ` Sukadev Bhattiprolu
     [not found]         ` <20090613181838.GA2775-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-06-13 20:22           ` Oren Laadan
2009-06-01  8:16   ` [RFC][v3][PATCH 1/7] Factor out code to allocate pidmap page Amerigo Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.