All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH linux-cr] nested pid namespaces (v2)
@ 2010-03-19 21:39 Serge E. Hallyn
  2010-03-19 21:41 ` [PATCH] user-ns: Nested pidns support (v2) Serge E. Hallyn
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Serge E. Hallyn @ 2010-03-19 21:39 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers, lkml

Support checkpoint and restart of tasks in nested pid namespaces.  At
Oren's request here is an alternative to my previous implementation.  In
this one, we keep the original single pids_array to minimize memory
allocations.  The pids array entries are augmented with a pidns depth
(relative to the container init's pidns, and an "rpid" which is the pid
in the checkpointer's pidns (or 0 if no valid pid exists).  The rpid
will be used by userspace to gather more information (like
/proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks are
in nested pid namespace, another single array holds all of the vpids.
At restart those are used by userspace to determine how to call
eclone().  Kernel ignores them.

All cr_tests including the new pid_ns testcase pass.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 checkpoint/checkpoint.c          |  113 ++++++++++++++++++++++++++++++++++----
 checkpoint/process.c             |   18 +++++-
 checkpoint/restart.c             |   45 ++++++++++++++-
 checkpoint/sys.c                 |    2 +
 include/linux/checkpoint.h       |    2 +-
 include/linux/checkpoint_hdr.h   |   16 +++++
 include/linux/checkpoint_types.h |    3 +
 kernel/nsproxy.c                 |    9 ++-
 8 files changed, 186 insertions(+), 22 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index f27af41..fe3546a 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -27,6 +27,7 @@
 #include <linux/deferqueue.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <linux/pid_namespace.h>
 
 /* unique checkpoint identifier (FIXME: should be per-container ?) */
 static atomic_t ctx_count = ATOMIC_INIT(0);
@@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 	struct task_struct *root = ctx->root_task;
 	struct nsproxy *nsproxy;
 	int ret = 0;
+	struct pid_namespace *pidns;
 
 	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
 
@@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
 		ret = -EPERM;
 	}
-	/* no support for >1 private pidns */
-	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
-		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
-		ret = -EPERM;
+	/* pidns must be descendent of root_nsproxy */
+	pidns = nsproxy->pid_ns;
+	while (pidns != ctx->root_nsproxy->pid_ns) {
+		if (pidns == &init_pid_ns) {
+			ret = -EPERM;
+			_ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
+			break;
+		}
+		pidns = pidns->parent;
 	}
 	rcu_read_unlock();
 
@@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 
 #define CKPT_HDR_PIDS_CHUNK	256
 
+/*
+ * Write the pids in ctx->root_nsproxy->pidns.  This info is
+ * needed at restart to unambiguously dereference tasks.
+ */
 static int checkpoint_pids(struct ckpt_ctx *ctx)
 {
 	struct ckpt_pids *h;
-	struct pid_namespace *ns;
+	struct pid_namespace *root_pidns;
 	struct task_struct *task;
 	struct task_struct **tasks_arr;
 	int nr_tasks, n, pos = 0, ret = 0;
 
-	ns = ctx->root_nsproxy->pid_ns;
+	root_pidns = ctx->root_nsproxy->pid_ns;
 	tasks_arr = ctx->tasks_arr;
 	nr_tasks = ctx->nr_tasks;
 	BUG_ON(nr_tasks <= 0);
@@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
 	do {
 		rcu_read_lock();
 		for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
+			struct pid_namespace *task_pidns;
 			task = tasks_arr[pos];
 
-			h[n].vpid = task_pid_nr_ns(task, ns);
-			h[n].vtgid = task_tgid_nr_ns(task, ns);
-			h[n].vpgid = task_pgrp_nr_ns(task, ns);
-			h[n].vsid = task_session_nr_ns(task, ns);
-			h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
+			h[n].vpid = task_pid_nr_ns(task, root_pidns);
+			h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
+			h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
+			h[n].vsid = task_session_nr_ns(task, root_pidns);
+			h[n].vppid = task_tgid_nr_ns(task->real_parent,
+					root_pidns);
+			task_pidns = task_nsproxy(task)->pid_ns;
+			h[n].rpid = task_pid_vnr(task);
+			h[n].depth = task_pidns->level - root_pidns->level;
 			ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
 				   pos, h[n].vpid, h[n].vtgid, h[n].vppid);
+			ctx->nr_vpids += h[n].depth;
 			pos++;
 		}
 		rcu_read_unlock();
@@ -356,6 +373,61 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+static int checkpoint_vpids(struct ckpt_ctx *ctx)
+{
+	struct ckpt_vpid *h;
+	struct pid_namespace *root_pidns, *task_pidns;
+	struct task_struct *task;
+	int ret, nr_tasks = ctx->nr_tasks;
+	int tidx = 0, /* index into task array */
+		hidx = 0; /* pids written into current ckpt_vpids chunk */
+
+	root_pidns = ctx->root_nsproxy->pid_ns;
+	nr_tasks = ctx->nr_tasks;
+
+	ret = ckpt_write_obj_type(ctx, NULL,
+				  sizeof(*h) * ctx->nr_vpids,
+				  CKPT_HDR_BUFFER);
+	if (ret < 0)
+		return ret;
+
+	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
+	if (!h)
+		return -ENOMEM;
+
+	do {
+		rcu_read_lock();
+		while (tidx < nr_tasks) {
+			int vidx; /* vpid index */
+			int nsdelta;
+
+			task = ctx->tasks_arr[tidx];
+			task_pidns = task_nsproxy(task)->pid_ns;
+			nsdelta = task_pidns->level - root_pidns->level;
+			if (hidx + nsdelta >= CKPT_HDR_PIDS_CHUNK)
+				break;
+
+			for (vidx = 0; vidx < nsdelta; vidx++) {
+				h[vidx].pid = task_pid_nr_ns(task, task_pidns);
+				task_pidns = task_pidns->parent;
+			}
+
+			hidx += nsdelta;
+			tidx++;
+		}
+		rcu_read_unlock();
+
+		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
+		if (ret < 0)
+			break;
+
+		hidx = 0;
+	} while (tidx < nr_tasks);
+
+	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
+	return ret;
+}
+
 static int collect_objects(struct ckpt_ctx *ctx)
 {
 	int n, ret = 0;
@@ -466,6 +538,7 @@ static int build_tree(struct ckpt_ctx *ctx)
 static int checkpoint_tree(struct ckpt_ctx *ctx)
 {
 	struct ckpt_hdr_tree *h;
+	struct ckpt_hdr_vpids *hvpids;
 	int ret;
 
 	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TREE);
@@ -480,7 +553,23 @@ static int checkpoint_tree(struct ckpt_ctx *ctx)
 		return ret;
 
 	ret = checkpoint_pids(ctx);
-	return ret;
+	if (ret < 0)
+		return ret;
+
+	hvpids = ckpt_hdr_get_type(ctx, sizeof(*hvpids), CKPT_HDR_VPIDS);
+	if (!hvpids)
+		return -ENOMEM;
+
+	hvpids->nr_vpids = ctx->nr_vpids;
+
+	ret = ckpt_write_obj(ctx, &hvpids->h);
+	ckpt_hdr_put(ctx, hvpids);
+	if (ret < 0)
+		return ret;
+	if (ctx->nr_vpids == 0)
+		return 0;
+
+	return checkpoint_vpids(ctx);
 }
 
 static struct task_struct *get_freezer_task(struct task_struct *root_task)
diff --git a/checkpoint/process.c b/checkpoint/process.c
index f917112..5176824 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -22,7 +22,7 @@
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 #include <linux/syscalls.h>
-
+#include <linux/pid_namespace.h>
 
 pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid)
 {
@@ -51,7 +51,7 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
 		 * Find the owner process of this pgid (it must exist
 		 * if pgrp exists). It must be a thread group leader.
 		 */
-		pgrp = find_vpid(pgid);
+		pgrp = find_pid_ns(pgid, ctx->root_nsproxy->pid_ns);
 		p = pid_task(pgrp, PIDTYPE_PID);
 		if (!p || !thread_group_leader(p))
 			return NULL;
@@ -578,6 +578,13 @@ static int restore_task_ns(struct ckpt_ctx *ctx)
 	}
 
 	if (nsproxy != task_nsproxy(current)) {
+		/*
+		 * This is *kinda* shady to do without any locking.  However
+		 * it is safe because each task is restarted separately in
+		 * serial.  If that ever changes, we'll need a spinlock?
+		 */
+		if (!nsproxy->pid_ns)
+			nsproxy->pid_ns = get_pid_ns(current->nsproxy->pid_ns);
 		get_nsproxy(nsproxy);
 		switch_task_namespaces(current, nsproxy);
 	}
@@ -829,8 +836,8 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
 
 	pgid = ctx->pids_arr[ctx->active_pid].vpgid;
 
-	if (pgid == task_pgrp_vnr(task))  /* nothing to do */
-		return 0;
+	if (pgid == task_pgrp_nr_ns(task, ctx->root_nsproxy->pid_ns))
+		return 0;  /* nothing to do */
 
 	if (task->signal->leader)  /* (2) */
 		return -EINVAL;
@@ -850,6 +857,9 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
 	if (ctx->uflags & RESTART_TASKSELF)
 		ret = 0;
 
+	if (ret < 0)
+		ckpt_err(ctx, ret, "setting pgid\n");
+
 	return ret;
 }
 
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 6a9644d..dc79da0 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -760,6 +760,39 @@ static int restore_read_tree(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+/*
+ * read all the vpids - we don't actually care about them,
+ * userspace did
+ */
+static int restore_slurp_vpids(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_vpids *h;
+	int size, ret;
+	void *junk;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VPIDS);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+	ctx->nr_vpids = h->nr_vpids;
+	ckpt_hdr_put(ctx, h);
+
+	if (!ctx->nr_vpids)
+		return 0;
+
+	size = sizeof(struct ckpt_vpid) * ctx->nr_vpids;
+	if (size < 0)		/* overflow ? */
+		return -EINVAL;
+
+	junk = kmalloc(size, GFP_KERNEL);
+	if (!junk)
+		return -ENOMEM;
+
+	ret = _ckpt_read_buffer(ctx, junk, size);
+	kfree(junk);
+
+	return ret;
+}
+
 static inline int all_tasks_activated(struct ckpt_ctx *ctx)
 {
 	return (ctx->active_pid == ctx->nr_pids);
@@ -870,7 +903,7 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
 
 static int wait_task_active(struct ckpt_ctx *ctx)
 {
-	pid_t pid = task_pid_vnr(current);
+	pid_t pid = task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns);
 	int ret;
 
 	ckpt_debug("pid %d waiting\n", pid);
@@ -886,7 +919,8 @@ static int wait_task_active(struct ckpt_ctx *ctx)
 
 static int wait_task_sync(struct ckpt_ctx *ctx)
 {
-	ckpt_debug("pid %d syncing\n", task_pid_vnr(current));
+	ckpt_debug("pid %d syncing\n",
+		task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns));
 	wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
 	ckpt_debug("task sync done (errno %d)\n", ctx->errno);
 	if (ckpt_test_error(ctx))
@@ -1160,7 +1194,7 @@ static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)
 
 	read_lock(&tasklist_lock);
 	list_for_each_entry(task, &current->children, sibling) {
-		if (task_pid_vnr(task) == pid) {
+		if (task_pid_nr_ns(task, ctx->coord_pidns) == pid) {
 			get_task_struct(task);
 			ctx->root_task = task;
 			ctx->root_pid = pid;
@@ -1237,6 +1271,11 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
 	if (ret < 0)
 		return ret;
 
+	ret = restore_slurp_vpids(ctx);
+	ckpt_debug("read vpids %d\n", ret);
+	if (ret < 0)
+		return ret;
+
 	if ((ctx->uflags & RESTART_TASKSELF) && ctx->nr_pids != 1)
 		return -EINVAL;
 
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 9e9df9b..45d3e7a 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -22,6 +22,7 @@
 #include <linux/capability.h>
 #include <linux/checkpoint.h>
 #include <linux/deferqueue.h>
+#include <linux/pid_namespace.h>
 
 /*
  * ckpt_unpriv_allowed - sysctl controlled.
@@ -276,6 +277,7 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 	ctx->uflags = uflags;
 	ctx->kflags = kflags;
 	ctx->ktime_begin = ktime_get();
+	ctx->coord_pidns = get_pid_ns(current->nsproxy->pid_ns);
 
 	atomic_set(&ctx->refcount, 0);
 	INIT_LIST_HEAD(&ctx->pgarr_list);
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 792b523..e860bf5 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -10,7 +10,7 @@
  *  distribution for more details.
  */
 
-#define CHECKPOINT_VERSION  5
+#define CHECKPOINT_VERSION  6
 
 /* checkpoint user flags */
 #define CHECKPOINT_SUBTREE	0x1
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 41412d1..1e2476a 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -117,6 +117,8 @@ enum {
 #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
 	CKPT_HDR_TASK_CREDS,
 #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
+	CKPT_HDR_VPIDS,
+#define CKPT_HDR_VPIDS CKPT_HDR_VPIDS
 
 	/* 201-299: reserved for arch-dependent */
 
@@ -327,11 +329,25 @@ struct ckpt_hdr_tree {
 } __attribute__((aligned(8)));
 
 struct ckpt_pids {
+	/* These pids are in the root_nsproxy's pid ns */
 	__s32 vpid;
 	__s32 vppid;
 	__s32 vtgid;
 	__s32 vpgid;
 	__s32 vsid;
+	__s32 rpid;  /* real pid - in checkpointer's pidns */
+	__s32 depth; /* pid depth */
+} __attribute__((aligned(8)));
+
+/* number of vpids */
+struct ckpt_hdr_vpids {
+	struct ckpt_hdr h;
+	__s32 nr_vpids;
+} __attribute__((aligned(8)));
+
+struct ckpt_vpid {
+	__s32 pid;
+	__s32 pad;
 } __attribute__((aligned(8)));
 
 /* pids */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index ecd3e91..2fb79cf 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -72,6 +72,9 @@ struct ckpt_ctx {
 	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
 	int nr_tasks;                   /* size of tasks array */
 
+	int nr_vpids;
+	struct pid_namespace *coord_pidns;	/* coordinator pid_ns */
+
 	/* [multi-process restart] */
 	struct ckpt_pids *pids_arr;	/* array of all pids [restart] */
 	int nr_pids;			/* size of pids array */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 0da0d83..6d86240 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
 		get_net(net_ns);
 		nsproxy->net_ns = net_ns;
 
-		get_pid_ns(current->nsproxy->pid_ns);
-		nsproxy->pid_ns = current->nsproxy->pid_ns;
+		/*
+		 * The pid_ns will get assigned the first time that we
+		 * assign the nsproxy to a task.  The task had unshared
+		 * its pid_ns in userspace before calling restart, and
+		 * we want to keep using that pid_ns.
+		 */
+		nsproxy->pid_ns = NULL;
 	}
  out:
 	if (ret < 0)
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH] user-ns: Nested pidns support (v2)
  2010-03-19 21:39 [PATCH linux-cr] nested pid namespaces (v2) Serge E. Hallyn
@ 2010-03-19 21:41 ` Serge E. Hallyn
       [not found] ` <20100319213955.GA17912-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2010-03-22 23:17 ` Oren Laadan
  2 siblings, 0 replies; 10+ messages in thread
From: Serge E. Hallyn @ 2010-03-19 21:41 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers, lkml

[ this patch is against the userspace checkpoint/restart tools at
http://www.linux-cr.org/git/?p=user-cr.git;a=summary ]

Support restart of nested pid namespaces.  Parse the ckpt_vpid
array to decide the vpids to specify for each task's eclone().

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
 include/linux/checkpoint.h     |    2 +-
 include/linux/checkpoint_hdr.h |   16 ++++
 restart.c                      |  158 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 160 insertions(+), 16 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 53b8b2c..8d021b9 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -14,7 +14,7 @@
  *  distribution for more details.
  */
 
-#define CHECKPOINT_VERSION 5
+#define CHECKPOINT_VERSION 6
 
 /* checkpoint user flags */
 #define CHECKPOINT_SUBTREE 0x1
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index e8eaf23..caf16a6 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -111,6 +111,8 @@ enum {
 #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
 	CKPT_HDR_TASK_CREDS,
 #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
+	CKPT_HDR_VPIDS,
+#define CKPT_HDR_VPIDS CKPT_HDR_VPIDS
 
 	/* 201-299: reserved for arch-dependent */
 
@@ -321,11 +323,25 @@ struct ckpt_hdr_tree {
 } __attribute__((aligned(8)));
 
 struct ckpt_pids {
+	/* these pids are in root_nsproxy's pid ns */
 	__s32 vpid;
 	__s32 vppid;
 	__s32 vtgid;
 	__s32 vpgid;
 	__s32 vsid;
+	__s32 rsid; /* real pid - in checkpointer's pid_ns */
+	__s32 depth; /* pidns depth */
+} __attribute__((aligned(8)));
+
+/* number of vpids */
+struct ckpt_hdr_vpids {
+	struct ckpt_hdr h;
+	__s32 nr_vpids;
+} __attribute__((aligned(8)));
+
+struct ckpt_vpid {
+	__s32 pid;
+	__s32 padding;
 } __attribute__((aligned(8)));
 
 /* pids */
diff --git a/restart.c b/restart.c
index 0c74bb6..32f36f8 100644
--- a/restart.c
+++ b/restart.c
@@ -244,6 +244,9 @@ struct task {
 
 	struct task *phantom;	/* pointer to place-holdler task (if any) */
 
+	int piddepth;
+	struct ckpt_vpid *vpids;
+
 	pid_t pid;		/* process IDs, our bread-&-butter */
 	pid_t ppid;
 	pid_t tgid;
@@ -272,6 +275,7 @@ struct ckpt_ctx {
 	int pipe_in;
 	int pipe_out;
 	int pids_nr;
+	int vpids_nr;
 
 	int pipe_child[2];	/* for children to report status */
 	int pipe_feed[2];	/* for feeder to provide input */
@@ -279,6 +283,7 @@ struct ckpt_ctx {
 
 	struct ckpt_pids *pids_arr;
 	struct ckpt_pids *copy_arr;
+	struct ckpt_vpid *vpids_arr;
 
 	struct task *tasks_arr;
 	int tasks_nr;
@@ -291,6 +296,7 @@ struct ckpt_ctx {
 	char header_arch[BUFSIZE];
 	char container[BUFSIZE];
 	char tree[BUFSIZE];
+	char vpids[BUFSIZE];
 	char buf[BUFSIZE];
 	struct app_restart_args *args;
 
@@ -316,6 +322,7 @@ static int ckpt_remount_devpts(struct ckpt_ctx *ctx);
 
 static int ckpt_build_tree(struct ckpt_ctx *ctx);
 static int ckpt_init_tree(struct ckpt_ctx *ctx);
+static int assign_vpids(struct ckpt_ctx *ctx);
 static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task);
 static int ckpt_placeholder_task(struct ckpt_ctx *ctx, struct task *task);
 static int ckpt_propagate_session(struct ckpt_ctx *ctx, struct task *session);
@@ -339,6 +346,7 @@ static int ckpt_write_header(struct ckpt_ctx *ctx);
 static int ckpt_write_header_arch(struct ckpt_ctx *ctx);
 static int ckpt_write_container(struct ckpt_ctx *ctx);
 static int ckpt_write_tree(struct ckpt_ctx *ctx);
+static int ckpt_write_vpids(struct ckpt_ctx *ctx);
 
 static int _ckpt_read(int fd, void *buf, int count);
 static int ckpt_read(int fd, void *buf, int count);
@@ -350,6 +358,7 @@ static int ckpt_read_header(struct ckpt_ctx *ctx);
 static int ckpt_read_header_arch(struct ckpt_ctx *ctx);
 static int ckpt_read_container(struct ckpt_ctx *ctx);
 static int ckpt_read_tree(struct ckpt_ctx *ctx);
+static int ckpt_read_vpids(struct ckpt_ctx *ctx);
 
 static int hash_init(struct ckpt_ctx *ctx);
 static void hash_exit(struct ckpt_ctx *ctx);
@@ -883,6 +892,12 @@ int app_restart(struct app_restart_args *args)
 		exit(1);
 	}
 
+	ret = ckpt_read_vpids(&ctx);
+	if (ret < 0) {
+		ckpt_perror("read c/r tree");
+		exit(1);
+	}
+
 	/* build creator-child-relationship tree */
 	if (hash_init(&ctx) < 0)
 		exit(1);
@@ -891,6 +906,10 @@ int app_restart(struct app_restart_args *args)
 	if (ret < 0)
 		exit(1);
 
+	ret = assign_vpids(&ctx);
+	if (ret < 0)
+		exit(1);
+
 	ret = ckpt_fork_feeder(&ctx);
 	if (ret < 0)
 		exit(1);
@@ -1218,13 +1237,13 @@ static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
 
 	return ret;
 }
-#else
+#else /* CLONE_NEWPID */
 static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
 {
 	ckpt_err("logical error: ckpt_coordinator_pidns unexpected\n");
 	exit(1);
 }
-#endif
+#endif /* CLONE_NEWPID */
 
 static int ckpt_coordinator(struct ckpt_ctx *ctx)
 {
@@ -2050,8 +2069,8 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 	struct clone_args clone_args;
 	genstack stk;
 	unsigned long flags = SIGCHLD;
-	size_t nr_pids = 1;
 	pid_t pid = 0;
+	pid_t *pids = &pid;
 
 	ckpt_dbg("forking child vpid %d flags %#x\n", child->pid, child->flags);
 
@@ -2067,29 +2086,58 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 		flags |= CLONE_PARENT;
 	}
 
+	memset(&clone_args, 0, sizeof(clone_args));
+	clone_args.nr_pids = 1;
 	/* select pid if --pids, otherwise it's 0 */
-	if (ctx->args->pids)
-		pid = child->pid;
+	if (ctx->args->pids) {
+		int i, depth = child->piddepth + 1;
 
-#ifdef CLONE_NEWPID
-	/* but for new pidns, don't specify a pid */
- 	if (child->flags & TASK_NEWPID) {
-		flags |= CLONE_NEWPID;
-		pid = 0;
+		clone_args.nr_pids = depth;
+		pids = malloc(sizeof(pid_t) * depth);
+		if (!pids) {
+			perror("ckpt_fork_child pids malloc");
+			return -1;
+		}
+
+		pids[0] = child->pid;
+		for (i = 1; i <= child->piddepth; i++)
+			pids[i] = child->vpids[i-1].pid;
+
+#ifndef CLONE_NEWPID
+		if (child->piddepth > child->creator->piddepth) {
+			ckpt_err("nested pidns but CLONE_NEWPID undefined");
+			errno = -EINVAL;
+			return -1;
+		} else if (child->flags & TASK_NEWPID) {
+			ckpt_err("TASK_NEWPID set but CLONE_NEWPID undefined");
+			errno = -EINVAL;
+			return -1;
+		}
+#else /* CLONE_NEWPID */
+		if (child->piddepth > child->creator->piddepth) {
+			child->flags |= TASK_NEWPID;
+			flags |= CLONE_NEWPID;
+		} else if (child->flags & TASK_NEWPID) {
+			/* The TASK_NEWPID could have been set for root task */
+			pids[0] = 0;
+			flags |= CLONE_NEWPID;
+		}
+		if (flags & CLONE_NEWPID)
+			clone_args.nr_pids--;
+#endif /* CLONE_NEWPID */
 	}
-#endif
 
 	if (child->flags & (TASK_SIBLING | TASK_THREAD))
 		child->real_parent = getppid();
 	else
 		child->real_parent = _getpid();
 
-	memset(&clone_args, 0, sizeof(clone_args));
 	clone_args.child_stack = (unsigned long)genstack_base(stk);
 	clone_args.child_stack_size = genstack_size(stk);
-	clone_args.nr_pids = nr_pids;
 
-	pid = eclone(ckpt_fork_stub, child, flags, &clone_args, &pid);
+	pid = eclone(ckpt_fork_stub, child, flags, &clone_args, pids);
+	if (pids != &pid)
+		free(pids);
 	if (pid < 0) {
 		ckpt_perror("eclone");
 		genstack_release(stk);
@@ -2269,6 +2317,9 @@ static int ckpt_do_feeder(void *data)
 	if (ckpt_write_tree(ctx) < 0)
 		ckpt_abort(ctx, "write c/r tree");
 
+	if (ckpt_write_vpids(ctx) < 0)
+		ckpt_abort(ctx, "write vpids");
+
 	/* read rest -> write rest */
 	if (ctx->args->inspect)
 		ckpt_read_write_inspect(ctx);
@@ -2461,6 +2512,8 @@ static int ckpt_read_obj(struct ckpt_ctx *ctx,
 		errno = EINVAL;
 		return -1;
 	}
+	if (h->len == sizeof(*h))
+	return 0;
 	return ckpt_read(STDIN_FILENO, buf, h->len - sizeof(*h));
 }
 
@@ -2609,8 +2662,64 @@ static int ckpt_read_tree(struct ckpt_ctx *ctx)
 	}
 
 	ret = ckpt_read_obj_ptr(ctx, ctx->pids_arr, len, CKPT_HDR_BUFFER);
-	if (ret < 0)
+	if (ret < 0) {
 		free(ctx->pids_arr);
+		return ret;
+	}
+
+	return ret;
+}
+
+/* set the vpids pointers in all the tasks */
+static int assign_vpids(struct ckpt_ctx *ctx)
+{
+	int d, hidx, tidx;
+
+	for (hidx = 0, tidx = 0; tidx < ctx->pids_nr; tidx++) {
+		d = ctx->tasks_arr[tidx].piddepth = ctx->pids_arr[tidx].depth;
+		if (!d) {
+			ctx->tasks_arr[tidx].vpids = NULL;
+			continue;
+		}
+		ctx->tasks_arr[tidx].vpids = &ctx->vpids_arr[hidx];
+		hidx += ctx->pids_arr[tidx].depth;
+		if (hidx > ctx->vpids_nr)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int ckpt_read_vpids(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_vpids *h;
+	int len, ret;
+
+	h = (struct ckpt_hdr_vpids *) ctx->vpids;
+	ret = ckpt_read_obj_type(ctx, h, sizeof(*h), CKPT_HDR_VPIDS);
+	if (ret < 0)
+		return ret;
+
+	ckpt_dbg("number of vpids: %d\n", h->nr_vpids);
+
+	if (h->nr_vpids < 0) {
+		ckpt_err("invalid number of vpids %d", h->nr_vpids);
+		errno = EINVAL;
+		return -1;
+	}
+	ctx->vpids_nr = h->nr_vpids;
+	if (!ctx->vpids_nr)
+		return 0;
+
+	len = sizeof(struct ckpt_vpid) * ctx->vpids_nr;
+
+	ctx->vpids_arr = malloc(len);
+	if (!ctx->pids_arr)
+		return -1;
+
+	ret = ckpt_read_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER);
+	if (ret < 0)
+		free(ctx->vpids_arr);
 
 	return ret;
 }
@@ -2685,6 +2794,25 @@ static int ckpt_write_tree(struct ckpt_ctx *ctx)
 	return 0;
 }
 
+static int ckpt_write_vpids(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_vpids *h;
+	int len;
+
+	h = (struct ckpt_hdr_vpids *) ctx->vpids;
+	if (ckpt_write_obj(ctx, (struct ckpt_hdr *) h) < 0)
+		ckpt_abort(ctx, "write vpids hdr");
+
+	if (!ctx->vpids_nr)
+		return 0;
+	len = sizeof(struct ckpt_vpid) * ctx->vpids_nr;
+	if (ckpt_write_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER) < 0)
+		ckpt_abort(ctx, "write vpids");
+	ckpt_dbg("wrote %d bytes for %d vpids\n", len, ctx->vpids_nr);
+
+	return 0;
+}
+
 /*
  * a simple hash implementation
  */
-- 
1.7.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH linux-cr] nested pid namespaces (v2)
  2010-03-19 21:39 [PATCH linux-cr] nested pid namespaces (v2) Serge E. Hallyn
@ 2010-03-22 10:55     ` Louis Rilling
       [not found] ` <20100319213955.GA17912-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2010-03-22 23:17 ` Oren Laadan
  2 siblings, 0 replies; 10+ messages in thread
From: Louis Rilling @ 2010-03-22 10:55 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, lkml


[-- Attachment #1.1: Type: text/plain, Size: 8794 bytes --]

On Fri, Mar 19, 2010 at 04:39:55PM -0500, Serge E. Hallyn wrote:
> Support checkpoint and restart of tasks in nested pid namespaces.  At
> Oren's request here is an alternative to my previous implementation.  In
> this one, we keep the original single pids_array to minimize memory
> allocations.  The pids array entries are augmented with a pidns depth
> (relative to the container init's pidns, and an "rpid" which is the pid
> in the checkpointer's pidns (or 0 if no valid pid exists).  The rpid
> will be used by userspace to gather more information (like
> /proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks are
> in nested pid namespace, another single array holds all of the vpids.
> At restart those are used by userspace to determine how to call
> eclone().  Kernel ignores them.
> 
> All cr_tests including the new pid_ns testcase pass.
> 

IMHO this approach looks ok too. I just feel that checkpoint_vpids() could
be re-worked a bit in order to not impose an artificial limit of
CKPT_HDR_PIDS_CHUNK to the depth of pid namespaces, even if it is 256 (see
suggested changes below).

It would probably be safer too to use task_active_pid_ns() instead of
task->nsproxy->pid_ns, just in case some PID namespace unsharing like proposed
by Eric makes it to mainline.

Thanks,

Louis

> Signed-off-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  checkpoint/checkpoint.c          |  113 ++++++++++++++++++++++++++++++++++----
>  checkpoint/process.c             |   18 +++++-
>  checkpoint/restart.c             |   45 ++++++++++++++-
>  checkpoint/sys.c                 |    2 +
>  include/linux/checkpoint.h       |    2 +-
>  include/linux/checkpoint_hdr.h   |   16 +++++
>  include/linux/checkpoint_types.h |    3 +
>  kernel/nsproxy.c                 |    9 ++-
>  8 files changed, 186 insertions(+), 22 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index f27af41..fe3546a 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -27,6 +27,7 @@
>  #include <linux/deferqueue.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <linux/pid_namespace.h>
>  
>  /* unique checkpoint identifier (FIXME: should be per-container ?) */
>  static atomic_t ctx_count = ATOMIC_INIT(0);
> @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  	struct task_struct *root = ctx->root_task;
>  	struct nsproxy *nsproxy;
>  	int ret = 0;
> +	struct pid_namespace *pidns;
>  
>  	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
>  
> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
>  		ret = -EPERM;
>  	}
> -	/* no support for >1 private pidns */
> -	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> -		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> -		ret = -EPERM;
> +	/* pidns must be descendent of root_nsproxy */
> +	pidns = nsproxy->pid_ns;
> +	while (pidns != ctx->root_nsproxy->pid_ns) {
> +		if (pidns == &init_pid_ns) {
> +			ret = -EPERM;
> +			_ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> +			break;
> +		}
> +		pidns = pidns->parent;
>  	}
>  	rcu_read_unlock();
>  
> @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  
>  #define CKPT_HDR_PIDS_CHUNK	256
>  
> +/*
> + * Write the pids in ctx->root_nsproxy->pidns.  This info is
> + * needed at restart to unambiguously dereference tasks.
> + */
>  static int checkpoint_pids(struct ckpt_ctx *ctx)
>  {
>  	struct ckpt_pids *h;
> -	struct pid_namespace *ns;
> +	struct pid_namespace *root_pidns;
>  	struct task_struct *task;
>  	struct task_struct **tasks_arr;
>  	int nr_tasks, n, pos = 0, ret = 0;
>  
> -	ns = ctx->root_nsproxy->pid_ns;
> +	root_pidns = ctx->root_nsproxy->pid_ns;
>  	tasks_arr = ctx->tasks_arr;
>  	nr_tasks = ctx->nr_tasks;
>  	BUG_ON(nr_tasks <= 0);
> @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>  	do {
>  		rcu_read_lock();
>  		for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
> +			struct pid_namespace *task_pidns;
>  			task = tasks_arr[pos];
>  
> -			h[n].vpid = task_pid_nr_ns(task, ns);
> -			h[n].vtgid = task_tgid_nr_ns(task, ns);
> -			h[n].vpgid = task_pgrp_nr_ns(task, ns);
> -			h[n].vsid = task_session_nr_ns(task, ns);
> -			h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
> +			h[n].vpid = task_pid_nr_ns(task, root_pidns);
> +			h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
> +			h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
> +			h[n].vsid = task_session_nr_ns(task, root_pidns);
> +			h[n].vppid = task_tgid_nr_ns(task->real_parent,
> +					root_pidns);
> +			task_pidns = task_nsproxy(task)->pid_ns;
> +			h[n].rpid = task_pid_vnr(task);
> +			h[n].depth = task_pidns->level - root_pidns->level;
>  			ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
>  				   pos, h[n].vpid, h[n].vtgid, h[n].vppid);
> +			ctx->nr_vpids += h[n].depth;
>  			pos++;
>  		}
>  		rcu_read_unlock();
> @@ -356,6 +373,61 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>  	return ret;
>  }
>  
> +static int checkpoint_vpids(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_vpid *h;
> +	struct pid_namespace *root_pidns, *task_pidns;
> +	struct task_struct *task;
> +	int ret, nr_tasks = ctx->nr_tasks;
> +	int tidx = 0, /* index into task array */
> +		hidx = 0; /* pids written into current ckpt_vpids chunk */
> +
> +	root_pidns = ctx->root_nsproxy->pid_ns;
> +	nr_tasks = ctx->nr_tasks;
> +
> +	ret = ckpt_write_obj_type(ctx, NULL,
> +				  sizeof(*h) * ctx->nr_vpids,
> +				  CKPT_HDR_BUFFER);
> +	if (ret < 0)
> +		return ret;
> +
> +	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	do {
> +		rcu_read_lock();
> +		while (tidx < nr_tasks) {
> +			int vidx; /* vpid index */
> +			int nsdelta;
> +
> +			task = ctx->tasks_arr[tidx];
> +			task_pidns = task_nsproxy(task)->pid_ns;
> +			nsdelta = task_pidns->level - root_pidns->level;
> +			if (hidx + nsdelta >= CKPT_HDR_PIDS_CHUNK)

I think that (hidx + nsdelta > CKPT_HDR_PIDS_CHUNK) checks more accurately the limit.

> +				break;
> +
> +			for (vidx = 0; vidx < nsdelta; vidx++) {
> +				h[vidx].pid = task_pid_nr_ns(task, task_pidns);

Here:
				h[hidx + vidx]

> +				task_pidns = task_pidns->parent;
> +			}
> +
> +			hidx += nsdelta;
> +			tidx++;
> +		}
> +		rcu_read_unlock();
> +
> +		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> +		if (ret < 0)
> +			break;
> +
> +		hidx = 0;
> +	} while (tidx < nr_tasks);
> +
> +	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +	return ret;
> +}

Maybe re-work it this way:

static int checkpoint_vpids(struct ckpt_ctx *ctx)
{
	struct ckpt_vpid *h;
	struct pid_namespace *root_pidns, *task_pidns, *active_pidns;
	struct task_struct *task;
	int ret, nr_tasks = ctx->nr_tasks;
	int tidx = 0, /* index into task array */
		hidx = 0; /* pids written into current ckpt_vpids chunk */
		vidx = 0; /* vpid index for current task */

	root_pidns = ctx->root_nsproxy->pid_ns;
	nr_tasks = ctx->nr_tasks;

	ret = ckpt_write_obj_type(ctx, NULL,
				  sizeof(*h) * ctx->nr_vpids,
				  CKPT_HDR_BUFFER);
	if (ret < 0)
		return ret;

	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
	if (!h)
		return -ENOMEM;

	do {
		rcu_read_lock();
		while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) {
			int nsdelta;

			task = ctx->tasks_arr[tidx];
			active_pidns = task_active_pid_ns(task);
			nsdelta = active_pidns->level - root_pidns->level;
			if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK)
				/*
				 * We will release rcu before recording the
				 * remaining vpids, but neither task nor its
				 * pid can disappear.
				 */
				nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx;

			if (vidx == 0)
				task_pidns = active_pidns;
			for (; vidx < nsdelta; vidx++) {
				h[hidx].pid = task_pid_nr_ns(task, task_pidns);
				hidx++;
				task_pidns = task_pidns->parent;
			}

			if (task_pidns == root_pidns) {
				tidx++;
				vidx = 0;
			}
		}
		rcu_read_unlock();

		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
		if (ret < 0)
			break;

		hidx = 0;
	} while (tidx < nr_tasks);

	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
	return ret;
}

[...]

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 206 bytes --]

_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH linux-cr] nested pid namespaces (v2)
@ 2010-03-22 10:55     ` Louis Rilling
  0 siblings, 0 replies; 10+ messages in thread
From: Louis Rilling @ 2010-03-22 10:55 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Oren Laadan, Linux Containers, lkml

[-- Attachment #1: Type: text/plain, Size: 8765 bytes --]

On Fri, Mar 19, 2010 at 04:39:55PM -0500, Serge E. Hallyn wrote:
> Support checkpoint and restart of tasks in nested pid namespaces.  At
> Oren's request here is an alternative to my previous implementation.  In
> this one, we keep the original single pids_array to minimize memory
> allocations.  The pids array entries are augmented with a pidns depth
> (relative to the container init's pidns, and an "rpid" which is the pid
> in the checkpointer's pidns (or 0 if no valid pid exists).  The rpid
> will be used by userspace to gather more information (like
> /proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks are
> in nested pid namespace, another single array holds all of the vpids.
> At restart those are used by userspace to determine how to call
> eclone().  Kernel ignores them.
> 
> All cr_tests including the new pid_ns testcase pass.
> 

IMHO this approach looks ok too. I just feel that checkpoint_vpids() could
be re-worked a bit in order to not impose an artificial limit of
CKPT_HDR_PIDS_CHUNK to the depth of pid namespaces, even if it is 256 (see
suggested changes below).

It would probably be safer too to use task_active_pid_ns() instead of
task->nsproxy->pid_ns, just in case some PID namespace unsharing like proposed
by Eric makes it to mainline.

Thanks,

Louis

> Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
> ---
>  checkpoint/checkpoint.c          |  113 ++++++++++++++++++++++++++++++++++----
>  checkpoint/process.c             |   18 +++++-
>  checkpoint/restart.c             |   45 ++++++++++++++-
>  checkpoint/sys.c                 |    2 +
>  include/linux/checkpoint.h       |    2 +-
>  include/linux/checkpoint_hdr.h   |   16 +++++
>  include/linux/checkpoint_types.h |    3 +
>  kernel/nsproxy.c                 |    9 ++-
>  8 files changed, 186 insertions(+), 22 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index f27af41..fe3546a 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -27,6 +27,7 @@
>  #include <linux/deferqueue.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <linux/pid_namespace.h>
>  
>  /* unique checkpoint identifier (FIXME: should be per-container ?) */
>  static atomic_t ctx_count = ATOMIC_INIT(0);
> @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  	struct task_struct *root = ctx->root_task;
>  	struct nsproxy *nsproxy;
>  	int ret = 0;
> +	struct pid_namespace *pidns;
>  
>  	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
>  
> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
>  		ret = -EPERM;
>  	}
> -	/* no support for >1 private pidns */
> -	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> -		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> -		ret = -EPERM;
> +	/* pidns must be descendent of root_nsproxy */
> +	pidns = nsproxy->pid_ns;
> +	while (pidns != ctx->root_nsproxy->pid_ns) {
> +		if (pidns == &init_pid_ns) {
> +			ret = -EPERM;
> +			_ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> +			break;
> +		}
> +		pidns = pidns->parent;
>  	}
>  	rcu_read_unlock();
>  
> @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  
>  #define CKPT_HDR_PIDS_CHUNK	256
>  
> +/*
> + * Write the pids in ctx->root_nsproxy->pidns.  This info is
> + * needed at restart to unambiguously dereference tasks.
> + */
>  static int checkpoint_pids(struct ckpt_ctx *ctx)
>  {
>  	struct ckpt_pids *h;
> -	struct pid_namespace *ns;
> +	struct pid_namespace *root_pidns;
>  	struct task_struct *task;
>  	struct task_struct **tasks_arr;
>  	int nr_tasks, n, pos = 0, ret = 0;
>  
> -	ns = ctx->root_nsproxy->pid_ns;
> +	root_pidns = ctx->root_nsproxy->pid_ns;
>  	tasks_arr = ctx->tasks_arr;
>  	nr_tasks = ctx->nr_tasks;
>  	BUG_ON(nr_tasks <= 0);
> @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>  	do {
>  		rcu_read_lock();
>  		for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
> +			struct pid_namespace *task_pidns;
>  			task = tasks_arr[pos];
>  
> -			h[n].vpid = task_pid_nr_ns(task, ns);
> -			h[n].vtgid = task_tgid_nr_ns(task, ns);
> -			h[n].vpgid = task_pgrp_nr_ns(task, ns);
> -			h[n].vsid = task_session_nr_ns(task, ns);
> -			h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
> +			h[n].vpid = task_pid_nr_ns(task, root_pidns);
> +			h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
> +			h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
> +			h[n].vsid = task_session_nr_ns(task, root_pidns);
> +			h[n].vppid = task_tgid_nr_ns(task->real_parent,
> +					root_pidns);
> +			task_pidns = task_nsproxy(task)->pid_ns;
> +			h[n].rpid = task_pid_vnr(task);
> +			h[n].depth = task_pidns->level - root_pidns->level;
>  			ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
>  				   pos, h[n].vpid, h[n].vtgid, h[n].vppid);
> +			ctx->nr_vpids += h[n].depth;
>  			pos++;
>  		}
>  		rcu_read_unlock();
> @@ -356,6 +373,61 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>  	return ret;
>  }
>  
> +static int checkpoint_vpids(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_vpid *h;
> +	struct pid_namespace *root_pidns, *task_pidns;
> +	struct task_struct *task;
> +	int ret, nr_tasks = ctx->nr_tasks;
> +	int tidx = 0, /* index into task array */
> +		hidx = 0; /* pids written into current ckpt_vpids chunk */
> +
> +	root_pidns = ctx->root_nsproxy->pid_ns;
> +	nr_tasks = ctx->nr_tasks;
> +
> +	ret = ckpt_write_obj_type(ctx, NULL,
> +				  sizeof(*h) * ctx->nr_vpids,
> +				  CKPT_HDR_BUFFER);
> +	if (ret < 0)
> +		return ret;
> +
> +	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	do {
> +		rcu_read_lock();
> +		while (tidx < nr_tasks) {
> +			int vidx; /* vpid index */
> +			int nsdelta;
> +
> +			task = ctx->tasks_arr[tidx];
> +			task_pidns = task_nsproxy(task)->pid_ns;
> +			nsdelta = task_pidns->level - root_pidns->level;
> +			if (hidx + nsdelta >= CKPT_HDR_PIDS_CHUNK)

I think that (hidx + nsdelta > CKPT_HDR_PIDS_CHUNK) checks more accurately the limit.

> +				break;
> +
> +			for (vidx = 0; vidx < nsdelta; vidx++) {
> +				h[vidx].pid = task_pid_nr_ns(task, task_pidns);

Here:
				h[hidx + vidx]

> +				task_pidns = task_pidns->parent;
> +			}
> +
> +			hidx += nsdelta;
> +			tidx++;
> +		}
> +		rcu_read_unlock();
> +
> +		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> +		if (ret < 0)
> +			break;
> +
> +		hidx = 0;
> +	} while (tidx < nr_tasks);
> +
> +	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +	return ret;
> +}

Maybe re-work it this way:

static int checkpoint_vpids(struct ckpt_ctx *ctx)
{
	struct ckpt_vpid *h;
	struct pid_namespace *root_pidns, *task_pidns, *active_pidns;
	struct task_struct *task;
	int ret, nr_tasks = ctx->nr_tasks;
	int tidx = 0, /* index into task array */
		hidx = 0; /* pids written into current ckpt_vpids chunk */
		vidx = 0; /* vpid index for current task */

	root_pidns = ctx->root_nsproxy->pid_ns;
	nr_tasks = ctx->nr_tasks;

	ret = ckpt_write_obj_type(ctx, NULL,
				  sizeof(*h) * ctx->nr_vpids,
				  CKPT_HDR_BUFFER);
	if (ret < 0)
		return ret;

	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
	if (!h)
		return -ENOMEM;

	do {
		rcu_read_lock();
		while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) {
			int nsdelta;

			task = ctx->tasks_arr[tidx];
			active_pidns = task_active_pid_ns(task);
			nsdelta = active_pidns->level - root_pidns->level;
			if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK)
				/*
				 * We will release rcu before recording the
				 * remaining vpids, but neither task nor its
				 * pid can disappear.
				 */
				nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx;

			if (vidx == 0)
				task_pidns = active_pidns;
			for (; vidx < nsdelta; vidx++) {
				h[hidx].pid = task_pid_nr_ns(task, task_pidns);
				hidx++;
				task_pidns = task_pidns->parent;
			}

			if (task_pidns == root_pidns) {
				tidx++;
				vidx = 0;
			}
		}
		rcu_read_unlock();

		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
		if (ret < 0)
			break;

		hidx = 0;
	} while (tidx < nr_tasks);

	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
	return ret;
}

[...]

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH linux-cr] nested pid namespaces (v2)
  2010-03-22 10:55     ` Louis Rilling
  (?)
@ 2010-03-22 14:38     ` Serge E. Hallyn
  2010-03-22 19:57       ` Matt Helsley
  -1 siblings, 1 reply; 10+ messages in thread
From: Serge E. Hallyn @ 2010-03-22 14:38 UTC (permalink / raw)
  To: Louis Rilling; +Cc: Oren Laadan, Linux Containers, lkml

Quoting Louis Rilling (Louis.Rilling@kerlabs.com):
> On Fri, Mar 19, 2010 at 04:39:55PM -0500, Serge E. Hallyn wrote:
> > Support checkpoint and restart of tasks in nested pid namespaces.  At
> > Oren's request here is an alternative to my previous implementation.  In
> > this one, we keep the original single pids_array to minimize memory
> > allocations.  The pids array entries are augmented with a pidns depth
> > (relative to the container init's pidns, and an "rpid" which is the pid
> > in the checkpointer's pidns (or 0 if no valid pid exists).  The rpid
> > will be used by userspace to gather more information (like
> > /proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks are
> > in nested pid namespace, another single array holds all of the vpids.
> > At restart those are used by userspace to determine how to call
> > eclone().  Kernel ignores them.
> > 
> > All cr_tests including the new pid_ns testcase pass.
> > 
> 
> IMHO this approach looks ok too. I just feel that checkpoint_vpids() could
> be re-worked a bit in order to not impose an artificial limit of
> CKPT_HDR_PIDS_CHUNK to the depth of pid namespaces, even if it is 256 (see
> suggested changes below).

Ah, took me a second to see what you were saying.  Good point.

> It would probably be safer too to use task_active_pid_ns() instead of
> task->nsproxy->pid_ns, just in case some PID namespace unsharing like proposed
> by Eric makes it to mainline.

The task is frozen though so it shouldn't be able to unshare while being
checkpointed, right?  But it's probably better code anyway.

> Thanks,
> 
> Louis
> 
> > Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
> > ---
> >  checkpoint/checkpoint.c          |  113 ++++++++++++++++++++++++++++++++++----
> >  checkpoint/process.c             |   18 +++++-
> >  checkpoint/restart.c             |   45 ++++++++++++++-
> >  checkpoint/sys.c                 |    2 +
> >  include/linux/checkpoint.h       |    2 +-
> >  include/linux/checkpoint_hdr.h   |   16 +++++
> >  include/linux/checkpoint_types.h |    3 +
> >  kernel/nsproxy.c                 |    9 ++-
> >  8 files changed, 186 insertions(+), 22 deletions(-)
> > 
> > diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> > index f27af41..fe3546a 100644
> > --- a/checkpoint/checkpoint.c
> > +++ b/checkpoint/checkpoint.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/deferqueue.h>
> >  #include <linux/checkpoint.h>
> >  #include <linux/checkpoint_hdr.h>
> > +#include <linux/pid_namespace.h>
> >  
> >  /* unique checkpoint identifier (FIXME: should be per-container ?) */
> >  static atomic_t ctx_count = ATOMIC_INIT(0);
> > @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> >  	struct task_struct *root = ctx->root_task;
> >  	struct nsproxy *nsproxy;
> >  	int ret = 0;
> > +	struct pid_namespace *pidns;
> >  
> >  	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
> >  
> > @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> >  		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
> >  		ret = -EPERM;
> >  	}
> > -	/* no support for >1 private pidns */
> > -	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> > -		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> > -		ret = -EPERM;
> > +	/* pidns must be descendent of root_nsproxy */
> > +	pidns = nsproxy->pid_ns;
> > +	while (pidns != ctx->root_nsproxy->pid_ns) {
> > +		if (pidns == &init_pid_ns) {
> > +			ret = -EPERM;
> > +			_ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> > +			break;
> > +		}
> > +		pidns = pidns->parent;
> >  	}
> >  	rcu_read_unlock();
> >  
> > @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> >  
> >  #define CKPT_HDR_PIDS_CHUNK	256
> >  
> > +/*
> > + * Write the pids in ctx->root_nsproxy->pidns.  This info is
> > + * needed at restart to unambiguously dereference tasks.
> > + */
> >  static int checkpoint_pids(struct ckpt_ctx *ctx)
> >  {
> >  	struct ckpt_pids *h;
> > -	struct pid_namespace *ns;
> > +	struct pid_namespace *root_pidns;
> >  	struct task_struct *task;
> >  	struct task_struct **tasks_arr;
> >  	int nr_tasks, n, pos = 0, ret = 0;
> >  
> > -	ns = ctx->root_nsproxy->pid_ns;
> > +	root_pidns = ctx->root_nsproxy->pid_ns;
> >  	tasks_arr = ctx->tasks_arr;
> >  	nr_tasks = ctx->nr_tasks;
> >  	BUG_ON(nr_tasks <= 0);
> > @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
> >  	do {
> >  		rcu_read_lock();
> >  		for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
> > +			struct pid_namespace *task_pidns;
> >  			task = tasks_arr[pos];
> >  
> > -			h[n].vpid = task_pid_nr_ns(task, ns);
> > -			h[n].vtgid = task_tgid_nr_ns(task, ns);
> > -			h[n].vpgid = task_pgrp_nr_ns(task, ns);
> > -			h[n].vsid = task_session_nr_ns(task, ns);
> > -			h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
> > +			h[n].vpid = task_pid_nr_ns(task, root_pidns);
> > +			h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
> > +			h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
> > +			h[n].vsid = task_session_nr_ns(task, root_pidns);
> > +			h[n].vppid = task_tgid_nr_ns(task->real_parent,
> > +					root_pidns);
> > +			task_pidns = task_nsproxy(task)->pid_ns;
> > +			h[n].rpid = task_pid_vnr(task);
> > +			h[n].depth = task_pidns->level - root_pidns->level;
> >  			ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
> >  				   pos, h[n].vpid, h[n].vtgid, h[n].vppid);
> > +			ctx->nr_vpids += h[n].depth;
> >  			pos++;
> >  		}
> >  		rcu_read_unlock();
> > @@ -356,6 +373,61 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
> >  	return ret;
> >  }
> >  
> > +static int checkpoint_vpids(struct ckpt_ctx *ctx)
> > +{
> > +	struct ckpt_vpid *h;
> > +	struct pid_namespace *root_pidns, *task_pidns;
> > +	struct task_struct *task;
> > +	int ret, nr_tasks = ctx->nr_tasks;
> > +	int tidx = 0, /* index into task array */
> > +		hidx = 0; /* pids written into current ckpt_vpids chunk */
> > +
> > +	root_pidns = ctx->root_nsproxy->pid_ns;
> > +	nr_tasks = ctx->nr_tasks;
> > +
> > +	ret = ckpt_write_obj_type(ctx, NULL,
> > +				  sizeof(*h) * ctx->nr_vpids,
> > +				  CKPT_HDR_BUFFER);
> > +	if (ret < 0)
> > +		return ret;
> > +
> > +	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> > +	if (!h)
> > +		return -ENOMEM;
> > +
> > +	do {
> > +		rcu_read_lock();
> > +		while (tidx < nr_tasks) {
> > +			int vidx; /* vpid index */
> > +			int nsdelta;
> > +
> > +			task = ctx->tasks_arr[tidx];
> > +			task_pidns = task_nsproxy(task)->pid_ns;
> > +			nsdelta = task_pidns->level - root_pidns->level;
> > +			if (hidx + nsdelta >= CKPT_HDR_PIDS_CHUNK)
> 
> I think that (hidx + nsdelta > CKPT_HDR_PIDS_CHUNK) checks more accurately the limit.
> 
> > +				break;
> > +
> > +			for (vidx = 0; vidx < nsdelta; vidx++) {
> > +				h[vidx].pid = task_pid_nr_ns(task, task_pidns);
> 
> Here:
> 				h[hidx + vidx]
> 
> > +				task_pidns = task_pidns->parent;
> > +			}
> > +
> > +			hidx += nsdelta;
> > +			tidx++;
> > +		}
> > +		rcu_read_unlock();
> > +
> > +		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> > +		if (ret < 0)
> > +			break;
> > +
> > +		hidx = 0;
> > +	} while (tidx < nr_tasks);
> > +
> > +	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> > +	return ret;
> > +}
> 
> Maybe re-work it this way:
> 
> static int checkpoint_vpids(struct ckpt_ctx *ctx)
> {
> 	struct ckpt_vpid *h;
> 	struct pid_namespace *root_pidns, *task_pidns, *active_pidns;
> 	struct task_struct *task;
> 	int ret, nr_tasks = ctx->nr_tasks;
> 	int tidx = 0, /* index into task array */
> 		hidx = 0; /* pids written into current ckpt_vpids chunk */
> 		vidx = 0; /* vpid index for current task */
> 
> 	root_pidns = ctx->root_nsproxy->pid_ns;
> 	nr_tasks = ctx->nr_tasks;
> 
> 	ret = ckpt_write_obj_type(ctx, NULL,
> 				  sizeof(*h) * ctx->nr_vpids,
> 				  CKPT_HDR_BUFFER);
> 	if (ret < 0)
> 		return ret;
> 
> 	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> 	if (!h)
> 		return -ENOMEM;
> 
> 	do {
> 		rcu_read_lock();
> 		while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) {
> 			int nsdelta;
> 
> 			task = ctx->tasks_arr[tidx];
> 			active_pidns = task_active_pid_ns(task);
> 			nsdelta = active_pidns->level - root_pidns->level;
> 			if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK)
> 				/*
> 				 * We will release rcu before recording the
> 				 * remaining vpids, but neither task nor its
> 				 * pid can disappear.
> 				 */
> 				nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx;

This looks good, thanks.

> 			if (vidx == 0)
> 				task_pidns = active_pidns;
> 			for (; vidx < nsdelta; vidx++) {
> 				h[hidx].pid = task_pid_nr_ns(task, task_pidns);
> 				hidx++;
> 				task_pidns = task_pidns->parent;
> 			}
> 
> 			if (task_pidns == root_pidns) {
> 				tidx++;
> 				vidx = 0;
> 			}
> 		}
> 		rcu_read_unlock();
> 
> 		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> 		if (ret < 0)
> 			break;
> 
> 		hidx = 0;
> 	} while (tidx < nr_tasks);
> 
> 	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> 	return ret;
> }
> 
> [...]
> 
> -- 
> Dr Louis Rilling			Kerlabs
> Skype: louis.rilling			Batiment Germanium
> Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
> http://www.kerlabs.com/			35700 Rennes

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH linux-cr] nested pid namespaces (v2)
  2010-03-22 14:38     ` Serge E. Hallyn
@ 2010-03-22 19:57       ` Matt Helsley
  2010-03-23  6:41         ` Louis Rilling
  0 siblings, 1 reply; 10+ messages in thread
From: Matt Helsley @ 2010-03-22 19:57 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Louis Rilling, Linux Containers, lkml

On Mon, Mar 22, 2010 at 09:38:00AM -0500, Serge E. Hallyn wrote:
> Quoting Louis Rilling (Louis.Rilling@kerlabs.com):
> > On Fri, Mar 19, 2010 at 04:39:55PM -0500, Serge E. Hallyn wrote:
> > > Support checkpoint and restart of tasks in nested pid namespaces.  At

<snip>

> > It would probably be safer too to use task_active_pid_ns() instead of
> > task->nsproxy->pid_ns, just in case some PID namespace unsharing like proposed
> > by Eric makes it to mainline.
> 
> The task is frozen though so it shouldn't be able to unshare while being
> checkpointed, right?  But it's probably better code anyway.

By the time it reaches checkpoint a frozen task is in the refrigerator
-- most often in the signal delivery portion of syscall return. So it can't
be making any new unshare/setns syscalls and any changes to the namespaces
should be visible.

Cheers,
	-Matt Helsley

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH linux-cr] nested pid namespaces (v2)
  2010-03-19 21:39 [PATCH linux-cr] nested pid namespaces (v2) Serge E. Hallyn
  2010-03-19 21:41 ` [PATCH] user-ns: Nested pidns support (v2) Serge E. Hallyn
       [not found] ` <20100319213955.GA17912-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-03-22 23:17 ` Oren Laadan
  2010-03-23  4:26   ` Serge E. Hallyn
  2 siblings, 1 reply; 10+ messages in thread
From: Oren Laadan @ 2010-03-22 23:17 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, lkml



Serge E. Hallyn wrote:
> Support checkpoint and restart of tasks in nested pid namespaces.  At
> Oren's request here is an alternative to my previous implementation.  In
> this one, we keep the original single pids_array to minimize memory
> allocations.  The pids array entries are augmented with a pidns depth

Thanks for adapting the patch.

FWIW, not only minimize memory allocations, but also permit a more
regular structure of the image data (array of fixed size elements
followed by an array of vpids), which simplifies the code that needs
to read/write/access this data.

> (relative to the container init's pidns, and an "rpid" which is the pid
> in the checkpointer's pidns (or 0 if no valid pid exists).  The rpid
> will be used by userspace to gather more information (like
> /proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks are
> in nested pid namespace, another single array holds all of the vpids.
> At restart those are used by userspace to determine how to call
> eclone().  Kernel ignores them.
> 
> All cr_tests including the new pid_ns testcase pass.
> 
> Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
> ---

[...]

> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
>  		ret = -EPERM;
>  	}
> -	/* no support for >1 private pidns */
> -	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> -		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> -		ret = -EPERM;
> +	/* pidns must be descendent of root_nsproxy */
> +	pidns = nsproxy->pid_ns;
> +	while (pidns != ctx->root_nsproxy->pid_ns) {
> +		if (pidns == &init_pid_ns) {
> +			ret = -EPERM;
> +			_ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> +			break;
> +		}
> +		pidns = pidns->parent;

Currently we do this while() loop twice - once here and once when
we collect the vpids. While I doubt if this has any performance
impact, is there an advantage to doing it also here ?  (a violation
will be observed there too).

[...]

>  
>  	if (nsproxy != task_nsproxy(current)) {
> +		/*
> +		 * This is *kinda* shady to do without any locking.  However
> +		 * it is safe because each task is restarted separately in
> +		 * serial.  If that ever changes, we'll need a spinlock?
> +		 */

Maybe add the lock/rcu already, so it is never forgotten later ?

> +		if (!nsproxy->pid_ns)
> +			nsproxy->pid_ns = get_pid_ns(current->nsproxy->pid_ns);
>  		get_nsproxy(nsproxy);
>  		switch_task_namespaces(current, nsproxy);
>  	}

[...]

> +/*
> + * read all the vpids - we don't actually care about them,
> + * userspace did
> + */

How about ckpt_read_consume() for this ?

> +static int restore_slurp_vpids(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_vpids *h;
> +	int size, ret;
> +	void *junk;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VPIDS);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +	ctx->nr_vpids = h->nr_vpids;
> +	ckpt_hdr_put(ctx, h);
> +
> +	if (!ctx->nr_vpids)
> +		return 0;
> +
> +	size = sizeof(struct ckpt_vpid) * ctx->nr_vpids;
> +	if (size < 0)		/* overflow ? */
> +		return -EINVAL;
> +
> +	junk = kmalloc(size, GFP_KERNEL);
> +	if (!junk)
> +		return -ENOMEM;
> +
> +	ret = _ckpt_read_buffer(ctx, junk, size);
> +	kfree(junk);
> +
> +	return ret;
> +}
> +

[...]

> @@ -1237,6 +1271,11 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
>  	if (ret < 0)
>  		return ret;
>  
> +	ret = restore_slurp_vpids(ctx);

instead:
	ret = ckpt_read_consume(ctx, ..., ...);

[...]

>  struct ckpt_pids {
> +	/* These pids are in the root_nsproxy's pid ns */
>  	__s32 vpid;
>  	__s32 vppid;
>  	__s32 vtgid;
>  	__s32 vpgid;
>  	__s32 vsid;
> +	__s32 rpid;  /* real pid - in checkpointer's pidns */

This comes from an unrelated patch/purpose, right - maybe mention
in the patch description ?

> +	__s32 depth; /* pid depth */
> +} __attribute__((aligned(8)));
> +
> +/* number of vpids */
> +struct ckpt_hdr_vpids {
> +	struct ckpt_hdr h;
> +	__s32 nr_vpids;
> +} __attribute__((aligned(8)));
> +
> +struct ckpt_vpid {
> +	__s32 pid;
> +	__s32 pad;

@pad is redundant as last element.

>  } __attribute__((aligned(8)));
>  
>  /* pids */
> diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
> index ecd3e91..2fb79cf 100644
> --- a/include/linux/checkpoint_types.h
> +++ b/include/linux/checkpoint_types.h
> @@ -72,6 +72,9 @@ struct ckpt_ctx {
>  	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
>  	int nr_tasks;                   /* size of tasks array */
>  
> +	int nr_vpids;
> +	struct pid_namespace *coord_pidns;	/* coordinator pid_ns */
> +
>  	/* [multi-process restart] */
>  	struct ckpt_pids *pids_arr;	/* array of all pids [restart] */
>  	int nr_pids;			/* size of pids array */
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 0da0d83..6d86240 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
>  		get_net(net_ns);
>  		nsproxy->net_ns = net_ns;
>  
> -		get_pid_ns(current->nsproxy->pid_ns);
> -		nsproxy->pid_ns = current->nsproxy->pid_ns;
> +		/*
> +		 * The pid_ns will get assigned the first time that we
> +		 * assign the nsproxy to a task.  The task had unshared
> +		 * its pid_ns in userspace before calling restart, and
> +		 * we want to keep using that pid_ns.
> +		 */
> +		nsproxy->pid_ns = NULL;

This doesn't look healthy.

If it is (or will be) possible for another process to look at the
restarting process, not having a pid-ns may confuse other code in
the kernel ?

>  	}
>   out:
>  	if (ret < 0)

Oren.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH linux-cr] nested pid namespaces (v2)
  2010-03-22 23:17 ` Oren Laadan
@ 2010-03-23  4:26   ` Serge E. Hallyn
  2010-03-23  5:39     ` Oren Laadan
  0 siblings, 1 reply; 10+ messages in thread
From: Serge E. Hallyn @ 2010-03-23  4:26 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Linux Containers, lkml

Quoting Oren Laadan (orenl@cs.columbia.edu):
> 
> 
> Serge E. Hallyn wrote:
> >Support checkpoint and restart of tasks in nested pid namespaces.  At
> >Oren's request here is an alternative to my previous implementation.  In
> >this one, we keep the original single pids_array to minimize memory
> >allocations.  The pids array entries are augmented with a pidns depth
> 
> Thanks for adapting the patch.
> 
> FWIW, not only minimize memory allocations, but also permit a more
> regular structure of the image data (array of fixed size elements
> followed by an array of vpids), which simplifies the code that needs
> to read/write/access this data.
> 
> >(relative to the container init's pidns, and an "rpid" which is the pid
> >in the checkpointer's pidns (or 0 if no valid pid exists).  The rpid
> >will be used by userspace to gather more information (like
> >/proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks are
> >in nested pid namespace, another single array holds all of the vpids.
> >At restart those are used by userspace to determine how to call
> >eclone().  Kernel ignores them.
> >
> >All cr_tests including the new pid_ns testcase pass.
> >
> >Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
> >---
> 
> [...]

Thanks, Oren - all other input is taken into what I'm about to post,
except:

> >@@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
> > 		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
> > 		ret = -EPERM;
> > 	}
> >-	/* no support for >1 private pidns */
> >-	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> >-		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> >-		ret = -EPERM;
> >+	/* pidns must be descendent of root_nsproxy */
> >+	pidns = nsproxy->pid_ns;
> >+	while (pidns != ctx->root_nsproxy->pid_ns) {
> >+		if (pidns == &init_pid_ns) {
> >+			ret = -EPERM;
> >+			_ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> >+			break;
> >+		}
> >+		pidns = pidns->parent;
> 
> Currently we do this while() loop twice - once here and once when
> we collect the vpids. While I doubt if this has any performance
> impact, is there an advantage to doing it also here ?  (a violation
> will be observed there too).

With the new logic (ripped verbatim from Louis' email) such a move
would make the checkpoint_vpids() code a bit uglier.  I'm about to
resend, please let me know if you still want the code moved.

...

> >diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> >index 0da0d83..6d86240 100644
> >--- a/kernel/nsproxy.c
> >+++ b/kernel/nsproxy.c
> >@@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
> > 		get_net(net_ns);
> > 		nsproxy->net_ns = net_ns;
> >-		get_pid_ns(current->nsproxy->pid_ns);
> >-		nsproxy->pid_ns = current->nsproxy->pid_ns;
> >+		/*
> >+		 * The pid_ns will get assigned the first time that we
> >+		 * assign the nsproxy to a task.  The task had unshared
> >+		 * its pid_ns in userspace before calling restart, and
> >+		 * we want to keep using that pid_ns.
> >+		 */
> >+		nsproxy->pid_ns = NULL;
> 
> This doesn't look healthy.
> 
> If it is (or will be) possible for another process to look at the
> restarting process, not having a pid-ns may confuse other code in
> the kernel ?

No task will have this nproxy attached before we assign a valid
pid_ns.  The NULL pid_ns is only while it is in the objhash but
not attached to a task.

thanks,
-serge

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH linux-cr] nested pid namespaces (v2)
  2010-03-23  4:26   ` Serge E. Hallyn
@ 2010-03-23  5:39     ` Oren Laadan
  0 siblings, 0 replies; 10+ messages in thread
From: Oren Laadan @ 2010-03-23  5:39 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Linux Containers, lkml



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl@cs.columbia.edu):
>>
>> Serge E. Hallyn wrote:
>>> Support checkpoint and restart of tasks in nested pid namespaces.  At
>>> Oren's request here is an alternative to my previous implementation.  In
>>> this one, we keep the original single pids_array to minimize memory
>>> allocations.  The pids array entries are augmented with a pidns depth
>> Thanks for adapting the patch.
>>
>> FWIW, not only minimize memory allocations, but also permit a more
>> regular structure of the image data (array of fixed size elements
>> followed by an array of vpids), which simplifies the code that needs
>> to read/write/access this data.
>>
>>> (relative to the container init's pidns, and an "rpid" which is the pid
>>> in the checkpointer's pidns (or 0 if no valid pid exists).  The rpid
>>> will be used by userspace to gather more information (like
>>> /proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks are
>>> in nested pid namespace, another single array holds all of the vpids.
>>> At restart those are used by userspace to determine how to call
>>> eclone().  Kernel ignores them.
>>>
>>> All cr_tests including the new pid_ns testcase pass.
>>>
>>> Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
>>> ---
>> [...]
> 
> Thanks, Oren - all other input is taken into what I'm about to post,
> except:
> 
>>> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>>> 		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
>>> 		ret = -EPERM;
>>> 	}
>>> -	/* no support for >1 private pidns */
>>> -	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
>>> -		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
>>> -		ret = -EPERM;
>>> +	/* pidns must be descendent of root_nsproxy */
>>> +	pidns = nsproxy->pid_ns;
>>> +	while (pidns != ctx->root_nsproxy->pid_ns) {
>>> +		if (pidns == &init_pid_ns) {
>>> +			ret = -EPERM;
>>> +			_ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
>>> +			break;
>>> +		}
>>> +		pidns = pidns->parent;
>> Currently we do this while() loop twice - once here and once when
>> we collect the vpids. While I doubt if this has any performance
>> impact, is there an advantage to doing it also here ?  (a violation
>> will be observed there too).
> 
> With the new logic (ripped verbatim from Louis' email) such a move
> would make the checkpoint_vpids() code a bit uglier.  I'm about to
> resend, please let me know if you still want the code moved.
> 

If you think it's simpler this way, then so be it.

> ...
> 
>>> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
>>> index 0da0d83..6d86240 100644
>>> --- a/kernel/nsproxy.c
>>> +++ b/kernel/nsproxy.c
>>> @@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
>>> 		get_net(net_ns);
>>> 		nsproxy->net_ns = net_ns;
>>> -		get_pid_ns(current->nsproxy->pid_ns);
>>> -		nsproxy->pid_ns = current->nsproxy->pid_ns;
>>> +		/*
>>> +		 * The pid_ns will get assigned the first time that we
>>> +		 * assign the nsproxy to a task.  The task had unshared
>>> +		 * its pid_ns in userspace before calling restart, and
>>> +		 * we want to keep using that pid_ns.
>>> +		 */
>>> +		nsproxy->pid_ns = NULL;
>> This doesn't look healthy.
>>
>> If it is (or will be) possible for another process to look at the
>> restarting process, not having a pid-ns may confuse other code in
>> the kernel ?
> 
> No task will have this nproxy attached before we assign a valid
> pid_ns.  The NULL pid_ns is only while it is in the objhash but
> not attached to a task.

Ahh .. I see, ok then.

Oren.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH linux-cr] nested pid namespaces (v2)
  2010-03-22 19:57       ` Matt Helsley
@ 2010-03-23  6:41         ` Louis Rilling
  0 siblings, 0 replies; 10+ messages in thread
From: Louis Rilling @ 2010-03-23  6:41 UTC (permalink / raw)
  To: Matt Helsley; +Cc: Serge E. Hallyn, Linux Containers, lkml

[-- Attachment #1: Type: text/plain, Size: 1420 bytes --]

On Mon, Mar 22, 2010 at 12:57:35PM -0700, Matt Helsley wrote:
> On Mon, Mar 22, 2010 at 09:38:00AM -0500, Serge E. Hallyn wrote:
> > Quoting Louis Rilling (Louis.Rilling@kerlabs.com):
> > > On Fri, Mar 19, 2010 at 04:39:55PM -0500, Serge E. Hallyn wrote:
> > > > Support checkpoint and restart of tasks in nested pid namespaces.  At
> 
> <snip>
> 
> > > It would probably be safer too to use task_active_pid_ns() instead of
> > > task->nsproxy->pid_ns, just in case some PID namespace unsharing like proposed
> > > by Eric makes it to mainline.
> > 
> > The task is frozen though so it shouldn't be able to unshare while being
> > checkpointed, right?  But it's probably better code anyway.
> 
> By the time it reaches checkpoint a frozen task is in the refrigerator
> -- most often in the signal delivery portion of syscall return. So it can't
> be making any new unshare/setns syscalls and any changes to the namespaces
> should be visible.

And what about a task having already unshared its pid namespace (from within
the container)? In this case, all pid-related stuff should be based on
task_active_pid_ns(), and ->nsproxy->pid_ns should be recorded too to correctly
restore pid unsharing.

Thanks,

Louis

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-03-23  6:41 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-03-19 21:39 [PATCH linux-cr] nested pid namespaces (v2) Serge E. Hallyn
2010-03-19 21:41 ` [PATCH] user-ns: Nested pidns support (v2) Serge E. Hallyn
     [not found] ` <20100319213955.GA17912-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-03-22 10:55   ` [PATCH linux-cr] nested pid namespaces (v2) Louis Rilling
2010-03-22 10:55     ` Louis Rilling
2010-03-22 14:38     ` Serge E. Hallyn
2010-03-22 19:57       ` Matt Helsley
2010-03-23  6:41         ` Louis Rilling
2010-03-22 23:17 ` Oren Laadan
2010-03-23  4:26   ` Serge E. Hallyn
2010-03-23  5:39     ` Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.