All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/9] user-cr: support for pids as shared objects
@ 2011-01-26 16:18 Oren Laadan
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:18 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

Hi,

This patchset adds the necessary support in user-cr related to
handling of pids as proper shared objets. You must use this if you use
the corresponding kernel-cr patchset recetly posted.

Thanks,

Oren.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/9] Initialize of args->{outfd, logfd, infd} in main c/r programs
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2011-01-26 16:19   ` Oren Laadan
       [not found]     ` <1296058748-21418-2-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2011-01-26 16:19   ` [PATCH 2/9] Introduce ctx->error to improve error reporting Oren Laadan
                     ` (7 subsequent siblings)
  8 siblings, 1 reply; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:19 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

The initialization belongs to the caller of the cr-library - i.e.
restart-main and checkpoint-main, and not in the cr-library.

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint-main.c |    4 ++--
 checkpoint.c      |    8 --------
 restart-main.c    |    4 ++--
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/checkpoint-main.c b/checkpoint-main.c
index a2a7d94..b2ec9c8 100644
--- a/checkpoint-main.c
+++ b/checkpoint-main.c
@@ -70,8 +70,8 @@ static void parse_args(struct cr_checkpoint_args *args, int argc, char *argv[])
 	char *logfile;
 
 	/* defaults */
-	args->outfd = -1;
-	args->logfd = -1;
+	args->outfd = fileno(stdout);
+	args->logfd = CHECKPOINT_FD_NONE;
 	args->uerrfd = fileno(stderr);
 	output = NULL;
 	logfile = NULL;
diff --git a/checkpoint.c b/checkpoint.c
index cce3d9d..f9b0b3b 100644
--- a/checkpoint.c
+++ b/checkpoint.c
@@ -37,14 +37,6 @@ int cr_checkpoint(int pid, struct cr_checkpoint_args *args)
 
 	global_uerrfd = args->uerrfd;
 
-	/* output file descriptor (default: stdout) */
-	if (args->outfd < 0)
-		args->outfd = STDOUT_FILENO;
-
-	/* output file descriptor (default: none) */
-	if (args->logfd < 0)
-		args->logfd = CHECKPOINT_FD_NONE;
-
 	if (!args->container)
 		args->flags |= CHECKPOINT_SUBTREE;
 
diff --git a/restart-main.c b/restart-main.c
index 6eed101..efa6a8f 100644
--- a/restart-main.c
+++ b/restart-main.c
@@ -146,10 +146,10 @@ static void parse_args(struct cr_restart_args *args, int argc, char *argv[])
 	/* defaults */
 	memset(args, 0, sizeof(*args));
 	args->wait = 1;
-	args->infd = -1;
-	args->klogfd = -1;
+	args->infd = fileno(stdin);
 	args->ulogfd = fileno(stdout);
 	args->uerrfd = fileno(stderr);
+	args->klogfd = CHECKPOINT_FD_NONE;
 	args->warn = CKPT_COND_WARN;
 	args->fail = CKPT_COND_FAIL;
 	no_pidns = 0;
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/9] Introduce ctx->error to improve error reporting
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2011-01-26 16:19   ` [PATCH 1/9] Initialize of args->{outfd, logfd, infd} in main c/r programs Oren Laadan
@ 2011-01-26 16:19   ` Oren Laadan
  2011-01-26 16:19   ` [PATCH 3/9] restart: cleanup setup/cleanup of freezer cgroups Oren Laadan
                     ` (6 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:19 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

if ctx->errno isn't already set, the:
- ctx_set_errno() saved errno in ctx->error
- ctx_ret_errno() sets ctx->error (and returns -1)

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 common.h  |    3 ++
 restart.c |   99 ++++++++++++++++++++++++++++++++++++------------------------
 2 files changed, 62 insertions(+), 40 deletions(-)

diff --git a/common.h b/common.h
index b4736bb..66193d3 100644
--- a/common.h
+++ b/common.h
@@ -7,6 +7,7 @@ static inline void ckpt_msg(int fd, char *format, ...)
 {
 	char buf[BUFSIZE];
 	va_list ap;
+	int err;
 
 	if (fd < 0)
 		return;
@@ -15,7 +16,9 @@ static inline void ckpt_msg(int fd, char *format, ...)
 	vsnprintf(buf, BUFSIZE, format, ap);
 	va_end(ap);
 
+	err = errno;
 	write(fd, buf, strlen(buf));
+	errno = err;
 }
 
 static void inline _strerror(int errnum, char *buf, size_t buflen)
diff --git a/restart.c b/restart.c
index 78d21c0..8106fd6 100644
--- a/restart.c
+++ b/restart.c
@@ -124,6 +124,8 @@ struct ckpt_ctx {
 		CTX_RESTART,
 	} whoami;
 
+	int error;
+
 	pid_t root_pid;
 	int pipe_in;
 	int pipe_out;
@@ -257,6 +259,20 @@ static inline int ckpt_cond_fail(struct ckpt_ctx *ctx, long mask)
 	return (ctx->args->fail & mask);
 }
 
+static inline int ctx_set_errno(struct ckpt_ctx *ctx)
+{
+	if (!ctx->error)
+		ctx->error = errno;
+	return -1;
+}
+
+static inline int ctx_ret_errno(struct ckpt_ctx *ctx, int err)
+{
+	if (!ctx->error)
+		ctx->error = err;
+	return -1;
+}
+
 static void report_exit_status(int status, char *str, int debug)
 {
 	char msg[64];
@@ -426,6 +442,7 @@ int process_args(struct cr_restart_args *args)
 	if (args->pidns) {
 		ckpt_err("This version of restart was compiled without "
 		       "support for --pidns.\n");
+		errno = ENOSYS;
 		return -1;
 	}
 #endif
@@ -434,6 +451,7 @@ int process_args(struct cr_restart_args *args)
 	if (global_debug) {
 		ckpt_err("This version of restart was compiled without "
 		       "support for --debug.\n");
+		errno = ENOSYS;
 		return -1;
 	}
 #endif
@@ -446,6 +464,7 @@ int process_args(struct cr_restart_args *args)
 	if (args->pids) {
 		ckpt_err("This version of restart was compiled without "
 		       "support for --pids.\n");
+		errno = ENOSYS;
 		return -1;
 	}
 #endif
@@ -455,6 +474,7 @@ int process_args(struct cr_restart_args *args)
 	    (args->pids || args->pidns || args->show_status ||
 	     args->copy_status || args->freezer)) {
 		ckpt_err("Invalid mix of --self with multiprocess options\n");
+		errno = EINVAL;
 		return -1;
 	}
 
@@ -703,7 +723,7 @@ static int ckpt_collect_child(struct ckpt_ctx *ctx)
 		status = global_child_status;
 	} else if (pid < 0) {
 		ckpt_perror("WEIRD: collect child task");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
 	return ckpt_parse_status(status, mimic, verbose);
@@ -716,18 +736,18 @@ static int ckpt_remount_devpts(struct ckpt_ctx *ctx)
 	/* make sure /dev/ptmx is a link else we'll just break */
 	if (lstat("/dev/ptmx", &ptystat) < 0) {
 		ckpt_perror("stat /dev/ptmx");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 	if ((ptystat.st_mode & S_IFMT) != S_IFLNK) {
-		ckpt_err("Error: /dev/ptmx must be a link to /dev/pts/ptmx\n");
-		return -1;
+		ckpt_err("[err] /dev/ptmx must be a link to /dev/pts/ptmx\n");
+		return ctx_ret_errno(ctx, ENODEV);
 	}
 
 	/* this is unlikely, but maybe we don't want to fail */
 	if (umount2("/dev/pts", MNT_DETACH) < 0) {
 		if (ckpt_cond_fail(ctx, CKPT_COND_MNTPTY)) {
 			ckpt_perror("umount -l /dev/pts");
-			return -1;
+			return ctx_set_errno(ctx);
 		}
 		if (ckpt_cond_warn(ctx, CKPT_COND_MNTPTY))
 			ckpt_err("[warn] failed to un-mount old /dev/pts\n");
@@ -735,7 +755,7 @@ static int ckpt_remount_devpts(struct ckpt_ctx *ctx)
 	if (mount("pts", "/dev/pts", "devpts", 0,
 		  "ptmxmode=666,newinstance") < 0) {
 		ckpt_perror("mount -t devpts -o newinstance");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
 	return 0;
@@ -802,6 +822,7 @@ static int ckpt_probe_child(pid_t pid, char *str)
 	ret = waitpid(pid, &status, WNOHANG);
 	if (ret == pid) {
 		report_exit_status(status, str, 0);
+		errno = ECHILD;
 		return -1;
 	} else if (ret < 0 && errno == ECHILD) {
 		ckpt_err("WEIRD: %s exited without trace (%s)\n",
@@ -809,6 +830,8 @@ static int ckpt_probe_child(pid_t pid, char *str)
 		return -1;
 	} else if (ret != 0) {
 		ckpt_err("waitpid for %s (%s)", str, strerror(errno));
+		if (ret > 0)
+			errno = ECHILD;
 		return -1;
 	}
 	return 0;
@@ -824,14 +847,14 @@ static int ckpt_remount_proc(struct ckpt_ctx *ctx)
 	if (umount2("/proc", MNT_DETACH) < 0) {
 		if (ckpt_cond_fail(ctx, CKPT_COND_MNTPROC)) {
 			ckpt_perror("umount -l /proc");
-			return -1;
+			return ctx_set_errno(ctx);
 		}
 		if (ckpt_cond_warn(ctx, CKPT_COND_MNTPROC))
 			ckpt_err("[warn] failed to un-mount old /proc\n");
 	}
 	if (mount("proc", "/proc", "proc", 0, NULL) < 0) {
 		ckpt_perror("mount -t proc");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
 	return 0;
@@ -903,13 +926,13 @@ static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
 	 */
 	if (pipe(ctx->pipe_coord) < 0) {
 		ckpt_perror("pipe");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
 	stk = genstack_alloc(PTHREAD_STACK_MIN);
 	if (!stk) {
 		ckpt_perror("coordinator genstack_alloc");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 	sp = genstack_sp(stk);
 
@@ -937,7 +960,7 @@ static int ckpt_coordinator_pidns(struct ckpt_ctx *ctx)
 	 * signal handler was plugged; verify that it's still there.
 	 */
 	if (ckpt_probe_child(coord_pid, "coordinator") < 0)
-		return -1;
+		return ctx_set_errno(ctx);
 
 	ctx->args->copy_status = copy;
 
@@ -977,7 +1000,7 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
 	 * signal handler was plugged; verify that it's still there.
 	 */
 	if (ckpt_probe_child(root_pid, "root task") < 0)
-		return -1;
+		return ctx_set_errno(ctx);
 
 	if (ctx->args->keep_frozen)
 		flags |= RESTART_FROZEN;
@@ -991,7 +1014,7 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
 		ckpt_perror("restart failed");
 		ckpt_verbose("Failed\n");
 		ckpt_dbg("restart failed ?\n");
-		return -1;
+		return ret;
 	}
 
 	ckpt_verbose("Success\n");
@@ -1003,7 +1026,7 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
 		/* Report success/failure to the parent */
 		if (write(ctx->pipe_coord[1], &ret, sizeof(ret)) < 0) {
 			ckpt_perror("failed to report status");
-			return -1;
+			return ctx_set_errno(ctx);
 		}
 
 		/*
@@ -1145,12 +1168,14 @@ static int ckpt_valid_pid(struct ckpt_ctx *ctx, pid_t pid, char *which, int i)
 {
 	if (pid < 0) {
 		ckpt_err("Invalid %s %d (for task#%d)\n", which, pid, i);
+		errno = EINVAL;
 		return 0;
 	}
 	if (!ctx->args->pidns && pid == 0) {
 		if (ckpt_cond_fail(ctx, CKPT_COND_PIDZERO)) {
 			ckpt_err("[err] task # %d with %s zero"
 				 " (requires --pidns)\n", i + 1, which);
+			errno = EINVAL;
 			return 0;
 		} else if (ckpt_cond_warn(ctx, CKPT_COND_PIDZERO)) {
 			ckpt_err("[warn] task # %d with %s zero"
@@ -1727,13 +1752,13 @@ int ckpt_fork_stub(void *data)
 
 	/* chroot ? */
 	if ((task->flags & TASK_NEWROOT) && chroot(ctx->args->root) < 0)
-		return -1;
+		return ctx_set_errno(ctx);
 	/* tasks with new pid-ns need new /proc mount */
 	if ((task->flags & TASK_NEWPID) && ckpt_remount_proc(ctx) < 0)
-		return -1;
+		return ctx_set_errno(ctx);
 	/* remount /dev/pts ? */
 	if ((task->flags & TASK_NEWPTS) && ckpt_remount_devpts(ctx) < 0)
-		return -1;
+		return ctx_set_errno(ctx);
 
 	/*
 	 * In restart into a new pid namespace (--pidns), coordinator
@@ -1755,19 +1780,21 @@ int ckpt_fork_stub(void *data)
 	if (!ctx->args->pidns) {
 		if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0) < 0) {
 			ckpt_perror("prctl");
-			return -1;
+			return ctx_set_errno(ctx);
 		}
 		if (getppid() != task->real_parent) {
 			ckpt_err("[%d]: parent is MIA (%d != %d)\n",
 				 _getpid(), getppid(), task->real_parent);
-			return -1;
+			if (errno == 0)
+				errno = ECHILD;
+			return ctx_set_errno(ctx);
 		}
 	}
 
 	/* if user requested freeze at end - add ourself to cgroup */
 	if (ctx->args->freezer && freezer_register(ctx, _getpid())) {
 		ckpt_err("[%d]: failed add to freezer cgroup\n", _getpid());
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
 	/* root has some extra work */
@@ -1820,12 +1847,10 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 #ifndef CLONE_NEWPID
 		if (child->piddepth > child->creator->piddepth) {
 			ckpt_err("nested pidns but CLONE_NEWPID undefined");
-			errno = -EINVAL;
-			return -1;
+			ctx_ret_errno(ctx, ENOSYS);
 		} else if (child->flags & TASK_NEWPID) {
 			ckpt_err("TASK_NEWPID set but CLONE_NEWPID undefined");
-			errno = -EINVAL;
-			return -1;
+			ctx_ret_errno(ctx, ENOSYS);
 		}
 #else /* CLONE_NEWPID */
 		if (child->piddepth > child->creator->piddepth) {
@@ -2234,6 +2259,8 @@ static int _ckpt_read(int fd, void *buf, int count)
 			continue;
 		if (nread == 0 && nleft == count)
 			return 0;
+		if (nread == 0)
+			errno = EIO;
 		if (nread <= 0)
 			return -1;
 		buf += nread;
@@ -2262,10 +2289,8 @@ static int ckpt_read_obj(struct ckpt_ctx *ctx,
 	ret = ckpt_read(fd, h, sizeof(*h));
 	if (ret < 0)
 		return ret;
-	if (h->len < sizeof(*h) || h->len > n) {
-		errno = EINVAL;
-		return -1;
-	}
+	if (h->len < sizeof(*h) || h->len > n)
+		return ctx_ret_errno(ctx, EINVAL);
 	if (h->len == sizeof(*h))
 		return 0;
 	return ckpt_read(fd, buf, h->len - sizeof(*h));
@@ -2279,10 +2304,8 @@ static int ckpt_read_obj_type(struct ckpt_ctx *ctx, void *buf, int n, int type)
 	ret = ckpt_read_obj(ctx, h, (void *) (h + 1), n);
 	if (ret < 0)
 		return ret;
-	if (h->type != type) {
-		errno = EINVAL;
-		return -1;
-	}
+	if (h->type != type)
+		return ctx_ret_errno(ctx, EINVAL);
 	return 0;
 }
 
@@ -2294,10 +2317,8 @@ static int ckpt_read_obj_ptr(struct ckpt_ctx *ctx, void *buf, int n, int type)
 	ret = ckpt_read_obj(ctx, &h, buf, n + sizeof(h));
 	if (ret < 0)
 		return ret;
-	if (h.type != type) {
-		errno = EINVAL;
-		return -1;
-	}
+	if (h.type != type)
+		return ctx_ret_errno(ctx, EINVAL);
 	return 0;
 }
 
@@ -2323,10 +2344,8 @@ static int ckpt_read_header(struct ckpt_ctx *ctx)
 
 	if (h->constants.uts_release_len > BUFSIZE / 4 ||
 	    h->constants.uts_version_len > BUFSIZE / 4 ||
-	    h->constants.uts_machine_len > BUFSIZE / 4) {
-		errno = EINVAL;
-		return -1;
-	}
+	    h->constants.uts_machine_len > BUFSIZE / 4)
+		return ctx_ret_errno(ctx, EINVAL);
 
 	ptr = (char *) h;
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/9] restart: cleanup setup/cleanup of freezer cgroups
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2011-01-26 16:19   ` [PATCH 1/9] Initialize of args->{outfd, logfd, infd} in main c/r programs Oren Laadan
  2011-01-26 16:19   ` [PATCH 2/9] Introduce ctx->error to improve error reporting Oren Laadan
@ 2011-01-26 16:19   ` Oren Laadan
  2011-01-26 16:19   ` [PATCH 4/9] restart: make feeder a proper child instead of a thread Oren Laadan
                     ` (5 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:19 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 restart.c |   22 ++++++++++------------
 1 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/restart.c b/restart.c
index 8106fd6..195a892 100644
--- a/restart.c
+++ b/restart.c
@@ -359,35 +359,32 @@ static void sigint_handler(int sig)
 
 static int freezer_prepare(struct ckpt_ctx *ctx)
 {
-	char *freezer;
 	int fd, ret;
 
 #define FREEZER_THAWED  "THAWED"
 
-	freezer = malloc(strlen(ctx->args->freezer) + 32);
-	if (!freezer) {
+	ctx->freezer = malloc(strlen(ctx->args->freezer) + 32);
+	if (!ctx->freezer) {
 		ckpt_perror("malloc freezer buf");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
-	sprintf(freezer, "%s/freezer.state", ctx->args->freezer);
+	sprintf(ctx->freezer, "%s/freezer.state", ctx->args->freezer);
 
-	fd = open(freezer, O_WRONLY, 0);
+	fd = open(ctx->freezer, O_WRONLY, 0);
 	if (fd < 0) {
 		ckpt_perror("freezer path");
-		free(freezer);
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 	ret = write(fd, FREEZER_THAWED, sizeof(FREEZER_THAWED)); 
 	if (ret != sizeof(FREEZER_THAWED)) {
 		ckpt_perror("thawing freezer");
-		free(freezer);
+		ctx_set_errno(ctx);
 		close(fd);
 		return -1;
 	}
 
-	sprintf(freezer, "%s/tasks", ctx->args->freezer);
-	ctx->freezer = freezer;
+	sprintf(ctx->freezer, "%s/tasks", ctx->args->freezer);
 	close(fd);
 	return 0;
 }
@@ -400,13 +397,14 @@ static int freezer_register(struct ckpt_ctx *ctx, pid_t pid)
 	fd = open(ctx->freezer, O_WRONLY, 0);
 	if (fd < 0) {
 		ckpt_perror("freezer path");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
 	n = sprintf(pidstr, "%d", pid);
 	ret = write(fd, pidstr, n);
 	if (ret != n) {
 		ckpt_perror("adding pid %d to freezer");
+		ctx_set_errno(ctx);
 		close(fd);
 		return -1;
 	}
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 4/9] restart: make feeder a proper child instead of a thread
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2011-01-26 16:19   ` [PATCH 3/9] restart: cleanup setup/cleanup of freezer cgroups Oren Laadan
@ 2011-01-26 16:19   ` Oren Laadan
  2011-01-26 16:19   ` [PATCH 5/9] restart: obtain pid_max from /proc/sys/kernel/pid_max Oren Laadan
                     ` (4 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:19 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Sukadev Bhattiprolu

As pointed out by Sukadev Bhattiprolu in this post:
http://www.spinics.net/lists/linux-containers/msg22411.html
it's not a good idea to have the coordinator and feeder share the same
memory address space.

The original idea was prevent the feeder from generating a SIGCHLD
prematurely that will interrupt the restart. So we could use regular
clone() without SIGCHLD. But then, if the feeder exits last then it
will spit an aesthetic message _after_ the "succes" message from the
coordiantor.

This patch makes the feeder a proper child, but also makes the feeder
wait for the coordinator before terinating, and makes the coordiantor
collect the feeder.

Cc: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 restart.c |   86 ++++++++++++++++++++++++++++++++++--------------------------
 1 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/restart.c b/restart.c
index 195a892..9535543 100644
--- a/restart.c
+++ b/restart.c
@@ -174,6 +174,7 @@ static int global_ulogfd;
 static int global_uerrfd;
 static int global_debug;
 static int global_verbose;
+static pid_t global_feeder_pid;
 static pid_t global_child_pid;
 static int global_child_status;
 static int global_child_collected;
@@ -205,7 +206,7 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child);
 static int ckpt_adjust_pids(struct ckpt_ctx *ctx);
 
 static void ckpt_abort(struct ckpt_ctx *ctx, char *str);
-static int ckpt_do_feeder(void *data);
+static int ckpt_do_feeder(struct ckpt_ctx *ctx);
 static int ckpt_fork_feeder(struct ckpt_ctx *ctx);
 
 static int ckpt_write(int fd, void *buf, int count);
@@ -313,11 +314,17 @@ static void sigchld_handler(int sig)
 			ckpt_dbg("SIGCHLD: child not ready\n");
 			break;
 		} else if (pid > 0) {
-			/* inform collection coordinator or root-task */
+			/* inform collection of coordinator or root-task */
 			if (pid == global_child_pid) {
 				global_child_status = status;
 				global_child_collected = 1;
-				report_exit_status(status, "SIGCHLD: ", 1);
+				ckpt_dbg("collected coord/root task\n");
+				report_exit_status(status, "SIGCHLD:", 1);
+			}
+			/* collect the feeder child */
+			if (pid == global_feeder_pid) {
+				ckpt_dbg("collected feeder process\n");
+				report_exit_status(status, "SIGCHLD:", 1);
 			}
 			ckpt_dbg("SIGCHLD: collected child %d\n", pid);
 			collected = 1;
@@ -521,7 +528,7 @@ static void exit_ctx(struct ckpt_ctx *ctx)
 int cr_restart(struct cr_restart_args *args)
 {
 	struct ckpt_ctx ctx;
-	int ret;
+	int status, ret;
 
 	init_ctx(&ctx);
 
@@ -649,17 +656,30 @@ int cr_restart(struct cr_restart_args *args)
 		ret = ckpt_coordinator(&ctx);
 	}
 
-	/*
-	 * On success, return pid of root of the restart process tree.
-	 */
-
 	if (ret < 0)
 		goto cleanup;
 
+	/* success: return pid of root of the restart process tree */
 	ret = global_child_pid;
 
+	/* time to release feeder so he can peacefully retire now */
+	status = 0;
+	if (write(ctx.pipe_out, &status, sizeof(status)) != sizeof(status))
+		ret = -1;
+
  cleanup:
 	exit_ctx(&ctx);
+
+	/* feeder doesn't exit - to avoid SIGCHILD to coordinator */
+	if (ret < 0 && global_feeder_pid)
+		kill(global_feeder_pid, SIGKILL);
+	/* wait for feeder child to terminate (ok of already gone) */
+	if (global_feeder_pid)
+		waitpid(global_feeder_pid, NULL, 0);
+
+	if (ret < 0)
+		errno = ctx.error;
+
 	return ret;
 }
 
@@ -1906,38 +1926,30 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
  */
 static int ckpt_fork_feeder(struct ckpt_ctx *ctx)
 {
-	genstack stk;
 	pid_t pid;
+	int ret;
 
 	if (pipe(ctx->pipe_feed)) {
 		ckpt_perror("pipe");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
 	if (pipe(ctx->pipe_child) < 0) {
 		ckpt_perror("pipe");
-		return -1;
-	}
-
-	/*
-	 * Use clone() without SIGCHLD so that the when the feeder
-	 * terminates it does not notify the parent (coordinator), as
-	 * this may interfere with the restart.
-	 */
-
-	stk = genstack_alloc(PTHREAD_STACK_MIN);
-	if (!stk) {
-		ckpt_perror("ckpt_fork_feeder genstack_alloc");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
-	pid = clone(ckpt_do_feeder, genstack_sp(stk),
-		    CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, ctx);
+	pid = fork();
 	if (pid < 0) {
 		ckpt_perror("feeder thread");
-		return -1;
+		return ctx_set_errno(ctx);
+	} else if (pid == 0) {
+		ret = ckpt_do_feeder(ctx);
+		exit(ret);
 	}
 
+	global_feeder_pid = pid;
+
 	/* children pipe: used for status reports from children */
 	close(ctx->pipe_child[0]);
 	ctx->pipe_out = ctx->pipe_child[1];
@@ -2045,12 +2057,15 @@ ckpt_dbg("write len %d (%d)\n", len, ret);
  * In '--no-pids' mode, transform the pids array (struct ckpt_pids)
  * on the fly and feed the result to the "init" task of the restart
  */
-static int ckpt_do_feeder(void *data)
+static int ckpt_do_feeder(struct ckpt_ctx *ctx)
 {
-	struct ckpt_ctx *ctx = (struct ckpt_ctx *) data;
+	int status;
 
 	ctx->whoami = CTX_FEEDER;  /* for sanity checks */
 
+	if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0) < 0)
+		ckpt_abort(ctx, "prctl");
+
 	/*
 	 * feeder has a separate file descriptor table, so
 	 * close/dup/open etc do not affect original caller
@@ -2096,16 +2111,13 @@ static int ckpt_do_feeder(void *data)
 	else
 		ckpt_read_write_blind(ctx);
 
-	/* All is well: feeder thread is done.  However, we must
-	 * invoke the exit system call directly. Otherwise, upon
-	 * return from this function, glibc's clone wrapper will call
-	 * _exit, which calls exit_group, which will terminate the
-	 * whole process, which is not what we want.
-	 */
-	syscall(SYS_exit, 0);
+	/* wait for parent (coordinator) to confirm, to avoid
+	   prematurely interrupting the restart with SIGCHLD */
+	if (read(ctx->pipe_in, &status, sizeof(status)) != sizeof(status))
+		ckpt_abort(ctx, "read coord status");
 
-	/* not reached */
-	return 0;
+	close(ctx->pipe_in);  /* no need to mark unused */
+	return status;
 }
 
 /*
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 5/9] restart: obtain pid_max from /proc/sys/kernel/pid_max
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (3 preceding siblings ...)
  2011-01-26 16:19   ` [PATCH 4/9] restart: make feeder a proper child instead of a thread Oren Laadan
@ 2011-01-26 16:19   ` Oren Laadan
  2011-01-26 16:19   ` [PATCH 6/9] restart: rename 'ctx->tasks_arr' to 'ctx->tasks' Oren Laadan
                     ` (3 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:19 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 restart.c |   19 +++++++++++++++++--
 1 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/restart.c b/restart.c
index 9535543..d998834 100644
--- a/restart.c
+++ b/restart.c
@@ -157,6 +157,9 @@ struct ckpt_ctx {
 	struct cr_restart_args *args;
 
 	char *freezer;
+
+	/* system limits */
+	pid_t pid_max;
 };
 
 struct pid_swap {
@@ -488,6 +491,9 @@ int process_args(struct cr_restart_args *args)
 
 static void init_ctx(struct ckpt_ctx *ctx)
 {
+	FILE *file;
+	char buf[1024];
+
 	memset(ctx, 0, sizeof(*ctx));
 
 	/* mark all fds as unused */
@@ -499,6 +505,15 @@ static void init_ctx(struct ckpt_ctx *ctx)
 	ctx->pipe_feed[1] = -1;
 	ctx->pipe_coord[0] = -1;
 	ctx->pipe_coord[1] = -1;
+
+	/* system limits */
+	ctx->pid_max = SHRT_MAX;  /* default */
+	file = fopen("/proc/sys/kernel/pid_max", "r");
+	if (file) {
+		if (fgets(buf, 1024, file))
+			ctx->pid_max = atoi(buf);
+		fclose(file);
+	}
 }
 
 static void exit_ctx(struct ckpt_ctx *ctx)
@@ -1212,12 +1227,12 @@ static int ckpt_alloc_pid(struct ckpt_ctx *ctx)
 	 * (this will become inefficient if pid-space is exhausted)
 	 */
 	do {
-		if (ctx->tasks_pid == INT_MAX)
+		if (ctx->tasks_pid == ctx->pid_max)
 			ctx->tasks_pid = CKPT_RESERVED_PIDS;
 		else
 			ctx->tasks_pid++;
 
-		if (n++ == INT_MAX) {	/* ohhh... */
+		if (n++ == ctx->pid_max) {	/* ohhh... */
 			ckpt_err("pid namsepace exhausted");
 			return -1;
 		}
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 6/9] restart: rename 'ctx->tasks_arr' to 'ctx->tasks'
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (4 preceding siblings ...)
  2011-01-26 16:19   ` [PATCH 5/9] restart: obtain pid_max from /proc/sys/kernel/pid_max Oren Laadan
@ 2011-01-26 16:19   ` Oren Laadan
  2011-01-26 16:19   ` [PATCH 7/9] udpate kernel headers: support for pids objects Oren Laadan
                     ` (2 subsequent siblings)
  8 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:19 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

In preparation for next (super)-patch.

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 restart.c |   44 ++++++++++++++++++++++----------------------
 1 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/restart.c b/restart.c
index d998834..ca02383 100644
--- a/restart.c
+++ b/restart.c
@@ -140,7 +140,7 @@ struct ckpt_ctx {
 	struct ckpt_pids *copy_arr;
 	__s32 *vpids_arr;
 
-	struct task *tasks_arr;
+	struct task *tasks;
 	int tasks_nr;
 	int tasks_max;
 	int tasks_pid;
@@ -520,8 +520,8 @@ static void exit_ctx(struct ckpt_ctx *ctx)
 {
 	if (ctx->freezer)
 		free(ctx->freezer);
-	if (ctx->tasks_arr)
-		free(ctx->tasks_arr);
+	if (ctx->tasks)
+		free(ctx->tasks);
 	if (ctx->pids_arr)
 		free(ctx->pids_arr);
 	if (ctx->copy_arr)
@@ -640,13 +640,13 @@ int cr_restart(struct cr_restart_args *args)
 	 * setup devpts, root-dir and /proc if necessary, ...
 	 */
 	if (ctx.args->mnt_pty)
-		ctx.tasks_arr[0].flags |= TASK_NEWPTS;
+		ctx.tasks[0].flags |= TASK_NEWPTS;
 	if (ctx.args->mntns)
-		ctx.tasks_arr[0].flags |= TASK_NEWNS;
+		ctx.tasks[0].flags |= TASK_NEWNS;
 	if (ctx.args->root)
-		ctx.tasks_arr[0].flags |= TASK_NEWROOT;
+		ctx.tasks[0].flags |= TASK_NEWROOT;
 
-	if (ctx.args->pidns && ctx.tasks_arr[0].pid != 1) {
+	if (ctx.args->pidns && ctx.tasks[0].pid != 1) {
 		ckpt_dbg("new pidns without init\n");
 		if (global_send_sigint == -1)
 			global_send_sigint = SIGINT;
@@ -655,12 +655,12 @@ int cr_restart(struct cr_restart_args *args)
 		 * the coordinator should set up the filesystems and
 		 * not the first process in the application process tree.
 		 */
-		ctx.tasks_arr[0].flags &=
-			~(TASK_NEWPTS | TASK_NEWROOT |TASK_NEWNS);
+		ctx.tasks[0].flags &=
+			~(TASK_NEWPTS | TASK_NEWROOT | TASK_NEWNS);
 		ret = ckpt_coordinator_pidns(&ctx);
 	} else if (ctx.args->pidns) {
 		ckpt_dbg("new pidns with init\n");
-		ctx.tasks_arr[0].flags |= TASK_NEWPID | TASK_NEWNS;
+		ctx.tasks[0].flags |= TASK_NEWPID | TASK_NEWNS;
 		if (global_send_sigint == -1)
 			global_send_sigint = SIGKILL;
 		ret = ckpt_coordinator(&ctx);
@@ -1018,7 +1018,7 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
 	pid_t root_pid;
 	int ret;
 
-	root_pid = ckpt_fork_child(ctx, &ctx->tasks_arr[0]);
+	root_pid = ckpt_fork_child(ctx, &ctx->tasks[0]);
 	if (root_pid < 0)
 		return -1;
 	global_child_pid = root_pid;
@@ -1055,7 +1055,7 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
 
 	ret = 0;
 
-	if (ctx->args->pidns && ctx->tasks_arr[0].pid != 1) {
+	if (ctx->args->pidns && ctx->tasks[0].pid != 1) {
 		/* Report success/failure to the parent */
 		if (write(ctx->pipe_coord[1], &ret, sizeof(ret)) < 0) {
 			ckpt_perror("failed to report status");
@@ -1093,7 +1093,7 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
 
 static inline struct task *ckpt_init_task(struct ckpt_ctx *ctx)
 {
-	return (&ctx->tasks_arr[0]);
+	return (&ctx->tasks[0]);
 }
 
 /*
@@ -1112,8 +1112,8 @@ static int ckpt_build_tree(struct ckpt_ctx *ctx)
 	 * placeholder tasks (each session id may have at most one)
 	 */
 	ctx->tasks_max = ctx->pids_nr * 4;
-	ctx->tasks_arr = malloc(sizeof(*ctx->tasks_arr) * ctx->tasks_max);
-	if (!ctx->tasks_arr) {
+	ctx->tasks = malloc(sizeof(*ctx->tasks) * ctx->tasks_max);
+	if (!ctx->tasks) {
 		ckpt_perror("malloc tasks array");
 		return -1;
 	}
@@ -1124,7 +1124,7 @@ static int ckpt_build_tree(struct ckpt_ctx *ctx)
 
 	/* assign a creator to each task */
 	for (i = 0; i < ctx->tasks_nr; i++) {
-		task = &ctx->tasks_arr[i];
+		task = &ctx->tasks[i];
 		if (task->creator)
 			continue;
 		if (ckpt_set_creator(ctx, task) < 0)
@@ -1134,7 +1134,7 @@ static int ckpt_build_tree(struct ckpt_ctx *ctx)
 #ifdef CHECKPOINT_DEBUG
 	ckpt_dbg("====== TASKS\n");
 	for (i = 0; i < ctx->tasks_nr; i++) {
-		task = &ctx->tasks_arr[i];
+		task = &ctx->tasks[i];
 		ckpt_dbg("\t[%d] pid %d ppid %d sid %d creator %d",
 			 i, task->pid, task->ppid, task->sid,
 			 task->creator->pid);
@@ -1169,7 +1169,7 @@ static int ckpt_setup_task(struct ckpt_ctx *ctx, pid_t pid, pid_t ppid)
 	if (hash_lookup(ctx, pid))  /* already handled */
 		return 0;
 
-	task = &ctx->tasks_arr[ctx->tasks_nr++];
+	task = &ctx->tasks[ctx->tasks_nr++];
 
 	task->flags = TASK_GHOST;
 
@@ -1294,7 +1294,7 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
 
 	/* populate with known tasks */
 	for (i = 0; i < pids_nr; i++) {
-		task = &ctx->tasks_arr[i];
+		task = &ctx->tasks[i];
 
 		task->flags = 0;
 
@@ -1571,7 +1571,7 @@ static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task)
 static int ckpt_placeholder_task(struct ckpt_ctx *ctx, struct task *task)
 {
 	struct task *session = hash_lookup(ctx, task->sid);
-	struct task *holder = &ctx->tasks_arr[ctx->tasks_nr++];
+	struct task *holder = &ctx->tasks[ctx->tasks_nr++];
 	pid_t pid;
 
 	if (ctx->tasks_nr > ctx->tasks_max) {
@@ -2481,7 +2481,7 @@ static int assign_vpids(struct ckpt_ctx *ctx)
 	}
 
 	for (tidx = 0, hidx = 0, vidx = 0; tidx < ctx->pids_nr; tidx++) {
-		task = &ctx->tasks_arr[tidx];
+		task = &ctx->tasks[tidx];
 		depth = ctx->pids_arr[tidx].depth;
 
 		task->vidx = vidx;
@@ -2521,7 +2521,7 @@ static int ckpt_read_vpids(struct ckpt_ctx *ctx)
 		if (ctx->pids_arr[i].depth < 0) {
 			ckpt_err("Invalid depth %d for pid %d",
 				 ctx->pids_arr[i].depth,
-				 ctx->tasks_arr[i].pid);
+				 ctx->tasks[i].pid);
 			errno = -EINVAL;
 			return -1;
 		}
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 7/9] udpate kernel headers: support for pids objects
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (5 preceding siblings ...)
  2011-01-26 16:19   ` [PATCH 6/9] restart: rename 'ctx->tasks_arr' to 'ctx->tasks' Oren Laadan
@ 2011-01-26 16:19   ` Oren Laadan
  2011-01-26 16:19   ` [PATCH 8/9] ckptinfo: s/ckpt_pids/ckpt_task_pids/ after kerenl header update Oren Laadan
  2011-01-26 16:19   ` [PATCH 9/9] restart: fix support for nested pid namespaces Oren Laadan
  8 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:19 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

---
 include/linux/checkpoint_hdr.h |   28 +++++++++++++++++++++++-----
 1 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f7c4d9a..8087250 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -94,7 +94,9 @@ enum {
 	CKPT_HDR_SECURITY,
 #define CKPT_HDR_SECURITY CKPT_HDR_SECURITY
 
-	CKPT_HDR_TREE = 101,
+	CKPT_HDR_PIDS = 101,
+#define CKPT_HDR_PIDS CKPT_HDR_PIDS
+	CKPT_HDR_TREE,
 #define CKPT_HDR_TREE CKPT_HDR_TREE
 	CKPT_HDR_TASK,
 #define CKPT_HDR_TASK CKPT_HDR_TASK
@@ -232,6 +234,8 @@ struct ckpt_hdr_objref {
 enum obj_type {
 	CKPT_OBJ_IGNORE = 0,
 #define CKPT_OBJ_IGNORE CKPT_OBJ_IGNORE
+	CKPT_OBJ_PID,
+#define CKPT_OBJ_PID CKPT_OBJ_PID
 	CKPT_OBJ_INODE,
 #define CKPT_OBJ_INODE CKPT_OBJ_INODE
 	CKPT_OBJ_FILE_TABLE,
@@ -343,24 +347,38 @@ struct ckpt_hdr_container {
 	 */
 } __attribute__((aligned(8)));;
 
+/* pids array */
+struct ckpt_hdr_pids {
+	struct ckpt_hdr h;
+	__u32 nr_pids;
+	__u32 nr_vpids;
+} __attribute__((aligned(8)));
+
+struct ckpt_pids {
+	__u32 depth;
+	__s32 numbers[1];
+} __attribute__((aligned(8)));
+
 /* task tree */
 struct ckpt_hdr_tree {
 	struct ckpt_hdr h;
-	__s32 nr_tasks;
+	__u32 nr_tasks;
 } __attribute__((aligned(8)));
 
-struct ckpt_pids {
+struct ckpt_task_pids {
 	/* These pids are in the root_nsproxy's pid ns */
 	__s32 vpid;
 	__s32 vppid;
 	__s32 vtgid;
 	__s32 vpgid;
 	__s32 vsid;
-	__s32 depth; /* pid namespace depth relative to container init */
+	__u32 depth;
 } __attribute__((aligned(8)));
 
 /* pids */
-#define CKPT_PID_NULL -1
+/* (negative but not valid error) */
+#define CKPT_PID_NULL (-4096) /* null pid pointer */
+#define CKPT_PID_ROOT (-4097) /* pid same as root task */
 
 /* task data */
 struct ckpt_hdr_task {
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 8/9] ckptinfo: s/ckpt_pids/ckpt_task_pids/ after kerenl header update
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (6 preceding siblings ...)
  2011-01-26 16:19   ` [PATCH 7/9] udpate kernel headers: support for pids objects Oren Laadan
@ 2011-01-26 16:19   ` Oren Laadan
  2011-01-26 16:19   ` [PATCH 9/9] restart: fix support for nested pid namespaces Oren Laadan
  8 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:19 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 ckptinfo.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ckptinfo.c b/ckptinfo.c
index d73b38c..1361c21 100644
--- a/ckptinfo.c
+++ b/ckptinfo.c
@@ -254,7 +254,7 @@ static int image_parse(int fd, struct args *args)
 static int image_parse_tree(struct ckpt_hdr *h, int fd, struct args *args)
 {
 	struct ckpt_hdr_tree *hh;
-	struct ckpt_pids *pp;
+	struct ckpt_task_pids *pp;
 	int nr_tasks;
 	int i, ret;
 
@@ -268,7 +268,7 @@ static int image_parse_tree(struct ckpt_hdr *h, int fd, struct args *args)
 	if (ret <= 0)
 		return -1;
 
-	pp =  (struct ckpt_pids *) h;
+	pp =  (struct ckpt_task_pids *) h;
 
 	if (args->show_task_tree) {
 		for (i = 0; i < nr_tasks; i++) {
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 9/9] restart: fix support for nested pid namespaces
       [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
                     ` (7 preceding siblings ...)
  2011-01-26 16:19   ` [PATCH 8/9] ckptinfo: s/ckpt_pids/ckpt_task_pids/ after kerenl header update Oren Laadan
@ 2011-01-26 16:19   ` Oren Laadan
  8 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:19 UTC (permalink / raw)
  To: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Sukadev Bhattiprolu

Adapt restart code to the new pids handling in kernel-cr that handles
pids as a proper shared object.

DISCLAIMER Disclaimer: this patch is bug and intrusive ... Here is a
summary of the changes that it makes:

1) The main change is that we read the 'ckpt_pids' that hold the
 actual pids numbers, and then everything else uses tags that refer to
 these objects. Since the ctx->pids_arr is an array of variable-length
 entries, it is inconvenient ot point to it with an index. So we use
 another array, ctx->pids, that maps from a linear index to the offset
 in the ctx->pids_arr where the data is found.

2) Now all pids other than those in 'ckpt_pids' are indices into that
 array (more precisely, into ctx->pids array), the variables now have
 a "_ind" suffix, e.g. "pid_ind" instead of "pid". There are helpers
 to translate from index to pids structure.

3) Document the data structures used to track pids and tasks within
the restart code.

4) To support (linearly) nested-pids, the pids hash table was extended
to have depth, so that if we need to allocate a new (dummy) pid, we
can choose unique pids at all pid-ns levels, not just the top.

5) Accordingly, dummy pid allocation is done at all possible depths in
 the hash.

6) Throw away ckpt_{read/write/assign}_vpids - it is no longer needed.
Instead, the seuqence of calls is now:
 ckpt_read_pids()
 ckpt_read_tree()
 ckpt_build_pids()
 ckpt_build_tree()

7) Disallow restart with --no-pids if there are nested pid-ns, because
 because is it quite complicated to find ou the pids of all tasks at
 all nested levels from userspace.

8) If the root task's is not a session leader (must be from a subtree
 checkpoint), then it should now inherit its sid from the coordinator.
 Furthermore, other tasks with sid/pgid inherited from above the root
 task should also do the same. For this to work we use a special value
 for their {sid,pgid}_ind: we can't use 0, because that already means
 a pid from an ancestor pid-ns; instead we mark it with CKPT_PID_ROOT,
 and the kernel code knows how to handle it.

 NOTE: this is only necessary when the root task is not a session
 leader. Otherwise, we can just add a placeholder task to accopmlish
 the same effect (recall it's a subtree). But a placeholder cannot be
 placed above the root task...

 NOTE2: by doing this, we squash all the sids/pgids from above the
 root task into a single common value at restart, even though they
 may have been distinct at checkpoint. This is considered a feature
 until someone really needs this to behave differently ...

9) Fix a subtle bug in the session-propagation logic, whereas we don't
 need a placeholder if we reach the root task _and_ we are a child of
 the root task (becaus we will inherit the sid from the root task).

10) In ckpt_fork_child() we can use the 'ckpt_pids' structure for the
 pids rather than manually build one.

11) In adjust_pids() and --no-pids we only try to update the numbers[0]
 of the pid; we don't support nested pid-ns for --no-pids.

Cc: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 restart.c | 1133 +++++++++++++++++++++++++++++++++++++++----------------------
 1 files changed, 733 insertions(+), 400 deletions(-)

diff --git a/restart.c b/restart.c
index ca02383..dc2e49a 100644
--- a/restart.c
+++ b/restart.c
@@ -45,6 +45,12 @@
 #include "common.h"
 
 /*
+ * To re-create the tasks tree in user space, 'restart' reads the
+ * header and tree data from the checkpoint image tree. It makes up
+ * for the data that was consumed by using a helper process that
+ * provides the data back to the restart syscall, followed by the rest
+ * of the checkpoint image stream.
+ *
  * By default, 'restart' creates a new pid namespace in which the
  * restart takes place, using the original pids from the time of the
  * checkpoint. This requires that CLONE_NEWPID and eclone() be enabled.
@@ -54,19 +60,15 @@
  * by default, 'restart' creates an equivalen tree without restoring
  * the original pids, assuming that the application can tolerate this.
  * For this, the 'ckpt_pids' array is transformed on-the-fly before it
- * is fed to the kernel.
+ * is fed to the kernel. This mode of operation is permitted only if
+ * all the restarting tasks belong to a single pid-namespace (i.e. no
+ * pid-namespace nesting).
  *
- * By default, "--pids" implied "--pidns" and vice-versa. The user can
+ * By default, "--pids" implies "--pidns" and vice-versa. The user can
  * use "--pids --no-pidns" for a restart in the currnet namespace -
  * 'restart' will attempt to create the new tree with the original pids
  * from the time of the checkpoint, if possible. This requires that
  * eclone() be enabled.
- *
- * To re-create the tasks tree in user space, 'restart' reads the
- * header and tree data from the checkpoint image tree. It makes up
- * for the data that was consumed by using a helper process that
- * provides the data back to the restart syscall, followed by the rest
- * of the checkpoint image stream.
  */
 
 struct hashent {
@@ -78,6 +80,75 @@ struct hashent {
 struct task;
 struct ckpt_ctx;
 
+/*
+ * The following data structres are used to track pids:
+ *
+ * ctx->pids_arr[]:
+ *   Array of (variable sized) 'struct ckpt_pids' from the checkpoint
+ *   image, each entry indicates the level (depth) relative to the
+ *   root task, and the pids at each level. NOTE: the order of pids
+ *   matches order of adding them to the objhash during checkpoint
+ *   (hence their tags).
+ *
+ * ctx->pids_copy[]:
+ *   Array used to hold a copy of pids_arr[] during --no-pids restart
+ *   when converting the task's pids from the original values from
+ *   the checkpoint image, to the real pids produced by forks.
+ *
+ * ctx->pids_new[]:
+ *   Array of (variable sized) 'struct ckpt_pids' to hold new pids
+ *   objects allocated by the MakeForst algorithm fo the restart.
+ *
+ * ctx->pids_index[]:
+ *   Array of integers that provides mapping from a pid object (tag)
+ *   to the byte offset inside ctx->pids_arr where that pid object
+ *   is. It is useful since the entries in the latter are of variable
+ *   size.
+ *
+ * ctx->tasks_arr[]:
+ *   Array of 'struct ckpt_task_pids' from the checkpoint image, each
+ *   entry indicates a task's pids (pid,tgid,pgid,sid,ppid) and the
+ *   pid-namespace nesting level. NOTE: the pids store the tags of the
+ *   corresponding pid objects (and thus their order in ctx->pids_arr)
+ *   rather then the pid values themselves.
+ *
+ * ctx->tasks[]:
+ *   Array of 'struct task' that holds information about all the tasks
+ *   neede to be created in userespace (the input and output of the
+ *   DumpForst and CreateForest algorithms). NOTE: the pids here also
+ *   store the tags of the corresponding pid objects).
+ *
+ * When restart algorithm needs to create dead tasks or produce dummy
+ * tasks, it stores new 'ckpt_pids' objects in ctx->pids_new[], and
+ * extends ctx->pids[] and ctx->tasks[] to store index to new pids
+ * and new tasks, respectively.
+ *
+ * ctx->pids_nr:  (original) size of ctx->pids_arr
+ * ctx->pids_cnt: current size of ctx->pids_index
+ * ctx->pids_max: maximum size of ctx->pids_index
+ * ctx->pids_off: current offset in ctx->pids_new[]
+ * ctx->pids_len: maximum offset in ctx->pids_new[]
+ *
+ * ctx->tasks_nr:  size of ctx->pids_arr
+ * ctx->tasks_cnt: current size of ctx->tasks
+ * ctx->tasks_max: maximum size of ctx->tasks
+ *
+ * Given a byte offset in ctx->pids_arr, to get the 'ckpt_pids':
+ *   pids = pid_at_index(ctx, @offset)
+ *
+ * Given a pid-index from ctx->tasks/ctx->tasks_arr, to get the byte
+ * offset of the matching 'ckpt_pids' in ctx->pids_arr:
+ *   ctx->pids_index[@index]
+ *
+ * And to get the 'ckpt_pids' from an index:
+ *   pids = pids_of_index(@index)
+ *
+ *
+ * ctx->tasks_pids[]:
+ *   Array of pid values indicating the next hint for pid allocation
+ *   at each nesting level of pid-namespace.
+ */
+
 struct task {
 	int flags;		/* state and (later) actions */
 
@@ -91,15 +162,15 @@ struct task {
 	int vidx;		/* index into vpid array, -1 if none */
 	int piddepth;
 
-	pid_t pid;		/* process IDs, our bread-&-butter */
-	pid_t ppid;
-	pid_t tgid;
-	pid_t sid;
+	/* Following are INDEX values into ctx->pids_index */
+	int pid_ind;		/* process IDs, our bread-&-butter */
+	int ppid_ind;
+	int tgid_ind;
+	int sid_ind;
 	
-	pid_t rpid;		/* [restart without vpids] actual (real) pid */
-
 	struct ckpt_ctx *ctx;	/* points back to the c/r context */
 
+	pid_t real_pid;		/* [restart without vpids] actual (real) pid */
 	pid_t real_parent;	/* pid of task's real parent */
 };
 
@@ -126,32 +197,44 @@ struct ckpt_ctx {
 
 	int error;
 
-	pid_t root_pid;
 	int pipe_in;
 	int pipe_out;
-	int pids_nr;
-	int vpids_nr;
 
 	int pipe_child[2];	/* for children to report status */
 	int pipe_feed[2];	/* for feeder to provide input */
 	int pipe_coord[2];	/* for coord to report status (if needed) */
 
+	int root_pid;
+
 	struct ckpt_pids *pids_arr;
-	struct ckpt_pids *copy_arr;
-	__s32 *vpids_arr;
+	struct ckpt_pids *pids_new;
+	struct ckpt_pids *pids_copy;
+	int *pids_index;
+
+	int pids_nr;
+	int vpids_nr;
+	int pids_cnt;
+	int pids_max;
+	int pids_off;
+	int pids_len;
 
+	struct ckpt_task_pids *tasks_arr;
 	struct task *tasks;
+
 	int tasks_nr;
+	int tasks_cnt;
 	int tasks_max;
-	int tasks_pid;
 
-	struct hashent **hash_arr;
+	/* an array of pid hash-tables: one hash-table per pidns level */
+	struct hashent ***hash_arr;
+	int *hash_last_pid;
+	int hash_depth;
 	
 	char header[BUFSIZE];
 	char header_arch[BUFSIZE];
 	char container[BUFSIZE];
 	char tree[BUFSIZE];
-	char vpids[BUFSIZE];
+	char pids[BUFSIZE];
 	char buf[BUFSIZE];
 
 	struct cr_restart_args *args;
@@ -193,9 +276,9 @@ int global_send_sigint = -1;
 static int ckpt_remount_proc(struct ckpt_ctx *ctx);
 static int ckpt_remount_devpts(struct ckpt_ctx *ctx);
 
+static int ckpt_build_pids(struct ckpt_ctx *ctx);
 static int ckpt_build_tree(struct ckpt_ctx *ctx);
 static int ckpt_init_tree(struct ckpt_ctx *ctx);
-static int assign_vpids(struct ckpt_ctx *ctx);
 static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task);
 static int ckpt_placeholder_task(struct ckpt_ctx *ctx, struct task *task);
 static int ckpt_propagate_session(struct ckpt_ctx *ctx, struct task *session);
@@ -218,8 +301,8 @@ static int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h);
 static int ckpt_write_header(struct ckpt_ctx *ctx);
 static int ckpt_write_header_arch(struct ckpt_ctx *ctx);
 static int ckpt_write_container(struct ckpt_ctx *ctx);
+static int ckpt_write_pids(struct ckpt_ctx *ctx);
 static int ckpt_write_tree(struct ckpt_ctx *ctx);
-static int ckpt_write_vpids(struct ckpt_ctx *ctx);
 
 static int _ckpt_read(int fd, void *buf, int count);
 static int ckpt_read(int fd, void *buf, int count);
@@ -231,12 +314,14 @@ static int ckpt_read_header(struct ckpt_ctx *ctx);
 static int ckpt_read_header_arch(struct ckpt_ctx *ctx);
 static int ckpt_read_container(struct ckpt_ctx *ctx);
 static int ckpt_read_tree(struct ckpt_ctx *ctx);
-static int ckpt_read_vpids(struct ckpt_ctx *ctx);
+static int ckpt_read_pids(struct ckpt_ctx *ctx);
 
 static int hash_init(struct ckpt_ctx *ctx);
 static void hash_exit(struct ckpt_ctx *ctx);
-static int hash_insert(struct ckpt_ctx *ctx, long key, void *data);
 static void *hash_lookup(struct ckpt_ctx *ctx, long key);
+static void *hash_lookup_level(struct ckpt_ctx *ctx, long key, int level);
+static void *hash_lookup_ind(struct ckpt_ctx *ctx, int n);
+static int hash_insert(struct ckpt_ctx *ctx, long key, void *data, int level);
 
 static inline pid_t _gettid(void)
 {
@@ -520,14 +605,20 @@ static void exit_ctx(struct ckpt_ctx *ctx)
 {
 	if (ctx->freezer)
 		free(ctx->freezer);
+
 	if (ctx->tasks)
 		free(ctx->tasks);
+	if (ctx->tasks_arr)
+		free(ctx->tasks_arr);
+
+	if (ctx->pids_index)
+		free(ctx->pids_index);
 	if (ctx->pids_arr)
 		free(ctx->pids_arr);
-	if (ctx->copy_arr)
-		free(ctx->copy_arr);
-	if (ctx->vpids_arr)
-		free(ctx->vpids_arr);
+	if (ctx->pids_new)
+		free(ctx->pids_new);
+	if (ctx->pids_copy)
+		free(ctx->pids_copy);
 
 	/* unused fd will be silently ignored */
 	close(ctx->pipe_in);
@@ -549,11 +640,10 @@ int cr_restart(struct cr_restart_args *args)
 
 	ctx.args = args;
 	ctx.whoami = CTX_RESTART;  /* for sanity checked */
-	ctx.tasks_pid = CKPT_RESERVED_PIDS;
 
 	ret = process_args(args);
 	if (ret < 0)
-		return -1;
+		return ret;
 
 	/* freezer preparation */
 	if (args->freezer && freezer_prepare(&ctx) < 0)
@@ -607,13 +697,13 @@ int cr_restart(struct cr_restart_args *args)
 		goto cleanup;
 	}
 
-	ret = ckpt_read_tree(&ctx);
+	ret = ckpt_read_pids(&ctx);
 	if (ret < 0) {
-		ckpt_perror("read c/r tree");
+		ckpt_perror("read c/r pids");
 		goto cleanup;
 	}
 
-	ret = ckpt_read_vpids(&ctx);
+	ret = ckpt_read_tree(&ctx);
 	if (ret < 0) {
 		ckpt_perror("read c/r tree");
 		goto cleanup;
@@ -622,12 +712,12 @@ int cr_restart(struct cr_restart_args *args)
 	/* build creator-child-relationship tree */
 	if (hash_init(&ctx) < 0)
 		goto cleanup;
-	ret = ckpt_build_tree(&ctx);
-	hash_exit(&ctx);
+
+	ret = ckpt_build_pids(&ctx);
 	if (ret < 0)
 		goto cleanup;
 
-	ret = assign_vpids(&ctx);
+	ret = ckpt_build_tree(&ctx);
 	if (ret < 0)
 		goto cleanup;
 
@@ -646,7 +736,7 @@ int cr_restart(struct cr_restart_args *args)
 	if (ctx.args->root)
 		ctx.tasks[0].flags |= TASK_NEWROOT;
 
-	if (ctx.args->pidns && ctx.tasks[0].pid != 1) {
+	if (ctx.args->pidns && ctx.tasks[0].pid_ind != 1) {
 		ckpt_dbg("new pidns without init\n");
 		if (global_send_sigint == -1)
 			global_send_sigint = SIGINT;
@@ -683,6 +773,7 @@ int cr_restart(struct cr_restart_args *args)
 		ret = -1;
 
  cleanup:
+	hash_exit(&ctx);
 	exit_ctx(&ctx);
 
 	/* feeder doesn't exit - to avoid SIGCHILD to coordinator */
@@ -1055,7 +1146,7 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
 
 	ret = 0;
 
-	if (ctx->args->pidns && ctx->tasks[0].pid != 1) {
+	if (ctx->args->pidns && ctx->tasks[0].pid_ind != 1) {
 		/* Report success/failure to the parent */
 		if (write(ctx->pipe_coord[1], &ret, sizeof(ret)) < 0) {
 			ckpt_perror("failed to report status");
@@ -1093,7 +1184,127 @@ static int ckpt_coordinator(struct ckpt_ctx *ctx)
 
 static inline struct task *ckpt_init_task(struct ckpt_ctx *ctx)
 {
-	return (&ctx->tasks[0]);
+	return &ctx->tasks[0];
+}
+
+static inline struct ckpt_pids *pids_at_offset(struct ckpt_ctx *ctx, int n)
+{
+	return (struct ckpt_pids *)(((char *) ctx->pids_arr) + n);
+}
+
+static inline struct ckpt_pids *pids_copy_at_offset(struct ckpt_ctx *ctx, int n)
+{
+	return (struct ckpt_pids *)(((char *) ctx->pids_copy) + n);
+}
+
+static inline struct ckpt_pids *pids_new_at_offset(struct ckpt_ctx *ctx, int n)
+{
+	return (struct ckpt_pids *)(((char *) ctx->pids_new) + n);
+}
+
+struct ckpt_pids pids_zero = {
+	.depth = 1,
+	.numbers = {0},
+};
+
+struct ckpt_pids pids_root = {
+	.depth = 1,
+	.numbers = {-1},
+};
+
+static inline struct ckpt_pids *pids_of_index(struct ckpt_ctx *ctx, int n)
+{
+	if (n == 0)
+		return &pids_zero;
+	else if (n == CKPT_PID_ROOT)
+		return &pids_root;
+	if (n < 0 || n > ctx->pids_cnt) {
+		errno = EINVAL;
+		return NULL;
+	}
+	if (n <= ctx->pids_nr)
+		return pids_at_offset(ctx, ctx->pids_index[n]);
+	else
+		return pids_new_at_offset(ctx, ctx->pids_index[n]);
+}
+
+static inline pid_t pid_at_index(struct ckpt_ctx *ctx, int n)
+{
+	return pids_of_index(ctx, n)->numbers[0];
+}
+
+/*
+ * ckpt_build_pids - prepare pids array: this array will index into
+ * the pids_arr array pointing at the beginning of the individual
+ * 'struct ckpt_pids' elements there (as they are of variable size)
+ */
+static int ckpt_build_pids(struct ckpt_ctx *ctx)
+{
+	struct ckpt_pids *pids;
+	int i = 0, n = 0;
+	int depth = 0;
+	int s, len;
+
+	/*
+	 * Allow for additional pids to be added on demand for
+	 * placeholder tasks (each session leader may have at most
+	 * one) Added +1 because index count starts from 1.
+	 */
+	ctx->pids_max = ctx->pids_nr * 2;
+	ctx->pids_index = malloc(sizeof(int) * (ctx->pids_max + 1));
+	if (!ctx->pids_index) {
+		ckpt_perror("malloc pids index");
+		return ctx_set_errno(ctx);
+	}
+
+	len = ctx->pids_nr * sizeof(*pids) + ctx->vpids_nr * sizeof(__s32);
+	if (len <= 0)   /* overflow ? */
+		return ctx_ret_errno(ctx, EOVERFLOW);
+
+	while (n < ctx->pids_nr && len > 0) {
+		s = sizeof(*pids);
+
+		pids = pids_at_offset(ctx, i);
+		if (pids->depth < 0)
+			return ctx_ret_errno(ctx, EINVAL);
+		s += pids->depth * sizeof(__s32);
+		if (s > len)
+			return ctx_ret_errno(ctx, EINVAL);
+
+		depth += pids->depth;
+		if (depth > ctx->vpids_nr)
+			return ctx_ret_errno(ctx, EINVAL);
+
+		ctx->pids_index[n + 1] = i;
+
+		len -= s;
+		i += s;
+		n++;
+	}
+
+	if (n != ctx->pids_nr || depth != ctx->vpids_nr || len != 0)
+		return ctx_ret_errno(ctx, EINVAL);
+
+	ctx->pids_cnt = ctx->pids_nr;
+
+	if (!ctx->args->pidns && depth > 0) {
+		ckpt_err("need --pidns for nested pidns container");
+		return ctx_ret_errno(ctx, EINVAL);
+	}
+
+#ifdef CHECKPOINT_DEBUG
+	ckpt_dbg("====== PIDS\n");
+	for (n = 1; n <= ctx->pids_nr; n++) {
+		pids = pids_of_index(ctx, n);
+		ckpt_dbg("\t[%d] depth %d pids", n, pids->depth);
+		for (i = 0; i <= pids->depth; i++)
+			ckpt_dbg_cont(" %d", pids->numbers[i]);
+		ckpt_dbg_cont("\n");
+	}
+	ckpt_dbg("...........\n");
+#endif
+
+	return 0;
 }
 
 /*
@@ -1107,15 +1318,13 @@ static int ckpt_build_tree(struct ckpt_ctx *ctx)
 
 	/*
 	 * Allow for additional tasks to be added on demand for
-	 * referenced pids of dead tasks (each task can introduce at
-	 * most two: session and process group IDs), as well as for
-	 * placeholder tasks (each session id may have at most one)
+	 * placeholder tasks (tgid/sid/pgid ids may each add one)
 	 */
 	ctx->tasks_max = ctx->pids_nr * 4;
 	ctx->tasks = malloc(sizeof(*ctx->tasks) * ctx->tasks_max);
 	if (!ctx->tasks) {
 		ckpt_perror("malloc tasks array");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
 	/* initialize tree */
@@ -1123,7 +1332,7 @@ static int ckpt_build_tree(struct ckpt_ctx *ctx)
 		return -1;
 
 	/* assign a creator to each task */
-	for (i = 0; i < ctx->tasks_nr; i++) {
+	for (i = 0; i < ctx->tasks_cnt; i++) {
 		task = &ctx->tasks[i];
 		if (task->creator)
 			continue;
@@ -1133,17 +1342,26 @@ static int ckpt_build_tree(struct ckpt_ctx *ctx)
 
 #ifdef CHECKPOINT_DEBUG
 	ckpt_dbg("====== TASKS\n");
-	for (i = 0; i < ctx->tasks_nr; i++) {
+	for (i = 0; i < ctx->tasks_cnt; i++) {
 		task = &ctx->tasks[i];
-		ckpt_dbg("\t[%d] pid %d ppid %d sid %d creator %d",
-			 i, task->pid, task->ppid, task->sid,
-			 task->creator->pid);
+		ckpt_dbg("\t[%3d] pid %5d(%3d) (tgid %3d) ppid %5d(%3d)"
+			 "  (pgid %3d) sid %5d(%3d) creator %5d(%3d)",
+			 i, pid_at_index(ctx, task->pid_ind), task->pid_ind,
+			 i < ctx->tasks_nr ? ctx->tasks_arr[i].vtgid : -1,
+			 pid_at_index(ctx, task->ppid_ind), task->ppid_ind,
+			 i < ctx->tasks_nr ? ctx->tasks_arr[i].vpgid : -1,
+			 task->sid_ind >= 0 ?
+			 pid_at_index(ctx, task->sid_ind) : -1, task->sid_ind,
+			 pid_at_index(ctx, task->creator->pid_ind));
 		if (task->next_sib)
-			ckpt_dbg_cont(" next %d", task->next_sib->pid);
+			ckpt_dbg_cont(" next %3d",
+				      pid_at_index(ctx, task->next_sib->pid_ind));
 		if (task->prev_sib)
-			ckpt_dbg_cont(" prev %d", task->prev_sib->pid);
+			ckpt_dbg_cont(" prev %3d",
+				      pid_at_index(ctx, task->prev_sib->pid_ind));
 		if (task->phantom)
-			ckpt_dbg_cont(" placeholder %d", task->phantom->pid);
+			ckpt_dbg_cont(" placeholder %3d",
+				      pid_at_index(ctx, task->phantom->pid_ind));
 		ckpt_dbg_cont(" %c%c%c%c%c%c",
 		       (task->flags & TASK_THREAD) ? 'T' : ' ',
 		       (task->flags & TASK_SIBLING) ? 'P' : ' ',
@@ -1159,24 +1377,30 @@ static int ckpt_build_tree(struct ckpt_ctx *ctx)
 	return 0;
 }		
 
-static int ckpt_setup_task(struct ckpt_ctx *ctx, pid_t pid, pid_t ppid)
+static int ckpt_setup_task(struct ckpt_ctx *ctx, int pid_ind, int ppid_ind)
 {
 	struct task *task;
+	struct ckpt_pids *pids;
+	int j;
 
-	if (pid == 0)  /* ignore if outside namespace */
+	/* ignore if outside namespace */
+	if (pid_ind == 0 || pid_ind == CKPT_PID_ROOT)
 		return 0;
 
-	if (hash_lookup(ctx, pid))  /* already handled */
+	pids = pids_of_index(ctx, pid_ind);
+
+	/* skip if already handled */
+	if (hash_lookup(ctx, pids->numbers[0]))
 		return 0;
 
-	task = &ctx->tasks[ctx->tasks_nr++];
+	task = &ctx->tasks[ctx->tasks_cnt++];
 
 	task->flags = TASK_GHOST;
 
-	task->pid = pid;
-	task->ppid = ppid;
-	task->tgid = pid;
-	task->sid = ppid;
+	task->pid_ind = pid_ind;
+	task->ppid_ind = ppid_ind;
+	task->tgid_ind = pid_ind;
+	task->sid_ind = ppid_ind;
 
 	task->children = NULL;
 	task->next_sib = NULL;
@@ -1184,20 +1408,21 @@ static int ckpt_setup_task(struct ckpt_ctx *ctx, pid_t pid, pid_t ppid)
 	task->creator = NULL;
 	task->phantom = NULL;
 
-	task->rpid = -1;
 	task->ctx = ctx;
+	task->real_pid = -1;
 
-	if (hash_insert(ctx, pid, task) < 0)
-		return -1;
-
-	/* remember the max pid seen */
-	if (task->pid > ctx->tasks_pid)
-		ctx->tasks_pid = task->pid;
+	for (j = 0; j <= pids->depth; j++) {
+		if (hash_insert(ctx, pids->numbers[j], task, j) < 0)
+			return -1;
+		/* remember the max pid seen */
+		if (pids->numbers[j] > ctx->hash_last_pid[j])
+			ctx->hash_last_pid[j] = pids->numbers[j];
+	}
 
 	return 0;
 }
 
-static int ckpt_valid_pid(struct ckpt_ctx *ctx, pid_t pid, char *which, int i)
+static int _ckpt_valid_pid(struct ckpt_ctx *ctx, pid_t pid, char *which, int i)
 {
 	if (pid < 0) {
 		ckpt_err("Invalid %s %d (for task#%d)\n", which, pid, i);
@@ -1218,104 +1443,155 @@ static int ckpt_valid_pid(struct ckpt_ctx *ctx, pid_t pid, char *which, int i)
 	return 1;
 }
 
-static int ckpt_alloc_pid(struct ckpt_ctx *ctx)
+static int ckpt_valid_pid(struct ckpt_ctx *ctx, int index, char *which, int i)
+{
+	struct ckpt_pids *pids;
+	int j;
+
+	pids = pids_of_index(ctx, index);
+	for (j = 0; j <= pids->depth; j++) {
+		if (!_ckpt_valid_pid(ctx, pids->numbers[j], which, i))
+			return 0;
+	}
+	return 1;
+}
+
+static int ckpt_alloc_pid(struct ckpt_ctx *ctx, int depth)
 {
-	int n = 0;
+	struct ckpt_pids *pids;
+	int j, n, last, len;
+	int pid_ind;
+
+	len = sizeof(*pids) + depth * sizeof(__s32);
+
+	/* need to expand the ctx->pids_new[] array ? */
+	if (ctx->pids_off + len > ctx->pids_len) {
+		pids = realloc(ctx->pids_new, ctx->pids_len * 3 / 2);
+		if (!pids) {
+			ckpt_perror("allocate new pids table");
+			return ctx_set_errno(ctx);
+		}
+		ctx->pids_new = pids;
+	}
+
+	/* need to expand the ctx->pids_index[] array ? */
+	if (ctx->pids_cnt >= ctx->pids_max)  {
+		/* shouldn't happen, beacuse we prepared enough */
+		ckpt_err("out of space in task table !");
+		return ctx_ret_errno(ctx, EOVERFLOW);
+	}
+
+	ctx->pids_cnt += 1;
+	pid_ind = ctx->pids_cnt;
+
+	ctx->pids_index[pid_ind] = ctx->pids_off;
+	ctx->pids_off += len;
 
 	/*
-	 * allocate an unused pid for the placeholder
+	 * allocate an unused pid for the placeholder in each pid-namespace
 	 * (this will become inefficient if pid-space is exhausted)
 	 */
-	do {
-		if (ctx->tasks_pid == ctx->pid_max)
-			ctx->tasks_pid = CKPT_RESERVED_PIDS;
-		else
-			ctx->tasks_pid++;
 
-		if (n++ == ctx->pid_max) {	/* ohhh... */
-			ckpt_err("pid namsepace exhausted");
-			return -1;
-		}
-	} while (hash_lookup(ctx, ctx->tasks_pid));
+	pids = pids_of_index(ctx, pid_ind);
 
-	return ctx->tasks_pid;
-}
+	for (j = 0; j <= depth; j++) {
+		n = 0;
+		last = ctx->hash_last_pid[j];
 
-static int ckpt_zero_pid(struct ckpt_ctx *ctx)
-{
-	pid_t pid;
+		do {
+			if (last >= ctx->pid_max)
+				last = CKPT_RESERVED_PIDS;
+			else
+				last++;
 
-	pid = ckpt_alloc_pid(ctx);
-	if (pid < 0)
-		return -1;
-	if (ckpt_setup_task(ctx, pid, ctx->pids_arr[0].vpid) < 0)
-		return -1;
-	return pid;
+			if (n++ == ctx->pid_max) {  /* ohhh... */
+				ckpt_err("pid namsepace exhausted");
+				return ctx_ret_errno(ctx, EOVERFLOW);
+			}
+		} while (hash_lookup_level(ctx, last, j));
+
+		ctx->hash_last_pid[j] = last;
+		pids->numbers[j] = last;
+	}
+
+	return pid_ind;
 }
 
 static int ckpt_init_tree(struct ckpt_ctx *ctx)
 {
-	struct ckpt_pids *pids_arr = ctx->pids_arr;
-	int pids_nr = ctx->pids_nr;
+	struct ckpt_task_pids *tasks_arr;
+	struct ckpt_pids *pids;
 	struct task *task;
-	pid_t root_pid;
-	pid_t root_sid;
-	pid_t zero_pid = 0;
-	int i;
+	int root_pid_ind;
+	int root_sid_ind;
+	int ppid_ind;
+	int i, j;
 
-	root_pid = pids_arr[0].vpid;
-	root_sid = pids_arr[0].vsid;
+	tasks_arr = ctx->tasks_arr;
+
+	root_pid_ind = tasks_arr[0].vpid;
+	root_sid_ind = tasks_arr[0].vsid;
 
 	/*
+	 * Any zero value (tgid/pgid/sid) means that at checkpoint the
+	 * original pid came from an ancestor pid-ns. If we find any,
+	 * the caller must also have requested --pidns, otherwise fail.
+	 * This is done in _ckpt_valid_pid(), but users can choose to
+	 * only issue a warning, and we'll convert them to inherit the
+	 * sid of the root task instead.
+	 *
 	 * The case where root_sid != root_pid is special. It must be
 	 * from a subtree checkpoint (in container, root_sid is either
 	 * same as root_pid or 0), and root_sid was inherited from an
 	 * ancestor of that subtree.
 	 *
-	 * If we restart with --pidns, make the root-task also inherit
-	 * sid from its ancestor (== coordinator), whatever 'restart'
-	 * task currently has.  For that, we force the root-task's sid
-	 * and all references to it from other tasks (via sid and
-	 * pgid), to 0. Later, the feeder will substitute the
-	 * cooridnator's sid for them.
+	 * When creating the tasks tree, the root task will inherit
+	 * its sid from its ancestor (usually from the coordinator;
+	 * however, for --pidns and non-init root task it will be the
+	 * stub init task inserted by us). This sid will be whatever
+	 * 'restart' process (or our caller) current has.
+	 *
+	 * For that, we force the root-task's sid and all references
+	 * to it to be CKPT_PID_ROOT. This tells restart to treat them
+	 * as such, and ensures that we don't call setsid() on the
+	 * root task (because sid != pid). CKPT_PID_ROOT is gracefully
+	 * handled both by ckpt_set_creator() when tracking the sid
+	 * heritage, and by the kernel when restoring a task's pgid.
 	 *
 	 * (Note that this still works even if the coordinator's sid
 	 * is "used" by a restarting task: a new-pidns restart will
 	 * fail because the pid is in use, and in an old-pidns restart
 	 * the task will be assigned a new pid anyway).
-	 *
-	 * If we restart with --no-pidns, we'll add a ghost task below
-	 * whose pid will be used instead of these zeroed entried.
 	 */
 
 	/* forcing root_sid to -1, will make comparisons below fail */
-	if (root_sid == root_pid)
-		root_sid = -1;
+	if (root_sid_ind == root_pid_ind)
+		root_sid_ind = -1;
 
 	/* populate with known tasks */
-	for (i = 0; i < pids_nr; i++) {
+	for (i = 0; i < ctx->tasks_nr; i++) {
 		task = &ctx->tasks[i];
 
 		task->flags = 0;
 
-		if (!ckpt_valid_pid(ctx, pids_arr[i].vpid, "pid", i))
+		if (!ckpt_valid_pid(ctx, tasks_arr[i].vpid, "pid", i))
 			return -1;
-		else if (!ckpt_valid_pid(ctx, pids_arr[i].vtgid, "tgid", i))
+		else if (!ckpt_valid_pid(ctx, tasks_arr[i].vtgid, "tgid", i))
 			return -1;
-		else if (!ckpt_valid_pid(ctx, pids_arr[i].vsid, "sid", i))
+		else if (!ckpt_valid_pid(ctx, tasks_arr[i].vsid, "sid", i))
 			return -1;
-		else if (!ckpt_valid_pid(ctx, pids_arr[i].vpgid, "pgid", i))
+		else if (!ckpt_valid_pid(ctx, tasks_arr[i].vpgid, "pgid", i))
 			return -1;
 
-		if (pids_arr[i].vsid == root_sid)
-			pids_arr[i].vsid = 0;
-		if (pids_arr[i].vpgid == root_sid)
-			pids_arr[i].vpgid = 0;
+		if (tasks_arr[i].vsid == root_sid_ind)
+			tasks_arr[i].vsid = CKPT_PID_ROOT;
+		if (tasks_arr[i].vpgid == root_sid_ind)
+			tasks_arr[i].vpgid = CKPT_PID_ROOT;
 
-		task->pid = pids_arr[i].vpid;
-		task->ppid = pids_arr[i].vppid;
-		task->tgid = pids_arr[i].vtgid;
-		task->sid = pids_arr[i].vsid;
+		task->pid_ind = tasks_arr[i].vpid;
+		task->ppid_ind = tasks_arr[i].vppid;
+		task->tgid_ind = tasks_arr[i].vtgid;
+		task->sid_ind = tasks_arr[i].vsid;
 
 		task->children = NULL;
 		task->next_sib = NULL;
@@ -1323,30 +1599,30 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
 		task->creator = NULL;
 		task->phantom = NULL;
 
-		task->rpid = -1;
 		task->ctx = ctx;
+		task->real_pid = -1;
 
-		if (hash_insert(ctx, task->pid, task) < 0)
-			return -1;
+		pids = pids_of_index(ctx, tasks_arr[i].vpid);
+		for (j = 0; j <= pids->depth; j++) {
+			if (hash_insert(ctx, pids->numbers[j], task, j) < 0)
+				return -1;
+			/* remember the max pid seen */
+			if (pids->numbers[j] > ctx->hash_last_pid[j])
+				ctx->hash_last_pid[j] = pids->numbers[j];
+		}
 	}
 
-	ctx->tasks_nr = pids_nr;
+	ctx->tasks_cnt = ctx->tasks_nr;
 
 	/* add pids unaccounted for (no tasks) */
-	for (i = 0; i < pids_nr; i++) {
-		pid_t sid;
-
-		sid = pids_arr[i].vsid;
+	for (i = 0; i < ctx->tasks_nr; i++) {
 
-		/* Remember if we find any vsid/vpgid - see below */
-		if (pids_arr[i].vsid == 0 || pids_arr[i].vpgid == 0)
-			zero_pid = 1;
 		/*
 		 * An unaccounted-for sid belongs to a task that was a
 		 * session leader and died. We can safe set its parent
 		 * (and creator) to be the root task.
 		 */
-		if (ckpt_setup_task(ctx, sid, root_pid) < 0)
+		if (ckpt_setup_task(ctx, tasks_arr[i].vsid, root_pid_ind) < 0)
 			return -1;
 
 		/*
@@ -1354,15 +1630,17 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
 		 * ancestor of root_task, and more specifically, via
 		 * root_task itself: make root_task our parent.
 		 */
-		if (sid == 0)
-			sid = root_pid;
+
+		ppid_ind = tasks_arr[i].vsid;
+		if (ppid_ind == 0 || ppid_ind == CKPT_PID_ROOT)
+			ppid_ind = root_pid_ind;
 
 		/*
 		 * If a pid belongs to a dead thread group leader, we
 		 * need to add it with the same sid as current (and
 		 * other) threads.
 		 */
-		if (ckpt_setup_task(ctx, pids_arr[i].vtgid, sid) < 0)
+		if (ckpt_setup_task(ctx, tasks_arr[i].vtgid, ppid_ind) < 0)
 			return -1;
 
 		/*
@@ -1373,36 +1651,16 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
 		 * same sid as us: all tasks with same pgrp must have
 		 * their sid matching.
 		 */
-		if (ckpt_setup_task(ctx, pids_arr[i].vpgid, sid) < 0)
-			return -1;
-	}
-
-	/*
-	 * Zero sid/pgid is disallowed in --no-pidns mode. If there
-	 * were any, we invent a new ghost-zero task and substitute
-	 * its pid for those any sid/pgid.
-	 */
-	if (zero_pid && !ctx->args->pidns) {
-		zero_pid = ckpt_zero_pid(ctx);
-		if (zero_pid < 0)
+		if (ckpt_setup_task(ctx, tasks_arr[i].vpgid, ppid_ind) < 0)
 			return -1;
-		for (i = 0; i < pids_nr; i++) {
-			if (pids_arr[i].vsid == 0) {
-				pids_arr[i].vsid = zero_pid;
-				pids_arr[i].vppid = zero_pid;
-			}
-			if (pids_arr[i].vpgid == 0) {
-				pids_arr[i].vpgid = zero_pid;
-				pids_arr[i].vppid = zero_pid;
-			}
-		}
 	}
 
 	/* mark root task(s), and set its "creator" to be zero_task */
 	ckpt_init_task(ctx)->flags |= TASK_ROOT;
 	ckpt_init_task(ctx)->creator = &zero_task;
 
-	ckpt_dbg("total tasks (including ghosts): %d\n", ctx->tasks_nr);
+	ckpt_dbg("total tasks (excluding ghosts): %d\n", ctx->tasks_nr);
+	ckpt_dbg("total tasks (including ghosts): %d\n", ctx->tasks_cnt);
 	return 0;
 }
 
@@ -1471,50 +1729,67 @@ static int ckpt_init_tree(struct ckpt_ctx *ctx)
  * leader.  This is done using a placeholder in a manner similar to
  * how we handle orphans that are not session leaders.
  */
+
 static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task)
 {
-	struct task *session = hash_lookup(ctx, task->sid);
-	struct task *parent = hash_lookup(ctx, task->ppid);
+	struct task *session;
+	struct task *parent;
 	struct task *creator;
 
+	session = hash_lookup_ind(ctx, task->sid_ind);
+	parent = hash_lookup_ind(ctx, task->ppid_ind);
+
 	if (task == ckpt_init_task(ctx)) {
-		ckpt_err("pid %d: logical error\n", ckpt_init_task(ctx)->pid);
-		return -1;
+		ckpt_err("pid %d: logical error\n",
+			 pid_at_index(ctx, ckpt_init_task(ctx)->pid_ind));
+		return ctx_ret_errno(ctx, EINVAL);
 	}
 
 	/* sid == 0 must have been inherited from outside the container */
-	if (task->sid == 0)
-		session = ckpt_init_task(ctx);
+	if (task->sid_ind == 0 || task->sid_ind == CKPT_PID_ROOT)
+		task->flags |= TASK_SESSION;
 
-	if (task->tgid != task->pid) {
+	if (task->tgid_ind != task->pid_ind) {
 		/* thread: creator is thread-group-leader */
-		ckpt_dbg("pid %d: thread tgid %d\n", task->pid, task->tgid);
-		creator = hash_lookup(ctx, task->tgid);
+		ckpt_dbg("pid %d: thread tgid %d\n",
+			 pid_at_index(ctx, task->pid_ind),
+			 pid_at_index(ctx, task->tgid_ind));
+		creator = hash_lookup_ind(ctx, task->tgid_ind);
 		if (!creator) {
 			/* oops... thread group leader MIA */
-			ckpt_err("pid %d: no leader %d\n", task->pid, task->tgid);
-			return -1;
+			ckpt_err("pid %d: no leader %d\n",
+				 pid_at_index(ctx, task->pid_ind),
+				 pid_at_index(ctx, task->tgid_ind));
+			return ctx_ret_errno(ctx, EINVAL);
 		}
 		task->flags |= TASK_THREAD;
-	} else if (task->ppid == 0 || !parent) {
+	} else if (task->ppid_ind == 0 || !parent) {
 		/* only root_task can have ppid == 0, parent must always exist */
-		ckpt_err("pid %d: invalid ppid %d\n", task->pid, task->ppid);
-		return -1;
-	} else if (task->pid == task->sid) {
+		ckpt_err("pid %d: invalid ppid %d\n",
+			 pid_at_index(ctx, task->pid_ind),
+			 pid_at_index(ctx, task->ppid_ind));
+		return ctx_ret_errno(ctx, EINVAL);
+	} else if (task->pid_ind == task->sid_ind) {
 		/* session leader: creator is parent */
-		ckpt_dbg("pid %d: session leader\n", task->pid);
+		ckpt_dbg("pid %d: session leader\n",
+			 pid_at_index(ctx, task->pid_ind));
 		creator = parent;
 	} else if (task->flags & TASK_DEAD) {
 		/* dead: creator is session leader */
-		ckpt_dbg("pid %d: task is dead\n", task->pid);
+		ckpt_dbg("pid %d: task is dead\n",
+			 pid_at_index(ctx, task->pid_ind));
 		creator = session;
-	} else if (task->sid == parent->sid) {
+	} else if (task->sid_ind == parent->sid_ind) {
 		/* (non-session-leader) inherit: creator is parent */
-		ckpt_dbg("pid %d: inherit sid %d\n", task->pid, task->sid);
+		ckpt_dbg("pid %d: inherit sid %d\n",
+			 pid_at_index(ctx, task->pid_ind),
+			 pid_at_index(ctx, task->sid_ind));
 		creator = parent;
-	} else if (task->ppid == 1) {
+	} else if (task->ppid_ind == 1) {
 		/* (non-session-leader) orphan: creator is dummy */
-		ckpt_dbg("pid %d: orphan session %d\n", task->pid, task->sid);
+		ckpt_dbg("pid %d: orphan session %d\n",
+			 pid_at_index(ctx, task->pid_ind),
+			 pid_at_index(ctx, task->sid_ind));
 		if (!session->phantom)
 			if (ckpt_placeholder_task(ctx, task) < 0)
 				return -1;
@@ -1524,27 +1799,31 @@ static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task)
 		if (!session->creator) {
 			/* (non-session-leader) recursive: session's creator */
 			ckpt_dbg("pid %d: recursive session creator %d\n",
-			       task->pid, task->sid);
+				 pid_at_index(ctx, task->pid_ind),
+				 pid_at_index(ctx, task->sid_ind));
 			if (ckpt_set_creator(ctx, session) < 0)
 				return -1;
 		}
 		/* then use it to decide what to do */
-		if (session->creator->pid == task->ppid) {
+		if (session->creator->pid_ind == task->ppid_ind) {
 			/* init must not be sibling creator (CLONE_PARENT) */
 			if (session == ckpt_init_task(ctx)) {
 				ckpt_err("pid %d: sibling session prohibited"
-				       " with init as creator\n", task->pid);
-				return -1;
+					 " with init as creator\n",
+					 pid_at_index(ctx, task->pid_ind));
+				return ctx_ret_errno(ctx, EINVAL);
 			}
 			/* (non-session-leader) sibling: creator is sibling */
 			ckpt_dbg("pid %d: sibling session %d\n",
-			       task->pid, task->sid);
+				 pid_at_index(ctx, task->pid_ind),
+				 pid_at_index(ctx, task->sid_ind));
 			creator = session;
 			task->flags |= TASK_SIBLING;
 		} else {
 			/* (non-session-leader) session: fork before setsid */
 			ckpt_dbg("pid %d: propagate session %d\n",
-			       task->pid, task->sid);
+				 pid_at_index(ctx, task->pid_ind),
+				 pid_at_index(ctx, task->sid_ind));
 			creator = parent;
 			task->flags |= TASK_SESSION;
 		}
@@ -1557,7 +1836,9 @@ static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task)
 		next->prev_sib = task;
 	}
 
-	ckpt_dbg("pid %d: creator set to %d\n", task->pid, creator->pid);
+	ckpt_dbg("pid %d: creator set to %d\n",
+		 pid_at_index(ctx, task->pid_ind),
+		 pid_at_index(ctx, creator->pid_ind));
 	task->creator = creator;
 	creator->children = task;
 
@@ -1570,26 +1851,29 @@ static int ckpt_set_creator(struct ckpt_ctx *ctx, struct task *task)
 
 static int ckpt_placeholder_task(struct ckpt_ctx *ctx, struct task *task)
 {
-	struct task *session = hash_lookup(ctx, task->sid);
-	struct task *holder = &ctx->tasks[ctx->tasks_nr++];
-	pid_t pid;
+	struct task *session;
+	struct task *holder;
+	int pid_ind;
 
-	if (ctx->tasks_nr > ctx->tasks_max) {
+	session = hash_lookup_ind(ctx, task->sid_ind);
+	holder = &ctx->tasks[ctx->tasks_cnt++];
+
+	if (ctx->tasks_cnt > ctx->tasks_max) {
 		/* shouldn't happen, beacuse we prepared enough */
 		ckpt_err("out of space in task table !");
-		return -1;
+		return ctx_ret_errno(ctx, EOVERFLOW);
 	}
 
-	pid = ckpt_alloc_pid(ctx);
-	if (pid < 0)
+	pid_ind = ckpt_alloc_pid(ctx, pids_of_index(ctx, task->pid_ind)->depth);
+	if (pid_ind < 0)
 		return -1;
 
 	holder->flags = TASK_DEAD;
 
-	holder->pid = pid;
-	holder->ppid = ckpt_init_task(ctx)->pid;
-	holder->tgid = pid;
-	holder->sid = task->sid;
+	holder->pid_ind = pid_ind;
+	holder->ppid_ind = ckpt_init_task(ctx)->pid_ind;
+	holder->tgid_ind = pid_ind;
+	holder->sid_ind = task->sid_ind;
 
 	holder->children = NULL;
 	holder->next_sib = NULL;
@@ -1597,8 +1881,8 @@ static int ckpt_placeholder_task(struct ckpt_ctx *ctx, struct task *task)
 	holder->creator = NULL;
 	holder->phantom = NULL;
 
-	holder->rpid = -1;
 	holder->ctx = ctx;
+	holder->real_pid = -1;
 
 	holder->creator = session;
 	if (session->children) {
@@ -1625,28 +1909,57 @@ static int ckpt_placeholder_task(struct ckpt_ctx *ctx, struct task *task)
 
 static int ckpt_propagate_session(struct ckpt_ctx *ctx, struct task *task)
 {
-	struct task *session = hash_lookup(ctx, task->sid);
+	struct task *session;
 	struct task *creator;
-	pid_t sid = task->sid;
+
+	session = hash_lookup_ind(ctx, task->sid_ind);
+
+	/*
+	 * propagate the TASK_SESSION up the ancestry until we reach
+	 * our session owner, so that all of them pass the sid we wnat
+	 * down before (possibly) changing their own.
+	 */
 
 	do {
-		ckpt_dbg("pid %d: set session\n", task->pid);
+		ckpt_dbg("pid %d: set session\n",
+			 pid_at_index(ctx, task->pid_ind));
 		task->flags |= TASK_SESSION;
 
 		creator = task->creator;
-		if (creator->pid == 1) {
+		/*
+		 * If we reached the root task, and the root task is
+		 * not our real parent, then we add a placeholder here
+		 * as a child of the root task and our creator. The
+		 * placeholder will inherit the session, and pass it
+		 * to us (and then terminate).
+		 */
+		if (creator == ckpt_init_task(ctx) &&
+		    creator != hash_lookup_ind(ctx, task->ppid_ind)) {
 			if (ckpt_placeholder_task(ctx, task) < 0)
 				return -1;
 		}
 
-		ckpt_dbg("pid %d: moving up to %d\n", task->pid, creator->pid);
+		ckpt_dbg("pid %d: moving up to %d\n",
+			 pid_at_index(ctx, task->pid_ind),
+			 pid_at_index(ctx, creator->pid_ind));
 		task = creator;
 
 		if(!task->creator) {
 			if (ckpt_set_creator(ctx, task) < 0)
 				return -1;
 		}
-	} while (task->sid != sid &&
+
+		/*
+		 * (Note that now @task already points to our creator!)
+		 * We don't propagate anymore if:
+		 *
+		 * (a) our creator has the same session as us
+		 * (b) our creator is the root task of the restart
+		 * (c) our creator already has TASK_SESSION,
+		 * (d) our creator's creator is our session (leader)
+		 */
+
+	} while (hash_lookup_ind(ctx, task->sid_ind) != session &&
 		 task != ckpt_init_task(ctx) &&
 		 !(task->flags & TASK_SESSION) &&
 		 task->creator != session);
@@ -1693,26 +2006,28 @@ static int ckpt_make_tree(struct ckpt_ctx *ctx, struct task *task)
 	int ret;
 
 	ckpt_dbg("pid %d: pid %d sid %d parent %d\n",
-	       task->pid, _gettid(), getsid(0), getppid());
+		 pid_at_index(ctx, task->pid_ind),
+		 _gettid(), getsid(0), getppid());
 
 	/* 1st pass: fork children that inherit our old session-id */
 	for (child = task->children; child; child = child->next_sib) {
 		if (child->flags & TASK_SESSION) {
 			ckpt_dbg("pid %d: fork child %d with session\n",
-			       task->pid, child->pid);
+				 pid_at_index(ctx, task->pid_ind),
+				 pid_at_index(ctx, child->pid_ind));
 			newpid = ckpt_fork_child(ctx, child);
 			if (newpid < 0)
-				return -1;
-			child->rpid = newpid;
+				return ctx_set_errno(ctx);
+			child->real_pid = newpid;
 		}
 	}
 
 	/* change session id, if necessary */
-	if (task->pid == task->sid) {
+	if (task->pid_ind == task->sid_ind) {
 		ret = setsid();
-		if (ret < 0 && task != ckpt_init_task(ctx)) {
+		if (ret < 0) {
 			ckpt_perror("setsid");
-			return -1;
+			return ctx_set_errno(ctx);
 		}
 	}
 
@@ -1720,11 +2035,12 @@ static int ckpt_make_tree(struct ckpt_ctx *ctx, struct task *task)
 	for (child = task->children; child; child = child->next_sib) {
 		if (!(child->flags & TASK_SESSION)) {
 			ckpt_dbg("pid %d: fork child %d without session\n",
-			       task->pid, child->pid);
+				 pid_at_index(ctx, task->pid_ind),
+				 pid_at_index(ctx, child->pid_ind));
 			newpid = ckpt_fork_child(ctx, child);
 			if (newpid < 0)
-				return -1;
-			child->rpid = newpid;
+				return ctx_set_errno(ctx);
+			child->real_pid = newpid;
 		}
 	}
 	
@@ -1740,15 +2056,13 @@ static int ckpt_make_tree(struct ckpt_ctx *ctx, struct task *task)
 	 */
 
 	/* communicate via pipe that all is well */
-	swap.old = task->pid;
+	swap.old = pid_at_index(ctx, task->pid_ind);
 	swap.new = _gettid();
 	ret = write(ctx->pipe_out, &swap, sizeof(swap));
 	if (ret != sizeof(swap)) {
 		ckpt_perror("write swap");
-		return -1;
+		return ctx_ret_errno(ctx, EIO);
 	}
-	close(ctx->pipe_out);
-	ctx->pipe_out = -1;  /* mark unused */
 
 	/*
 	 * At this point restart may have already begun in the kernel.
@@ -1842,18 +2156,20 @@ int ckpt_fork_stub(void *data)
 static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 {
 	struct clone_args clone_args;
-	genstack stk;
+	struct ckpt_pids *pids, *ppids;
 	unsigned long flags = SIGCHLD;
+	pid_t *numbers;
 	pid_t pid = 0;
-	pid_t *pids = &pid;
-	int i, depth;
+	genstack stk;
+	int j;
 
-	ckpt_dbg("fork child vpid %d flags %#x\n", child->pid, child->flags);
+	ckpt_dbg("fork child vpid %d flags %#x\n",
+		 pid_at_index(ctx, child->pid_ind), child->flags);
 
 	stk = genstack_alloc(PTHREAD_STACK_MIN);
 	if (!stk) {
 		ckpt_perror("ckpt_fork_child genstack_alloc");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 
 	if (child->flags & TASK_THREAD)
@@ -1870,13 +2186,14 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 
 	memset(&clone_args, 0, sizeof(clone_args));
 	clone_args.nr_pids = 1;
-	/* select pid if --pids, otherwise it's 0 */
-	if (ctx->args->pids) {
-		depth = child->piddepth + 1;
-		clone_args.nr_pids = depth;
 
-		pids = &ctx->vpids_arr[child->vidx];
+	pids = pids_of_index(ctx, child->pid_ind);
+	ppids = pids_of_index(ctx, child->creator->pid_ind);
+	numbers = pids_zero.numbers;
 
+	/* select pid if --pids, otherwise it's 0 */
+	if (ctx->args->pids) {
+		clone_args.nr_pids = pids->depth;
 #ifndef CLONE_NEWPID
 		if (child->piddepth > child->creator->piddepth) {
 			ckpt_err("nested pidns but CLONE_NEWPID undefined");
@@ -1886,20 +2203,24 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 			ctx_ret_errno(ctx, ENOSYS);
 		}
 #else /* CLONE_NEWPID */
-		if (child->piddepth > child->creator->piddepth) {
+		if (pids->depth > ppids->depth + 1) {
+			ckpt_err("unsupported form of pidns nesting");
+			ctx_ret_errno(ctx, ENOSYS);
+		}
+		if (pids->depth > ppids->depth) {
 			child->flags |= TASK_NEWPID;
 			flags |= CLONE_NEWPID;
 			clone_args.nr_pids--;
+			numbers = pids->numbers;
 		} else if (child->flags & TASK_NEWPID) {
-			/* The TASK_NEWPID could have been set for root task */
-			pids[0] = 0;
+			/*
+			 * This happens for a restart with --pidns in which
+			 * the root task is init in its namespace (the flag
+			 * TASK_NEWPID was set for this root task).
+			 */
+			assert(pids->depth == 0);
 			flags |= CLONE_NEWPID;
 		}
-		if (flags & CLONE_NEWPID && !ctx->args->pidns) {
-			ckpt_err("need --pidns for nested pidns container");
-			errno = -EINVAL;
-			return -1;
-		}
 #endif /* CLONE_NEWPID */
 	}
 
@@ -1915,18 +2236,23 @@ static pid_t ckpt_fork_child(struct ckpt_ctx *ctx, struct task *child)
 	clone_args.child_stack_size = genstack_size(stk);
 
 	ckpt_dbg("task %d forking with flags %lx numpids %d\n",
-		child->pid, flags, clone_args.nr_pids);
-	for (i = 0; i < clone_args.nr_pids; i++)
-		ckpt_dbg("task %d pid[%d]=%d\n", child->pid, i, pids[i]);
+		 pid_at_index(ctx, child->pid_ind), flags, clone_args.nr_pids);
+	ckpt_dbg("task %d pids:", pid_at_index(ctx, child->pid_ind));
+	for (j = 0; j < clone_args.nr_pids; j++)
+		ckpt_dbg_cont("%d\n", numbers[j]);
+	ckpt_dbg("...\n");
 
-	pid = eclone(ckpt_fork_stub, child, flags, &clone_args, pids);
-	if (pid < 0)
+	pid = eclone(ckpt_fork_stub, child, flags, &clone_args, numbers);
+	if (pid < 0) {
 		ckpt_perror("eclone");
+		ctx_set_errno(ctx);
+	}
 
 	if (pid < 0 || !(child->flags & TASK_THREAD))
 		genstack_release(stk);
 
-	ckpt_dbg("forked child vpid %d (asked %d)\n", pid, child->pid);
+	ckpt_dbg("forked child vpid %d (asked %d)\n",
+		 pid, pid_at_index(ctx, child->pid_ind));
 	return pid;
 }
 
@@ -2024,7 +2350,6 @@ static void ckpt_read_write_inspect(struct ckpt_ctx *ctx)
 
 	while (1) {
 		ret = _ckpt_read(STDIN_FILENO, &h, sizeof(h));
-ckpt_dbg("ret %d len %d type %d\n", ret, h.len, h.type);
 		if (ret == 0)
 			break;
 		if (ret < 0)
@@ -2060,7 +2385,6 @@ ckpt_dbg("ret %d len %d type %d\n", ret, h.len, h.type);
 
 			h.len -= ret;
 			ret = ckpt_write(STDOUT_FILENO, ctx->buf, ret);
-ckpt_dbg("write len %d (%d)\n", len, ret);
 			if (ret < 0)
 				ckpt_abort(ctx, "write output");
 		}
@@ -2114,12 +2438,12 @@ static int ckpt_do_feeder(struct ckpt_ctx *ctx)
 	if (ckpt_write_container(ctx) < 0)
 		ckpt_abort(ctx, "write container section");
 
+	if (ckpt_write_pids(ctx) < 0)
+		ckpt_abort(ctx, "write c/r pids");
+
 	if (ckpt_write_tree(ctx) < 0)
 		ckpt_abort(ctx, "write c/r tree");
 
-	if (ckpt_write_vpids(ctx) < 0)
-		ckpt_abort(ctx, "write vpids");
-
 	/* read rest -> write rest */
 	if (ctx->args->inspect)
 		ckpt_read_write_inspect(ctx);
@@ -2145,8 +2469,9 @@ static int ckpt_do_feeder(struct ckpt_ctx *ctx)
  */
 static int ckpt_adjust_pids(struct ckpt_ctx *ctx)
 {
+	struct ckpt_pids *pids, *copy;
 	struct pid_swap swap;
-	int n, m, len, ret;
+	int n, m, off, len, ret;
 	pid_t coord_sid;
 
 	coord_sid = getsid(0);
@@ -2161,23 +2486,22 @@ static int ckpt_adjust_pids(struct ckpt_ctx *ctx)
 	 *    but correct should be: [][][B][][A][]...
 	 */
 
-	len = sizeof(struct ckpt_pids) * ctx->pids_nr;
+	len = ctx->pids_nr * sizeof(*pids) + ctx->vpids_nr * sizeof(__s32);
 
 #ifdef CHECKPOINT_DEBUG
 	ckpt_dbg("====== PIDS ARRAY\n");
 	for (m = 0; m < ctx->pids_nr; m++) {
-		struct ckpt_pids *p;
-		p = &ctx->pids_arr[m];
-		ckpt_dbg("[%d] pid %d ppid %d sid %d pgid %d\n",
-			 m, p->vpid, p->vppid, p->vsid, p->vpgid);
+		pids = pids_of_index(ctx, m + 1);
+		ckpt_dbg("[%d] pid %d depth %d\n",
+			 m, pids->numbers[0], pids->depth);
 	}
 	ckpt_dbg("............\n");
 #endif
 
-	memcpy(ctx->copy_arr, ctx->pids_arr, len);
+	memcpy(ctx->pids_copy, ctx->pids_arr, len);
 
-	/* read in 'pid_swap' data and adjust ctx->pids_arr */
-	for (n = 0; n < ctx->tasks_nr; n++) {
+	/* read in 'pid_swap' data and adjust ctx->pids */
+	for (n = 0; n < ctx->tasks_cnt; n++) {
 		/* get pid info from next task */
 		ret = read(ctx->pipe_in, &swap, sizeof(swap));
 		if (ret < 0)
@@ -2189,33 +2513,30 @@ static int ckpt_adjust_pids(struct ckpt_ctx *ctx)
 
 		ckpt_dbg("c/r swap old %d new %d\n", swap.old, swap.new);
 		for (m = 0; m < ctx->pids_nr; m++) {
-			if (ctx->pids_arr[m].vpid == swap.old)
-				ctx->copy_arr[m].vpid = swap.new;
-			if (ctx->pids_arr[m].vtgid == swap.old)
-				ctx->copy_arr[m].vtgid = swap.new;
-			if (ctx->pids_arr[m].vsid == swap.old)
-				ctx->copy_arr[m].vsid = swap.new;
-			if (ctx->pids_arr[m].vpgid == swap.old)
-				ctx->copy_arr[m].vpgid = swap.new;
+			off = ctx->pids_index[m + 1];
+			pids = pids_at_offset(ctx, off);
+			copy = pids_copy_at_offset(ctx, off);
+			if (pids->numbers[0] == swap.old)
+				copy->numbers[0] = swap.new;
 		}
 	}
 
-	memcpy(ctx->pids_arr, ctx->copy_arr, len);
+	free(ctx->pids_arr);
+	ctx->pids_arr = ctx->pids_copy;
+	ctx->pids_copy = NULL;
 
 #ifdef CHECKPOINT_DEBUG
 	if (!ctx->args->pids) {
 		ckpt_dbg("====== PIDS ARRAY (swaped)\n");
 		for (m = 0; m < ctx->pids_nr; m++) {
-			struct ckpt_pids *p;
-			p = &ctx->pids_arr[m];
-			ckpt_dbg("[%d] pid %d ppid %d sid %d pgid %d\n",
-				 m, p->vpid, p->vppid, p->vsid, p->vpgid);
+			pids = pids_of_index(ctx, m + 1);
+			ckpt_dbg("[%d] pid %d depth %d\n",
+				 m, pids->numbers[0], pids->depth);
 		}
 		ckpt_dbg("............\n");
 	}
 #endif
 
-	close(ctx->pipe_in);  /* called by feeder, no need to mark */
 	return 0;
 }
 
@@ -2426,35 +2747,41 @@ static int ckpt_read_container(struct ckpt_ctx *ctx)
 	return ckpt_read_obj_type(ctx, ptr, 200, CKPT_HDR_LSM_INFO);
 }
 
-static int ckpt_read_tree(struct ckpt_ctx *ctx)
+static int ckpt_read_pids(struct ckpt_ctx *ctx)
 {
-	struct ckpt_hdr_tree *h;
+	struct ckpt_hdr_pids *h;
 	int len, ret;
 
-	h = (struct ckpt_hdr_tree *) ctx->tree;
-	ret = ckpt_read_obj_type(ctx, h, sizeof(*h), CKPT_HDR_TREE);
+	h = (struct ckpt_hdr_pids *) ctx->pids;
+	ret = ckpt_read_obj_type(ctx, h, sizeof(*h), CKPT_HDR_PIDS);
 	if (ret < 0)
 		return ret;
 
-	ckpt_dbg("number of tasks: %d\n", h->nr_tasks);
+	ckpt_dbg("number of pids %d, vpids %d\n", h->nr_pids, h->nr_vpids);
 
-	if (h->nr_tasks <= 0) {
-		ckpt_err("invalid number of tasks %d", h->nr_tasks);
-		errno = EINVAL;
-		return -1;
+	if (h->nr_pids <= 0) {
+		ckpt_err("invalid number of pids %d", h->nr_pids);
+		return ctx_ret_errno(ctx, EINVAL);
+	}
+	if (h->nr_vpids < 0) {
+		ckpt_err("invalid number of vpids %d", h->nr_vpids);
+		return ctx_ret_errno(ctx, EINVAL);
 	}
 
-	/* get a working a copy of header */
-	memcpy(ctx->buf, ctx->tree, BUFSIZE);
-
-	ctx->pids_nr = h->nr_tasks;
+	ctx->pids_nr = h->nr_pids;
+	ctx->vpids_nr = h->nr_vpids;
 
-	len = sizeof(struct ckpt_pids) * ctx->pids_nr;
+        len = ctx->pids_nr * sizeof(struct ckpt_pids) +
+		ctx->vpids_nr * sizeof(__s32);
 
 	ctx->pids_arr = malloc(len);
-	ctx->copy_arr = malloc(len);
-	if (!ctx->pids_arr || !ctx->copy_arr)
-		return -1;
+	ctx->pids_copy = malloc(len);
+	ctx->pids_new = malloc(len);
+	if (!ctx->pids_arr || !ctx->pids_copy || !ctx->pids_new)
+		return ctx_ret_errno(ctx, EINVAL);
+
+	ctx->pids_off = 0;
+	ctx->pids_len = len;
 
 	ret = ckpt_read_obj_ptr(ctx, ctx->pids_arr, len, CKPT_HDR_BUFFER);
 	if (ret < 0)
@@ -2463,95 +2790,36 @@ static int ckpt_read_tree(struct ckpt_ctx *ctx)
 	return ret;
 }
 
-/*
- * transform vpids arrays to the format convenient for eclone:
- * prefix the level 0 pid to every sequence of nested pids.
- * also,  set the vpids pointers in all the tasks.
- */
-static int assign_vpids(struct ckpt_ctx *ctx)
+static int ckpt_read_tree(struct ckpt_ctx *ctx)
 {
-	__s32 *vpids_arr;
-	int depth, hidx, vidx, tidx;
-	struct task *task;
-
-	vpids_arr = malloc(sizeof(__s32) * (ctx->vpids_nr + ctx->pids_nr));
-	if (vpids_arr == NULL) {
-		perror("assign_vpids malloc");
-		return -1;
-	}
-
-	for (tidx = 0, hidx = 0, vidx = 0; tidx < ctx->pids_nr; tidx++) {
-		task = &ctx->tasks[tidx];
-		depth = ctx->pids_arr[tidx].depth;
-
-		task->vidx = vidx;
-		task->piddepth = depth;
+	struct ckpt_hdr_tree *h;
+	int len, ret;
 
-		/* set task's and top level pid */
-		vpids_arr[vidx++] = task->pid;
-		/* copy task's nested pids */
-		memcpy(&vpids_arr[vidx], &ctx->vpids_arr[hidx],
-		       sizeof(__s32) * depth);
+	h = (struct ckpt_hdr_tree *) ctx->tree;
+	ret = ckpt_read_obj_type(ctx, h, sizeof(*h), CKPT_HDR_TREE);
+	if (ret < 0)
+		return ret;
 
-		vidx += depth;
-		hidx += depth;
+	ckpt_dbg("number of tasks: %d\n", h->nr_tasks);
 
-#ifdef CHECKPOINT_DEBUG
-		ckpt_dbg("task[%d].vidx = %d (depth %d, rpid %d)\n",
-			tidx, vidx, depth, ctx->pids_arr[tidx].vpid);
-		while (depth-- > 0)  {
-			ckpt_dbg("task[%d].vpid[%d] = %d\n", tidx,
-				 depth, vpids_arr[hidx - depth - 1]);
-		}
-#endif
+	if (h->nr_tasks <= 0) {
+		ckpt_err("invalid number of tasks %d", h->nr_tasks);
+		return ctx_ret_errno(ctx, EINVAL);
 	}
 
-	/* relpace "raw" vpids_arr with this one */
-	free(ctx->vpids_arr);
-	ctx->vpids_arr = vpids_arr;
-
-	return 0;
-}
-
-static int ckpt_read_vpids(struct ckpt_ctx *ctx)
-{
-	int i, len, ret;
-
-	for (i = 0; i < ctx->pids_nr; i++) {
-		if (ctx->pids_arr[i].depth < 0) {
-			ckpt_err("Invalid depth %d for pid %d",
-				 ctx->pids_arr[i].depth,
-				 ctx->tasks[i].pid);
-			errno = -EINVAL;
-			return -1;
-		}
-
-		ctx->vpids_nr += ctx->pids_arr[i].depth;
-
-		if(ctx->vpids_nr < 0) {
-			ckpt_err("Number of vpids overflowed");
-			errno = -E2BIG;
-			return -1;
-		}
-	}
+	ctx->tasks_nr = h->nr_tasks;
 
-	ckpt_dbg("number of vpids: %d\n", ctx->vpids_nr);
+	len = sizeof(struct ckpt_task_pids) * ctx->tasks_nr;
 
-	if (!ctx->vpids_nr)
-		return 0;
+	ctx->tasks_arr = malloc(len);
+	if (!ctx->tasks_arr)
+		return ctx_ret_errno(ctx, EINVAL);
 
-	len = sizeof(__s32) * ctx->vpids_nr;
-	if (len < 0) {
-		ckpt_err("Length of vpids array overflowed");
-		errno = -EINVAL;
-		return -1;
-	}
+	ret = ckpt_read_obj_ptr(ctx, ctx->tasks_arr, len, CKPT_HDR_BUFFER);
 
-	ctx->vpids_arr = malloc(len);
-	if (!ctx->pids_arr)
-		return -1;
+	if (ret < 0)
+		return ret;
 
-	ret = ckpt_read_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER);
 	return ret;
 }
 
@@ -2609,32 +2877,43 @@ static int ckpt_write_container(struct ckpt_ctx *ctx)
 	return ckpt_write_obj(ctx, (struct ckpt_hdr *) ptr);
 }
 
-static int ckpt_write_tree(struct ckpt_ctx *ctx)
+static int ckpt_write_pids(struct ckpt_ctx *ctx)
 {
-	struct ckpt_hdr_tree *h;
+	struct ckpt_hdr_pids *h;
 	int len;
 
-	h = (struct ckpt_hdr_tree *) ctx->tree;
+	h = (struct ckpt_hdr_pids *) ctx->pids;
 	if (ckpt_write_obj(ctx, (struct ckpt_hdr *) h) < 0)
-		ckpt_abort(ctx, "write tree");
+		ckpt_abort(ctx, "write pids");
 
-	len = sizeof(struct ckpt_pids) * ctx->pids_nr;
+        len = ctx->pids_nr * sizeof(struct ckpt_pids) +
+		ctx->vpids_nr * sizeof(__s32);
 	if (ckpt_write_obj_ptr(ctx, ctx->pids_arr, len, CKPT_HDR_BUFFER) < 0)
 		ckpt_abort(ctx, "write pids");
 
 	return 0;
 }
 
-static int ckpt_write_vpids(struct ckpt_ctx *ctx)
+static int ckpt_write_tree(struct ckpt_ctx *ctx)
 {
+	struct ckpt_hdr_tree *h;
 	int len;
 
-	if (!ctx->vpids_nr)
-		return 0;
-	len = sizeof(__s32) * ctx->vpids_nr;
-	if (ckpt_write_obj_ptr(ctx, ctx->vpids_arr, len, CKPT_HDR_BUFFER) < 0)
-		ckpt_abort(ctx, "write vpids");
-	ckpt_dbg("wrote %d bytes for %d vpids\n", len, ctx->vpids_nr);
+	h = (struct ckpt_hdr_tree *) ctx->tree;
+	if (ckpt_write_obj(ctx, (struct ckpt_hdr *) h) < 0)
+		ckpt_abort(ctx, "write tree");
+
+	len = sizeof(struct ckpt_task_pids) * ctx->tasks_nr;
+ckpt_dbg("len = %d\n");
+	if (ckpt_write_obj_ptr(ctx, ctx->tasks_arr, len, CKPT_HDR_BUFFER) < 0)
+		ckpt_abort(ctx, "write pids");
+
+	for (len = 0; len < ctx->tasks_nr; len++) {
+		struct ckpt_task_pids *task;
+		task = &ctx->tasks_arr[len];
+		ckpt_dbg("\t[%d] pid %d tgid %d pgid %d sid %d\n", len,
+			 task->vpid, task->vtgid, task->vpgid, task->vsid);
+	}
 
 	return 0;
 }
@@ -2646,31 +2925,65 @@ static int ckpt_write_vpids(struct ckpt_ctx *ctx)
 #define HASH_BITS	11
 #define HASH_BUCKETS	(2 << (HASH_BITS - 1))
 
-static int hash_init(struct ckpt_ctx *ctx)
+static int hash_expand(struct ckpt_ctx *ctx, int depth)
 {
-	struct hashent **hash;
+	struct hashent ***hash;
+	int *hash_last_pid;
 
-	ctx->hash_arr = malloc(sizeof(*hash) * HASH_BUCKETS);
-	if (!ctx->hash_arr) {
-		ckpt_perror("malloc hash table");
-		return -1;
+	hash = ctx->hash_arr;
+	hash = realloc(hash, sizeof(*hash) * depth);
+	if (!hash) {
+		ckpt_perror("allocate hash table");
+		return ctx_set_errno(ctx);
+	} else
+		ctx->hash_arr = hash;
+
+	hash_last_pid = ctx->hash_last_pid;
+	hash_last_pid =	realloc(hash_last_pid, sizeof(*hash_last_pid) * depth);
+	if (!hash_last_pid) {
+		ckpt_perror("allocate hash table");
+		return ctx_set_errno(ctx);
+	} else
+		ctx->hash_last_pid = hash_last_pid;
+
+	while (ctx->hash_depth < depth) {
+		hash[ctx->hash_depth] = malloc(sizeof(**hash) * HASH_BUCKETS);
+		if (!hash[ctx->hash_depth]) {
+			ckpt_perror("allocate hash table");
+			return ctx_set_errno(ctx);
+		}
+		memset(hash[ctx->hash_depth], 0, sizeof(**hash) * HASH_BUCKETS);
+		hash_last_pid[ctx->hash_depth] = CKPT_RESERVED_PIDS;
+		ctx->hash_depth++;
 	}
-	memset(ctx->hash_arr, 0, sizeof(*hash) * HASH_BUCKETS);
+
 	return 0;
 }
 
+static int hash_init(struct ckpt_ctx *ctx)
+{
+	return hash_expand(ctx, 1);
+}
+
 static void hash_exit(struct ckpt_ctx *ctx)
 {
 	struct hashent *hash, *next;
-	int i;
+	int i, j;
 
-	for (i = 0; i < HASH_BUCKETS; i++) {
-		for (hash = ctx->hash_arr[i]; hash; hash = next) {
-			next = hash->next;
-			free(hash);
+	if (!ctx->hash_arr)
+		return;
+
+	for (i = 0; i < ctx->hash_depth; i++) {
+		for (j = 0; j < HASH_BUCKETS; j++) {
+			for (hash = ctx->hash_arr[i][i]; hash; hash = next) {
+				next = hash->next;
+				free(hash);
+			}
 		}
+		free(ctx->hash_arr[i]);
 	}
 
+	free(ctx->hash_last_pid);
 	free(ctx->hash_arr);
 }
 
@@ -2685,35 +2998,55 @@ static inline int hash_func(long key)
 	return (hash >> (sizeof(key)*8 - HASH_BITS));
 }
 
-static int hash_insert(struct ckpt_ctx *ctx, long key, void *data)
+static int hash_insert(struct ckpt_ctx *ctx, long key, void *data, int level)
 {
 	struct hashent *hash;
 	int bucket;
 
+	if (level >= ctx->hash_depth)
+		if (hash_expand(ctx, level) < 0)
+			return ctx_set_errno(ctx);
+
 	hash = malloc(sizeof(*hash));
 	if (!hash) {
 		ckpt_perror("malloc hash");
-		return -1;
+		return ctx_set_errno(ctx);
 	}
 	hash->key = key;
 	hash->data = data;
 
 	bucket = hash_func(key);
-	hash->next = ctx->hash_arr[bucket];
-	ctx->hash_arr[bucket] = hash;
+	hash->next = ctx->hash_arr[level][bucket];
+	ctx->hash_arr[level][bucket] = hash;
 
 	return 0;
 }
 
-static void *hash_lookup(struct ckpt_ctx *ctx, long key)
+static void *hash_lookup_level(struct ckpt_ctx *ctx, long key, int level)
 {
 	struct hashent *hash;
 	int bucket;
 
+	if (level > ctx->hash_depth)
+		return NULL;
+
 	bucket = hash_func(key);
-	for (hash = ctx->hash_arr[bucket]; hash; hash = hash->next) {
+	for (hash = ctx->hash_arr[level][bucket]; hash; hash = hash->next) {
 		if (hash->key == key)
 			return hash->data;
 	}
 	return NULL;
 }
+
+static void *hash_lookup(struct ckpt_ctx *ctx, long key)
+{
+	return hash_lookup_level(ctx, key, 0);
+}
+
+static void *hash_lookup_ind(struct ckpt_ctx *ctx, int n)
+{
+	if (n == 0 || n == CKPT_PID_ROOT)
+		return hash_lookup_level(ctx, pid_at_index(ctx, 1), 0);
+	else
+		return hash_lookup_level(ctx, pid_at_index(ctx, n), 0);
+}
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 0/9] user-cr: support for pids as shared objects
       [not found]     ` <1296058748-21418-2-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2011-01-26 16:24       ` Oren Laadan
  0 siblings, 0 replies; 11+ messages in thread
From: Oren Laadan @ 2011-01-26 16:24 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA


[Hmm.. here too the cover was lost .. need to investigate]

Hi,   
      
This patchset adds the necessary support in user-cr related to
handling of pids as proper shared objets. You must use this if you use
the corresponding kernel-cr patchset recetly posted.

Thanks,

Oren.


On 01/26/2011 11:19 AM, Oren Laadan wrote:
> The initialization belongs to the caller of the cr-library - i.e.
> restart-main and checkpoint-main, and not in the cr-library.
> 
> Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
> ---
>  checkpoint-main.c |    4 ++--
>  checkpoint.c      |    8 --------
>  restart-main.c    |    4 ++--
>  3 files changed, 4 insertions(+), 12 deletions(-)
> 
> diff --git a/checkpoint-main.c b/checkpoint-main.c
> index a2a7d94..b2ec9c8 100644
> --- a/checkpoint-main.c
> +++ b/checkpoint-main.c
> @@ -70,8 +70,8 @@ static void parse_args(struct cr_checkpoint_args *args, int argc, char *argv[])
>  	char *logfile;
>  
>  	/* defaults */
> -	args->outfd = -1;
> -	args->logfd = -1;
> +	args->outfd = fileno(stdout);
> +	args->logfd = CHECKPOINT_FD_NONE;
>  	args->uerrfd = fileno(stderr);
>  	output = NULL;
>  	logfile = NULL;
> diff --git a/checkpoint.c b/checkpoint.c
> index cce3d9d..f9b0b3b 100644
> --- a/checkpoint.c
> +++ b/checkpoint.c
> @@ -37,14 +37,6 @@ int cr_checkpoint(int pid, struct cr_checkpoint_args *args)
>  
>  	global_uerrfd = args->uerrfd;
>  
> -	/* output file descriptor (default: stdout) */
> -	if (args->outfd < 0)
> -		args->outfd = STDOUT_FILENO;
> -
> -	/* output file descriptor (default: none) */
> -	if (args->logfd < 0)
> -		args->logfd = CHECKPOINT_FD_NONE;
> -
>  	if (!args->container)
>  		args->flags |= CHECKPOINT_SUBTREE;
>  
> diff --git a/restart-main.c b/restart-main.c
> index 6eed101..efa6a8f 100644
> --- a/restart-main.c
> +++ b/restart-main.c
> @@ -146,10 +146,10 @@ static void parse_args(struct cr_restart_args *args, int argc, char *argv[])
>  	/* defaults */
>  	memset(args, 0, sizeof(*args));
>  	args->wait = 1;
> -	args->infd = -1;
> -	args->klogfd = -1;
> +	args->infd = fileno(stdin);
>  	args->ulogfd = fileno(stdout);
>  	args->uerrfd = fileno(stderr);
> +	args->klogfd = CHECKPOINT_FD_NONE;
>  	args->warn = CKPT_COND_WARN;
>  	args->fail = CKPT_COND_FAIL;
>  	no_pidns = 0;

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2011-01-26 16:24 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-26 16:18 [PATCH 0/9] user-cr: support for pids as shared objects Oren Laadan
     [not found] ` <1296058748-21418-1-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2011-01-26 16:19   ` [PATCH 1/9] Initialize of args->{outfd, logfd, infd} in main c/r programs Oren Laadan
     [not found]     ` <1296058748-21418-2-git-send-email-orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2011-01-26 16:24       ` [PATCH 0/9] user-cr: support for pids as shared objects Oren Laadan
2011-01-26 16:19   ` [PATCH 2/9] Introduce ctx->error to improve error reporting Oren Laadan
2011-01-26 16:19   ` [PATCH 3/9] restart: cleanup setup/cleanup of freezer cgroups Oren Laadan
2011-01-26 16:19   ` [PATCH 4/9] restart: make feeder a proper child instead of a thread Oren Laadan
2011-01-26 16:19   ` [PATCH 5/9] restart: obtain pid_max from /proc/sys/kernel/pid_max Oren Laadan
2011-01-26 16:19   ` [PATCH 6/9] restart: rename 'ctx->tasks_arr' to 'ctx->tasks' Oren Laadan
2011-01-26 16:19   ` [PATCH 7/9] udpate kernel headers: support for pids objects Oren Laadan
2011-01-26 16:19   ` [PATCH 8/9] ckptinfo: s/ckpt_pids/ckpt_task_pids/ after kerenl header update Oren Laadan
2011-01-26 16:19   ` [PATCH 9/9] restart: fix support for nested pid namespaces Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.