All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] /proc/pid/checkpointable
@ 2009-03-17 17:43 Sukadev Bhattiprolu
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:43 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


Started out implementing '/proc/pid/checkpointable' which reports 0 or 1
depending on whether a process is checkpointable atm. But realized that
some patches that were discussed earlier are missing from Dave Hansen's
tree.

This patchset includes those missing patches plus two new patches that
create /proc/pid/checkpointable.

        [PATCH 1/6] Checkpoint multiple processes
        [PATCH 2/6] Restart multiple processes
        [PATCH 3/6] Check 'may_checkpoint()' early
        [PATCH 4/6] Deny external checkpoint unless task is frozen
        [PATCH 5/6] Define and use proc_pid_checkpointable()
        [PATCH 6/6] Explain reason for task being uncheckpointable

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/6] Checkpoint multiple processes
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-17 17:49   ` Sukadev Bhattiprolu
  2009-03-17 17:49   ` [PATCH 2/6] Restart " Sukadev Bhattiprolu
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:49 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Date: Thu, 12 Mar 2009 14:16:05 -0700
Subject: [PATCH 1/6] Checkpoint multiple processes

Checkpointing of multiple processes works by recording the tasks tree
structure below a given task (usually this task is the container init).

For a given task, do a DFS scan of the tasks tree and collect them
into an array (keeping a reference to each task). Using DFS simplifies
the recreation of tasks either in user space or kernel space. For each
task collected, test if it can be checkpointed, and save its pid, tgid,
and ppid.

The actual work is divided into two passes: a first scan counts the
tasks, then memory is allocated and a second scan fills the array.

The logic is suitable for creation of processes during restart either
in userspace or by the kernel.

Currently we ignore threads and zombies, as well as session ids.

Changelog[v12]:
  - Replace obsolete cr_debug() with pr_debug()

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/checkpoint.c        |  217 ++++++++++++++++++++++++++++++++++++++--
 checkpoint/sys.c               |   16 +++
 include/linux/checkpoint.h     |    3 +
 include/linux/checkpoint_hdr.h |   13 ++-
 4 files changed, 238 insertions(+), 11 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index e0af8a2..ed4fd3d 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -226,13 +226,6 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 {
 	int ret;
 
-	/* TODO: verity that the task is frozen (unless self) */
-
-	if (t->state == TASK_DEAD) {
-		pr_warning("c/r: task may not be in state TASK_DEAD\n");
-		return -EAGAIN;
-	}
-
 	ret = cr_write_task_struct(ctx, t);
 	pr_debug("task_struct: ret %d\n", ret);
 	if (ret < 0)
@@ -255,6 +248,200 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
+/* dump all tasks in ctx->tasks_arr[] */
+static int cr_write_all_tasks(struct cr_ctx *ctx)
+{
+	int n, ret = 0;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		pr_debug("dumping task #%d\n", n);
+		ret = cr_write_task(ctx, ctx->tasks_arr[n]);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
+static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
+{
+	pr_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
+
+	if (t->state == TASK_DEAD) {
+		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+		return -EAGAIN;
+	}
+
+	if (!ptrace_may_access(t, PTRACE_MODE_READ))
+		return -EPERM;
+
+	/* FIXME: verify that the task is frozen (unless self) */
+
+	/* FIXME: change this for nested containers */
+	if (task_nsproxy(t) != ctx->root_nsproxy)
+		return -EPERM;
+
+	return 0;
+}
+
+#define CR_HDR_PIDS_CHUNK	256
+
+static int cr_write_pids(struct cr_ctx *ctx)
+{
+	struct cr_hdr_pids *hh;
+	struct pid_namespace *ns;
+	struct task_struct *task;
+	struct task_struct **tasks_arr;
+	int tasks_nr, n, ret = 0, pos = 0;
+
+	ns = ctx->root_nsproxy->pid_ns;
+	tasks_arr = ctx->tasks_arr;
+	tasks_nr = ctx->tasks_nr;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh) * CR_HDR_PIDS_CHUNK);
+
+	while (tasks_nr > 0) {
+		rcu_read_lock();
+		for (n = min(tasks_nr, CR_HDR_PIDS_CHUNK); n; n--) {
+			task = tasks_arr[pos];
+
+			/* is this task cool ? */
+			ret = cr_may_checkpoint_task(task, ctx);
+			if (ret < 0) {
+				rcu_read_unlock();
+				goto out;
+			}
+			hh[pos].vpid = task_pid_nr_ns(task, ns);
+			hh[pos].vtgid = task_tgid_nr_ns(task, ns);
+			hh[pos].vppid = task_tgid_nr_ns(task->real_parent, ns);
+			pr_debug("task[%d]: vpid %d vtgid %d parent %d\n", pos,
+				 hh[pos].vpid, hh[pos].vtgid, hh[pos].vppid);
+			pos++;
+		}
+		rcu_read_unlock();
+
+		n = min(tasks_nr, CR_HDR_PIDS_CHUNK);
+		ret = cr_kwrite(ctx, hh, n * sizeof(*hh));
+		if (ret < 0)
+			break;
+
+		tasks_nr -= n;
+	}
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+	return ret;
+}
+
+/* count number of tasks in tree (and optionally fill pid's in array) */
+static int cr_tree_count_tasks(struct cr_ctx *ctx)
+{
+	struct task_struct *root = ctx->root_task;
+	struct task_struct *task = root;
+	struct task_struct *parent = NULL;
+	struct task_struct **tasks_arr = ctx->tasks_arr;
+	int tasks_nr = ctx->tasks_nr;
+	int nr = 0;
+
+	read_lock(&tasklist_lock);
+
+	/* count tasks via DFS scan of the tree */
+	while (1) {
+		if (tasks_arr) {
+			/* unlikely, but ... */
+			if (nr == tasks_nr)
+				return -EBUSY;	/* cleanup in cr_ctx_free() */
+			tasks_arr[nr] = task;
+			get_task_struct(task);
+		}
+
+		nr++;
+
+		/* if has children - proceed with child */
+		if (!list_empty(&task->children)) {
+			parent = task;
+			task = list_entry(task->children.next,
+					  struct task_struct, sibling);
+			continue;
+		}
+
+		while (task != root) {
+			/* if has sibling - proceed with sibling */
+			if (!list_is_last(&task->sibling, &parent->children)) {
+				task = list_entry(task->sibling.next,
+						  struct task_struct, sibling);
+				break;
+			}
+
+			/* else, trace back to parent and proceed */
+			task = parent;
+			parent = parent->real_parent;
+		}
+
+		if (task == root)
+			break;
+	}
+
+	read_unlock(&tasklist_lock);
+	return nr;
+}
+
+/*
+ * cr_build_tree - scan the tasks tree in DFS order and fill in array
+ * @ctx: checkpoint context
+ *
+ * Using DFS order simplifies the restart logic to re-create the tasks.
+ *
+ * On success, ctx->tasks_arr will be allocated and populated with all
+ * tasks (reference taken), and ctx->tasks_nr will hold the total count.
+ * The array is cleaned up by cr_ctx_free().
+ */
+static int cr_build_tree(struct cr_ctx *ctx)
+{
+	int n, m;
+
+	/* count tasks (no side effects) */
+	n = cr_tree_count_tasks(ctx);
+
+	ctx->tasks_nr = n;
+	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
+	if (!ctx->tasks_arr)
+		return -ENOMEM;
+
+	/* count again (now will fill array) */
+	m = cr_tree_count_tasks(ctx);
+
+	/* unlikely, but ... (cleanup in cr_ctx_free) */
+	if (m < 0)
+		return m;
+	else if (m != n)
+		return -EBUSY;
+
+	return 0;
+}
+
+/* dump the array that describes the tasks tree */
+static int cr_write_tree(struct cr_ctx *ctx)
+{
+	struct cr_hdr h;
+	struct cr_hdr_tree *hh;
+	int ret;
+
+	h.type = CR_HDR_TREE;
+	h.len = sizeof(*hh);
+	h.parent = 0;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh));
+	hh->tasks_nr = ctx->tasks_nr;
+
+	ret = cr_write_obj(ctx, &h, hh);
+	cr_hbuf_put(ctx, sizeof(*hh));
+	if (ret < 0)
+		return ret;
+
+	ret = cr_write_pids(ctx);
+	return ret;
+}
+
 static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 {
 	struct task_struct *task = NULL;
@@ -272,7 +459,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!task)
 		goto out;
 
-#if 0	/* enable to use containers */
+#if 0	/* enable with containers */
 	if (!is_container_init(task)) {
 		err = -EINVAL;
 		goto out;
@@ -294,7 +481,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!nsproxy)
 		goto out;
 
-	/* TODO: verify that the container is frozen */
+	/* FIXME: verify that the container is frozen */
 
 	ctx->root_task = task;
 	ctx->root_nsproxy = nsproxy;
@@ -342,12 +529,22 @@ int do_checkpoint(struct cr_ctx *ctx, pid_t pid)
 	ret = cr_ctx_checkpoint(ctx, pid);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_build_tree(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_head(ctx);
 	if (ret < 0)
 		goto out;
-	ret = cr_write_task(ctx, ctx->root_task);
+	ret = cr_write_tree(ctx);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_write_all_tasks(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_tail(ctx);
 	if (ret < 0)
 		goto out;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 4a51ed3..0436ef3 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -152,6 +152,19 @@ void cr_hbuf_put(struct cr_ctx *ctx, int n)
  * restart operation, and persists until the operation is completed.
  */
 
+static void cr_task_arr_free(struct cr_ctx *ctx)
+{
+	int n;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		if (ctx->tasks_arr[n]) {
+			put_task_struct(ctx->tasks_arr[n]);
+			ctx->tasks_arr[n] = NULL;
+		}
+	}
+	kfree(ctx->tasks_arr);
+}
+
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
 	if (ctx->file)
@@ -164,6 +177,9 @@ static void cr_ctx_free(struct cr_ctx *ctx)
 	cr_pgarr_free(ctx);
 	cr_objhash_free(ctx);
 
+	if (ctx->tasks_arr)
+		cr_task_arr_free(ctx);
+
 	if (ctx->root_nsproxy)
 		put_nsproxy(ctx->root_nsproxy);
 	if (ctx->root_task)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index e77393f..6307511 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -34,6 +34,9 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
+	struct task_struct **tasks_arr;	/* array of all tasks in container */
+	int tasks_nr;			/* size of tasks array */
+
 	struct cr_objhash *objhash;	/* hash for shared objects */
 
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 66313bd..0acf7c0 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -45,7 +45,8 @@ enum {
 	CR_HDR_STRING,
 	CR_HDR_FNAME,
 
-	CR_HDR_TASK = 101,
+	CR_HDR_TREE = 101,
+	CR_HDR_TASK,
 	CR_HDR_THREAD,
 	CR_HDR_CPU,
 
@@ -81,6 +82,16 @@ struct cr_hdr_tail {
 	__u64 magic;
 } __attribute__((aligned(8)));
 
+struct cr_hdr_tree {
+	__u32 tasks_nr;
+} __attribute__((aligned(8)));
+
+struct cr_hdr_pids {
+	__s32 vpid;
+	__s32 vtgid;
+	__s32 vppid;
+} __attribute__((aligned(8)));
+
 struct cr_hdr_task {
 	__u32 state;
 	__u32 exit_state;
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/6] Restart multiple processes
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17 17:49   ` [PATCH 1/6] Checkpoint multiple processes Sukadev Bhattiprolu
@ 2009-03-17 17:49   ` Sukadev Bhattiprolu
  2009-03-17 17:50   ` [PATCH 3/6] Check 'may_checkpoint()' early Sukadev Bhattiprolu
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:49 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Date: Thu, 12 Mar 2009 14:19:41 -0700
Subject: [PATCH 2/6] Restart multiple processes

Restarting of multiple processes expects all restarting tasks to call
sys_restart(). Once inside the system call, each task will restart
itself at the same order that they were saved. The internals of the
syscall will take care of in-kernel synchronization bewteen tasks.

This patch does _not_ create the task tree in the kernel. Instead it
assumes that all tasks are created in some way and then invoke the
restart syscall. You can use the userspace mktree.c program to do
that.

The init task (*) has a special role: it allocates the restart context
(ctx), and coordinates the operation. In particular, it first waits
until all participating tasks enter the kernel, and provides them the
common restart context. Once everyone in ready, it begins to restart
itself.

In contrast, the other tasks enter the kernel, locate the init task (*)
and grab its restart context, and then wait for their turn to restore.

When a task (init or not) completes its restart, it hands the control
over to the next in line, by waking that task.

An array of pids (the one saved during the checkpoint) is used to
synchronize the operation. The first task in the array is the init
task (*). The restart context (ctx) maintain a "current position" in
the array, which indicates which task is currently active. Once the
currently active task completes its own restart, it increments that
position and wakes up the next task.

Restart assumes that userspace provides meaningful data, otherwise
it's garbage-in-garbage-out. In this case, the syscall may block
indefinitely, but in TASK_INTERRUPTIBLE, so the user can ctrl-c or
otherwise kill the stray restarting tasks.

In terms of security, restart runs as the user the invokes it, so it
will not allow a user to do more than is otherwise permitted by the
usual system semantics and policy.

Currently we ignore threads and zombies, as well as session ids.
Add support for multiple processes

(*) For containers, restart should be called inside a fresh container
by the init task of that container. However, it is also possible to
restart applications not necessarily inside a container, and without
restoring the original pids of the processes (that is, provided that
the application can tolerate such behavior). This is useful to allow
multi-process restart of tasks not isolated inside a container, and
also for debugging.

Changelog[v12]:
  - Replace obsolete cr_debug() with pr_debug()

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/restart.c       |  214 +++++++++++++++++++++++++++++++++++++++++++-
 checkpoint/sys.c           |   34 ++++++--
 include/linux/checkpoint.h |   25 +++++-
 include/linux/sched.h      |    3 +
 4 files changed, 262 insertions(+), 14 deletions(-)

diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 0c46abf..6b4cd75 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -10,6 +10,7 @@
 
 #include <linux/version.h>
 #include <linux/sched.h>
+#include <linux/wait.h>
 #include <linux/file.h>
 #include <linux/magic.h>
 #include <linux/checkpoint.h>
@@ -276,30 +277,235 @@ static int cr_read_task(struct cr_ctx *ctx)
 	return ret;
 }
 
+/* cr_read_tree - read the tasks tree into the checkpoint context */
+static int cr_read_tree(struct cr_ctx *ctx)
+{
+	struct cr_hdr_tree *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	int parent, size, ret = -EINVAL;
+
+	parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TREE);
+	if (parent < 0) {
+		ret = parent;
+		goto out;
+	} else if (parent != 0)
+		goto out;
+
+	if (hh->tasks_nr < 0)
+		goto out;
+
+	ctx->pids_nr = hh->tasks_nr;
+	size = sizeof(*ctx->pids_arr) * ctx->pids_nr;
+	if (size < 0)		/* overflow ? */
+		goto out;
+
+	ctx->pids_arr = kmalloc(size, GFP_KERNEL);
+	if (!ctx->pids_arr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = cr_kread(ctx, ctx->pids_arr, size);
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+	return ret;
+}
+
+static int cr_wait_task(struct cr_ctx *ctx)
+{
+	pid_t pid = task_pid_vnr(current);
+
+	pr_debug("pid %d waiting\n", pid);
+	return wait_event_interruptible(ctx->waitq, ctx->pids_active == pid);
+}
+
+static int cr_next_task(struct cr_ctx *ctx)
+{
+	struct task_struct *tsk;
+
+	ctx->pids_pos++;
+
+	pr_debug("pids_pos %d %d\n", ctx->pids_pos, ctx->pids_nr);
+	if (ctx->pids_pos == ctx->pids_nr) {
+		complete(&ctx->complete);
+		return 0;
+	}
+
+	ctx->pids_active = ctx->pids_arr[ctx->pids_pos].vpid;
+
+	pr_debug("pids_next %d\n", ctx->pids_active);
+
+	rcu_read_lock();
+	tsk = find_task_by_pid_ns(ctx->pids_active, ctx->root_nsproxy->pid_ns);
+	if (tsk)
+		wake_up_process(tsk);
+	rcu_read_unlock();
+
+	if (!tsk) {
+		ctx->pids_err = -ESRCH;
+		complete(&ctx->complete);
+		return -ESRCH;
+	}
+
+	return 0;
+}
+
+/* FIXME: this should be per container */
+DECLARE_WAIT_QUEUE_HEAD(cr_restart_waitq);
+
+static int do_restart_task(struct cr_ctx *ctx, pid_t pid)
+{
+	struct task_struct *root_task;
+	int ret;
+
+	rcu_read_lock();
+	root_task = find_task_by_pid_ns(pid, current->nsproxy->pid_ns);
+	if (root_task)
+		get_task_struct(root_task);
+	rcu_read_unlock();
+
+	if (!root_task)
+		return -EINVAL;
+
+	/*
+	 * wait for container init to initialize the restart context, then
+	 * grab a reference to that context, and if we're the last task to
+	 * do it, notify the container init.
+	 */
+	ret = wait_event_interruptible(cr_restart_waitq,
+				       root_task->checkpoint_ctx);
+	if (ret < 0)
+		goto out;
+
+	task_lock(root_task);
+	ctx = root_task->checkpoint_ctx;
+	if (ctx)
+		cr_ctx_get(ctx);
+	task_unlock(root_task);
+
+	if (!ctx) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (atomic_dec_and_test(&ctx->tasks_count))
+		complete(&ctx->complete);
+
+	/* wait for our turn, do the restore, and tell next task in line */
+	ret = cr_wait_task(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_read_task(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_next_task(ctx);
+
+ out:
+	cr_ctx_put(ctx);
+	put_task_struct(root_task);
+	return ret;
+}
+
+static int cr_wait_all_tasks_start(struct cr_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->pids_nr == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+	current->checkpoint_ctx = ctx;
+
+	wake_up_all(&cr_restart_waitq);
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	if (ret < 0)
+		return ret;
+
+	task_lock(current);
+	current->checkpoint_ctx = NULL;
+	task_unlock(current);
+
+	return 0;
+}
+
+static int cr_wait_all_tasks_finish(struct cr_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->pids_nr == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+
+	ret = cr_next_task(ctx);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
 /* setup restart-specific parts of ctx */
 static int cr_ctx_restart(struct cr_ctx *ctx, pid_t pid)
 {
+	ctx->root_pid = pid;
+	ctx->root_task = current;
+	ctx->root_nsproxy = current->nsproxy;
+
+	get_task_struct(ctx->root_task);
+	get_nsproxy(ctx->root_nsproxy);
+
+	atomic_set(&ctx->tasks_count, ctx->pids_nr - 1);
+
 	return 0;
 }
 
-int do_restart(struct cr_ctx *ctx, pid_t pid)
+static int do_restart_root(struct cr_ctx *ctx, pid_t pid)
 {
 	int ret;
 
+	ret = cr_read_head(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_read_tree(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_ctx_restart(ctx, pid);
 	if (ret < 0)
 		goto out;
-	ret = cr_read_head(ctx);
+
+	/* wait for all other tasks to enter do_restart_task() */
+	ret = cr_wait_all_tasks_start(ctx);
 	if (ret < 0)
 		goto out;
+
 	ret = cr_read_task(ctx);
 	if (ret < 0)
 		goto out;
-	ret = cr_read_tail(ctx);
+
+	/* wait for all other tasks to complete do_restart_task() */
+	ret = cr_wait_all_tasks_finish(ctx);
 	if (ret < 0)
 		goto out;
 
-	/* on success, adjust the return value if needed [TODO] */
+	ret = cr_read_tail(ctx);
+
  out:
 	return ret;
 }
+
+int do_restart(struct cr_ctx *ctx, pid_t pid)
+{
+	int ret;
+
+	if (ctx)
+		ret = do_restart_root(ctx, pid);
+	else
+		ret = do_restart_task(ctx, pid);
+
+	/* on success, adjust the return value if needed [TODO] */
+	return ret;
+}
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 0436ef3..f26b0c6 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -167,6 +167,8 @@ static void cr_task_arr_free(struct cr_ctx *ctx)
 
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
+	BUG_ON(atomic_read(&ctx->refcount));
+
 	if (ctx->file)
 		fput(ctx->file);
 
@@ -185,6 +187,8 @@ static void cr_ctx_free(struct cr_ctx *ctx)
 	if (ctx->root_task)
 		put_task_struct(ctx->root_task);
 
+	kfree(ctx->pids_arr);
+
 	kfree(ctx);
 }
 
@@ -199,8 +203,10 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 
 	ctx->flags = flags;
 
+	atomic_set(&ctx->refcount, 0);
 	INIT_LIST_HEAD(&ctx->pgarr_list);
 	INIT_LIST_HEAD(&ctx->pgarr_pool);
+	init_waitqueue_head(&ctx->waitq);
 
 	err = -EBADF;
 	ctx->file = fget(fd);
@@ -215,6 +221,7 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 	if (cr_objhash_alloc(ctx) < 0)
 		goto err;
 
+	atomic_inc(&ctx->refcount);
 	return ctx;
 
  err:
@@ -222,6 +229,17 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 	return ERR_PTR(err);
 }
 
+void cr_ctx_get(struct cr_ctx *ctx)
+{
+	atomic_inc(&ctx->refcount);
+}
+
+void cr_ctx_put(struct cr_ctx *ctx)
+{
+	if (ctx && atomic_dec_and_test(&ctx->refcount))
+		cr_ctx_free(ctx);
+}
+
 /**
  * sys_checkpoint - checkpoint a container
  * @pid: pid of the container init(1) process
@@ -249,7 +267,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
 	if (!ret)
 		ret = ctx->crid;
 
-	cr_ctx_free(ctx);
+	cr_ctx_put(ctx);
 	return ret;
 }
 
@@ -264,7 +282,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
  */
 asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
 {
-	struct cr_ctx *ctx;
+	struct cr_ctx *ctx = NULL;
 	pid_t pid;
 	int ret;
 
@@ -272,15 +290,17 @@ asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
 	if (flags)
 		return -EINVAL;
 
-	ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
 	/* FIXME: for now, we use 'crid' as a pid */
 	pid = (pid_t) crid;
 
+	if (pid == task_pid_vnr(current))
+		ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
 	ret = do_restart(ctx, pid);
 
-	cr_ctx_free(ctx);
+	cr_ctx_put(ctx);
 	return ret;
 }
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 6307511..484be56 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -13,10 +13,13 @@
 #include <linux/path.h>
 #include <linux/fs.h>
 #include <linux/fdtable.h>
+#include <linux/path.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
 
 #ifdef CONFIG_CHECKPOINT_RESTART
 
-#define CR_VERSION  2
+#define CR_VERSION  3
 
 struct cr_ctx {
 	int crid;		/* unique checkpoint id */
@@ -34,8 +37,7 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
-	struct task_struct **tasks_arr;	/* array of all tasks in container */
-	int tasks_nr;			/* size of tasks array */
+	atomic_t refcount;
 
 	struct cr_objhash *objhash;	/* hash for shared objects */
 
@@ -43,6 +45,20 @@ struct cr_ctx {
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
 
 	struct path fs_mnt;	/* container root (FIXME) */
+
+	/* [multi-process checkpoint] */
+	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
+	int tasks_nr;                   /* size of tasks array */
+
+	/* [multi-process restart] */
+	struct cr_hdr_pids *pids_arr;	/* array of all pids [restart] */
+	int pids_nr;			/* size of pids array */
+	int pids_pos;			/* position pids array */
+	int pids_err;			/* error occured ? */
+	pid_t pids_active;		/* pid of (next) active task */
+	atomic_t tasks_count;		/* sync of restarting tasks */
+	struct completion complete;	/* sync of restarting tasks */
+	wait_queue_head_t waitq;	/* sync of restarting tasks */
 };
 
 /* cr_ctx: flags */
@@ -55,6 +71,9 @@ extern int cr_kread(struct cr_ctx *ctx, void *buf, int count);
 extern void *cr_hbuf_get(struct cr_ctx *ctx, int n);
 extern void cr_hbuf_put(struct cr_ctx *ctx, int n);
 
+extern void cr_ctx_get(struct cr_ctx *ctx);
+extern void cr_ctx_put(struct cr_ctx *ctx);
+
 /* shared objects handling */
 
 enum {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c216e0..c5d5987 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1416,6 +1416,9 @@ struct task_struct {
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	unsigned long trace;
+#endif 
+#ifdef CONFIG_CHECKPOINT_RESTART
+	struct cr_ctx *checkpoint_ctx;
 #endif
 };
 
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/6] Check 'may_checkpoint()' early
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17 17:49   ` [PATCH 1/6] Checkpoint multiple processes Sukadev Bhattiprolu
  2009-03-17 17:49   ` [PATCH 2/6] Restart " Sukadev Bhattiprolu
@ 2009-03-17 17:50   ` Sukadev Bhattiprolu
       [not found]     ` <20090317175013.GD10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17 17:50   ` [PATCH 4/6] Deny external checkpoint unless task is frozen Sukadev Bhattiprolu
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:50 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Thu, 12 Mar 2009 14:19:54 -0700
Subject: [PATCH 3/6] Check 'may_checkpoint()' early

We currently check if a task is checkpointable when writing the task
information to checkpoint file. The small downside of doing this check
late is that we may have processed several processes in the tree before
hitting one that cannot be checkpointed.

We anyway walk the process tree we are checkpointing earlier, while
counting the tasks. We could check if all processes in the tree are
checkpointable at that time and fail early if any of them are not.
Since the process tree should be frozen, checking earlier should not
matter ?

For now, the patch leaves the existing check in cr_write_pids(). We
can remove that later.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index ed4fd3d..dae9b97 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -341,11 +341,18 @@ static int cr_tree_count_tasks(struct cr_ctx *ctx)
 	struct task_struct **tasks_arr = ctx->tasks_arr;
 	int tasks_nr = ctx->tasks_nr;
 	int nr = 0;
+	int ret;
 
 	read_lock(&tasklist_lock);
 
 	/* count tasks via DFS scan of the tree */
 	while (1) {
+		ret = cr_may_checkpoint_task(task, ctx);
+		if (ret < 0) {
+			nr = ret;
+			break;
+		}
+
 		if (tasks_arr) {
 			/* unlikely, but ... */
 			if (nr == tasks_nr)
@@ -401,6 +408,8 @@ static int cr_build_tree(struct cr_ctx *ctx)
 
 	/* count tasks (no side effects) */
 	n = cr_tree_count_tasks(ctx);
+	if (n < 0)
+		return n;
 
 	ctx->tasks_nr = n;
 	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/6] Deny external checkpoint unless task is frozen
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-03-17 17:50   ` [PATCH 3/6] Check 'may_checkpoint()' early Sukadev Bhattiprolu
@ 2009-03-17 17:50   ` Sukadev Bhattiprolu
  2009-03-17 17:51   ` [PATCH 5/6] Define and use proc_pid_checkpointable() Sukadev Bhattiprolu
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:50 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Thu, 12 Mar 2009 14:20:02 -0700
Subject: [PATCH 4/6] Deny external checkpoint unless task is frozen

Remove a 'FIXME' and ensure that the tasks we are checkpointing are
frozen unless its a self-checkpoint.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index dae9b97..c16f30c 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/utsname.h>
 #include <linux/magic.h>
+#include <linux/freezer.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -275,7 +276,9 @@ static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
 	if (!ptrace_may_access(t, PTRACE_MODE_READ))
 		return -EPERM;
 
-	/* FIXME: verify that the task is frozen (unless self) */
+	/* verify that the task is frozen (unless self) */
+	if (t != current && !frozen(t))
+		return -EBUSY;
 
 	/* FIXME: change this for nested containers */
 	if (task_nsproxy(t) != ctx->root_nsproxy)
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 5/6] Define and use proc_pid_checkpointable()
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (3 preceding siblings ...)
  2009-03-17 17:50   ` [PATCH 4/6] Deny external checkpoint unless task is frozen Sukadev Bhattiprolu
@ 2009-03-17 17:51   ` Sukadev Bhattiprolu
  2009-03-17 17:51   ` [PATCH 6/6] Explain reason for task being uncheckpointable Sukadev Bhattiprolu
  2009-03-18  9:28   ` [PATCH 0/6] /proc/pid/checkpointable Oren Laadan
  6 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:51 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Fri, 13 Mar 2009 17:25:42 -0700
Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()

Create a proc file, /proc/pid/checkpointable, which shows '1' if
task is checkpointable and '0' if it is not.

To determine whether a task is checkpointable, the handler for this
new proc file, shares the same code with sys_checkpoint().

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c    |   23 ++++++++++++++++-------
 fs/proc/base.c             |   13 +++++++++++++
 include/linux/checkpoint.h |    7 +++++++
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index c16f30c..ad0956c 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -264,22 +264,31 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
 	return ret;
 }
 
+int task_checkpointable(struct task_struct *t)
+{
+	if (t->state == TASK_DEAD) {
+		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+		return 0;
+	}
+
+	/* Verify that task is frozen, unless it is self-checkpoint */
+	if (t != current && !frozen(t))
+		return 0;
+
+	return 1;
+}
+
 static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
 {
 	pr_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
 
-	if (t->state == TASK_DEAD) {
-		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+	/* verify that the task is checkpointable */
+	if (!task_checkpointable(t))
 		return -EAGAIN;
-	}
 
 	if (!ptrace_may_access(t, PTRACE_MODE_READ))
 		return -EPERM;
 
-	/* verify that the task is frozen (unless self) */
-	if (t != current && !frozen(t))
-		return -EBUSY;
-
 	/* FIXME: change this for nested containers */
 	if (task_nsproxy(t) != ctx->root_nsproxy)
 		return -EPERM;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index bde92c3..989ca93 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2501,6 +2501,13 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTART
+static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
+{
+	return sprintf(buffer, "%d\n", task_checkpointable(task));
+}
+#endif
+
 /*
  * Thread groups
  */
@@ -2580,6 +2587,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, proc_tgid_io_accounting),
 #endif
+#ifdef CONFIG_CHECKPOINT_RESTART
+	INF("checkpointable", S_IRUGO, proc_pid_checkpointable),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file * filp,
@@ -2915,6 +2925,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, proc_tid_io_accounting),
 #endif
+#ifdef CONFIG_CHECKPOINT_RESTART
+	INF("checkpointable", S_IRUGO, proc_pid_checkpointable),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file * filp,
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 484be56..d1f53a6 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -142,6 +142,8 @@ static inline int cr_enabled(void)
 	return 1;
 }
 
+int task_checkpointable(struct task_struct *t);
+
 #else /* !CONFIG_CHECKPOINT_RESTART */
 
 static inline void files_deny_checkpointing(struct files_struct *files) {}
@@ -162,5 +164,10 @@ static inline int cr_enabled(void)
 	return 0;
 }
 
+static inline int task_checkpointable(struct task_struct *t)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CHECKPOINT_RESTART */
 #endif /* _CHECKPOINT_CKPT_H_ */
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 6/6] Explain reason for task being uncheckpointable
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (4 preceding siblings ...)
  2009-03-17 17:51   ` [PATCH 5/6] Define and use proc_pid_checkpointable() Sukadev Bhattiprolu
@ 2009-03-17 17:51   ` Sukadev Bhattiprolu
       [not found]     ` <20090317175149.GG10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-18  9:28   ` [PATCH 0/6] /proc/pid/checkpointable Oren Laadan
  6 siblings, 1 reply; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:51 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Sat, 14 Mar 2009 10:21:07 -0700
Subject: [PATCH 6/6] Explain reason for task being uncheckpointable

Try to give an useful message on why a task is uncheckpointable.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c    |   13 +++++++++----
 fs/proc/base.c             |   19 ++++++++++++++++++-
 include/linux/checkpoint.h |    7 +++++++
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index ad0956c..c8c377d 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -264,18 +264,23 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
 	return ret;
 }
 
-int task_checkpointable(struct task_struct *t)
+int __task_checkpointable(struct task_struct *t)
 {
 	if (t->state == TASK_DEAD) {
 		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
-		return 0;
+		return CR_DEAD;
 	}
 
 	/* Verify that task is frozen, unless it is self-checkpoint */
 	if (t != current && !frozen(t))
-		return 0;
+		return CR_NOT_FROZEN;
+
+	return CR_CHECKPOINTABLE;
+}
 
-	return 1;
+int task_checkpointable(struct task_struct *t)
+{
+	return __task_checkpointable(t) == CR_CHECKPOINTABLE;
 }
 
 static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 989ca93..6e4c07c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2502,9 +2502,26 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTART
+static char *explain_checkpointable(int reason)
+{
+	char *message;
+
+	switch(reason) {
+	case CR_CHECKPOINTABLE: message = "1"; break;
+	case CR_DEAD: 		message = "0 (dead)"; break;
+	case CR_NOT_FROZEN: 	message = "0 (not frozen)"; break;
+	default:		message = "0 (unknown)"; break;
+	}
+	return message;
+}
+
 static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
 {
-	return sprintf(buffer, "%d\n", task_checkpointable(task));
+	int rc;
+
+	rc = __task_checkpointable(task);
+
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", explain_checkpointable(rc));
 }
 #endif
 
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index d1f53a6..a1eba73 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -142,6 +142,13 @@ static inline int cr_enabled(void)
 	return 1;
 }
 
+enum cr_uncheckpointable_reason {
+	CR_CHECKPOINTABLE = 0,
+	CR_DEAD,
+	CR_NOT_FROZEN
+};
+
+int __task_checkpointable(struct task_struct *t);
 int task_checkpointable(struct task_struct *t);
 
 #else /* !CONFIG_CHECKPOINT_RESTART */
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 6/6] Explain reason for task being uncheckpointable
       [not found]     ` <20090317175149.GG10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-17 18:52       ` Serge E. Hallyn
       [not found]         ` <20090317185228.GC1039-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Serge E. Hallyn @ 2009-03-17 18:52 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
> 
> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> Date: Sat, 14 Mar 2009 10:21:07 -0700
> Subject: [PATCH 6/6] Explain reason for task being uncheckpointable
> 
> Try to give an useful message on why a task is uncheckpointable.

As I said somewhere else, I think this is what makes it worth
having a procfile (or debugfs file) over just doing a trial
sys_checkpoint() with an invalid fd.

However,

> Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> ---
>  checkpoint/checkpoint.c    |   13 +++++++++----
>  fs/proc/base.c             |   19 ++++++++++++++++++-
>  include/linux/checkpoint.h |    7 +++++++
>  3 files changed, 34 insertions(+), 5 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index ad0956c..c8c377d 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -264,18 +264,23 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
>  	return ret;
>  }
> 
> -int task_checkpointable(struct task_struct *t)
> +int __task_checkpointable(struct task_struct *t)
>  {
>  	if (t->state == TASK_DEAD) {
>  		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
> -		return 0;
> +		return CR_DEAD;
>  	}
> 
>  	/* Verify that task is frozen, unless it is self-checkpoint */
>  	if (t != current && !frozen(t))
> -		return 0;
> +		return CR_NOT_FROZEN;
> +
> +	return CR_CHECKPOINTABLE;
> +}
> 
> -	return 1;
> +int task_checkpointable(struct task_struct *t)
> +{
> +	return __task_checkpointable(t) == CR_CHECKPOINTABLE;
>  }
> 
>  static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 989ca93..6e4c07c 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2502,9 +2502,26 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
>  }
> 
>  #ifdef CONFIG_CHECKPOINT_RESTART
> +static char *explain_checkpointable(int reason)
> +{
> +	char *message;
> +
> +	switch(reason) {
> +	case CR_CHECKPOINTABLE: message = "1"; break;
> +	case CR_DEAD: 		message = "0 (dead)"; break;
> +	case CR_NOT_FROZEN: 	message = "0 (not frozen)"; break;
> +	default:		message = "0 (unknown)"; break;

I don't think splitting it up this way is going to work long-term.
It means you need one code for each type of message, and can't give
any more detail beyond that (i.e. the value of
current->nsproxy->uts_ns.refcount, or whatever).

I think it would be better to pass a buffer into __task_checkpointable,
and let task_checkpointable() pass in a NULL buffer meaning "I
(sys_checkpoint) don't care about the explanation".

> +	}
> +	return message;
> +}
> +
>  static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
>  {
> -	return sprintf(buffer, "%d\n", task_checkpointable(task));
> +	int rc;
> +
> +	rc = __task_checkpointable(task);
> +
> +	return scnprintf(buffer, PAGE_SIZE, "%s\n", explain_checkpointable(rc));
>  }
>  #endif
> 
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index d1f53a6..a1eba73 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -142,6 +142,13 @@ static inline int cr_enabled(void)
>  	return 1;
>  }
> 
> +enum cr_uncheckpointable_reason {
> +	CR_CHECKPOINTABLE = 0,
> +	CR_DEAD,
> +	CR_NOT_FROZEN
> +};
> +
> +int __task_checkpointable(struct task_struct *t);
>  int task_checkpointable(struct task_struct *t);
> 
>  #else /* !CONFIG_CHECKPOINT_RESTART */
> -- 
> 1.5.2.5
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 6/6] Explain reason for task being uncheckpointable
       [not found]         ` <20090317185228.GC1039-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-18  6:50           ` Sukadev Bhattiprolu
  0 siblings, 0 replies; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-18  6:50 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Containers, David C. Hansen

Serge E. Hallyn [serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org] wrote:
| >  #ifdef CONFIG_CHECKPOINT_RESTART
| > +static char *explain_checkpointable(int reason)
| > +{
| > +	char *message;
| > +
| > +	switch(reason) {
| > +	case CR_CHECKPOINTABLE: message = "1"; break;
| > +	case CR_DEAD: 		message = "0 (dead)"; break;
| > +	case CR_NOT_FROZEN: 	message = "0 (not frozen)"; break;
| > +	default:		message = "0 (unknown)"; break;
| 
| I don't think splitting it up this way is going to work long-term.
| It means you need one code for each type of message, and can't give
| any more detail beyond that (i.e. the value of
| current->nsproxy->uts_ns.refcount, or whatever).
| 
| I think it would be better to pass a buffer into __task_checkpointable,
| and let task_checkpointable() pass in a NULL buffer meaning "I
| (sys_checkpoint) don't care about the explanation".

Ok. I had tried it before, but was not happy about __task_checkpointable()
filling in a message buffer,  but I see your point. Here is an updated patch.

---

From 73dd3ec01320d62881cb5296aa4d68667571ef4c Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Tue, 17 Mar 2009 16:15:16 -0700
Subject: [PATCH 6/6] Explain reason for task being uncheckpointable

Try to give an useful message on why a task is uncheckpointable.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c    |   16 ++++++++++++++--
 fs/proc/base.c             |    4 +++-
 include/linux/checkpoint.h |    1 +
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index ad0956c..b4dbb22 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -264,19 +264,31 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
 	return ret;
 }
 
-int task_checkpointable(struct task_struct *t)
+int __task_checkpointable(struct task_struct *t, char *msg_buf, int len)
 {
 	if (t->state == TASK_DEAD) {
+		if (msg_buf)
+			scnprintf(msg_buf, len, "0 (task dead)\n");
 		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
 		return 0;
 	}
 
 	/* Verify that task is frozen, unless it is self-checkpoint */
-	if (t != current && !frozen(t))
+	if (t != current && !frozen(t)) {
+		if (msg_buf)
+			scnprintf(msg_buf, len, "0 (task not frozen)\n");
 		return 0;
+	}
+
+	if (msg_buf)
+		scnprintf(msg_buf, len, "1\n");
 
 	return 1;
 }
+int task_checkpointable(struct task_struct *t)
+{
+	return __task_checkpointable(t, NULL, 0);
+}
 
 static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
 {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 989ca93..2d054b7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2504,7 +2504,9 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 #ifdef CONFIG_CHECKPOINT_RESTART
 static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
 {
-	return sprintf(buffer, "%d\n", task_checkpointable(task));
+	__task_checkpointable(task, buffer, PAGE_SIZE);
+
+	return strlen(buffer);
 }
 #endif
 
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index d1f53a6..44adb0b 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -142,6 +142,7 @@ static inline int cr_enabled(void)
 	return 1;
 }
 
+int __task_checkpointable(struct task_struct *t, char *msg_buf, int len);
 int task_checkpointable(struct task_struct *t);
 
 #else /* !CONFIG_CHECKPOINT_RESTART */
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (5 preceding siblings ...)
  2009-03-17 17:51   ` [PATCH 6/6] Explain reason for task being uncheckpointable Sukadev Bhattiprolu
@ 2009-03-18  9:28   ` Oren Laadan
  6 siblings, 0 replies; 12+ messages in thread
From: Oren Laadan @ 2009-03-18  9:28 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen


Catching up chronologically, I sent these to the previous version of your
patch - but it's still relevant:

https://lists.linux-foundation.org/pipermail/containers/2009-March/016307.html
https://lists.linux-foundation.org/pipermail/containers/2009-March/016308.html

Oren.

Sukadev Bhattiprolu wrote:
> Started out implementing '/proc/pid/checkpointable' which reports 0 or 1
> depending on whether a process is checkpointable atm. But realized that
> some patches that were discussed earlier are missing from Dave Hansen's
> tree.
> 
> This patchset includes those missing patches plus two new patches that
> create /proc/pid/checkpointable.
> 
>         [PATCH 1/6] Checkpoint multiple processes
>         [PATCH 2/6] Restart multiple processes
>         [PATCH 3/6] Check 'may_checkpoint()' early
>         [PATCH 4/6] Deny external checkpoint unless task is frozen
>         [PATCH 5/6] Define and use proc_pid_checkpointable()
>         [PATCH 6/6] Explain reason for task being uncheckpointable
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/6] Check 'may_checkpoint()' early
       [not found]     ` <20090317175013.GD10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-26  5:58       ` Sukadev Bhattiprolu
       [not found]         ` <20090326055840.GA8220-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-26  5:58 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers


Oren, 

I see that patch 3/6 (deny external checkpoint unless frozen) in this
set was merged in v14, but do you have any plans/objections to merging
this optimization patch ? Serge had acked an earlier version of this
patch.

Suka

Sukadev Bhattiprolu [sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org] wrote:
| 
| From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
| Date: Thu, 12 Mar 2009 14:19:54 -0700
| Subject: [PATCH 3/6] Check 'may_checkpoint()' early
| 
| We currently check if a task is checkpointable when writing the task
| information to checkpoint file. The small downside of doing this check
| late is that we may have processed several processes in the tree before
| hitting one that cannot be checkpointed.
| 
| We anyway walk the process tree we are checkpointing earlier, while
| counting the tasks. We could check if all processes in the tree are
| checkpointable at that time and fail early if any of them are not.
| Since the process tree should be frozen, checking earlier should not
| matter ?
| 
| For now, the patch leaves the existing check in cr_write_pids(). We
| can remove that later.
| 
| Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
| ---
|  checkpoint/checkpoint.c |    9 +++++++++
|  1 files changed, 9 insertions(+), 0 deletions(-)
| 
| diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
| index ed4fd3d..dae9b97 100644
| --- a/checkpoint/checkpoint.c
| +++ b/checkpoint/checkpoint.c
| @@ -341,11 +341,18 @@ static int cr_tree_count_tasks(struct cr_ctx *ctx)
|  	struct task_struct **tasks_arr = ctx->tasks_arr;
|  	int tasks_nr = ctx->tasks_nr;
|  	int nr = 0;
| +	int ret;
| 
|  	read_lock(&tasklist_lock);
| 
|  	/* count tasks via DFS scan of the tree */
|  	while (1) {
| +		ret = cr_may_checkpoint_task(task, ctx);
| +		if (ret < 0) {
| +			nr = ret;
| +			break;
| +		}
| +
|  		if (tasks_arr) {
|  			/* unlikely, but ... */
|  			if (nr == tasks_nr)
| @@ -401,6 +408,8 @@ static int cr_build_tree(struct cr_ctx *ctx)
| 
|  	/* count tasks (no side effects) */
|  	n = cr_tree_count_tasks(ctx);
| +	if (n < 0)
| +		return n;
| 
|  	ctx->tasks_nr = n;
|  	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
| -- 
| 1.5.2.5
| 
| _______________________________________________
| Containers mailing list
| Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
| https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/6] Check 'may_checkpoint()' early
       [not found]         ` <20090326055840.GA8220-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-26 13:25           ` Serge E. Hallyn
  0 siblings, 0 replies; 12+ messages in thread
From: Serge E. Hallyn @ 2009-03-26 13:25 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers

Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
> 
> Oren, 
> 
> I see that patch 3/6 (deny external checkpoint unless frozen) in this
> set was merged in v14, but do you have any plans/objections to merging
> this optimization patch ? Serge had acked an earlier version of this
> patch.
> 
> Suka
> 
> Sukadev Bhattiprolu [sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org] wrote:
> | 
> | From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> | Date: Thu, 12 Mar 2009 14:19:54 -0700
> | Subject: [PATCH 3/6] Check 'may_checkpoint()' early
> | 
> | We currently check if a task is checkpointable when writing the task
> | information to checkpoint file. The small downside of doing this check
> | late is that we may have processed several processes in the tree before
> | hitting one that cannot be checkpointed.
> | 
> | We anyway walk the process tree we are checkpointing earlier, while
> | counting the tasks. We could check if all processes in the tree are
> | checkpointable at that time and fail early if any of them are not.
> | Since the process tree should be frozen, checking earlier should not
> | matter ?
> | 
> | For now, the patch leaves the existing check in cr_write_pids(). We
> | can remove that later.
> | 
> | Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> | ---
> |  checkpoint/checkpoint.c |    9 +++++++++
> |  1 files changed, 9 insertions(+), 0 deletions(-)
> | 
> | diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> | index ed4fd3d..dae9b97 100644
> | --- a/checkpoint/checkpoint.c
> | +++ b/checkpoint/checkpoint.c
> | @@ -341,11 +341,18 @@ static int cr_tree_count_tasks(struct cr_ctx *ctx)
> |  	struct task_struct **tasks_arr = ctx->tasks_arr;
> |  	int tasks_nr = ctx->tasks_nr;
> |  	int nr = 0;
> | +	int ret;
> | 
> |  	read_lock(&tasklist_lock);
> | 
> |  	/* count tasks via DFS scan of the tree */
> |  	while (1) {
> | +		ret = cr_may_checkpoint_task(task, ctx);
> | +		if (ret < 0) {
> | +			nr = ret;
> | +			break;
> | +		}
> | +
> |  		if (tasks_arr) {
> |  			/* unlikely, but ... */
> |  			if (nr == tasks_nr)
> | @@ -401,6 +408,8 @@ static int cr_build_tree(struct cr_ctx *ctx)
> | 
> |  	/* count tasks (no side effects) */
> |  	n = cr_tree_count_tasks(ctx);
> | +	if (n < 0)
> | +		return n;

This hunk is already in v14.

-serge

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2009-03-26 13:25 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-17 17:43 [PATCH 0/6] /proc/pid/checkpointable Sukadev Bhattiprolu
     [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-17 17:49   ` [PATCH 1/6] Checkpoint multiple processes Sukadev Bhattiprolu
2009-03-17 17:49   ` [PATCH 2/6] Restart " Sukadev Bhattiprolu
2009-03-17 17:50   ` [PATCH 3/6] Check 'may_checkpoint()' early Sukadev Bhattiprolu
     [not found]     ` <20090317175013.GD10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-26  5:58       ` Sukadev Bhattiprolu
     [not found]         ` <20090326055840.GA8220-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-26 13:25           ` Serge E. Hallyn
2009-03-17 17:50   ` [PATCH 4/6] Deny external checkpoint unless task is frozen Sukadev Bhattiprolu
2009-03-17 17:51   ` [PATCH 5/6] Define and use proc_pid_checkpointable() Sukadev Bhattiprolu
2009-03-17 17:51   ` [PATCH 6/6] Explain reason for task being uncheckpointable Sukadev Bhattiprolu
     [not found]     ` <20090317175149.GG10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-17 18:52       ` Serge E. Hallyn
     [not found]         ` <20090317185228.GC1039-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-18  6:50           ` Sukadev Bhattiprolu
2009-03-18  9:28   ` [PATCH 0/6] /proc/pid/checkpointable Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.