All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] /proc/pid/checkpointable
@ 2009-03-17 17:43 Sukadev Bhattiprolu
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:43 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


Started out implementing '/proc/pid/checkpointable' which reports 0 or 1
depending on whether a process is checkpointable atm. But realized that
some patches that were discussed earlier are missing from Dave Hansen's
tree.

This patchset includes those missing patches plus two new patches that
create /proc/pid/checkpointable.

        [PATCH 1/6] Checkpoint multiple processes
        [PATCH 2/6] Restart multiple processes
        [PATCH 3/6] Check 'may_checkpoint()' early
        [PATCH 4/6] Deny external checkpoint unless task is frozen
        [PATCH 5/6] Define and use proc_pid_checkpointable()
        [PATCH 6/6] Explain reason for task being uncheckpointable

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH 1/6] Checkpoint multiple processes
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-17 17:49   ` Sukadev Bhattiprolu
  2009-03-17 17:49   ` [PATCH 2/6] Restart " Sukadev Bhattiprolu
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:49 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Date: Thu, 12 Mar 2009 14:16:05 -0700
Subject: [PATCH 1/6] Checkpoint multiple processes

Checkpointing of multiple processes works by recording the tasks tree
structure below a given task (usually this task is the container init).

For a given task, do a DFS scan of the tasks tree and collect them
into an array (keeping a reference to each task). Using DFS simplifies
the recreation of tasks either in user space or kernel space. For each
task collected, test if it can be checkpointed, and save its pid, tgid,
and ppid.

The actual work is divided into two passes: a first scan counts the
tasks, then memory is allocated and a second scan fills the array.

The logic is suitable for creation of processes during restart either
in userspace or by the kernel.

Currently we ignore threads and zombies, as well as session ids.

Changelog[v12]:
  - Replace obsolete cr_debug() with pr_debug()

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/checkpoint.c        |  217 ++++++++++++++++++++++++++++++++++++++--
 checkpoint/sys.c               |   16 +++
 include/linux/checkpoint.h     |    3 +
 include/linux/checkpoint_hdr.h |   13 ++-
 4 files changed, 238 insertions(+), 11 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index e0af8a2..ed4fd3d 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -226,13 +226,6 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 {
 	int ret;
 
-	/* TODO: verity that the task is frozen (unless self) */
-
-	if (t->state == TASK_DEAD) {
-		pr_warning("c/r: task may not be in state TASK_DEAD\n");
-		return -EAGAIN;
-	}
-
 	ret = cr_write_task_struct(ctx, t);
 	pr_debug("task_struct: ret %d\n", ret);
 	if (ret < 0)
@@ -255,6 +248,200 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
+/* dump all tasks in ctx->tasks_arr[] */
+static int cr_write_all_tasks(struct cr_ctx *ctx)
+{
+	int n, ret = 0;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		pr_debug("dumping task #%d\n", n);
+		ret = cr_write_task(ctx, ctx->tasks_arr[n]);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
+static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
+{
+	pr_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
+
+	if (t->state == TASK_DEAD) {
+		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+		return -EAGAIN;
+	}
+
+	if (!ptrace_may_access(t, PTRACE_MODE_READ))
+		return -EPERM;
+
+	/* FIXME: verify that the task is frozen (unless self) */
+
+	/* FIXME: change this for nested containers */
+	if (task_nsproxy(t) != ctx->root_nsproxy)
+		return -EPERM;
+
+	return 0;
+}
+
+#define CR_HDR_PIDS_CHUNK	256
+
+static int cr_write_pids(struct cr_ctx *ctx)
+{
+	struct cr_hdr_pids *hh;
+	struct pid_namespace *ns;
+	struct task_struct *task;
+	struct task_struct **tasks_arr;
+	int tasks_nr, n, ret = 0, pos = 0;
+
+	ns = ctx->root_nsproxy->pid_ns;
+	tasks_arr = ctx->tasks_arr;
+	tasks_nr = ctx->tasks_nr;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh) * CR_HDR_PIDS_CHUNK);
+
+	while (tasks_nr > 0) {
+		rcu_read_lock();
+		for (n = min(tasks_nr, CR_HDR_PIDS_CHUNK); n; n--) {
+			task = tasks_arr[pos];
+
+			/* is this task cool ? */
+			ret = cr_may_checkpoint_task(task, ctx);
+			if (ret < 0) {
+				rcu_read_unlock();
+				goto out;
+			}
+			hh[pos].vpid = task_pid_nr_ns(task, ns);
+			hh[pos].vtgid = task_tgid_nr_ns(task, ns);
+			hh[pos].vppid = task_tgid_nr_ns(task->real_parent, ns);
+			pr_debug("task[%d]: vpid %d vtgid %d parent %d\n", pos,
+				 hh[pos].vpid, hh[pos].vtgid, hh[pos].vppid);
+			pos++;
+		}
+		rcu_read_unlock();
+
+		n = min(tasks_nr, CR_HDR_PIDS_CHUNK);
+		ret = cr_kwrite(ctx, hh, n * sizeof(*hh));
+		if (ret < 0)
+			break;
+
+		tasks_nr -= n;
+	}
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+	return ret;
+}
+
+/* count number of tasks in tree (and optionally fill pid's in array) */
+static int cr_tree_count_tasks(struct cr_ctx *ctx)
+{
+	struct task_struct *root = ctx->root_task;
+	struct task_struct *task = root;
+	struct task_struct *parent = NULL;
+	struct task_struct **tasks_arr = ctx->tasks_arr;
+	int tasks_nr = ctx->tasks_nr;
+	int nr = 0;
+
+	read_lock(&tasklist_lock);
+
+	/* count tasks via DFS scan of the tree */
+	while (1) {
+		if (tasks_arr) {
+			/* unlikely, but ... */
+			if (nr == tasks_nr)
+				return -EBUSY;	/* cleanup in cr_ctx_free() */
+			tasks_arr[nr] = task;
+			get_task_struct(task);
+		}
+
+		nr++;
+
+		/* if has children - proceed with child */
+		if (!list_empty(&task->children)) {
+			parent = task;
+			task = list_entry(task->children.next,
+					  struct task_struct, sibling);
+			continue;
+		}
+
+		while (task != root) {
+			/* if has sibling - proceed with sibling */
+			if (!list_is_last(&task->sibling, &parent->children)) {
+				task = list_entry(task->sibling.next,
+						  struct task_struct, sibling);
+				break;
+			}
+
+			/* else, trace back to parent and proceed */
+			task = parent;
+			parent = parent->real_parent;
+		}
+
+		if (task == root)
+			break;
+	}
+
+	read_unlock(&tasklist_lock);
+	return nr;
+}
+
+/*
+ * cr_build_tree - scan the tasks tree in DFS order and fill in array
+ * @ctx: checkpoint context
+ *
+ * Using DFS order simplifies the restart logic to re-create the tasks.
+ *
+ * On success, ctx->tasks_arr will be allocated and populated with all
+ * tasks (reference taken), and ctx->tasks_nr will hold the total count.
+ * The array is cleaned up by cr_ctx_free().
+ */
+static int cr_build_tree(struct cr_ctx *ctx)
+{
+	int n, m;
+
+	/* count tasks (no side effects) */
+	n = cr_tree_count_tasks(ctx);
+
+	ctx->tasks_nr = n;
+	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
+	if (!ctx->tasks_arr)
+		return -ENOMEM;
+
+	/* count again (now will fill array) */
+	m = cr_tree_count_tasks(ctx);
+
+	/* unlikely, but ... (cleanup in cr_ctx_free) */
+	if (m < 0)
+		return m;
+	else if (m != n)
+		return -EBUSY;
+
+	return 0;
+}
+
+/* dump the array that describes the tasks tree */
+static int cr_write_tree(struct cr_ctx *ctx)
+{
+	struct cr_hdr h;
+	struct cr_hdr_tree *hh;
+	int ret;
+
+	h.type = CR_HDR_TREE;
+	h.len = sizeof(*hh);
+	h.parent = 0;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh));
+	hh->tasks_nr = ctx->tasks_nr;
+
+	ret = cr_write_obj(ctx, &h, hh);
+	cr_hbuf_put(ctx, sizeof(*hh));
+	if (ret < 0)
+		return ret;
+
+	ret = cr_write_pids(ctx);
+	return ret;
+}
+
 static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 {
 	struct task_struct *task = NULL;
@@ -272,7 +459,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!task)
 		goto out;
 
-#if 0	/* enable to use containers */
+#if 0	/* enable with containers */
 	if (!is_container_init(task)) {
 		err = -EINVAL;
 		goto out;
@@ -294,7 +481,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!nsproxy)
 		goto out;
 
-	/* TODO: verify that the container is frozen */
+	/* FIXME: verify that the container is frozen */
 
 	ctx->root_task = task;
 	ctx->root_nsproxy = nsproxy;
@@ -342,12 +529,22 @@ int do_checkpoint(struct cr_ctx *ctx, pid_t pid)
 	ret = cr_ctx_checkpoint(ctx, pid);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_build_tree(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_head(ctx);
 	if (ret < 0)
 		goto out;
-	ret = cr_write_task(ctx, ctx->root_task);
+	ret = cr_write_tree(ctx);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_write_all_tasks(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_tail(ctx);
 	if (ret < 0)
 		goto out;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 4a51ed3..0436ef3 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -152,6 +152,19 @@ void cr_hbuf_put(struct cr_ctx *ctx, int n)
  * restart operation, and persists until the operation is completed.
  */
 
+static void cr_task_arr_free(struct cr_ctx *ctx)
+{
+	int n;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		if (ctx->tasks_arr[n]) {
+			put_task_struct(ctx->tasks_arr[n]);
+			ctx->tasks_arr[n] = NULL;
+		}
+	}
+	kfree(ctx->tasks_arr);
+}
+
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
 	if (ctx->file)
@@ -164,6 +177,9 @@ static void cr_ctx_free(struct cr_ctx *ctx)
 	cr_pgarr_free(ctx);
 	cr_objhash_free(ctx);
 
+	if (ctx->tasks_arr)
+		cr_task_arr_free(ctx);
+
 	if (ctx->root_nsproxy)
 		put_nsproxy(ctx->root_nsproxy);
 	if (ctx->root_task)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index e77393f..6307511 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -34,6 +34,9 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
+	struct task_struct **tasks_arr;	/* array of all tasks in container */
+	int tasks_nr;			/* size of tasks array */
+
 	struct cr_objhash *objhash;	/* hash for shared objects */
 
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 66313bd..0acf7c0 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -45,7 +45,8 @@ enum {
 	CR_HDR_STRING,
 	CR_HDR_FNAME,
 
-	CR_HDR_TASK = 101,
+	CR_HDR_TREE = 101,
+	CR_HDR_TASK,
 	CR_HDR_THREAD,
 	CR_HDR_CPU,
 
@@ -81,6 +82,16 @@ struct cr_hdr_tail {
 	__u64 magic;
 } __attribute__((aligned(8)));
 
+struct cr_hdr_tree {
+	__u32 tasks_nr;
+} __attribute__((aligned(8)));
+
+struct cr_hdr_pids {
+	__s32 vpid;
+	__s32 vtgid;
+	__s32 vppid;
+} __attribute__((aligned(8)));
+
 struct cr_hdr_task {
 	__u32 state;
 	__u32 exit_state;
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 2/6] Restart multiple processes
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17 17:49   ` [PATCH 1/6] Checkpoint multiple processes Sukadev Bhattiprolu
@ 2009-03-17 17:49   ` Sukadev Bhattiprolu
  2009-03-17 17:50   ` [PATCH 3/6] Check 'may_checkpoint()' early Sukadev Bhattiprolu
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:49 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Date: Thu, 12 Mar 2009 14:19:41 -0700
Subject: [PATCH 2/6] Restart multiple processes

Restarting of multiple processes expects all restarting tasks to call
sys_restart(). Once inside the system call, each task will restart
itself at the same order that they were saved. The internals of the
syscall will take care of in-kernel synchronization bewteen tasks.

This patch does _not_ create the task tree in the kernel. Instead it
assumes that all tasks are created in some way and then invoke the
restart syscall. You can use the userspace mktree.c program to do
that.

The init task (*) has a special role: it allocates the restart context
(ctx), and coordinates the operation. In particular, it first waits
until all participating tasks enter the kernel, and provides them the
common restart context. Once everyone in ready, it begins to restart
itself.

In contrast, the other tasks enter the kernel, locate the init task (*)
and grab its restart context, and then wait for their turn to restore.

When a task (init or not) completes its restart, it hands the control
over to the next in line, by waking that task.

An array of pids (the one saved during the checkpoint) is used to
synchronize the operation. The first task in the array is the init
task (*). The restart context (ctx) maintain a "current position" in
the array, which indicates which task is currently active. Once the
currently active task completes its own restart, it increments that
position and wakes up the next task.

Restart assumes that userspace provides meaningful data, otherwise
it's garbage-in-garbage-out. In this case, the syscall may block
indefinitely, but in TASK_INTERRUPTIBLE, so the user can ctrl-c or
otherwise kill the stray restarting tasks.

In terms of security, restart runs as the user the invokes it, so it
will not allow a user to do more than is otherwise permitted by the
usual system semantics and policy.

Currently we ignore threads and zombies, as well as session ids.
Add support for multiple processes

(*) For containers, restart should be called inside a fresh container
by the init task of that container. However, it is also possible to
restart applications not necessarily inside a container, and without
restoring the original pids of the processes (that is, provided that
the application can tolerate such behavior). This is useful to allow
multi-process restart of tasks not isolated inside a container, and
also for debugging.

Changelog[v12]:
  - Replace obsolete cr_debug() with pr_debug()

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/restart.c       |  214 +++++++++++++++++++++++++++++++++++++++++++-
 checkpoint/sys.c           |   34 ++++++--
 include/linux/checkpoint.h |   25 +++++-
 include/linux/sched.h      |    3 +
 4 files changed, 262 insertions(+), 14 deletions(-)

diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 0c46abf..6b4cd75 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -10,6 +10,7 @@
 
 #include <linux/version.h>
 #include <linux/sched.h>
+#include <linux/wait.h>
 #include <linux/file.h>
 #include <linux/magic.h>
 #include <linux/checkpoint.h>
@@ -276,30 +277,235 @@ static int cr_read_task(struct cr_ctx *ctx)
 	return ret;
 }
 
+/* cr_read_tree - read the tasks tree into the checkpoint context */
+static int cr_read_tree(struct cr_ctx *ctx)
+{
+	struct cr_hdr_tree *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	int parent, size, ret = -EINVAL;
+
+	parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TREE);
+	if (parent < 0) {
+		ret = parent;
+		goto out;
+	} else if (parent != 0)
+		goto out;
+
+	if (hh->tasks_nr < 0)
+		goto out;
+
+	ctx->pids_nr = hh->tasks_nr;
+	size = sizeof(*ctx->pids_arr) * ctx->pids_nr;
+	if (size < 0)		/* overflow ? */
+		goto out;
+
+	ctx->pids_arr = kmalloc(size, GFP_KERNEL);
+	if (!ctx->pids_arr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = cr_kread(ctx, ctx->pids_arr, size);
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+	return ret;
+}
+
+static int cr_wait_task(struct cr_ctx *ctx)
+{
+	pid_t pid = task_pid_vnr(current);
+
+	pr_debug("pid %d waiting\n", pid);
+	return wait_event_interruptible(ctx->waitq, ctx->pids_active == pid);
+}
+
+static int cr_next_task(struct cr_ctx *ctx)
+{
+	struct task_struct *tsk;
+
+	ctx->pids_pos++;
+
+	pr_debug("pids_pos %d %d\n", ctx->pids_pos, ctx->pids_nr);
+	if (ctx->pids_pos == ctx->pids_nr) {
+		complete(&ctx->complete);
+		return 0;
+	}
+
+	ctx->pids_active = ctx->pids_arr[ctx->pids_pos].vpid;
+
+	pr_debug("pids_next %d\n", ctx->pids_active);
+
+	rcu_read_lock();
+	tsk = find_task_by_pid_ns(ctx->pids_active, ctx->root_nsproxy->pid_ns);
+	if (tsk)
+		wake_up_process(tsk);
+	rcu_read_unlock();
+
+	if (!tsk) {
+		ctx->pids_err = -ESRCH;
+		complete(&ctx->complete);
+		return -ESRCH;
+	}
+
+	return 0;
+}
+
+/* FIXME: this should be per container */
+DECLARE_WAIT_QUEUE_HEAD(cr_restart_waitq);
+
+static int do_restart_task(struct cr_ctx *ctx, pid_t pid)
+{
+	struct task_struct *root_task;
+	int ret;
+
+	rcu_read_lock();
+	root_task = find_task_by_pid_ns(pid, current->nsproxy->pid_ns);
+	if (root_task)
+		get_task_struct(root_task);
+	rcu_read_unlock();
+
+	if (!root_task)
+		return -EINVAL;
+
+	/*
+	 * wait for container init to initialize the restart context, then
+	 * grab a reference to that context, and if we're the last task to
+	 * do it, notify the container init.
+	 */
+	ret = wait_event_interruptible(cr_restart_waitq,
+				       root_task->checkpoint_ctx);
+	if (ret < 0)
+		goto out;
+
+	task_lock(root_task);
+	ctx = root_task->checkpoint_ctx;
+	if (ctx)
+		cr_ctx_get(ctx);
+	task_unlock(root_task);
+
+	if (!ctx) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (atomic_dec_and_test(&ctx->tasks_count))
+		complete(&ctx->complete);
+
+	/* wait for our turn, do the restore, and tell next task in line */
+	ret = cr_wait_task(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_read_task(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_next_task(ctx);
+
+ out:
+	cr_ctx_put(ctx);
+	put_task_struct(root_task);
+	return ret;
+}
+
+static int cr_wait_all_tasks_start(struct cr_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->pids_nr == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+	current->checkpoint_ctx = ctx;
+
+	wake_up_all(&cr_restart_waitq);
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	if (ret < 0)
+		return ret;
+
+	task_lock(current);
+	current->checkpoint_ctx = NULL;
+	task_unlock(current);
+
+	return 0;
+}
+
+static int cr_wait_all_tasks_finish(struct cr_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->pids_nr == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+
+	ret = cr_next_task(ctx);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
 /* setup restart-specific parts of ctx */
 static int cr_ctx_restart(struct cr_ctx *ctx, pid_t pid)
 {
+	ctx->root_pid = pid;
+	ctx->root_task = current;
+	ctx->root_nsproxy = current->nsproxy;
+
+	get_task_struct(ctx->root_task);
+	get_nsproxy(ctx->root_nsproxy);
+
+	atomic_set(&ctx->tasks_count, ctx->pids_nr - 1);
+
 	return 0;
 }
 
-int do_restart(struct cr_ctx *ctx, pid_t pid)
+static int do_restart_root(struct cr_ctx *ctx, pid_t pid)
 {
 	int ret;
 
+	ret = cr_read_head(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_read_tree(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_ctx_restart(ctx, pid);
 	if (ret < 0)
 		goto out;
-	ret = cr_read_head(ctx);
+
+	/* wait for all other tasks to enter do_restart_task() */
+	ret = cr_wait_all_tasks_start(ctx);
 	if (ret < 0)
 		goto out;
+
 	ret = cr_read_task(ctx);
 	if (ret < 0)
 		goto out;
-	ret = cr_read_tail(ctx);
+
+	/* wait for all other tasks to complete do_restart_task() */
+	ret = cr_wait_all_tasks_finish(ctx);
 	if (ret < 0)
 		goto out;
 
-	/* on success, adjust the return value if needed [TODO] */
+	ret = cr_read_tail(ctx);
+
  out:
 	return ret;
 }
+
+int do_restart(struct cr_ctx *ctx, pid_t pid)
+{
+	int ret;
+
+	if (ctx)
+		ret = do_restart_root(ctx, pid);
+	else
+		ret = do_restart_task(ctx, pid);
+
+	/* on success, adjust the return value if needed [TODO] */
+	return ret;
+}
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 0436ef3..f26b0c6 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -167,6 +167,8 @@ static void cr_task_arr_free(struct cr_ctx *ctx)
 
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
+	BUG_ON(atomic_read(&ctx->refcount));
+
 	if (ctx->file)
 		fput(ctx->file);
 
@@ -185,6 +187,8 @@ static void cr_ctx_free(struct cr_ctx *ctx)
 	if (ctx->root_task)
 		put_task_struct(ctx->root_task);
 
+	kfree(ctx->pids_arr);
+
 	kfree(ctx);
 }
 
@@ -199,8 +203,10 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 
 	ctx->flags = flags;
 
+	atomic_set(&ctx->refcount, 0);
 	INIT_LIST_HEAD(&ctx->pgarr_list);
 	INIT_LIST_HEAD(&ctx->pgarr_pool);
+	init_waitqueue_head(&ctx->waitq);
 
 	err = -EBADF;
 	ctx->file = fget(fd);
@@ -215,6 +221,7 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 	if (cr_objhash_alloc(ctx) < 0)
 		goto err;
 
+	atomic_inc(&ctx->refcount);
 	return ctx;
 
  err:
@@ -222,6 +229,17 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 	return ERR_PTR(err);
 }
 
+void cr_ctx_get(struct cr_ctx *ctx)
+{
+	atomic_inc(&ctx->refcount);
+}
+
+void cr_ctx_put(struct cr_ctx *ctx)
+{
+	if (ctx && atomic_dec_and_test(&ctx->refcount))
+		cr_ctx_free(ctx);
+}
+
 /**
  * sys_checkpoint - checkpoint a container
  * @pid: pid of the container init(1) process
@@ -249,7 +267,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
 	if (!ret)
 		ret = ctx->crid;
 
-	cr_ctx_free(ctx);
+	cr_ctx_put(ctx);
 	return ret;
 }
 
@@ -264,7 +282,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
  */
 asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
 {
-	struct cr_ctx *ctx;
+	struct cr_ctx *ctx = NULL;
 	pid_t pid;
 	int ret;
 
@@ -272,15 +290,17 @@ asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
 	if (flags)
 		return -EINVAL;
 
-	ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
 	/* FIXME: for now, we use 'crid' as a pid */
 	pid = (pid_t) crid;
 
+	if (pid == task_pid_vnr(current))
+		ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
 	ret = do_restart(ctx, pid);
 
-	cr_ctx_free(ctx);
+	cr_ctx_put(ctx);
 	return ret;
 }
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 6307511..484be56 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -13,10 +13,13 @@
 #include <linux/path.h>
 #include <linux/fs.h>
 #include <linux/fdtable.h>
+#include <linux/path.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
 
 #ifdef CONFIG_CHECKPOINT_RESTART
 
-#define CR_VERSION  2
+#define CR_VERSION  3
 
 struct cr_ctx {
 	int crid;		/* unique checkpoint id */
@@ -34,8 +37,7 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
-	struct task_struct **tasks_arr;	/* array of all tasks in container */
-	int tasks_nr;			/* size of tasks array */
+	atomic_t refcount;
 
 	struct cr_objhash *objhash;	/* hash for shared objects */
 
@@ -43,6 +45,20 @@ struct cr_ctx {
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
 
 	struct path fs_mnt;	/* container root (FIXME) */
+
+	/* [multi-process checkpoint] */
+	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
+	int tasks_nr;                   /* size of tasks array */
+
+	/* [multi-process restart] */
+	struct cr_hdr_pids *pids_arr;	/* array of all pids [restart] */
+	int pids_nr;			/* size of pids array */
+	int pids_pos;			/* position pids array */
+	int pids_err;			/* error occured ? */
+	pid_t pids_active;		/* pid of (next) active task */
+	atomic_t tasks_count;		/* sync of restarting tasks */
+	struct completion complete;	/* sync of restarting tasks */
+	wait_queue_head_t waitq;	/* sync of restarting tasks */
 };
 
 /* cr_ctx: flags */
@@ -55,6 +71,9 @@ extern int cr_kread(struct cr_ctx *ctx, void *buf, int count);
 extern void *cr_hbuf_get(struct cr_ctx *ctx, int n);
 extern void cr_hbuf_put(struct cr_ctx *ctx, int n);
 
+extern void cr_ctx_get(struct cr_ctx *ctx);
+extern void cr_ctx_put(struct cr_ctx *ctx);
+
 /* shared objects handling */
 
 enum {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c216e0..c5d5987 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1416,6 +1416,9 @@ struct task_struct {
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	unsigned long trace;
+#endif 
+#ifdef CONFIG_CHECKPOINT_RESTART
+	struct cr_ctx *checkpoint_ctx;
 #endif
 };
 
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 3/6] Check 'may_checkpoint()' early
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17 17:49   ` [PATCH 1/6] Checkpoint multiple processes Sukadev Bhattiprolu
  2009-03-17 17:49   ` [PATCH 2/6] Restart " Sukadev Bhattiprolu
@ 2009-03-17 17:50   ` Sukadev Bhattiprolu
       [not found]     ` <20090317175013.GD10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17 17:50   ` [PATCH 4/6] Deny external checkpoint unless task is frozen Sukadev Bhattiprolu
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:50 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Thu, 12 Mar 2009 14:19:54 -0700
Subject: [PATCH 3/6] Check 'may_checkpoint()' early

We currently check if a task is checkpointable when writing the task
information to checkpoint file. The small downside of doing this check
late is that we may have processed several processes in the tree before
hitting one that cannot be checkpointed.

We anyway walk the process tree we are checkpointing earlier, while
counting the tasks. We could check if all processes in the tree are
checkpointable at that time and fail early if any of them are not.
Since the process tree should be frozen, checking earlier should not
matter ?

For now, the patch leaves the existing check in cr_write_pids(). We
can remove that later.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index ed4fd3d..dae9b97 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -341,11 +341,18 @@ static int cr_tree_count_tasks(struct cr_ctx *ctx)
 	struct task_struct **tasks_arr = ctx->tasks_arr;
 	int tasks_nr = ctx->tasks_nr;
 	int nr = 0;
+	int ret;
 
 	read_lock(&tasklist_lock);
 
 	/* count tasks via DFS scan of the tree */
 	while (1) {
+		ret = cr_may_checkpoint_task(task, ctx);
+		if (ret < 0) {
+			nr = ret;
+			break;
+		}
+
 		if (tasks_arr) {
 			/* unlikely, but ... */
 			if (nr == tasks_nr)
@@ -401,6 +408,8 @@ static int cr_build_tree(struct cr_ctx *ctx)
 
 	/* count tasks (no side effects) */
 	n = cr_tree_count_tasks(ctx);
+	if (n < 0)
+		return n;
 
 	ctx->tasks_nr = n;
 	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 4/6] Deny external checkpoint unless task is frozen
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-03-17 17:50   ` [PATCH 3/6] Check 'may_checkpoint()' early Sukadev Bhattiprolu
@ 2009-03-17 17:50   ` Sukadev Bhattiprolu
  2009-03-17 17:51   ` [PATCH 5/6] Define and use proc_pid_checkpointable() Sukadev Bhattiprolu
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:50 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Thu, 12 Mar 2009 14:20:02 -0700
Subject: [PATCH 4/6] Deny external checkpoint unless task is frozen

Remove a 'FIXME' and ensure that the tasks we are checkpointing are
frozen unless its a self-checkpoint.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index dae9b97..c16f30c 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/utsname.h>
 #include <linux/magic.h>
+#include <linux/freezer.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -275,7 +276,9 @@ static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
 	if (!ptrace_may_access(t, PTRACE_MODE_READ))
 		return -EPERM;
 
-	/* FIXME: verify that the task is frozen (unless self) */
+	/* verify that the task is frozen (unless self) */
+	if (t != current && !frozen(t))
+		return -EBUSY;
 
 	/* FIXME: change this for nested containers */
 	if (task_nsproxy(t) != ctx->root_nsproxy)
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 5/6] Define and use proc_pid_checkpointable()
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (3 preceding siblings ...)
  2009-03-17 17:50   ` [PATCH 4/6] Deny external checkpoint unless task is frozen Sukadev Bhattiprolu
@ 2009-03-17 17:51   ` Sukadev Bhattiprolu
  2009-03-17 17:51   ` [PATCH 6/6] Explain reason for task being uncheckpointable Sukadev Bhattiprolu
  2009-03-18  9:28   ` [PATCH 0/6] /proc/pid/checkpointable Oren Laadan
  6 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:51 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Fri, 13 Mar 2009 17:25:42 -0700
Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()

Create a proc file, /proc/pid/checkpointable, which shows '1' if
task is checkpointable and '0' if it is not.

To determine whether a task is checkpointable, the handler for this
new proc file, shares the same code with sys_checkpoint().

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c    |   23 ++++++++++++++++-------
 fs/proc/base.c             |   13 +++++++++++++
 include/linux/checkpoint.h |    7 +++++++
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index c16f30c..ad0956c 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -264,22 +264,31 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
 	return ret;
 }
 
+int task_checkpointable(struct task_struct *t)
+{
+	if (t->state == TASK_DEAD) {
+		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+		return 0;
+	}
+
+	/* Verify that task is frozen, unless it is self-checkpoint */
+	if (t != current && !frozen(t))
+		return 0;
+
+	return 1;
+}
+
 static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
 {
 	pr_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
 
-	if (t->state == TASK_DEAD) {
-		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+	/* verify that the task is checkpointable */
+	if (!task_checkpointable(t))
 		return -EAGAIN;
-	}
 
 	if (!ptrace_may_access(t, PTRACE_MODE_READ))
 		return -EPERM;
 
-	/* verify that the task is frozen (unless self) */
-	if (t != current && !frozen(t))
-		return -EBUSY;
-
 	/* FIXME: change this for nested containers */
 	if (task_nsproxy(t) != ctx->root_nsproxy)
 		return -EPERM;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index bde92c3..989ca93 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2501,6 +2501,13 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTART
+static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
+{
+	return sprintf(buffer, "%d\n", task_checkpointable(task));
+}
+#endif
+
 /*
  * Thread groups
  */
@@ -2580,6 +2587,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, proc_tgid_io_accounting),
 #endif
+#ifdef CONFIG_CHECKPOINT_RESTART
+	INF("checkpointable", S_IRUGO, proc_pid_checkpointable),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file * filp,
@@ -2915,6 +2925,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, proc_tid_io_accounting),
 #endif
+#ifdef CONFIG_CHECKPOINT_RESTART
+	INF("checkpointable", S_IRUGO, proc_pid_checkpointable),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file * filp,
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 484be56..d1f53a6 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -142,6 +142,8 @@ static inline int cr_enabled(void)
 	return 1;
 }
 
+int task_checkpointable(struct task_struct *t);
+
 #else /* !CONFIG_CHECKPOINT_RESTART */
 
 static inline void files_deny_checkpointing(struct files_struct *files) {}
@@ -162,5 +164,10 @@ static inline int cr_enabled(void)
 	return 0;
 }
 
+static inline int task_checkpointable(struct task_struct *t)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CHECKPOINT_RESTART */
 #endif /* _CHECKPOINT_CKPT_H_ */
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 6/6] Explain reason for task being uncheckpointable
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (4 preceding siblings ...)
  2009-03-17 17:51   ` [PATCH 5/6] Define and use proc_pid_checkpointable() Sukadev Bhattiprolu
@ 2009-03-17 17:51   ` Sukadev Bhattiprolu
       [not found]     ` <20090317175149.GG10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-18  9:28   ` [PATCH 0/6] /proc/pid/checkpointable Oren Laadan
  6 siblings, 1 reply; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17 17:51 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Sat, 14 Mar 2009 10:21:07 -0700
Subject: [PATCH 6/6] Explain reason for task being uncheckpointable

Try to give an useful message on why a task is uncheckpointable.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c    |   13 +++++++++----
 fs/proc/base.c             |   19 ++++++++++++++++++-
 include/linux/checkpoint.h |    7 +++++++
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index ad0956c..c8c377d 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -264,18 +264,23 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
 	return ret;
 }
 
-int task_checkpointable(struct task_struct *t)
+int __task_checkpointable(struct task_struct *t)
 {
 	if (t->state == TASK_DEAD) {
 		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
-		return 0;
+		return CR_DEAD;
 	}
 
 	/* Verify that task is frozen, unless it is self-checkpoint */
 	if (t != current && !frozen(t))
-		return 0;
+		return CR_NOT_FROZEN;
+
+	return CR_CHECKPOINTABLE;
+}
 
-	return 1;
+int task_checkpointable(struct task_struct *t)
+{
+	return __task_checkpointable(t) == CR_CHECKPOINTABLE;
 }
 
 static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 989ca93..6e4c07c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2502,9 +2502,26 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTART
+static char *explain_checkpointable(int reason)
+{
+	char *message;
+
+	switch(reason) {
+	case CR_CHECKPOINTABLE: message = "1"; break;
+	case CR_DEAD: 		message = "0 (dead)"; break;
+	case CR_NOT_FROZEN: 	message = "0 (not frozen)"; break;
+	default:		message = "0 (unknown)"; break;
+	}
+	return message;
+}
+
 static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
 {
-	return sprintf(buffer, "%d\n", task_checkpointable(task));
+	int rc;
+
+	rc = __task_checkpointable(task);
+
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", explain_checkpointable(rc));
 }
 #endif
 
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index d1f53a6..a1eba73 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -142,6 +142,13 @@ static inline int cr_enabled(void)
 	return 1;
 }
 
+enum cr_uncheckpointable_reason {
+	CR_CHECKPOINTABLE = 0,
+	CR_DEAD,
+	CR_NOT_FROZEN
+};
+
+int __task_checkpointable(struct task_struct *t);
 int task_checkpointable(struct task_struct *t);
 
 #else /* !CONFIG_CHECKPOINT_RESTART */
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 6/6] Explain reason for task being uncheckpointable
       [not found]     ` <20090317175149.GG10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-17 18:52       ` Serge E. Hallyn
       [not found]         ` <20090317185228.GC1039-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Serge E. Hallyn @ 2009-03-17 18:52 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
> 
> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> Date: Sat, 14 Mar 2009 10:21:07 -0700
> Subject: [PATCH 6/6] Explain reason for task being uncheckpointable
> 
> Try to give an useful message on why a task is uncheckpointable.

As I said somewhere else, I think this is what makes it worth
having a procfile (or debugfs file) over just doing a trial
sys_checkpoint() with an invalid fd.

However,

> Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> ---
>  checkpoint/checkpoint.c    |   13 +++++++++----
>  fs/proc/base.c             |   19 ++++++++++++++++++-
>  include/linux/checkpoint.h |    7 +++++++
>  3 files changed, 34 insertions(+), 5 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index ad0956c..c8c377d 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -264,18 +264,23 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
>  	return ret;
>  }
> 
> -int task_checkpointable(struct task_struct *t)
> +int __task_checkpointable(struct task_struct *t)
>  {
>  	if (t->state == TASK_DEAD) {
>  		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
> -		return 0;
> +		return CR_DEAD;
>  	}
> 
>  	/* Verify that task is frozen, unless it is self-checkpoint */
>  	if (t != current && !frozen(t))
> -		return 0;
> +		return CR_NOT_FROZEN;
> +
> +	return CR_CHECKPOINTABLE;
> +}
> 
> -	return 1;
> +int task_checkpointable(struct task_struct *t)
> +{
> +	return __task_checkpointable(t) == CR_CHECKPOINTABLE;
>  }
> 
>  static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 989ca93..6e4c07c 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2502,9 +2502,26 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
>  }
> 
>  #ifdef CONFIG_CHECKPOINT_RESTART
> +static char *explain_checkpointable(int reason)
> +{
> +	char *message;
> +
> +	switch(reason) {
> +	case CR_CHECKPOINTABLE: message = "1"; break;
> +	case CR_DEAD: 		message = "0 (dead)"; break;
> +	case CR_NOT_FROZEN: 	message = "0 (not frozen)"; break;
> +	default:		message = "0 (unknown)"; break;

I don't think splitting it up this way is going to work long-term.
It means you need one code for each type of message, and can't give
any more detail beyond that (i.e. the value of
current->nsproxy->uts_ns.refcount, or whatever).

I think it would be better to pass a buffer into __task_checkpointable,
and let task_checkpointable() pass in a NULL buffer meaning "I
(sys_checkpoint) don't care about the explanation".

> +	}
> +	return message;
> +}
> +
>  static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
>  {
> -	return sprintf(buffer, "%d\n", task_checkpointable(task));
> +	int rc;
> +
> +	rc = __task_checkpointable(task);
> +
> +	return scnprintf(buffer, PAGE_SIZE, "%s\n", explain_checkpointable(rc));
>  }
>  #endif
> 
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index d1f53a6..a1eba73 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -142,6 +142,13 @@ static inline int cr_enabled(void)
>  	return 1;
>  }
> 
> +enum cr_uncheckpointable_reason {
> +	CR_CHECKPOINTABLE = 0,
> +	CR_DEAD,
> +	CR_NOT_FROZEN
> +};
> +
> +int __task_checkpointable(struct task_struct *t);
>  int task_checkpointable(struct task_struct *t);
> 
>  #else /* !CONFIG_CHECKPOINT_RESTART */
> -- 
> 1.5.2.5
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 6/6] Explain reason for task being uncheckpointable
       [not found]         ` <20090317185228.GC1039-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-18  6:50           ` Sukadev Bhattiprolu
  0 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-18  6:50 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Containers, David C. Hansen

Serge E. Hallyn [serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org] wrote:
| >  #ifdef CONFIG_CHECKPOINT_RESTART
| > +static char *explain_checkpointable(int reason)
| > +{
| > +	char *message;
| > +
| > +	switch(reason) {
| > +	case CR_CHECKPOINTABLE: message = "1"; break;
| > +	case CR_DEAD: 		message = "0 (dead)"; break;
| > +	case CR_NOT_FROZEN: 	message = "0 (not frozen)"; break;
| > +	default:		message = "0 (unknown)"; break;
| 
| I don't think splitting it up this way is going to work long-term.
| It means you need one code for each type of message, and can't give
| any more detail beyond that (i.e. the value of
| current->nsproxy->uts_ns.refcount, or whatever).
| 
| I think it would be better to pass a buffer into __task_checkpointable,
| and let task_checkpointable() pass in a NULL buffer meaning "I
| (sys_checkpoint) don't care about the explanation".

Ok. I had tried it before, but was not happy about __task_checkpointable()
filling in a message buffer,  but I see your point. Here is an updated patch.

---

From 73dd3ec01320d62881cb5296aa4d68667571ef4c Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Tue, 17 Mar 2009 16:15:16 -0700
Subject: [PATCH 6/6] Explain reason for task being uncheckpointable

Try to give an useful message on why a task is uncheckpointable.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c    |   16 ++++++++++++++--
 fs/proc/base.c             |    4 +++-
 include/linux/checkpoint.h |    1 +
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index ad0956c..b4dbb22 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -264,19 +264,31 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
 	return ret;
 }
 
-int task_checkpointable(struct task_struct *t)
+int __task_checkpointable(struct task_struct *t, char *msg_buf, int len)
 {
 	if (t->state == TASK_DEAD) {
+		if (msg_buf)
+			scnprintf(msg_buf, len, "0 (task dead)\n");
 		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
 		return 0;
 	}
 
 	/* Verify that task is frozen, unless it is self-checkpoint */
-	if (t != current && !frozen(t))
+	if (t != current && !frozen(t)) {
+		if (msg_buf)
+			scnprintf(msg_buf, len, "0 (task not frozen)\n");
 		return 0;
+	}
+
+	if (msg_buf)
+		scnprintf(msg_buf, len, "1\n");
 
 	return 1;
 }
+int task_checkpointable(struct task_struct *t)
+{
+	return __task_checkpointable(t, NULL, 0);
+}
 
 static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
 {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 989ca93..2d054b7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2504,7 +2504,9 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 #ifdef CONFIG_CHECKPOINT_RESTART
 static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
 {
-	return sprintf(buffer, "%d\n", task_checkpointable(task));
+	__task_checkpointable(task, buffer, PAGE_SIZE);
+
+	return strlen(buffer);
 }
 #endif
 
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index d1f53a6..44adb0b 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -142,6 +142,7 @@ static inline int cr_enabled(void)
 	return 1;
 }
 
+int __task_checkpointable(struct task_struct *t, char *msg_buf, int len);
 int task_checkpointable(struct task_struct *t);
 
 #else /* !CONFIG_CHECKPOINT_RESTART */
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (5 preceding siblings ...)
  2009-03-17 17:51   ` [PATCH 6/6] Explain reason for task being uncheckpointable Sukadev Bhattiprolu
@ 2009-03-18  9:28   ` Oren Laadan
  6 siblings, 0 replies; 38+ messages in thread
From: Oren Laadan @ 2009-03-18  9:28 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen


Catching up chronologically, I sent these to the previous version of your
patch - but it's still relevant:

https://lists.linux-foundation.org/pipermail/containers/2009-March/016307.html
https://lists.linux-foundation.org/pipermail/containers/2009-March/016308.html

Oren.

Sukadev Bhattiprolu wrote:
> Started out implementing '/proc/pid/checkpointable' which reports 0 or 1
> depending on whether a process is checkpointable atm. But realized that
> some patches that were discussed earlier are missing from Dave Hansen's
> tree.
> 
> This patchset includes those missing patches plus two new patches that
> create /proc/pid/checkpointable.
> 
>         [PATCH 1/6] Checkpoint multiple processes
>         [PATCH 2/6] Restart multiple processes
>         [PATCH 3/6] Check 'may_checkpoint()' early
>         [PATCH 4/6] Deny external checkpoint unless task is frozen
>         [PATCH 5/6] Define and use proc_pid_checkpointable()
>         [PATCH 6/6] Explain reason for task being uncheckpointable
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/6] Check 'may_checkpoint()' early
       [not found]     ` <20090317175013.GD10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-26  5:58       ` Sukadev Bhattiprolu
       [not found]         ` <20090326055840.GA8220-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-26  5:58 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers


Oren, 

I see that patch 3/6 (deny external checkpoint unless frozen) in this
set was merged in v14, but do you have any plans/objections to merging
this optimization patch ? Serge had acked an earlier version of this
patch.

Suka

Sukadev Bhattiprolu [sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org] wrote:
| 
| From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
| Date: Thu, 12 Mar 2009 14:19:54 -0700
| Subject: [PATCH 3/6] Check 'may_checkpoint()' early
| 
| We currently check if a task is checkpointable when writing the task
| information to checkpoint file. The small downside of doing this check
| late is that we may have processed several processes in the tree before
| hitting one that cannot be checkpointed.
| 
| We anyway walk the process tree we are checkpointing earlier, while
| counting the tasks. We could check if all processes in the tree are
| checkpointable at that time and fail early if any of them are not.
| Since the process tree should be frozen, checking earlier should not
| matter ?
| 
| For now, the patch leaves the existing check in cr_write_pids(). We
| can remove that later.
| 
| Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
| ---
|  checkpoint/checkpoint.c |    9 +++++++++
|  1 files changed, 9 insertions(+), 0 deletions(-)
| 
| diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
| index ed4fd3d..dae9b97 100644
| --- a/checkpoint/checkpoint.c
| +++ b/checkpoint/checkpoint.c
| @@ -341,11 +341,18 @@ static int cr_tree_count_tasks(struct cr_ctx *ctx)
|  	struct task_struct **tasks_arr = ctx->tasks_arr;
|  	int tasks_nr = ctx->tasks_nr;
|  	int nr = 0;
| +	int ret;
| 
|  	read_lock(&tasklist_lock);
| 
|  	/* count tasks via DFS scan of the tree */
|  	while (1) {
| +		ret = cr_may_checkpoint_task(task, ctx);
| +		if (ret < 0) {
| +			nr = ret;
| +			break;
| +		}
| +
|  		if (tasks_arr) {
|  			/* unlikely, but ... */
|  			if (nr == tasks_nr)
| @@ -401,6 +408,8 @@ static int cr_build_tree(struct cr_ctx *ctx)
| 
|  	/* count tasks (no side effects) */
|  	n = cr_tree_count_tasks(ctx);
| +	if (n < 0)
| +		return n;
| 
|  	ctx->tasks_nr = n;
|  	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
| -- 
| 1.5.2.5
| 
| _______________________________________________
| Containers mailing list
| Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
| https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 3/6] Check 'may_checkpoint()' early
       [not found]         ` <20090326055840.GA8220-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-26 13:25           ` Serge E. Hallyn
  0 siblings, 0 replies; 38+ messages in thread
From: Serge E. Hallyn @ 2009-03-26 13:25 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers

Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
> 
> Oren, 
> 
> I see that patch 3/6 (deny external checkpoint unless frozen) in this
> set was merged in v14, but do you have any plans/objections to merging
> this optimization patch ? Serge had acked an earlier version of this
> patch.
> 
> Suka
> 
> Sukadev Bhattiprolu [sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org] wrote:
> | 
> | From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> | Date: Thu, 12 Mar 2009 14:19:54 -0700
> | Subject: [PATCH 3/6] Check 'may_checkpoint()' early
> | 
> | We currently check if a task is checkpointable when writing the task
> | information to checkpoint file. The small downside of doing this check
> | late is that we may have processed several processes in the tree before
> | hitting one that cannot be checkpointed.
> | 
> | We anyway walk the process tree we are checkpointing earlier, while
> | counting the tasks. We could check if all processes in the tree are
> | checkpointable at that time and fail early if any of them are not.
> | Since the process tree should be frozen, checking earlier should not
> | matter ?
> | 
> | For now, the patch leaves the existing check in cr_write_pids(). We
> | can remove that later.
> | 
> | Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> | ---
> |  checkpoint/checkpoint.c |    9 +++++++++
> |  1 files changed, 9 insertions(+), 0 deletions(-)
> | 
> | diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> | index ed4fd3d..dae9b97 100644
> | --- a/checkpoint/checkpoint.c
> | +++ b/checkpoint/checkpoint.c
> | @@ -341,11 +341,18 @@ static int cr_tree_count_tasks(struct cr_ctx *ctx)
> |  	struct task_struct **tasks_arr = ctx->tasks_arr;
> |  	int tasks_nr = ctx->tasks_nr;
> |  	int nr = 0;
> | +	int ret;
> | 
> |  	read_lock(&tasklist_lock);
> | 
> |  	/* count tasks via DFS scan of the tree */
> |  	while (1) {
> | +		ret = cr_may_checkpoint_task(task, ctx);
> | +		if (ret < 0) {
> | +			nr = ret;
> | +			break;
> | +		}
> | +
> |  		if (tasks_arr) {
> |  			/* unlikely, but ... */
> |  			if (nr == tasks_nr)
> | @@ -401,6 +408,8 @@ static int cr_build_tree(struct cr_ctx *ctx)
> | 
> |  	/* count tasks (no side effects) */
> |  	n = cr_tree_count_tasks(ctx);
> | +	if (n < 0)
> | +		return n;

This hunk is already in v14.

-serge

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]                                           ` <49CB504A.2080400-GANU6spQydw@public.gmane.org>
@ 2009-03-26 13:29                                             ` Serge E. Hallyn
  0 siblings, 0 replies; 38+ messages in thread
From: Serge E. Hallyn @ 2009-03-26 13:29 UTC (permalink / raw)
  To: Cedric Le Goater
  Cc: Containers, Sukadev Bhattiprolu, David C. Hansen,
	Eric W. Biederman, Dave Hansen

Quoting Cedric Le Goater (legoater-GANU6spQydw@public.gmane.org):
> Serge E. Hallyn wrote:
> > Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> >> Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> writes:
> >>
> >>> On Wed, 2009-03-18 at 13:03 -0700, Mike Waychison wrote:
> >>>> Polluting the dmesg buffer with messages from common failures (consider 
> >>>> a multi-user cluster where checkpoints may or may not succeed) isn't 
> >>>> very useful.
> >>> Yeah, I've already gotten an earful from Serge and Dan S. about this. :)
> >>>
> >>> Serge suggested that, perhaps, the audit framework could be used.  We
> >>> might also use an ftrace buffer if we want to keep a whole ton of
> >>> messages around, too.
> >>>
> >>> dmesg is definitely not workable long-term at all.
> >> How about having place holder objects in the generated checkpoint.
> >> Then instead of having a failure you have a non-restoreable checkpoint.
> >> But you know which fd, or which mmaped region, or which other thing
> >> is causing the problem and if you want more information you can
> >> look at that resource.
> >>
> >> That gives user space the freedom and scrub out the non-checkpointable
> >> bits and replace them with something like /dev/null so that we can
> >> continue on and restore the checkpoint anyway, if we think our
> >> app can cope with some things going away.
> >>
> >> Eric
> > 
> > I like this idea.
> 
> yes. This is something required to replace stdios for example, when 
> you execute an application under ssh, checkpoint and then restart on 
> an other host. This a topical scenario for a batch manager in an HPC 
> environment. 
> 
> identified resources of the container are tracked to be ignored by 
> checkpoint and to be replaced by similar ones at restart.

So in that case how are the resources identified?  Does the user
specify them at checkpoint?  Do you look for specific strings
(/dev/pts/*) at restart?

-serge

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]                                       ` <20090325172938.GA18957-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-26  9:52                                         ` Cedric Le Goater
       [not found]                                           ` <49CB504A.2080400-GANU6spQydw@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Cedric Le Goater @ 2009-03-26  9:52 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Containers, Sukadev Bhattiprolu, David C. Hansen,
	Eric W. Biederman, Dave Hansen

Serge E. Hallyn wrote:
> Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
>> Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> writes:
>>
>>> On Wed, 2009-03-18 at 13:03 -0700, Mike Waychison wrote:
>>>> Polluting the dmesg buffer with messages from common failures (consider 
>>>> a multi-user cluster where checkpoints may or may not succeed) isn't 
>>>> very useful.
>>> Yeah, I've already gotten an earful from Serge and Dan S. about this. :)
>>>
>>> Serge suggested that, perhaps, the audit framework could be used.  We
>>> might also use an ftrace buffer if we want to keep a whole ton of
>>> messages around, too.
>>>
>>> dmesg is definitely not workable long-term at all.
>> How about having place holder objects in the generated checkpoint.
>> Then instead of having a failure you have a non-restoreable checkpoint.
>> But you know which fd, or which mmaped region, or which other thing
>> is causing the problem and if you want more information you can
>> look at that resource.
>>
>> That gives user space the freedom and scrub out the non-checkpointable
>> bits and replace them with something like /dev/null so that we can
>> continue on and restore the checkpoint anyway, if we think our
>> app can cope with some things going away.
>>
>> Eric
> 
> I like this idea.

yes. This is something required to replace stdios for example, when 
you execute an application under ssh, checkpoint and then restart on 
an other host. This a topical scenario for a batch manager in an HPC 
environment. 

identified resources of the container are tracked to be ignored by 
checkpoint and to be replaced by similar ones at restart.


C.

> Subystems which are temporarily entirely unsupported (like sysvipc)
> would need at least a dummy section in the format wherein we can at
> least say 'unsupported', otherwise we'll still just get a meaningless
> -EINVAL.
>
> I actually got bitten yesterday by trying to checkpoint a task that
> wasn't frozen.  I forgot v14 had that check, and my failures (a
> segfault actually) weren't helpful.
> 
> -serge
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]                                   ` <m17i2dx00b.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
@ 2009-03-25 17:29                                     ` Serge E. Hallyn
       [not found]                                       ` <20090325172938.GA18957-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Serge E. Hallyn @ 2009-03-25 17:29 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Containers, Sukadev Bhattiprolu, David C. Hansen, Dave Hansen

Quoting Eric W. Biederman (ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org):
> Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> writes:
> 
> > On Wed, 2009-03-18 at 13:03 -0700, Mike Waychison wrote:
> >> Polluting the dmesg buffer with messages from common failures (consider 
> >> a multi-user cluster where checkpoints may or may not succeed) isn't 
> >> very useful.
> >
> > Yeah, I've already gotten an earful from Serge and Dan S. about this. :)
> >
> > Serge suggested that, perhaps, the audit framework could be used.  We
> > might also use an ftrace buffer if we want to keep a whole ton of
> > messages around, too.
> >
> > dmesg is definitely not workable long-term at all.
> 
> How about having place holder objects in the generated checkpoint.
> Then instead of having a failure you have a non-restoreable checkpoint.
> But you know which fd, or which mmaped region, or which other thing
> is causing the problem and if you want more information you can
> look at that resource.
> 
> That gives user space the freedom and scrub out the non-checkpointable
> bits and replace them with something like /dev/null so that we can
> continue on and restore the checkpoint anyway, if we think our
> app can cope with some things going away.
> 
> Eric

I like this idea.

Subystems which are temporarily entirely unsupported (like sysvipc)
would need at least a dummy section in the format wherein we can at
least say 'unsupported', otherwise we'll still just get a meaningless
-EINVAL.

I actually got bitten yesterday by trying to checkpoint a task that
wasn't frozen.  I forgot v14 had that check, and my failures (a
segfault actually) weren't helpful.

-serge

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
  2009-03-18 20:13                               ` Dave Hansen
@ 2009-03-25 12:25                                 ` Eric W. Biederman
       [not found]                                   ` <m17i2dx00b.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Eric W. Biederman @ 2009-03-25 12:25 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org> writes:

> On Wed, 2009-03-18 at 13:03 -0700, Mike Waychison wrote:
>> Polluting the dmesg buffer with messages from common failures (consider 
>> a multi-user cluster where checkpoints may or may not succeed) isn't 
>> very useful.
>
> Yeah, I've already gotten an earful from Serge and Dan S. about this. :)
>
> Serge suggested that, perhaps, the audit framework could be used.  We
> might also use an ftrace buffer if we want to keep a whole ton of
> messages around, too.
>
> dmesg is definitely not workable long-term at all.

How about having place holder objects in the generated checkpoint.
Then instead of having a failure you have a non-restoreable checkpoint.
But you know which fd, or which mmaped region, or which other thing
is causing the problem and if you want more information you can
look at that resource.

That gives user space the freedom and scrub out the non-checkpointable
bits and replace them with something like /dev/null so that we can
continue on and restore the checkpoint anyway, if we think our
app can cope with some things going away.

Eric

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]                             ` <49C153AF.7070504-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2009-03-18 20:13                               ` Dave Hansen
  2009-03-25 12:25                                 ` Eric W. Biederman
  0 siblings, 1 reply; 38+ messages in thread
From: Dave Hansen @ 2009-03-18 20:13 UTC (permalink / raw)
  To: Mike Waychison; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

On Wed, 2009-03-18 at 13:03 -0700, Mike Waychison wrote:
> Polluting the dmesg buffer with messages from common failures (consider 
> a multi-user cluster where checkpoints may or may not succeed) isn't 
> very useful.

Yeah, I've already gotten an earful from Serge and Dan S. about this. :)

Serge suggested that, perhaps, the audit framework could be used.  We
might also use an ftrace buffer if we want to keep a whole ton of
messages around, too.

dmesg is definitely not workable long-term at all.

-- Dave

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]                         ` <49C1347F.3000601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-18 20:03                           ` Mike Waychison
       [not found]                             ` <49C153AF.7070504-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Mike Waychison @ 2009-03-18 20:03 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

Oren Laadan wrote:
> 
> Serge E. Hallyn wrote:
>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>> Serge E. Hallyn wrote:
>>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>>> Sukadev Bhattiprolu wrote:
>>>>>> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>>>>>> Date: Fri, 13 Mar 2009 17:25:42 -0700
>>>>>> Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()
>>>>>>
>>>>>> Create a proc file, /proc/pid/checkpointable, which shows '1' if
>>>>>> task is checkpointable and '0' if it is not.
>>>>>>
>>>>>> To determine whether a task is checkpointable, the handler for this
>>>>>> new proc file, shares the same code with sys_checkpoint().
>>>> Hey Oren,
>>>>
>>>> 3 counter-points:
>>>>
>>>>> I still don't understand why we would like to do it this way.
>>>>>
>>>>> First, it makes little sense to do it per-task, because we are supposed
>>>>> to checkpoint an entire container.
>>>> Yes we need per-container info too.  Actually, per-checkpoint-job-init,
>>>> so if we send pids in for that, it should return false if we send in the
>>>> pid of a task which isn't a proper checkpoint-job-init.
>>>>
>>>> But we also want the info per-task, for debugging info.
>>>>
>>> My suggestions works for this two: we add a flag CR_CTX_DRYRUN; a task
>>> can ask to checkpoint itself, or another task, with CR_CTX_DRYRUN and
>>> the checkpoint code runs without actual effect. (If we don't want to
>>> expose the actual flag to userspace, then we simply use it in an
>>> implementation of a /proc/PID/checkpointable operation).
>> Hmm, so if we pass in CR_CTX_DRYRUN, then the fd can point to a file
>> wherein to store a text represenation of the reason?
>>
>> Dave will probably hate it, but it could be worse...
> 
> Either that.
> 
> Or continue using a debugfs interface, except that the  implementation
> will go through an internal interface as I suggest.
> 
> Or spit the reason on the kernel console so the user can check dmesg
> for it (like when 'modprobe' fails).

It'd be nice if the error message could be gotten directly from the 
call.  Would something like a new packet in the output stream 
(cr_hdr->type == CR_HDR_FAILURE) with something descriptive in the body 
of the packet make sense?  That could then be scanned for when 
sys_checkpoint fails..

Polluting the dmesg buffer with messages from common failures (consider 
a multi-user cluster where checkpoints may or may not succeed) isn't 
very useful.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]                       ` <49C133F9.2020505-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-18 18:06                         ` Dave Hansen
  0 siblings, 0 replies; 38+ messages in thread
From: Dave Hansen @ 2009-03-18 18:06 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu

On Wed, 2009-03-18 at 13:48 -0400, Oren Laadan wrote:
> Dave Hansen wrote:
> > On Wed, 2009-03-18 at 12:16 -0400, Oren Laadan wrote:
> >> My suggestions works for this two: we add a flag CR_CTX_DRYRUN; a task
> >> can ask to checkpoint itself, or another task, with CR_CTX_DRYRUN and
> >> the checkpoint code runs without actual effect. (If we don't want to
> >> expose the actual flag to userspace, then we simply use it in an
> >> implementation of a /proc/PID/checkpointable operation).
> > 
> > This mostly falls short answering the question "Where/when did I go
> > wrong?"  Personally, I think that is critical for getting good bug
> > reports out of normal Joes that might not really be interested in c/r
> > development itself.  It is like lockdep.  The guys/gals posting those
> > reports really don't know about kernel locking, but they are able to
> > improve it anyway.
> > 
> 
> As long as we have the descriptive text accompanied, it will answer the
> "where/why" question.
> 
> I can't think of an example where for me, as the developer, the "when"
> question was important in finding and fixing an unsupported corner.
> 
> Can you think of examples where this would be that much more informative
> then simply answering the "where/what" question ?

I think it is crucially important if a ->may_checkpoint-style flag is
persistent and sticky.

If we're *just* considering moment-by-moment state it doesn't matter at
all.

How about I rephrase: "Where/when did I first go wrong?"

I consider that an important question to answer.  What was the first
thing that I did that made this thing uncheckpointable?  If I want my
container to always be checkpointable, how else do I find race
conditions that may may it uncheckpointable for a moment?

Yep, it requires extra code.  But it also completely changes the way in
which we can go after features.  Oh, well.  You don't like it.  It is
different from how Zap did it.  Let's drop it.

-- Dave

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]                     ` <20090318171840.GA29523-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-18 17:50                       ` Oren Laadan
       [not found]                         ` <49C1347F.3000601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Oren Laadan @ 2009-03-18 17:50 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>
>> Serge E. Hallyn wrote:
>>> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>>> Sukadev Bhattiprolu wrote:
>>>>> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>>>>> Date: Fri, 13 Mar 2009 17:25:42 -0700
>>>>> Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()
>>>>>
>>>>> Create a proc file, /proc/pid/checkpointable, which shows '1' if
>>>>> task is checkpointable and '0' if it is not.
>>>>>
>>>>> To determine whether a task is checkpointable, the handler for this
>>>>> new proc file, shares the same code with sys_checkpoint().
>>> Hey Oren,
>>>
>>> 3 counter-points:
>>>
>>>> I still don't understand why we would like to do it this way.
>>>>
>>>> First, it makes little sense to do it per-task, because we are supposed
>>>> to checkpoint an entire container.
>>> Yes we need per-container info too.  Actually, per-checkpoint-job-init,
>>> so if we send pids in for that, it should return false if we send in the
>>> pid of a task which isn't a proper checkpoint-job-init.
>>>
>>> But we also want the info per-task, for debugging info.
>>>
>> My suggestions works for this two: we add a flag CR_CTX_DRYRUN; a task
>> can ask to checkpoint itself, or another task, with CR_CTX_DRYRUN and
>> the checkpoint code runs without actual effect. (If we don't want to
>> expose the actual flag to userspace, then we simply use it in an
>> implementation of a /proc/PID/checkpointable operation).
> 
> Hmm, so if we pass in CR_CTX_DRYRUN, then the fd can point to a file
> wherein to store a text represenation of the reason?
> 
> Dave will probably hate it, but it could be worse...

Either that.

Or continue using a debugfs interface, except that the  implementation
will go through an internal interface as I suggest.

Or spit the reason on the kernel console so the user can check dmesg
for it (like when 'modprobe' fails).

Oren.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
  2009-03-18 16:24                   ` Dave Hansen
@ 2009-03-18 17:48                     ` Oren Laadan
       [not found]                       ` <49C133F9.2020505-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Oren Laadan @ 2009-03-18 17:48 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Containers, Sukadev Bhattiprolu



Dave Hansen wrote:
> On Wed, 2009-03-18 at 12:16 -0400, Oren Laadan wrote:
>> My suggestions works for this two: we add a flag CR_CTX_DRYRUN; a task
>> can ask to checkpoint itself, or another task, with CR_CTX_DRYRUN and
>> the checkpoint code runs without actual effect. (If we don't want to
>> expose the actual flag to userspace, then we simply use it in an
>> implementation of a /proc/PID/checkpointable operation).
> 
> This mostly falls short answering the question "Where/when did I go
> wrong?"  Personally, I think that is critical for getting good bug
> reports out of normal Joes that might not really be interested in c/r
> development itself.  It is like lockdep.  The guys/gals posting those
> reports really don't know about kernel locking, but they are able to
> improve it anyway.
> 

As long as we have the descriptive text accompanied, it will answer the
"where/why" question.

I can't think of an example where for me, as the developer, the "when"
question was important in finding and fixing an unsupported corner.

Can you think of examples where this would be that much more informative
then simply answering the "where/what" question ?

Oren.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]                 ` <49C1201A.3050604-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-18 17:18                   ` Serge E. Hallyn
       [not found]                     ` <20090318171840.GA29523-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Serge E. Hallyn @ 2009-03-18 17:18 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Serge E. Hallyn wrote:
> > Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> >>
> >> Sukadev Bhattiprolu wrote:
> >>> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> >>> Date: Fri, 13 Mar 2009 17:25:42 -0700
> >>> Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()
> >>>
> >>> Create a proc file, /proc/pid/checkpointable, which shows '1' if
> >>> task is checkpointable and '0' if it is not.
> >>>
> >>> To determine whether a task is checkpointable, the handler for this
> >>> new proc file, shares the same code with sys_checkpoint().
> > 
> > Hey Oren,
> > 
> > 3 counter-points:
> > 
> >> I still don't understand why we would like to do it this way.
> >>
> >> First, it makes little sense to do it per-task, because we are supposed
> >> to checkpoint an entire container.
> > 
> > Yes we need per-container info too.  Actually, per-checkpoint-job-init,
> > so if we send pids in for that, it should return false if we send in the
> > pid of a task which isn't a proper checkpoint-job-init.
> > 
> > But we also want the info per-task, for debugging info.
> > 
> 
> My suggestions works for this two: we add a flag CR_CTX_DRYRUN; a task
> can ask to checkpoint itself, or another task, with CR_CTX_DRYRUN and
> the checkpoint code runs without actual effect. (If we don't want to
> expose the actual flag to userspace, then we simply use it in an
> implementation of a /proc/PID/checkpointable operation).

Hmm, so if we pass in CR_CTX_DRYRUN, then the fd can point to a file
wherein to store a text represenation of the reason?

Dave will probably hate it, but it could be worse...

-serge

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]                 ` <49C11E61.4010505-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-18 16:24                   ` Dave Hansen
  2009-03-18 17:48                     ` Oren Laadan
  0 siblings, 1 reply; 38+ messages in thread
From: Dave Hansen @ 2009-03-18 16:24 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu

On Wed, 2009-03-18 at 12:16 -0400, Oren Laadan wrote:
> My suggestions works for this two: we add a flag CR_CTX_DRYRUN; a task
> can ask to checkpoint itself, or another task, with CR_CTX_DRYRUN and
> the checkpoint code runs without actual effect. (If we don't want to
> expose the actual flag to userspace, then we simply use it in an
> implementation of a /proc/PID/checkpointable operation).

This mostly falls short answering the question "Where/when did I go
wrong?"  Personally, I think that is critical for getting good bug
reports out of normal Joes that might not really be interested in c/r
development itself.  It is like lockdep.  The guys/gals posting those
reports really don't know about kernel locking, but they are able to
improve it anyway.

-- Dave

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]             ` <20090318135953.GE22636-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-18 16:16               ` Oren Laadan
@ 2009-03-18 16:23               ` Oren Laadan
       [not found]                 ` <49C1201A.3050604-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  1 sibling, 1 reply; 38+ messages in thread
From: Oren Laadan @ 2009-03-18 16:23 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>
>> Sukadev Bhattiprolu wrote:
>>> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>>> Date: Fri, 13 Mar 2009 17:25:42 -0700
>>> Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()
>>>
>>> Create a proc file, /proc/pid/checkpointable, which shows '1' if
>>> task is checkpointable and '0' if it is not.
>>>
>>> To determine whether a task is checkpointable, the handler for this
>>> new proc file, shares the same code with sys_checkpoint().
> 
> Hey Oren,
> 
> 3 counter-points:
> 
>> I still don't understand why we would like to do it this way.
>>
>> First, it makes little sense to do it per-task, because we are supposed
>> to checkpoint an entire container.
> 
> Yes we need per-container info too.  Actually, per-checkpoint-job-init,
> so if we send pids in for that, it should return false if we send in the
> pid of a task which isn't a proper checkpoint-job-init.
> 
> But we also want the info per-task, for debugging info.
> 

My suggestions works for this two: we add a flag CR_CTX_DRYRUN; a task
can ask to checkpoint itself, or another task, with CR_CTX_DRYRUN and
the checkpoint code runs without actual effect. (If we don't want to
expose the actual flag to userspace, then we simply use it in an
implementation of a /proc/PID/checkpointable operation).

> I don't know how to represent those two cases, though.
> 
> Also debugfs may be a more appropriate medium.

Yes, I think that's better the /proc (which is then carved in stone ...)

> 
>> Second, what's wrong with doing a "dry" checkpoint on the container (or
>> if you prefer, the task, for what it's worth), that will not buffer nor
>> write out any data - just say "yes" or "no" ?
> 
> You can't get a text explanation like 'fd 4 (/sys/class/net/eth0) is not
> checkpointable'.  That's what's wrong with it.

I'm all for it.

> 
>> (we could use a flag "CR_CTX_DRYRUN" when calling sys_checkpoint() for
>> this, and test for this flag in, say, kwrite/kread).
>>
>> After all, we don't expect applications or users to continuously and
>> repeatedly test if they can checkpoint, so it isn't performance critical.
>> So we simply reuse the existing code.
> 
> No, but we do expect someone trying to checkpoint their job and failing
> to be curious as to why.

I'm not opposing the idea of a descriptive text message - on the contrary,
and I have supported this in earlier emails.

However, I suggest an alternative implementation to the approach in this
patch set (or, more precisely, a generalization).

And I argue that we  don't need anything beyond that (a la Ingo's
may_checkpoint), because it does not add enough value to justify the extra
code, efforts and maintenance.

> 
>> This would also catch cases where we can't checkpoint because the kernel
>> is low on memory - which wouldn't show up otherwise.
>>
>> And in any case, this is orthogonal to what Dave is pushing, following
>> Ingo's comment, to know when a task _becomes_ not-checkpointable. (And
>> in any case, I think our time is better spent on adding functionality
>> instead).
> 
> I think Suka's patch is small enough that there's not a lot of pursuing
> going on.  Dave's may_checkpoint is a lot more ambitious.
> 
> Just to be clear - are you saying you think both this patch and Dave's
> may_checkpoint patchsets ought to be delayed, or just Suka's, or just
> Dave's?  :)
> 

I think Suka's patch is the right way to go - and that we can generalize
the approach to test for a "dryrun" attribute (e.g. on ctx->flags) throughout
the checkpointing code.

Dave's patch has two parts now: the concept of "may_checkpoint" which, I
think, is diverting our efforts from other, more important, issues. This
is because knowing exactly when an application becomes 'uncheckpointable'
is not that more useful then knowing why it fails to checkpoint at the
time of the checkpoint. Also, there are all these issues of how to
transition back into 'checkpointable' state.

The other part is the fops->checkpoint() method, which saves the (or some)
state of the file, and can also be used to report "no-go", and I think
that is a good idea. Moreover, it can be made to understand a "dryrun"
flag and do nothing but check that a checkpoint would work...

Oren.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]             ` <20090318135953.GE22636-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-18 16:16               ` Oren Laadan
       [not found]                 ` <49C11E61.4010505-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-03-18 16:23               ` Oren Laadan
  1 sibling, 1 reply; 38+ messages in thread
From: Oren Laadan @ 2009-03-18 16:16 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen



Serge E. Hallyn wrote:
> Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
>>
>> Sukadev Bhattiprolu wrote:
>>> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>>> Date: Fri, 13 Mar 2009 17:25:42 -0700
>>> Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()
>>>
>>> Create a proc file, /proc/pid/checkpointable, which shows '1' if
>>> task is checkpointable and '0' if it is not.
>>>
>>> To determine whether a task is checkpointable, the handler for this
>>> new proc file, shares the same code with sys_checkpoint().
> 
> Hey Oren,
> 
> 3 counter-points:
> 
>> I still don't understand why we would like to do it this way.
>>
>> First, it makes little sense to do it per-task, because we are supposed
>> to checkpoint an entire container.
> 
> Yes we need per-container info too.  Actually, per-checkpoint-job-init,
> so if we send pids in for that, it should return false if we send in the
> pid of a task which isn't a proper checkpoint-job-init.
> 
> But we also want the info per-task, for debugging info.
> 

My suggestions works for this two: we add a flag CR_CTX_DRYRUN; a task
can ask to checkpoint itself, or another task, with CR_CTX_DRYRUN and
the checkpoint code runs without actual effect. (If we don't want to
expose the actual flag to userspace, then we simply use it in an
implementation of a /proc/PID/checkpointable operation).

> I don't know how to represent those two cases, though.
> 
> Also debugfs may be a more appropriate medium.

True.

> 
>> Second, what's wrong with doing a "dry" checkpoint on the container (or
>> if you prefer, the task, for what it's worth), that will not buffer nor
>> write out any data - just say "yes" or "no" ?
> 
> You can't get a text explanation like 'fd 4 (/sys/class/net/eth0) is not
> checkpointable'.  That's what's wrong with it.

I'm not opposing the idea of a descriptive text message - on the contrary.
I suggest an alternative implementation to the approach in this patch set
(or, more precisely, a generalization). And I argue that we don't need
anything beyond that (a la Ingo's may_checkpoint), because it does not
add enough value to justify the extra code, efforts and maintenance.

> 
>> (we could use a flag "CR_CTX_DRYRUN" when calling sys_checkpoint() for
>> this, and test for this flag in, say, kwrite/kread).
>>
>> After all, we don't expect applications or users to continuously and
>> repeatedly test if they can checkpoint, so it isn't performance critical.
>> So we simply reuse the existing code.
> 
> No, but we do expect someone trying to checkpoint their job and failing
> to be curious as to why.

Which is why I think a descriptive text message is a great idea.

>> This would also catch cases where we can't checkpoint because the kernel
>> is low on memory - which wouldn't show up otherwise.
>>
>> And in any case, this is orthogonal to what Dave is pushing, following
>> Ingo's comment, to know when a task _becomes_ not-checkpointable. (And
>> in any case, I think our time is better spent on adding functionality
>> instead).
> 
> I think Suka's patch is small enough that there's not a lot of pursuing
> going on.  Dave's may_checkpoint is a lot more ambitious.
> 
> Just to be clear - are you saying you think both this patch and Dave's
> may_checkpoint patchsets ought to be delayed, or just Suka's, or just
> Dave's?  :)
> 

I think Suka's patch is the right way to go - and that we can generalize
the approach to test for a "dryrun" attribute (e.g. on ctx->flags) throughout
the checkpointing code.

Dave's patch has two parts now: the concept of "may_checkpoint" which, I
think, is diverting our efforts from other, more important, issues. This
is because knowing exactly when an application becomes 'uncheckpointable'
is not that more useful then knowing why it fails to checkpoint at the
time of the checkpoint. Also, there are all these issues of how to
transition back into 'checkpointable' state.

The other part is the fops->checkpoint() method, which saves the (or some)
state of the file, and can also be used to report "no-go", and I think
that is a good idea. Moreover, it can be made to understand a "dryrun"
flag and do nothing but check that a checkpoint would work...

Oren.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]         ` <49C0B6FF.5030104-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  2009-03-18 13:59           ` Serge E. Hallyn
@ 2009-03-18 14:42           ` Dave Hansen
  1 sibling, 0 replies; 38+ messages in thread
From: Dave Hansen @ 2009-03-18 14:42 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu

On Wed, 2009-03-18 at 04:55 -0400, Oren Laadan wrote: Bhattiprolu wrote:
> > From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> > Date: Fri, 13 Mar 2009 17:25:42 -0700
> > Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()
> > 
> > Create a proc file, /proc/pid/checkpointable, which shows '1' if
> > task is checkpointable and '0' if it is not.
> > 
> > To determine whether a task is checkpointable, the handler for this
> > new proc file, shares the same code with sys_checkpoint().
> 
> I still don't understand why we would like to do it this way.
> 
> First, it makes little sense to do it per-task, because we are supposed
> to checkpoint an entire container.

I take it you didn't like the 'files' or 'uts' versions of this, either.
Do we have a container object in the kernel that we could tag, anyway?

> Second, what's wrong with doing a "dry" checkpoint on the container (or
> if you prefer, the task, for what it's worth), that will not buffer nor
> write out any data - just say "yes" or "no" ?
> 
> (we could use a flag "CR_CTX_DRYRUN" when calling sys_checkpoint() for
> this, and test for this flag in, say, kwrite/kread).
> 
> After all, we don't expect applications or users to continuously and
> repeatedly test if they can checkpoint, so it isn't performance critical.
> So we simply reuse the existing code.

Heh.  Did you read Ingo's suggestions?  He suggested *exactly* that.

http://lkml.org/lkml/2008/10/9/196

> This would also catch cases where we can't checkpoint because the kernel
> is low on memory - which wouldn't show up otherwise.
> 
> And in any case, this is orthogonal to what Dave is pushing, following
> Ingo's comment, to know when a task _becomes_ not-checkpointable. (And
> in any case, I think our time is better spent on adding functionality
> instead).
> 
> Oren.
-- Dave

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]         ` <49C0B6FF.5030104-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-18 13:59           ` Serge E. Hallyn
       [not found]             ` <20090318135953.GE22636-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-18 14:42           ` Dave Hansen
  1 sibling, 1 reply; 38+ messages in thread
From: Serge E. Hallyn @ 2009-03-18 13:59 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Sukadev Bhattiprolu wrote:
> > From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> > Date: Fri, 13 Mar 2009 17:25:42 -0700
> > Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()
> > 
> > Create a proc file, /proc/pid/checkpointable, which shows '1' if
> > task is checkpointable and '0' if it is not.
> > 
> > To determine whether a task is checkpointable, the handler for this
> > new proc file, shares the same code with sys_checkpoint().

Hey Oren,

3 counter-points:

> I still don't understand why we would like to do it this way.
> 
> First, it makes little sense to do it per-task, because we are supposed
> to checkpoint an entire container.

Yes we need per-container info too.  Actually, per-checkpoint-job-init,
so if we send pids in for that, it should return false if we send in the
pid of a task which isn't a proper checkpoint-job-init.

But we also want the info per-task, for debugging info.

I don't know how to represent those two cases, though.

Also debugfs may be a more appropriate medium.

> Second, what's wrong with doing a "dry" checkpoint on the container (or
> if you prefer, the task, for what it's worth), that will not buffer nor
> write out any data - just say "yes" or "no" ?

You can't get a text explanation like 'fd 4 (/sys/class/net/eth0) is not
checkpointable'.  That's what's wrong with it.

> (we could use a flag "CR_CTX_DRYRUN" when calling sys_checkpoint() for
> this, and test for this flag in, say, kwrite/kread).
> 
> After all, we don't expect applications or users to continuously and
> repeatedly test if they can checkpoint, so it isn't performance critical.
> So we simply reuse the existing code.

No, but we do expect someone trying to checkpoint their job and failing
to be curious as to why.

> This would also catch cases where we can't checkpoint because the kernel
> is low on memory - which wouldn't show up otherwise.
> 
> And in any case, this is orthogonal to what Dave is pushing, following
> Ingo's comment, to know when a task _becomes_ not-checkpointable. (And
> in any case, I think our time is better spent on adding functionality
> instead).

I think Suka's patch is small enough that there's not a lot of pursuing
going on.  Dave's may_checkpoint is a lot more ambitious.

Just to be clear - are you saying you think both this patch and Dave's
may_checkpoint patchsets ought to be delayed, or just Suka's, or just
Dave's?  :)

-serge

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]         ` <49C0B750.4050109-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-18 13:53           ` Serge E. Hallyn
  0 siblings, 0 replies; 38+ messages in thread
From: Serge E. Hallyn @ 2009-03-18 13:53 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Sukadev Bhattiprolu wrote:
> > From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> > Date: Sat, 14 Mar 2009 10:21:07 -0700
> > Subject: [PATCH 6/6] Explain reason for task being uncheckpointable
> > 
> > Try to give an useful message on why a task is uncheckpointable.
> > 
> > Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> > ---
> >  checkpoint/checkpoint.c    |   13 +++++++++----
> >  fs/proc/base.c             |   19 ++++++++++++++++++-
> >  include/linux/checkpoint.h |    7 +++++++
> >  3 files changed, 34 insertions(+), 5 deletions(-)
> > 
> 
> [...]
> 
> >  #ifdef CONFIG_CHECKPOINT_RESTART
> > +static char *explain_checkpointable(int reason)
> > +{
> > +	char *message;
> > +
> > +	switch(reason) {
> > +	case CR_CHECKPOINTABLE: message = "1"; break;
> > +	case CR_DEAD: 		message = "0 (dead)"; break;
> > +	case CR_NOT_FROZEN: 	message = "0 (not frozen)"; break;
> 
> Hmmm... so a task cannot test if it itself is checkpointable, because
> it will never be frozen and will fail right there before proceeding to
> addition (future) tests ?

No, no...  :

/* Verify that task is frozen, unless it is self-checkpoint */
if (t != current && !frozen(t))
-               return 0;
+               return CR_NOT_FROZEN;

-serge

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]     ` <20090317063958.GG2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-18  8:56       ` Oren Laadan
       [not found]         ` <49C0B750.4050109-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Oren Laadan @ 2009-03-18  8:56 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen



Sukadev Bhattiprolu wrote:
> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> Date: Sat, 14 Mar 2009 10:21:07 -0700
> Subject: [PATCH 6/6] Explain reason for task being uncheckpointable
> 
> Try to give an useful message on why a task is uncheckpointable.
> 
> Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> ---
>  checkpoint/checkpoint.c    |   13 +++++++++----
>  fs/proc/base.c             |   19 ++++++++++++++++++-
>  include/linux/checkpoint.h |    7 +++++++
>  3 files changed, 34 insertions(+), 5 deletions(-)
> 

[...]

>  #ifdef CONFIG_CHECKPOINT_RESTART
> +static char *explain_checkpointable(int reason)
> +{
> +	char *message;
> +
> +	switch(reason) {
> +	case CR_CHECKPOINTABLE: message = "1"; break;
> +	case CR_DEAD: 		message = "0 (dead)"; break;
> +	case CR_NOT_FROZEN: 	message = "0 (not frozen)"; break;

Hmmm... so a task cannot test if it itself is checkpointable, because
it will never be frozen and will fail right there before proceeding to
addition (future) tests ?

[...]

Oren.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found]     ` <20090317063940.GF2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-18  8:55       ` Oren Laadan
       [not found]         ` <49C0B6FF.5030104-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Oren Laadan @ 2009-03-18  8:55 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen



Sukadev Bhattiprolu wrote:
> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> Date: Fri, 13 Mar 2009 17:25:42 -0700
> Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()
> 
> Create a proc file, /proc/pid/checkpointable, which shows '1' if
> task is checkpointable and '0' if it is not.
> 
> To determine whether a task is checkpointable, the handler for this
> new proc file, shares the same code with sys_checkpoint().

I still don't understand why we would like to do it this way.

First, it makes little sense to do it per-task, because we are supposed
to checkpoint an entire container.

Second, what's wrong with doing a "dry" checkpoint on the container (or
if you prefer, the task, for what it's worth), that will not buffer nor
write out any data - just say "yes" or "no" ?

(we could use a flag "CR_CTX_DRYRUN" when calling sys_checkpoint() for
this, and test for this flag in, say, kwrite/kread).

After all, we don't expect applications or users to continuously and
repeatedly test if they can checkpoint, so it isn't performance critical.
So we simply reuse the existing code.

This would also catch cases where we can't checkpoint because the kernel
is low on memory - which wouldn't show up otherwise.

And in any case, this is orthogonal to what Dave is pushing, following
Ingo's comment, to know when a task _becomes_ not-checkpointable. (And
in any case, I think our time is better spent on adding functionality
instead).

Oren.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found] ` <20090317062754.GA2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (5 preceding siblings ...)
  2009-03-17  6:39   ` Sukadev Bhattiprolu
@ 2009-03-17  6:55   ` Sukadev Bhattiprolu
  6 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17  6:55 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers

Darn,

Sorry about he screwed up subject lines. I will repost the patchset.

Sukadev

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found] ` <20090317062754.GA2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (4 preceding siblings ...)
  2009-03-17  6:39   ` Sukadev Bhattiprolu
@ 2009-03-17  6:39   ` Sukadev Bhattiprolu
       [not found]     ` <20090317063958.GG2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17  6:55   ` Sukadev Bhattiprolu
  6 siblings, 1 reply; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17  6:39 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Sat, 14 Mar 2009 10:21:07 -0700
Subject: [PATCH 6/6] Explain reason for task being uncheckpointable

Try to give an useful message on why a task is uncheckpointable.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c    |   13 +++++++++----
 fs/proc/base.c             |   19 ++++++++++++++++++-
 include/linux/checkpoint.h |    7 +++++++
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index ad0956c..c8c377d 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -264,18 +264,23 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
 	return ret;
 }
 
-int task_checkpointable(struct task_struct *t)
+int __task_checkpointable(struct task_struct *t)
 {
 	if (t->state == TASK_DEAD) {
 		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
-		return 0;
+		return CR_DEAD;
 	}
 
 	/* Verify that task is frozen, unless it is self-checkpoint */
 	if (t != current && !frozen(t))
-		return 0;
+		return CR_NOT_FROZEN;
+
+	return CR_CHECKPOINTABLE;
+}
 
-	return 1;
+int task_checkpointable(struct task_struct *t)
+{
+	return __task_checkpointable(t) == CR_CHECKPOINTABLE;
 }
 
 static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 989ca93..6e4c07c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2502,9 +2502,26 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTART
+static char *explain_checkpointable(int reason)
+{
+	char *message;
+
+	switch(reason) {
+	case CR_CHECKPOINTABLE: message = "1"; break;
+	case CR_DEAD: 		message = "0 (dead)"; break;
+	case CR_NOT_FROZEN: 	message = "0 (not frozen)"; break;
+	default:		message = "0 (unknown)"; break;
+	}
+	return message;
+}
+
 static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
 {
-	return sprintf(buffer, "%d\n", task_checkpointable(task));
+	int rc;
+
+	rc = __task_checkpointable(task);
+
+	return scnprintf(buffer, PAGE_SIZE, "%s\n", explain_checkpointable(rc));
 }
 #endif
 
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index d1f53a6..a1eba73 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -142,6 +142,13 @@ static inline int cr_enabled(void)
 	return 1;
 }
 
+enum cr_uncheckpointable_reason {
+	CR_CHECKPOINTABLE = 0,
+	CR_DEAD,
+	CR_NOT_FROZEN
+};
+
+int __task_checkpointable(struct task_struct *t);
 int task_checkpointable(struct task_struct *t);
 
 #else /* !CONFIG_CHECKPOINT_RESTART */
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found] ` <20090317062754.GA2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (3 preceding siblings ...)
  2009-03-17  6:39   ` Sukadev Bhattiprolu
@ 2009-03-17  6:39   ` Sukadev Bhattiprolu
       [not found]     ` <20090317063940.GF2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17  6:39   ` Sukadev Bhattiprolu
  2009-03-17  6:55   ` Sukadev Bhattiprolu
  6 siblings, 1 reply; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17  6:39 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Fri, 13 Mar 2009 17:25:42 -0700
Subject: [PATCH 5/6] Define and use proc_pid_checkpointable()

Create a proc file, /proc/pid/checkpointable, which shows '1' if
task is checkpointable and '0' if it is not.

To determine whether a task is checkpointable, the handler for this
new proc file, shares the same code with sys_checkpoint().

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c    |   23 ++++++++++++++++-------
 fs/proc/base.c             |   13 +++++++++++++
 include/linux/checkpoint.h |    7 +++++++
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index c16f30c..ad0956c 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -264,22 +264,31 @@ static int cr_write_all_tasks(struct cr_ctx *ctx)
 	return ret;
 }
 
+int task_checkpointable(struct task_struct *t)
+{
+	if (t->state == TASK_DEAD) {
+		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+		return 0;
+	}
+
+	/* Verify that task is frozen, unless it is self-checkpoint */
+	if (t != current && !frozen(t))
+		return 0;
+
+	return 1;
+}
+
 static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
 {
 	pr_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
 
-	if (t->state == TASK_DEAD) {
-		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+	/* verify that the task is checkpointable */
+	if (!task_checkpointable(t))
 		return -EAGAIN;
-	}
 
 	if (!ptrace_may_access(t, PTRACE_MODE_READ))
 		return -EPERM;
 
-	/* verify that the task is frozen (unless self) */
-	if (t != current && !frozen(t))
-		return -EBUSY;
-
 	/* FIXME: change this for nested containers */
 	if (task_nsproxy(t) != ctx->root_nsproxy)
 		return -EPERM;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index bde92c3..989ca93 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2501,6 +2501,13 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTART
+static int proc_pid_checkpointable(struct task_struct *task, char *buffer)
+{
+	return sprintf(buffer, "%d\n", task_checkpointable(task));
+}
+#endif
+
 /*
  * Thread groups
  */
@@ -2580,6 +2587,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, proc_tgid_io_accounting),
 #endif
+#ifdef CONFIG_CHECKPOINT_RESTART
+	INF("checkpointable", S_IRUGO, proc_pid_checkpointable),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file * filp,
@@ -2915,6 +2925,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	INF("io",	S_IRUGO, proc_tid_io_accounting),
 #endif
+#ifdef CONFIG_CHECKPOINT_RESTART
+	INF("checkpointable", S_IRUGO, proc_pid_checkpointable),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file * filp,
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 484be56..d1f53a6 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -142,6 +142,8 @@ static inline int cr_enabled(void)
 	return 1;
 }
 
+int task_checkpointable(struct task_struct *t);
+
 #else /* !CONFIG_CHECKPOINT_RESTART */
 
 static inline void files_deny_checkpointing(struct files_struct *files) {}
@@ -162,5 +164,10 @@ static inline int cr_enabled(void)
 	return 0;
 }
 
+static inline int task_checkpointable(struct task_struct *t)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CHECKPOINT_RESTART */
 #endif /* _CHECKPOINT_CKPT_H_ */
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found] ` <20090317062754.GA2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-03-17  6:39   ` Sukadev Bhattiprolu
@ 2009-03-17  6:39   ` Sukadev Bhattiprolu
  2009-03-17  6:39   ` Sukadev Bhattiprolu
                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17  6:39 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Thu, 12 Mar 2009 14:20:02 -0700
Subject: [PATCH 4/6] Deny external checkpoint unless task is frozen

Remove a 'FIXME' and ensure that the tasks we are checkpointing are
frozen unless its a self-checkpoint.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index dae9b97..c16f30c 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/utsname.h>
 #include <linux/magic.h>
+#include <linux/freezer.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -275,7 +276,9 @@ static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
 	if (!ptrace_may_access(t, PTRACE_MODE_READ))
 		return -EPERM;
 
-	/* FIXME: verify that the task is frozen (unless self) */
+	/* verify that the task is frozen (unless self) */
+	if (t != current && !frozen(t))
+		return -EBUSY;
 
 	/* FIXME: change this for nested containers */
 	if (task_nsproxy(t) != ctx->root_nsproxy)
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found] ` <20090317062754.GA2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17  6:38   ` Sukadev Bhattiprolu
  2009-03-17  6:38   ` Sukadev Bhattiprolu
@ 2009-03-17  6:39   ` Sukadev Bhattiprolu
  2009-03-17  6:39   ` Sukadev Bhattiprolu
                     ` (3 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17  6:39 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Date: Thu, 12 Mar 2009 14:19:54 -0700
Subject: [PATCH 3/6] Check 'may_checkpoint()' early

We currently check if a task is checkpointable when writing the task
information to checkpoint file. The small downside of doing this check
late is that we may have processed several processes in the tree before
hitting one that cannot be checkpointed.

We anyway walk the process tree we are checkpointing earlier, while
counting the tasks. We could check if all processes in the tree are
checkpointable at that time and fail early if any of them are not.
Since the process tree should be frozen, checking earlier should not
matter ?

For now, the patch leaves the existing check in cr_write_pids(). We
can remove that later.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index ed4fd3d..dae9b97 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -341,11 +341,18 @@ static int cr_tree_count_tasks(struct cr_ctx *ctx)
 	struct task_struct **tasks_arr = ctx->tasks_arr;
 	int tasks_nr = ctx->tasks_nr;
 	int nr = 0;
+	int ret;
 
 	read_lock(&tasklist_lock);
 
 	/* count tasks via DFS scan of the tree */
 	while (1) {
+		ret = cr_may_checkpoint_task(task, ctx);
+		if (ret < 0) {
+			nr = ret;
+			break;
+		}
+
 		if (tasks_arr) {
 			/* unlikely, but ... */
 			if (nr == tasks_nr)
@@ -401,6 +408,8 @@ static int cr_build_tree(struct cr_ctx *ctx)
 
 	/* count tasks (no side effects) */
 	n = cr_tree_count_tasks(ctx);
+	if (n < 0)
+		return n;
 
 	ctx->tasks_nr = n;
 	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found] ` <20090317062754.GA2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-17  6:38   ` Sukadev Bhattiprolu
@ 2009-03-17  6:38   ` Sukadev Bhattiprolu
  2009-03-17  6:39   ` Sukadev Bhattiprolu
                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17  6:38 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Date: Thu, 12 Mar 2009 14:19:41 -0700
Subject: [PATCH 2/6] Restart multiple processes

Restarting of multiple processes expects all restarting tasks to call
sys_restart(). Once inside the system call, each task will restart
itself at the same order that they were saved. The internals of the
syscall will take care of in-kernel synchronization bewteen tasks.

This patch does _not_ create the task tree in the kernel. Instead it
assumes that all tasks are created in some way and then invoke the
restart syscall. You can use the userspace mktree.c program to do
that.

The init task (*) has a special role: it allocates the restart context
(ctx), and coordinates the operation. In particular, it first waits
until all participating tasks enter the kernel, and provides them the
common restart context. Once everyone in ready, it begins to restart
itself.

In contrast, the other tasks enter the kernel, locate the init task (*)
and grab its restart context, and then wait for their turn to restore.

When a task (init or not) completes its restart, it hands the control
over to the next in line, by waking that task.

An array of pids (the one saved during the checkpoint) is used to
synchronize the operation. The first task in the array is the init
task (*). The restart context (ctx) maintain a "current position" in
the array, which indicates which task is currently active. Once the
currently active task completes its own restart, it increments that
position and wakes up the next task.

Restart assumes that userspace provides meaningful data, otherwise
it's garbage-in-garbage-out. In this case, the syscall may block
indefinitely, but in TASK_INTERRUPTIBLE, so the user can ctrl-c or
otherwise kill the stray restarting tasks.

In terms of security, restart runs as the user the invokes it, so it
will not allow a user to do more than is otherwise permitted by the
usual system semantics and policy.

Currently we ignore threads and zombies, as well as session ids.
Add support for multiple processes

(*) For containers, restart should be called inside a fresh container
by the init task of that container. However, it is also possible to
restart applications not necessarily inside a container, and without
restoring the original pids of the processes (that is, provided that
the application can tolerate such behavior). This is useful to allow
multi-process restart of tasks not isolated inside a container, and
also for debugging.

Changelog[v12]:
  - Replace obsolete cr_debug() with pr_debug()

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/restart.c       |  214 +++++++++++++++++++++++++++++++++++++++++++-
 checkpoint/sys.c           |   34 ++++++--
 include/linux/checkpoint.h |   25 +++++-
 include/linux/sched.h      |    3 +
 4 files changed, 262 insertions(+), 14 deletions(-)

diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 0c46abf..6b4cd75 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -10,6 +10,7 @@
 
 #include <linux/version.h>
 #include <linux/sched.h>
+#include <linux/wait.h>
 #include <linux/file.h>
 #include <linux/magic.h>
 #include <linux/checkpoint.h>
@@ -276,30 +277,235 @@ static int cr_read_task(struct cr_ctx *ctx)
 	return ret;
 }
 
+/* cr_read_tree - read the tasks tree into the checkpoint context */
+static int cr_read_tree(struct cr_ctx *ctx)
+{
+	struct cr_hdr_tree *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	int parent, size, ret = -EINVAL;
+
+	parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TREE);
+	if (parent < 0) {
+		ret = parent;
+		goto out;
+	} else if (parent != 0)
+		goto out;
+
+	if (hh->tasks_nr < 0)
+		goto out;
+
+	ctx->pids_nr = hh->tasks_nr;
+	size = sizeof(*ctx->pids_arr) * ctx->pids_nr;
+	if (size < 0)		/* overflow ? */
+		goto out;
+
+	ctx->pids_arr = kmalloc(size, GFP_KERNEL);
+	if (!ctx->pids_arr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = cr_kread(ctx, ctx->pids_arr, size);
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+	return ret;
+}
+
+static int cr_wait_task(struct cr_ctx *ctx)
+{
+	pid_t pid = task_pid_vnr(current);
+
+	pr_debug("pid %d waiting\n", pid);
+	return wait_event_interruptible(ctx->waitq, ctx->pids_active == pid);
+}
+
+static int cr_next_task(struct cr_ctx *ctx)
+{
+	struct task_struct *tsk;
+
+	ctx->pids_pos++;
+
+	pr_debug("pids_pos %d %d\n", ctx->pids_pos, ctx->pids_nr);
+	if (ctx->pids_pos == ctx->pids_nr) {
+		complete(&ctx->complete);
+		return 0;
+	}
+
+	ctx->pids_active = ctx->pids_arr[ctx->pids_pos].vpid;
+
+	pr_debug("pids_next %d\n", ctx->pids_active);
+
+	rcu_read_lock();
+	tsk = find_task_by_pid_ns(ctx->pids_active, ctx->root_nsproxy->pid_ns);
+	if (tsk)
+		wake_up_process(tsk);
+	rcu_read_unlock();
+
+	if (!tsk) {
+		ctx->pids_err = -ESRCH;
+		complete(&ctx->complete);
+		return -ESRCH;
+	}
+
+	return 0;
+}
+
+/* FIXME: this should be per container */
+DECLARE_WAIT_QUEUE_HEAD(cr_restart_waitq);
+
+static int do_restart_task(struct cr_ctx *ctx, pid_t pid)
+{
+	struct task_struct *root_task;
+	int ret;
+
+	rcu_read_lock();
+	root_task = find_task_by_pid_ns(pid, current->nsproxy->pid_ns);
+	if (root_task)
+		get_task_struct(root_task);
+	rcu_read_unlock();
+
+	if (!root_task)
+		return -EINVAL;
+
+	/*
+	 * wait for container init to initialize the restart context, then
+	 * grab a reference to that context, and if we're the last task to
+	 * do it, notify the container init.
+	 */
+	ret = wait_event_interruptible(cr_restart_waitq,
+				       root_task->checkpoint_ctx);
+	if (ret < 0)
+		goto out;
+
+	task_lock(root_task);
+	ctx = root_task->checkpoint_ctx;
+	if (ctx)
+		cr_ctx_get(ctx);
+	task_unlock(root_task);
+
+	if (!ctx) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (atomic_dec_and_test(&ctx->tasks_count))
+		complete(&ctx->complete);
+
+	/* wait for our turn, do the restore, and tell next task in line */
+	ret = cr_wait_task(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_read_task(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_next_task(ctx);
+
+ out:
+	cr_ctx_put(ctx);
+	put_task_struct(root_task);
+	return ret;
+}
+
+static int cr_wait_all_tasks_start(struct cr_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->pids_nr == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+	current->checkpoint_ctx = ctx;
+
+	wake_up_all(&cr_restart_waitq);
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	if (ret < 0)
+		return ret;
+
+	task_lock(current);
+	current->checkpoint_ctx = NULL;
+	task_unlock(current);
+
+	return 0;
+}
+
+static int cr_wait_all_tasks_finish(struct cr_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->pids_nr == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+
+	ret = cr_next_task(ctx);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
 /* setup restart-specific parts of ctx */
 static int cr_ctx_restart(struct cr_ctx *ctx, pid_t pid)
 {
+	ctx->root_pid = pid;
+	ctx->root_task = current;
+	ctx->root_nsproxy = current->nsproxy;
+
+	get_task_struct(ctx->root_task);
+	get_nsproxy(ctx->root_nsproxy);
+
+	atomic_set(&ctx->tasks_count, ctx->pids_nr - 1);
+
 	return 0;
 }
 
-int do_restart(struct cr_ctx *ctx, pid_t pid)
+static int do_restart_root(struct cr_ctx *ctx, pid_t pid)
 {
 	int ret;
 
+	ret = cr_read_head(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_read_tree(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_ctx_restart(ctx, pid);
 	if (ret < 0)
 		goto out;
-	ret = cr_read_head(ctx);
+
+	/* wait for all other tasks to enter do_restart_task() */
+	ret = cr_wait_all_tasks_start(ctx);
 	if (ret < 0)
 		goto out;
+
 	ret = cr_read_task(ctx);
 	if (ret < 0)
 		goto out;
-	ret = cr_read_tail(ctx);
+
+	/* wait for all other tasks to complete do_restart_task() */
+	ret = cr_wait_all_tasks_finish(ctx);
 	if (ret < 0)
 		goto out;
 
-	/* on success, adjust the return value if needed [TODO] */
+	ret = cr_read_tail(ctx);
+
  out:
 	return ret;
 }
+
+int do_restart(struct cr_ctx *ctx, pid_t pid)
+{
+	int ret;
+
+	if (ctx)
+		ret = do_restart_root(ctx, pid);
+	else
+		ret = do_restart_task(ctx, pid);
+
+	/* on success, adjust the return value if needed [TODO] */
+	return ret;
+}
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 0436ef3..f26b0c6 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -167,6 +167,8 @@ static void cr_task_arr_free(struct cr_ctx *ctx)
 
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
+	BUG_ON(atomic_read(&ctx->refcount));
+
 	if (ctx->file)
 		fput(ctx->file);
 
@@ -185,6 +187,8 @@ static void cr_ctx_free(struct cr_ctx *ctx)
 	if (ctx->root_task)
 		put_task_struct(ctx->root_task);
 
+	kfree(ctx->pids_arr);
+
 	kfree(ctx);
 }
 
@@ -199,8 +203,10 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 
 	ctx->flags = flags;
 
+	atomic_set(&ctx->refcount, 0);
 	INIT_LIST_HEAD(&ctx->pgarr_list);
 	INIT_LIST_HEAD(&ctx->pgarr_pool);
+	init_waitqueue_head(&ctx->waitq);
 
 	err = -EBADF;
 	ctx->file = fget(fd);
@@ -215,6 +221,7 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 	if (cr_objhash_alloc(ctx) < 0)
 		goto err;
 
+	atomic_inc(&ctx->refcount);
 	return ctx;
 
  err:
@@ -222,6 +229,17 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 	return ERR_PTR(err);
 }
 
+void cr_ctx_get(struct cr_ctx *ctx)
+{
+	atomic_inc(&ctx->refcount);
+}
+
+void cr_ctx_put(struct cr_ctx *ctx)
+{
+	if (ctx && atomic_dec_and_test(&ctx->refcount))
+		cr_ctx_free(ctx);
+}
+
 /**
  * sys_checkpoint - checkpoint a container
  * @pid: pid of the container init(1) process
@@ -249,7 +267,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
 	if (!ret)
 		ret = ctx->crid;
 
-	cr_ctx_free(ctx);
+	cr_ctx_put(ctx);
 	return ret;
 }
 
@@ -264,7 +282,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
  */
 asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
 {
-	struct cr_ctx *ctx;
+	struct cr_ctx *ctx = NULL;
 	pid_t pid;
 	int ret;
 
@@ -272,15 +290,17 @@ asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
 	if (flags)
 		return -EINVAL;
 
-	ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
 	/* FIXME: for now, we use 'crid' as a pid */
 	pid = (pid_t) crid;
 
+	if (pid == task_pid_vnr(current))
+		ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
 	ret = do_restart(ctx, pid);
 
-	cr_ctx_free(ctx);
+	cr_ctx_put(ctx);
 	return ret;
 }
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 6307511..484be56 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -13,10 +13,13 @@
 #include <linux/path.h>
 #include <linux/fs.h>
 #include <linux/fdtable.h>
+#include <linux/path.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
 
 #ifdef CONFIG_CHECKPOINT_RESTART
 
-#define CR_VERSION  2
+#define CR_VERSION  3
 
 struct cr_ctx {
 	int crid;		/* unique checkpoint id */
@@ -34,8 +37,7 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
-	struct task_struct **tasks_arr;	/* array of all tasks in container */
-	int tasks_nr;			/* size of tasks array */
+	atomic_t refcount;
 
 	struct cr_objhash *objhash;	/* hash for shared objects */
 
@@ -43,6 +45,20 @@ struct cr_ctx {
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
 
 	struct path fs_mnt;	/* container root (FIXME) */
+
+	/* [multi-process checkpoint] */
+	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
+	int tasks_nr;                   /* size of tasks array */
+
+	/* [multi-process restart] */
+	struct cr_hdr_pids *pids_arr;	/* array of all pids [restart] */
+	int pids_nr;			/* size of pids array */
+	int pids_pos;			/* position pids array */
+	int pids_err;			/* error occured ? */
+	pid_t pids_active;		/* pid of (next) active task */
+	atomic_t tasks_count;		/* sync of restarting tasks */
+	struct completion complete;	/* sync of restarting tasks */
+	wait_queue_head_t waitq;	/* sync of restarting tasks */
 };
 
 /* cr_ctx: flags */
@@ -55,6 +71,9 @@ extern int cr_kread(struct cr_ctx *ctx, void *buf, int count);
 extern void *cr_hbuf_get(struct cr_ctx *ctx, int n);
 extern void cr_hbuf_put(struct cr_ctx *ctx, int n);
 
+extern void cr_ctx_get(struct cr_ctx *ctx);
+extern void cr_ctx_put(struct cr_ctx *ctx);
+
 /* shared objects handling */
 
 enum {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c216e0..c5d5987 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1416,6 +1416,9 @@ struct task_struct {
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	unsigned long trace;
+#endif 
+#ifdef CONFIG_CHECKPOINT_RESTART
+	struct cr_ctx *checkpoint_ctx;
 #endif
 };
 
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 0/6] /proc/pid/checkpointable
       [not found] ` <20090317062754.GA2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-17  6:38   ` Sukadev Bhattiprolu
  2009-03-17  6:38   ` Sukadev Bhattiprolu
                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17  6:38 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Date: Thu, 12 Mar 2009 14:16:05 -0700
Subject: [PATCH 1/6] Checkpoint multiple processes

Checkpointing of multiple processes works by recording the tasks tree
structure below a given task (usually this task is the container init).

For a given task, do a DFS scan of the tasks tree and collect them
into an array (keeping a reference to each task). Using DFS simplifies
the recreation of tasks either in user space or kernel space. For each
task collected, test if it can be checkpointed, and save its pid, tgid,
and ppid.

The actual work is divided into two passes: a first scan counts the
tasks, then memory is allocated and a second scan fills the array.

The logic is suitable for creation of processes during restart either
in userspace or by the kernel.

Currently we ignore threads and zombies, as well as session ids.

Changelog[v12]:
  - Replace obsolete cr_debug() with pr_debug()

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/checkpoint.c        |  217 ++++++++++++++++++++++++++++++++++++++--
 checkpoint/sys.c               |   16 +++
 include/linux/checkpoint.h     |    3 +
 include/linux/checkpoint_hdr.h |   13 ++-
 4 files changed, 238 insertions(+), 11 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index e0af8a2..ed4fd3d 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -226,13 +226,6 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 {
 	int ret;
 
-	/* TODO: verity that the task is frozen (unless self) */
-
-	if (t->state == TASK_DEAD) {
-		pr_warning("c/r: task may not be in state TASK_DEAD\n");
-		return -EAGAIN;
-	}
-
 	ret = cr_write_task_struct(ctx, t);
 	pr_debug("task_struct: ret %d\n", ret);
 	if (ret < 0)
@@ -255,6 +248,200 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
+/* dump all tasks in ctx->tasks_arr[] */
+static int cr_write_all_tasks(struct cr_ctx *ctx)
+{
+	int n, ret = 0;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		pr_debug("dumping task #%d\n", n);
+		ret = cr_write_task(ctx, ctx->tasks_arr[n]);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
+static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
+{
+	pr_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
+
+	if (t->state == TASK_DEAD) {
+		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+		return -EAGAIN;
+	}
+
+	if (!ptrace_may_access(t, PTRACE_MODE_READ))
+		return -EPERM;
+
+	/* FIXME: verify that the task is frozen (unless self) */
+
+	/* FIXME: change this for nested containers */
+	if (task_nsproxy(t) != ctx->root_nsproxy)
+		return -EPERM;
+
+	return 0;
+}
+
+#define CR_HDR_PIDS_CHUNK	256
+
+static int cr_write_pids(struct cr_ctx *ctx)
+{
+	struct cr_hdr_pids *hh;
+	struct pid_namespace *ns;
+	struct task_struct *task;
+	struct task_struct **tasks_arr;
+	int tasks_nr, n, ret = 0, pos = 0;
+
+	ns = ctx->root_nsproxy->pid_ns;
+	tasks_arr = ctx->tasks_arr;
+	tasks_nr = ctx->tasks_nr;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh) * CR_HDR_PIDS_CHUNK);
+
+	while (tasks_nr > 0) {
+		rcu_read_lock();
+		for (n = min(tasks_nr, CR_HDR_PIDS_CHUNK); n; n--) {
+			task = tasks_arr[pos];
+
+			/* is this task cool ? */
+			ret = cr_may_checkpoint_task(task, ctx);
+			if (ret < 0) {
+				rcu_read_unlock();
+				goto out;
+			}
+			hh[pos].vpid = task_pid_nr_ns(task, ns);
+			hh[pos].vtgid = task_tgid_nr_ns(task, ns);
+			hh[pos].vppid = task_tgid_nr_ns(task->real_parent, ns);
+			pr_debug("task[%d]: vpid %d vtgid %d parent %d\n", pos,
+				 hh[pos].vpid, hh[pos].vtgid, hh[pos].vppid);
+			pos++;
+		}
+		rcu_read_unlock();
+
+		n = min(tasks_nr, CR_HDR_PIDS_CHUNK);
+		ret = cr_kwrite(ctx, hh, n * sizeof(*hh));
+		if (ret < 0)
+			break;
+
+		tasks_nr -= n;
+	}
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+	return ret;
+}
+
+/* count number of tasks in tree (and optionally fill pid's in array) */
+static int cr_tree_count_tasks(struct cr_ctx *ctx)
+{
+	struct task_struct *root = ctx->root_task;
+	struct task_struct *task = root;
+	struct task_struct *parent = NULL;
+	struct task_struct **tasks_arr = ctx->tasks_arr;
+	int tasks_nr = ctx->tasks_nr;
+	int nr = 0;
+
+	read_lock(&tasklist_lock);
+
+	/* count tasks via DFS scan of the tree */
+	while (1) {
+		if (tasks_arr) {
+			/* unlikely, but ... */
+			if (nr == tasks_nr)
+				return -EBUSY;	/* cleanup in cr_ctx_free() */
+			tasks_arr[nr] = task;
+			get_task_struct(task);
+		}
+
+		nr++;
+
+		/* if has children - proceed with child */
+		if (!list_empty(&task->children)) {
+			parent = task;
+			task = list_entry(task->children.next,
+					  struct task_struct, sibling);
+			continue;
+		}
+
+		while (task != root) {
+			/* if has sibling - proceed with sibling */
+			if (!list_is_last(&task->sibling, &parent->children)) {
+				task = list_entry(task->sibling.next,
+						  struct task_struct, sibling);
+				break;
+			}
+
+			/* else, trace back to parent and proceed */
+			task = parent;
+			parent = parent->real_parent;
+		}
+
+		if (task == root)
+			break;
+	}
+
+	read_unlock(&tasklist_lock);
+	return nr;
+}
+
+/*
+ * cr_build_tree - scan the tasks tree in DFS order and fill in array
+ * @ctx: checkpoint context
+ *
+ * Using DFS order simplifies the restart logic to re-create the tasks.
+ *
+ * On success, ctx->tasks_arr will be allocated and populated with all
+ * tasks (reference taken), and ctx->tasks_nr will hold the total count.
+ * The array is cleaned up by cr_ctx_free().
+ */
+static int cr_build_tree(struct cr_ctx *ctx)
+{
+	int n, m;
+
+	/* count tasks (no side effects) */
+	n = cr_tree_count_tasks(ctx);
+
+	ctx->tasks_nr = n;
+	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
+	if (!ctx->tasks_arr)
+		return -ENOMEM;
+
+	/* count again (now will fill array) */
+	m = cr_tree_count_tasks(ctx);
+
+	/* unlikely, but ... (cleanup in cr_ctx_free) */
+	if (m < 0)
+		return m;
+	else if (m != n)
+		return -EBUSY;
+
+	return 0;
+}
+
+/* dump the array that describes the tasks tree */
+static int cr_write_tree(struct cr_ctx *ctx)
+{
+	struct cr_hdr h;
+	struct cr_hdr_tree *hh;
+	int ret;
+
+	h.type = CR_HDR_TREE;
+	h.len = sizeof(*hh);
+	h.parent = 0;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh));
+	hh->tasks_nr = ctx->tasks_nr;
+
+	ret = cr_write_obj(ctx, &h, hh);
+	cr_hbuf_put(ctx, sizeof(*hh));
+	if (ret < 0)
+		return ret;
+
+	ret = cr_write_pids(ctx);
+	return ret;
+}
+
 static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 {
 	struct task_struct *task = NULL;
@@ -272,7 +459,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!task)
 		goto out;
 
-#if 0	/* enable to use containers */
+#if 0	/* enable with containers */
 	if (!is_container_init(task)) {
 		err = -EINVAL;
 		goto out;
@@ -294,7 +481,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!nsproxy)
 		goto out;
 
-	/* TODO: verify that the container is frozen */
+	/* FIXME: verify that the container is frozen */
 
 	ctx->root_task = task;
 	ctx->root_nsproxy = nsproxy;
@@ -342,12 +529,22 @@ int do_checkpoint(struct cr_ctx *ctx, pid_t pid)
 	ret = cr_ctx_checkpoint(ctx, pid);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_build_tree(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_head(ctx);
 	if (ret < 0)
 		goto out;
-	ret = cr_write_task(ctx, ctx->root_task);
+	ret = cr_write_tree(ctx);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_write_all_tasks(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_tail(ctx);
 	if (ret < 0)
 		goto out;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 4a51ed3..0436ef3 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -152,6 +152,19 @@ void cr_hbuf_put(struct cr_ctx *ctx, int n)
  * restart operation, and persists until the operation is completed.
  */
 
+static void cr_task_arr_free(struct cr_ctx *ctx)
+{
+	int n;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		if (ctx->tasks_arr[n]) {
+			put_task_struct(ctx->tasks_arr[n]);
+			ctx->tasks_arr[n] = NULL;
+		}
+	}
+	kfree(ctx->tasks_arr);
+}
+
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
 	if (ctx->file)
@@ -164,6 +177,9 @@ static void cr_ctx_free(struct cr_ctx *ctx)
 	cr_pgarr_free(ctx);
 	cr_objhash_free(ctx);
 
+	if (ctx->tasks_arr)
+		cr_task_arr_free(ctx);
+
 	if (ctx->root_nsproxy)
 		put_nsproxy(ctx->root_nsproxy);
 	if (ctx->root_task)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index e77393f..6307511 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -34,6 +34,9 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
+	struct task_struct **tasks_arr;	/* array of all tasks in container */
+	int tasks_nr;			/* size of tasks array */
+
 	struct cr_objhash *objhash;	/* hash for shared objects */
 
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 66313bd..0acf7c0 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -45,7 +45,8 @@ enum {
 	CR_HDR_STRING,
 	CR_HDR_FNAME,
 
-	CR_HDR_TASK = 101,
+	CR_HDR_TREE = 101,
+	CR_HDR_TASK,
 	CR_HDR_THREAD,
 	CR_HDR_CPU,
 
@@ -81,6 +82,16 @@ struct cr_hdr_tail {
 	__u64 magic;
 } __attribute__((aligned(8)));
 
+struct cr_hdr_tree {
+	__u32 tasks_nr;
+} __attribute__((aligned(8)));
+
+struct cr_hdr_pids {
+	__s32 vpid;
+	__s32 vtgid;
+	__s32 vppid;
+} __attribute__((aligned(8)));
+
 struct cr_hdr_task {
 	__u32 state;
 	__u32 exit_state;
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 0/6] /proc/pid/checkpointable
@ 2009-03-17  6:27 Sukadev Bhattiprolu
       [not found] ` <20090317062754.GA2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-17  6:27 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


Started out implementing '/proc/pid/checkpointable' which reports 0 or 1
depending on whether a process is checkpointable atm. But realized hat
some patches that were discussed earlier are missing from Dave Hansen's
tree.

This patchset includes those missing patches plus two new patches that
create /proc/pid/checkpointable.

	[PATCH 1/6] Checkpoint multiple processes
	[PATCH 2/6] Restart multiple processes
	[PATCH 3/6] Check 'may_checkpoint()' early
	[PATCH 4/6] Deny external checkpoint unless task is frozen
	[PATCH 5/6] Define and use proc_pid_checkpointable()
	[PATCH 6/6] Explain reason for task being uncheckpointable

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2009-03-26 13:29 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-17 17:43 [PATCH 0/6] /proc/pid/checkpointable Sukadev Bhattiprolu
     [not found] ` <20090317174359.GA10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-17 17:49   ` [PATCH 1/6] Checkpoint multiple processes Sukadev Bhattiprolu
2009-03-17 17:49   ` [PATCH 2/6] Restart " Sukadev Bhattiprolu
2009-03-17 17:50   ` [PATCH 3/6] Check 'may_checkpoint()' early Sukadev Bhattiprolu
     [not found]     ` <20090317175013.GD10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-26  5:58       ` Sukadev Bhattiprolu
     [not found]         ` <20090326055840.GA8220-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-26 13:25           ` Serge E. Hallyn
2009-03-17 17:50   ` [PATCH 4/6] Deny external checkpoint unless task is frozen Sukadev Bhattiprolu
2009-03-17 17:51   ` [PATCH 5/6] Define and use proc_pid_checkpointable() Sukadev Bhattiprolu
2009-03-17 17:51   ` [PATCH 6/6] Explain reason for task being uncheckpointable Sukadev Bhattiprolu
     [not found]     ` <20090317175149.GG10796-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-17 18:52       ` Serge E. Hallyn
     [not found]         ` <20090317185228.GC1039-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-18  6:50           ` Sukadev Bhattiprolu
2009-03-18  9:28   ` [PATCH 0/6] /proc/pid/checkpointable Oren Laadan
  -- strict thread matches above, loose matches on Subject: below --
2009-03-17  6:27 Sukadev Bhattiprolu
     [not found] ` <20090317062754.GA2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-17  6:38   ` Sukadev Bhattiprolu
2009-03-17  6:38   ` Sukadev Bhattiprolu
2009-03-17  6:39   ` Sukadev Bhattiprolu
2009-03-17  6:39   ` Sukadev Bhattiprolu
2009-03-17  6:39   ` Sukadev Bhattiprolu
     [not found]     ` <20090317063940.GF2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-18  8:55       ` Oren Laadan
     [not found]         ` <49C0B6FF.5030104-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-18 13:59           ` Serge E. Hallyn
     [not found]             ` <20090318135953.GE22636-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-18 16:16               ` Oren Laadan
     [not found]                 ` <49C11E61.4010505-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-18 16:24                   ` Dave Hansen
2009-03-18 17:48                     ` Oren Laadan
     [not found]                       ` <49C133F9.2020505-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-18 18:06                         ` Dave Hansen
2009-03-18 16:23               ` Oren Laadan
     [not found]                 ` <49C1201A.3050604-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-18 17:18                   ` Serge E. Hallyn
     [not found]                     ` <20090318171840.GA29523-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-18 17:50                       ` Oren Laadan
     [not found]                         ` <49C1347F.3000601-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-18 20:03                           ` Mike Waychison
     [not found]                             ` <49C153AF.7070504-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2009-03-18 20:13                               ` Dave Hansen
2009-03-25 12:25                                 ` Eric W. Biederman
     [not found]                                   ` <m17i2dx00b.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>
2009-03-25 17:29                                     ` Serge E. Hallyn
     [not found]                                       ` <20090325172938.GA18957-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-26  9:52                                         ` Cedric Le Goater
     [not found]                                           ` <49CB504A.2080400-GANU6spQydw@public.gmane.org>
2009-03-26 13:29                                             ` Serge E. Hallyn
2009-03-18 14:42           ` Dave Hansen
2009-03-17  6:39   ` Sukadev Bhattiprolu
     [not found]     ` <20090317063958.GG2377-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-18  8:56       ` Oren Laadan
     [not found]         ` <49C0B750.4050109-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-18 13:53           ` Serge E. Hallyn
2009-03-17  6:55   ` Sukadev Bhattiprolu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.