All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] Missing patches in Dave Hansen's tree
@ 2009-03-12  4:24 Sukadev Bhattiprolu
       [not found] ` <20090312042416.GC7733-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-12  4:24 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers, sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8


Following patches were in or meant to be in Oren's tree
and are currently missing from Dave Hansen's tree.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/5] Track in-kernel when we expect checkpoint/restart to work
       [not found] ` <20090312042416.GC7733-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-12  4:25   ` Sukadev Bhattiprolu
       [not found]     ` <20090312042503.GA8703-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-12  4:25   ` [PATCH 2/5] Checkpoint multiple processes Sukadev Bhattiprolu
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-12  4:25 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [PATCH 1/5] Track in-kernel when we expect checkpoint/restart to work

Suggested by Ingo.

Checkpoint/restart is going to be a long effort to get things working.
We're going to have a lot of things that we know just don't work for
a long time.  That doesn't mean that it will be useless, it just means
that there's some complicated features that we are going to have to
work incrementally to fix.

This patch introduces a new mechanism to help the checkpoint/restart
developers.  A new function pair: task/process_deny_checkpoint() is
created.  When called, these tell the kernel that we *know* that the
process has performed some activity that will keep it from being
properly checkpointed.

The 'flag' is an atomic_t for now so that we can have some level
of atomicity and make sure to only warn once.

For now, this is a one-way trip.  Once a process is no longer
'may_checkpoint' capable, neither it nor its children ever will be.
This can, of course, be fixed up in the future.  We might want to
reset the flag when a new pid namespace is created, for instance.

Signed-off-by: Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/checkpoint.c    |    6 ++++++
 include/linux/checkpoint.h |   28 +++++++++++++++++++++++++++-
 include/linux/sched.h      |    3 +++
 kernel/fork.c              |   10 ++++++++++
 4 files changed, 46 insertions(+), 1 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index e0af8a2..35e3c6b 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -233,6 +233,12 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 		return -EAGAIN;
 	}
 
+	if (!atomic_read(&t->may_checkpoint)) {
+		pr_warning("c/r: task %d may not checkpoint\n",
+			   task_pid_vnr(t));
+		return -EBUSY;
+	}
+
 	ret = cr_write_task_struct(ctx, t);
 	pr_debug("task_struct: ret %d\n", ret);
 	if (ret < 0)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index e77393f..e0aa60a 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -10,9 +10,10 @@
  *  distribution for more details.
  */
 
-#include <linux/path.h>
 #include <linux/fs.h>
 #include <linux/fdtable.h>
+#include <linux/path.h>
+#include <linux/sched.h>
 
 #ifdef CONFIG_CHECKPOINT_RESTART
 
@@ -109,9 +110,23 @@ static inline void __files_deny_checkpointing(struct files_struct *files,
 		return;
 	printk(KERN_INFO "process performed an action that can not be "
 			"checkpointed at: %s:%d\n", file, line);
+	WARN_ON(1);
+}
+
+static inline void __task_deny_checkpointing(struct task_struct *task,
+		char *file, int line)
+{
+	if (!atomic_dec_and_test(&task->may_checkpoint))
+		return;
+
+	printk(KERN_INFO "process performed an action that can not be "
+			"checkpointed at: %s:%d\n", file, line);
+	WARN_ON(1);
 }
 #define files_deny_checkpointing(f)  \
 	__files_deny_checkpointing(f, __FILE__, __LINE__)
+#define process_deny_checkpointing(p)  \
+	__task_deny_checkpointing(p, __FILE__, __LINE__)
 
 int cr_explain_file(struct file *file, char *explain, int left);
 int cr_file_supported(struct file *file);
@@ -120,6 +135,15 @@ static inline int cr_enabled(void)
 	return 1;
 }
 
+/*
+ * For now, we're not going to have a distinction between
+ * tasks and processes for the purpose of c/r.  But, allow
+ * these two calls anyway to make new users at least think
+ * about it.
+ */
+#define task_deny_checkpointing(p)  \
+	__task_deny_checkpointing(p, __FILE__, __LINE__)
+
 #else /* !CONFIG_CHECKPOINT_RESTART */
 
 static inline void files_deny_checkpointing(struct files_struct *files) {}
@@ -127,6 +151,8 @@ static inline int cr_explain_file(struct file *file, char *explain, int left)
 {
 	return 0;
 }
+static inline void task_deny_checkpointing(struct task_struct *task) {}
+static inline void process_deny_checkpointing(struct task_struct *task) {}
 
 static inline int cr_file_supported(struct file *file)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c216e0..050b25c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1416,6 +1416,9 @@ struct task_struct {
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	unsigned long trace;
+#endif 
+#ifdef CONFIG_CHECKPOINT_RESTART
+	atomic_t may_checkpoint;
 #endif
 };
 
diff --git a/kernel/fork.c b/kernel/fork.c
index a66fbde..29c5a28 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -199,6 +199,13 @@ void __init fork_init(unsigned long mempages)
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 	init_task.signal->rlim[RLIMIT_SIGPENDING] =
 		init_task.signal->rlim[RLIMIT_NPROC];
+
+#ifdef CONFIG_CHECKPOINT_RESTART
+	/*
+	 * This probably won't stay set for long...
+	 */
+	atomic_set(&init_task.may_checkpoint, 1);
+#endif
 }
 
 int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
@@ -249,6 +256,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	tsk->btrace_seq = 0;
 #endif
 	tsk->splice_pipe = NULL;
+#ifdef CONFIG_CHECKPOINT_RESTART
+	atomic_set(&tsk->may_checkpoint, atomic_read(&orig->may_checkpoint));
+#endif
 	return tsk;
 
 out:
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/5] Checkpoint multiple processes
       [not found] ` <20090312042416.GC7733-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-12  4:25   ` [PATCH 1/5] Track in-kernel when we expect checkpoint/restart to work Sukadev Bhattiprolu
@ 2009-03-12  4:25   ` Sukadev Bhattiprolu
  2009-03-12  4:25   ` [PATCH 3/5] Restart " Sukadev Bhattiprolu
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-12  4:25 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Subject: [PATCH 2/5] Checkpoint multiple processes

Checkpointing of multiple processes works by recording the tasks tree
structure below a given task (usually this task is the container init).

For a given task, do a DFS scan of the tasks tree and collect them
into an array (keeping a reference to each task). Using DFS simplifies
the recreation of tasks either in user space or kernel space. For each
task collected, test if it can be checkpointed, and save its pid, tgid,
and ppid.

The actual work is divided into two passes: a first scan counts the
tasks, then memory is allocated and a second scan fills the array.

The logic is suitable for creation of processes during restart either
in userspace or by the kernel.

Currently we ignore threads and zombies, as well as session ids.

Changelog[v12]:
  - Replace obsolete cr_debug() with pr_debug()

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 checkpoint/checkpoint.c        |  228 +++++++++++++++++++++++++++++++++++++---
 checkpoint/sys.c               |   16 +++
 include/linux/checkpoint.h     |    3 +
 include/linux/checkpoint_hdr.h |   13 ++-
 4 files changed, 243 insertions(+), 17 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 35e3c6b..fbcd9eb 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -226,19 +226,6 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 {
 	int ret;
 
-	/* TODO: verity that the task is frozen (unless self) */
-
-	if (t->state == TASK_DEAD) {
-		pr_warning("c/r: task may not be in state TASK_DEAD\n");
-		return -EAGAIN;
-	}
-
-	if (!atomic_read(&t->may_checkpoint)) {
-		pr_warning("c/r: task %d may not checkpoint\n",
-			   task_pid_vnr(t));
-		return -EBUSY;
-	}
-
 	ret = cr_write_task_struct(ctx, t);
 	pr_debug("task_struct: ret %d\n", ret);
 	if (ret < 0)
@@ -261,6 +248,205 @@ static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t)
 	return ret;
 }
 
+/* dump all tasks in ctx->tasks_arr[] */
+static int cr_write_all_tasks(struct cr_ctx *ctx)
+{
+	int n, ret = 0;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		pr_debug("dumping task #%d\n", n);
+		ret = cr_write_task(ctx, ctx->tasks_arr[n]);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
+static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
+{
+	pr_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
+
+	if (t->state == TASK_DEAD) {
+		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+		return -EAGAIN;
+	}
+
+	if (!atomic_read(&t->may_checkpoint)) {
+		pr_warning("c/r: task %d uncheckpointable\n", task_pid_vnr(t));
+		return -EBUSY;
+	}
+
+	if (!ptrace_may_access(t, PTRACE_MODE_READ))
+		return -EPERM;
+
+	/* FIXME: verify that the task is frozen (unless self) */
+
+	/* FIXME: change this for nested containers */
+	if (task_nsproxy(t) != ctx->root_nsproxy)
+		return -EPERM;
+
+	return 0;
+}
+
+#define CR_HDR_PIDS_CHUNK	256
+
+static int cr_write_pids(struct cr_ctx *ctx)
+{
+	struct cr_hdr_pids *hh;
+	struct pid_namespace *ns;
+	struct task_struct *task;
+	struct task_struct **tasks_arr;
+	int tasks_nr, n, ret = 0, pos = 0;
+
+	ns = ctx->root_nsproxy->pid_ns;
+	tasks_arr = ctx->tasks_arr;
+	tasks_nr = ctx->tasks_nr;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh) * CR_HDR_PIDS_CHUNK);
+
+	while (tasks_nr > 0) {
+		rcu_read_lock();
+		for (n = min(tasks_nr, CR_HDR_PIDS_CHUNK); n; n--) {
+			task = tasks_arr[pos];
+
+			/* is this task cool ? */
+			ret = cr_may_checkpoint_task(task, ctx);
+			if (ret < 0) {
+				rcu_read_unlock();
+				goto out;
+			}
+			hh[pos].vpid = task_pid_nr_ns(task, ns);
+			hh[pos].vtgid = task_tgid_nr_ns(task, ns);
+			hh[pos].vppid = task_tgid_nr_ns(task->real_parent, ns);
+			pr_debug("task[%d]: vpid %d vtgid %d parent %d\n", pos,
+				 hh[pos].vpid, hh[pos].vtgid, hh[pos].vppid);
+			pos++;
+		}
+		rcu_read_unlock();
+
+		n = min(tasks_nr, CR_HDR_PIDS_CHUNK);
+		ret = cr_kwrite(ctx, hh, n * sizeof(*hh));
+		if (ret < 0)
+			break;
+
+		tasks_nr -= n;
+	}
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+	return ret;
+}
+
+/* count number of tasks in tree (and optionally fill pid's in array) */
+static int cr_tree_count_tasks(struct cr_ctx *ctx)
+{
+	struct task_struct *root = ctx->root_task;
+	struct task_struct *task = root;
+	struct task_struct *parent = NULL;
+	struct task_struct **tasks_arr = ctx->tasks_arr;
+	int tasks_nr = ctx->tasks_nr;
+	int nr = 0;
+
+	read_lock(&tasklist_lock);
+
+	/* count tasks via DFS scan of the tree */
+	while (1) {
+		if (tasks_arr) {
+			/* unlikely, but ... */
+			if (nr == tasks_nr)
+				return -EBUSY;	/* cleanup in cr_ctx_free() */
+			tasks_arr[nr] = task;
+			get_task_struct(task);
+		}
+
+		nr++;
+
+		/* if has children - proceed with child */
+		if (!list_empty(&task->children)) {
+			parent = task;
+			task = list_entry(task->children.next,
+					  struct task_struct, sibling);
+			continue;
+		}
+
+		while (task != root) {
+			/* if has sibling - proceed with sibling */
+			if (!list_is_last(&task->sibling, &parent->children)) {
+				task = list_entry(task->sibling.next,
+						  struct task_struct, sibling);
+				break;
+			}
+
+			/* else, trace back to parent and proceed */
+			task = parent;
+			parent = parent->real_parent;
+		}
+
+		if (task == root)
+			break;
+	}
+
+	read_unlock(&tasklist_lock);
+	return nr;
+}
+
+/*
+ * cr_build_tree - scan the tasks tree in DFS order and fill in array
+ * @ctx: checkpoint context
+ *
+ * Using DFS order simplifies the restart logic to re-create the tasks.
+ *
+ * On success, ctx->tasks_arr will be allocated and populated with all
+ * tasks (reference taken), and ctx->tasks_nr will hold the total count.
+ * The array is cleaned up by cr_ctx_free().
+ */
+static int cr_build_tree(struct cr_ctx *ctx)
+{
+	int n, m;
+
+	/* count tasks (no side effects) */
+	n = cr_tree_count_tasks(ctx);
+
+	ctx->tasks_nr = n;
+	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
+	if (!ctx->tasks_arr)
+		return -ENOMEM;
+
+	/* count again (now will fill array) */
+	m = cr_tree_count_tasks(ctx);
+
+	/* unlikely, but ... (cleanup in cr_ctx_free) */
+	if (m < 0)
+		return m;
+	else if (m != n)
+		return -EBUSY;
+
+	return 0;
+}
+
+/* dump the array that describes the tasks tree */
+static int cr_write_tree(struct cr_ctx *ctx)
+{
+	struct cr_hdr h;
+	struct cr_hdr_tree *hh;
+	int ret;
+
+	h.type = CR_HDR_TREE;
+	h.len = sizeof(*hh);
+	h.parent = 0;
+
+	hh = cr_hbuf_get(ctx, sizeof(*hh));
+	hh->tasks_nr = ctx->tasks_nr;
+
+	ret = cr_write_obj(ctx, &h, hh);
+	cr_hbuf_put(ctx, sizeof(*hh));
+	if (ret < 0)
+		return ret;
+
+	ret = cr_write_pids(ctx);
+	return ret;
+}
+
 static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 {
 	struct task_struct *task = NULL;
@@ -278,7 +464,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!task)
 		goto out;
 
-#if 0	/* enable to use containers */
+#if 0	/* enable with containers */
 	if (!is_container_init(task)) {
 		err = -EINVAL;
 		goto out;
@@ -300,7 +486,7 @@ static int cr_get_container(struct cr_ctx *ctx, pid_t pid)
 	if (!nsproxy)
 		goto out;
 
-	/* TODO: verify that the container is frozen */
+	/* FIXME: verify that the container is frozen */
 
 	ctx->root_task = task;
 	ctx->root_nsproxy = nsproxy;
@@ -348,12 +534,22 @@ int do_checkpoint(struct cr_ctx *ctx, pid_t pid)
 	ret = cr_ctx_checkpoint(ctx, pid);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_build_tree(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_head(ctx);
 	if (ret < 0)
 		goto out;
-	ret = cr_write_task(ctx, ctx->root_task);
+	ret = cr_write_tree(ctx);
 	if (ret < 0)
 		goto out;
+
+	ret = cr_write_all_tasks(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_write_tail(ctx);
 	if (ret < 0)
 		goto out;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 4a51ed3..0436ef3 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -152,6 +152,19 @@ void cr_hbuf_put(struct cr_ctx *ctx, int n)
  * restart operation, and persists until the operation is completed.
  */
 
+static void cr_task_arr_free(struct cr_ctx *ctx)
+{
+	int n;
+
+	for (n = 0; n < ctx->tasks_nr; n++) {
+		if (ctx->tasks_arr[n]) {
+			put_task_struct(ctx->tasks_arr[n]);
+			ctx->tasks_arr[n] = NULL;
+		}
+	}
+	kfree(ctx->tasks_arr);
+}
+
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
 	if (ctx->file)
@@ -164,6 +177,9 @@ static void cr_ctx_free(struct cr_ctx *ctx)
 	cr_pgarr_free(ctx);
 	cr_objhash_free(ctx);
 
+	if (ctx->tasks_arr)
+		cr_task_arr_free(ctx);
+
 	if (ctx->root_nsproxy)
 		put_nsproxy(ctx->root_nsproxy);
 	if (ctx->root_task)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index e0aa60a..4b0446d 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -35,6 +35,9 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
+	struct task_struct **tasks_arr;	/* array of all tasks in container */
+	int tasks_nr;			/* size of tasks array */
+
 	struct cr_objhash *objhash;	/* hash for shared objects */
 
 	struct list_head pgarr_list;	/* page array to dump VMA contents */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 66313bd..0acf7c0 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -45,7 +45,8 @@ enum {
 	CR_HDR_STRING,
 	CR_HDR_FNAME,
 
-	CR_HDR_TASK = 101,
+	CR_HDR_TREE = 101,
+	CR_HDR_TASK,
 	CR_HDR_THREAD,
 	CR_HDR_CPU,
 
@@ -81,6 +82,16 @@ struct cr_hdr_tail {
 	__u64 magic;
 } __attribute__((aligned(8)));
 
+struct cr_hdr_tree {
+	__u32 tasks_nr;
+} __attribute__((aligned(8)));
+
+struct cr_hdr_pids {
+	__s32 vpid;
+	__s32 vtgid;
+	__s32 vppid;
+} __attribute__((aligned(8)));
+
 struct cr_hdr_task {
 	__u32 state;
 	__u32 exit_state;
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/5] Restart multiple processes
       [not found] ` <20090312042416.GC7733-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-12  4:25   ` [PATCH 1/5] Track in-kernel when we expect checkpoint/restart to work Sukadev Bhattiprolu
  2009-03-12  4:25   ` [PATCH 2/5] Checkpoint multiple processes Sukadev Bhattiprolu
@ 2009-03-12  4:25   ` Sukadev Bhattiprolu
  2009-03-12  4:25   ` [PATCH 4/5] Check 'may_checkpoint()' early Sukadev Bhattiprolu
  2009-03-12  4:26   ` [PATCH 5/5] Deny external checkpoint unless task is frozen Sukadev Bhattiprolu
  4 siblings, 0 replies; 11+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-12  4:25 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Subject: [PATCH 3/5] Restart multiple processes

Restarting of multiple processes expects all restarting tasks to call
sys_restart(). Once inside the system call, each task will restart
itself at the same order that they were saved. The internals of the
syscall will take care of in-kernel synchronization bewteen tasks.

This patch does _not_ create the task tree in the kernel. Instead it
assumes that all tasks are created in some way and then invoke the
restart syscall. You can use the userspace mktree.c program to do
that.

The init task (*) has a special role: it allocates the restart context
(ctx), and coordinates the operation. In particular, it first waits
until all participating tasks enter the kernel, and provides them the
common restart context. Once everyone in ready, it begins to restart
itself.

In contrast, the other tasks enter the kernel, locate the init task (*)
and grab its restart context, and then wait for their turn to restore.

When a task (init or not) completes its restart, it hands the control
over to the next in line, by waking that task.

An array of pids (the one saved during the checkpoint) is used to
synchronize the operation. The first task in the array is the init
task (*). The restart context (ctx) maintain a "current position" in
the array, which indicates which task is currently active. Once the
currently active task completes its own restart, it increments that
position and wakes up the next task.

Restart assumes that userspace provides meaningful data, otherwise
it's garbage-in-garbage-out. In this case, the syscall may block
indefinitely, but in TASK_INTERRUPTIBLE, so the user can ctrl-c or
otherwise kill the stray restarting tasks.

In terms of security, restart runs as the user the invokes it, so it
will not allow a user to do more than is otherwise permitted by the
usual system semantics and policy.

Currently we ignore threads and zombies, as well as session ids.
Add support for multiple processes

(*) For containers, restart should be called inside a fresh container
by the init task of that container. However, it is also possible to
restart applications not necessarily inside a container, and without
restoring the original pids of the processes (that is, provided that
the application can tolerate such behavior). This is useful to allow
multi-process restart of tasks not isolated inside a container, and
also for debugging.

Changelog[v12]:
  - Replace obsolete cr_debug() with pr_debug()

Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
---
 checkpoint/restart.c       |  214 +++++++++++++++++++++++++++++++++++++++++++-
 checkpoint/sys.c           |   34 ++++++--
 include/linux/checkpoint.h |   23 ++++-
 include/linux/sched.h      |    1 +
 4 files changed, 258 insertions(+), 14 deletions(-)

diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 0c46abf..6b4cd75 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -10,6 +10,7 @@
 
 #include <linux/version.h>
 #include <linux/sched.h>
+#include <linux/wait.h>
 #include <linux/file.h>
 #include <linux/magic.h>
 #include <linux/checkpoint.h>
@@ -276,30 +277,235 @@ static int cr_read_task(struct cr_ctx *ctx)
 	return ret;
 }
 
+/* cr_read_tree - read the tasks tree into the checkpoint context */
+static int cr_read_tree(struct cr_ctx *ctx)
+{
+	struct cr_hdr_tree *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	int parent, size, ret = -EINVAL;
+
+	parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TREE);
+	if (parent < 0) {
+		ret = parent;
+		goto out;
+	} else if (parent != 0)
+		goto out;
+
+	if (hh->tasks_nr < 0)
+		goto out;
+
+	ctx->pids_nr = hh->tasks_nr;
+	size = sizeof(*ctx->pids_arr) * ctx->pids_nr;
+	if (size < 0)		/* overflow ? */
+		goto out;
+
+	ctx->pids_arr = kmalloc(size, GFP_KERNEL);
+	if (!ctx->pids_arr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = cr_kread(ctx, ctx->pids_arr, size);
+ out:
+	cr_hbuf_put(ctx, sizeof(*hh));
+	return ret;
+}
+
+static int cr_wait_task(struct cr_ctx *ctx)
+{
+	pid_t pid = task_pid_vnr(current);
+
+	pr_debug("pid %d waiting\n", pid);
+	return wait_event_interruptible(ctx->waitq, ctx->pids_active == pid);
+}
+
+static int cr_next_task(struct cr_ctx *ctx)
+{
+	struct task_struct *tsk;
+
+	ctx->pids_pos++;
+
+	pr_debug("pids_pos %d %d\n", ctx->pids_pos, ctx->pids_nr);
+	if (ctx->pids_pos == ctx->pids_nr) {
+		complete(&ctx->complete);
+		return 0;
+	}
+
+	ctx->pids_active = ctx->pids_arr[ctx->pids_pos].vpid;
+
+	pr_debug("pids_next %d\n", ctx->pids_active);
+
+	rcu_read_lock();
+	tsk = find_task_by_pid_ns(ctx->pids_active, ctx->root_nsproxy->pid_ns);
+	if (tsk)
+		wake_up_process(tsk);
+	rcu_read_unlock();
+
+	if (!tsk) {
+		ctx->pids_err = -ESRCH;
+		complete(&ctx->complete);
+		return -ESRCH;
+	}
+
+	return 0;
+}
+
+/* FIXME: this should be per container */
+DECLARE_WAIT_QUEUE_HEAD(cr_restart_waitq);
+
+static int do_restart_task(struct cr_ctx *ctx, pid_t pid)
+{
+	struct task_struct *root_task;
+	int ret;
+
+	rcu_read_lock();
+	root_task = find_task_by_pid_ns(pid, current->nsproxy->pid_ns);
+	if (root_task)
+		get_task_struct(root_task);
+	rcu_read_unlock();
+
+	if (!root_task)
+		return -EINVAL;
+
+	/*
+	 * wait for container init to initialize the restart context, then
+	 * grab a reference to that context, and if we're the last task to
+	 * do it, notify the container init.
+	 */
+	ret = wait_event_interruptible(cr_restart_waitq,
+				       root_task->checkpoint_ctx);
+	if (ret < 0)
+		goto out;
+
+	task_lock(root_task);
+	ctx = root_task->checkpoint_ctx;
+	if (ctx)
+		cr_ctx_get(ctx);
+	task_unlock(root_task);
+
+	if (!ctx) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (atomic_dec_and_test(&ctx->tasks_count))
+		complete(&ctx->complete);
+
+	/* wait for our turn, do the restore, and tell next task in line */
+	ret = cr_wait_task(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_read_task(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_next_task(ctx);
+
+ out:
+	cr_ctx_put(ctx);
+	put_task_struct(root_task);
+	return ret;
+}
+
+static int cr_wait_all_tasks_start(struct cr_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->pids_nr == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+	current->checkpoint_ctx = ctx;
+
+	wake_up_all(&cr_restart_waitq);
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	if (ret < 0)
+		return ret;
+
+	task_lock(current);
+	current->checkpoint_ctx = NULL;
+	task_unlock(current);
+
+	return 0;
+}
+
+static int cr_wait_all_tasks_finish(struct cr_ctx *ctx)
+{
+	int ret;
+
+	if (ctx->pids_nr == 1)
+		return 0;
+
+	init_completion(&ctx->complete);
+
+	ret = cr_next_task(ctx);
+	if (ret < 0)
+		return ret;
+
+	ret = wait_for_completion_interruptible(&ctx->complete);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
 /* setup restart-specific parts of ctx */
 static int cr_ctx_restart(struct cr_ctx *ctx, pid_t pid)
 {
+	ctx->root_pid = pid;
+	ctx->root_task = current;
+	ctx->root_nsproxy = current->nsproxy;
+
+	get_task_struct(ctx->root_task);
+	get_nsproxy(ctx->root_nsproxy);
+
+	atomic_set(&ctx->tasks_count, ctx->pids_nr - 1);
+
 	return 0;
 }
 
-int do_restart(struct cr_ctx *ctx, pid_t pid)
+static int do_restart_root(struct cr_ctx *ctx, pid_t pid)
 {
 	int ret;
 
+	ret = cr_read_head(ctx);
+	if (ret < 0)
+		goto out;
+	ret = cr_read_tree(ctx);
+	if (ret < 0)
+		goto out;
+
 	ret = cr_ctx_restart(ctx, pid);
 	if (ret < 0)
 		goto out;
-	ret = cr_read_head(ctx);
+
+	/* wait for all other tasks to enter do_restart_task() */
+	ret = cr_wait_all_tasks_start(ctx);
 	if (ret < 0)
 		goto out;
+
 	ret = cr_read_task(ctx);
 	if (ret < 0)
 		goto out;
-	ret = cr_read_tail(ctx);
+
+	/* wait for all other tasks to complete do_restart_task() */
+	ret = cr_wait_all_tasks_finish(ctx);
 	if (ret < 0)
 		goto out;
 
-	/* on success, adjust the return value if needed [TODO] */
+	ret = cr_read_tail(ctx);
+
  out:
 	return ret;
 }
+
+int do_restart(struct cr_ctx *ctx, pid_t pid)
+{
+	int ret;
+
+	if (ctx)
+		ret = do_restart_root(ctx, pid);
+	else
+		ret = do_restart_task(ctx, pid);
+
+	/* on success, adjust the return value if needed [TODO] */
+	return ret;
+}
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 0436ef3..f26b0c6 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -167,6 +167,8 @@ static void cr_task_arr_free(struct cr_ctx *ctx)
 
 static void cr_ctx_free(struct cr_ctx *ctx)
 {
+	BUG_ON(atomic_read(&ctx->refcount));
+
 	if (ctx->file)
 		fput(ctx->file);
 
@@ -185,6 +187,8 @@ static void cr_ctx_free(struct cr_ctx *ctx)
 	if (ctx->root_task)
 		put_task_struct(ctx->root_task);
 
+	kfree(ctx->pids_arr);
+
 	kfree(ctx);
 }
 
@@ -199,8 +203,10 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 
 	ctx->flags = flags;
 
+	atomic_set(&ctx->refcount, 0);
 	INIT_LIST_HEAD(&ctx->pgarr_list);
 	INIT_LIST_HEAD(&ctx->pgarr_pool);
+	init_waitqueue_head(&ctx->waitq);
 
 	err = -EBADF;
 	ctx->file = fget(fd);
@@ -215,6 +221,7 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 	if (cr_objhash_alloc(ctx) < 0)
 		goto err;
 
+	atomic_inc(&ctx->refcount);
 	return ctx;
 
  err:
@@ -222,6 +229,17 @@ static struct cr_ctx *cr_ctx_alloc(int fd, unsigned long flags)
 	return ERR_PTR(err);
 }
 
+void cr_ctx_get(struct cr_ctx *ctx)
+{
+	atomic_inc(&ctx->refcount);
+}
+
+void cr_ctx_put(struct cr_ctx *ctx)
+{
+	if (ctx && atomic_dec_and_test(&ctx->refcount))
+		cr_ctx_free(ctx);
+}
+
 /**
  * sys_checkpoint - checkpoint a container
  * @pid: pid of the container init(1) process
@@ -249,7 +267,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
 	if (!ret)
 		ret = ctx->crid;
 
-	cr_ctx_free(ctx);
+	cr_ctx_put(ctx);
 	return ret;
 }
 
@@ -264,7 +282,7 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags)
  */
 asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
 {
-	struct cr_ctx *ctx;
+	struct cr_ctx *ctx = NULL;
 	pid_t pid;
 	int ret;
 
@@ -272,15 +290,17 @@ asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
 	if (flags)
 		return -EINVAL;
 
-	ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
 	/* FIXME: for now, we use 'crid' as a pid */
 	pid = (pid_t) crid;
 
+	if (pid == task_pid_vnr(current))
+		ctx = cr_ctx_alloc(fd, flags | CR_CTX_RSTR);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
 	ret = do_restart(ctx, pid);
 
-	cr_ctx_free(ctx);
+	cr_ctx_put(ctx);
 	return ret;
 }
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 4b0446d..191d32a 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -14,10 +14,11 @@
 #include <linux/fdtable.h>
 #include <linux/path.h>
 #include <linux/sched.h>
+#include <asm/atomic.h>
 
 #ifdef CONFIG_CHECKPOINT_RESTART
 
-#define CR_VERSION  2
+#define CR_VERSION  3
 
 struct cr_ctx {
 	int crid;		/* unique checkpoint id */
@@ -35,8 +36,7 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
-	struct task_struct **tasks_arr;	/* array of all tasks in container */
-	int tasks_nr;			/* size of tasks array */
+	atomic_t refcount;
 
 	struct cr_objhash *objhash;	/* hash for shared objects */
 
@@ -44,6 +44,20 @@ struct cr_ctx {
 	struct list_head pgarr_pool;	/* pool of empty page arrays chain */
 
 	struct path fs_mnt;	/* container root (FIXME) */
+
+	/* [multi-process checkpoint] */
+	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
+	int tasks_nr;                   /* size of tasks array */
+
+	/* [multi-process restart] */
+	struct cr_hdr_pids *pids_arr;	/* array of all pids [restart] */
+	int pids_nr;			/* size of pids array */
+	int pids_pos;			/* position pids array */
+	int pids_err;			/* error occured ? */
+	pid_t pids_active;		/* pid of (next) active task */
+	atomic_t tasks_count;		/* sync of restarting tasks */
+	struct completion complete;	/* sync of restarting tasks */
+	wait_queue_head_t waitq;	/* sync of restarting tasks */
 };
 
 /* cr_ctx: flags */
@@ -56,6 +70,9 @@ extern int cr_kread(struct cr_ctx *ctx, void *buf, int count);
 extern void *cr_hbuf_get(struct cr_ctx *ctx, int n);
 extern void cr_hbuf_put(struct cr_ctx *ctx, int n);
 
+extern void cr_ctx_get(struct cr_ctx *ctx);
+extern void cr_ctx_put(struct cr_ctx *ctx);
+
 /* shared objects handling */
 
 enum {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 050b25c..93adad0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1419,6 +1419,7 @@ struct task_struct {
 #endif 
 #ifdef CONFIG_CHECKPOINT_RESTART
 	atomic_t may_checkpoint;
+	struct cr_ctx *checkpoint_ctx;
 #endif
 };
 
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 4/5] Check 'may_checkpoint()' early
       [not found] ` <20090312042416.GC7733-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-03-12  4:25   ` [PATCH 3/5] Restart " Sukadev Bhattiprolu
@ 2009-03-12  4:25   ` Sukadev Bhattiprolu
       [not found]     ` <20090312042559.GD8703-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-03-12  4:26   ` [PATCH 5/5] Deny external checkpoint unless task is frozen Sukadev Bhattiprolu
  4 siblings, 1 reply; 11+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-12  4:25 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [PATCH 4/5] Check 'may_checkpoint()' early

We currently check if a task is checkpointable when writing the task
information to checkpoint file. The small downside of doing this check
late is that we may have processed several processes in the tree before
hitting one that cannot be checkpointed.

We anyway walk the process tree we are checkpointing earlier, while
counting the tasks. We could check if all processes in the tree are
checkpointable at that time and fail early if any of them are not.
Since the process tree should be frozen, checking earlier should not
matter ?

For now, the patch leaves the existing check in cr_write_pids(). We
can remove that later.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index fbcd9eb..9189abb 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -346,11 +346,18 @@ static int cr_tree_count_tasks(struct cr_ctx *ctx)
 	struct task_struct **tasks_arr = ctx->tasks_arr;
 	int tasks_nr = ctx->tasks_nr;
 	int nr = 0;
+	int ret;
 
 	read_lock(&tasklist_lock);
 
 	/* count tasks via DFS scan of the tree */
 	while (1) {
+		ret = cr_may_checkpoint_task(task, ctx);
+		if (ret < 0) {
+			nr = ret;
+			break;
+		}
+
 		if (tasks_arr) {
 			/* unlikely, but ... */
 			if (nr == tasks_nr)
@@ -406,6 +413,8 @@ static int cr_build_tree(struct cr_ctx *ctx)
 
 	/* count tasks (no side effects) */
 	n = cr_tree_count_tasks(ctx);
+	if (n < 0)
+		return n;
 
 	ctx->tasks_nr = n;
 	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 5/5] Deny external checkpoint unless task is frozen
       [not found] ` <20090312042416.GC7733-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (3 preceding siblings ...)
  2009-03-12  4:25   ` [PATCH 4/5] Check 'may_checkpoint()' early Sukadev Bhattiprolu
@ 2009-03-12  4:26   ` Sukadev Bhattiprolu
       [not found]     ` <20090312042614.GE8703-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  4 siblings, 1 reply; 11+ messages in thread
From: Sukadev Bhattiprolu @ 2009-03-12  4:26 UTC (permalink / raw)
  To: David C. Hansen; +Cc: Containers


From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Subject: [PATCH 5/5] Deny external checkpoint unless task is frozen

Remove a 'FIXME' and ensure that the tasks we are checkpointing are
frozen unless its a self-checkpoint.

Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 checkpoint/checkpoint.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 9189abb..cfa6b4f 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/utsname.h>
 #include <linux/magic.h>
+#include <linux/freezer.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
@@ -280,7 +281,9 @@ static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
 	if (!ptrace_may_access(t, PTRACE_MODE_READ))
 		return -EPERM;
 
-	/* FIXME: verify that the task is frozen (unless self) */
+	/* verify that the task is frozen (unless self) */
+	if (t != current && !frozen(t))
+		return -EBUSY;
 
 	/* FIXME: change this for nested containers */
 	if (task_nsproxy(t) != ctx->root_nsproxy)
-- 
1.5.2.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/5] Track in-kernel when we expect checkpoint/restart to work
       [not found]     ` <20090312042503.GA8703-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-12 16:57       ` Serge E. Hallyn
  0 siblings, 0 replies; 11+ messages in thread
From: Serge E. Hallyn @ 2009-03-12 16:57 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):

Hi Suka,

the reason this patch was dropped is that we've moved from
task to slightly more per-resource checkpointability tracking.

So right now there is no t->may_checkpoint, but there is
t->files->may_checkpoint and t->mm->may_checkpoint.

-serge

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/5] Check 'may_checkpoint()' early
       [not found]     ` <20090312042559.GD8703-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-12 17:33       ` Serge E. Hallyn
  0 siblings, 0 replies; 11+ messages in thread
From: Serge E. Hallyn @ 2009-03-12 17:33 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
> 
> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> Subject: [PATCH 4/5] Check 'may_checkpoint()' early
> 
> We currently check if a task is checkpointable when writing the task
> information to checkpoint file. The small downside of doing this check
> late is that we may have processed several processes in the tree before
> hitting one that cannot be checkpointed.
> 
> We anyway walk the process tree we are checkpointing earlier, while
> counting the tasks. We could check if all processes in the tree are
> checkpointable at that time and fail early if any of them are not.
> Since the process tree should be frozen, checking earlier should not
> matter ?
> 
> For now, the patch leaves the existing check in cr_write_pids(). We
> can remove that later.
> 
> Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>

Yes this seems like a good idea to me.

Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

> ---
>  checkpoint/checkpoint.c |    9 +++++++++
>  1 files changed, 9 insertions(+), 0 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index fbcd9eb..9189abb 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -346,11 +346,18 @@ static int cr_tree_count_tasks(struct cr_ctx *ctx)
>  	struct task_struct **tasks_arr = ctx->tasks_arr;
>  	int tasks_nr = ctx->tasks_nr;
>  	int nr = 0;
> +	int ret;
> 
>  	read_lock(&tasklist_lock);
> 
>  	/* count tasks via DFS scan of the tree */
>  	while (1) {
> +		ret = cr_may_checkpoint_task(task, ctx);
> +		if (ret < 0) {
> +			nr = ret;
> +			break;
> +		}
> +
>  		if (tasks_arr) {
>  			/* unlikely, but ... */
>  			if (nr == tasks_nr)
> @@ -406,6 +413,8 @@ static int cr_build_tree(struct cr_ctx *ctx)
> 
>  	/* count tasks (no side effects) */
>  	n = cr_tree_count_tasks(ctx);
> +	if (n < 0)
> +		return n;
> 
>  	ctx->tasks_nr = n;
>  	ctx->tasks_arr = kzalloc(n * sizeof(*ctx->tasks_arr), GFP_KERNEL);
> -- 
> 1.5.2.5
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 5/5] Deny external checkpoint unless task is frozen
       [not found]     ` <20090312042614.GE8703-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-12 17:37       ` Serge E. Hallyn
       [not found]         ` <20090312173707.GC17253-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Serge E. Hallyn @ 2009-03-12 17:37 UTC (permalink / raw)
  To: Sukadev Bhattiprolu; +Cc: Containers, David C. Hansen

Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
> 
> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> Subject: [PATCH 5/5] Deny external checkpoint unless task is frozen
> 
> Remove a 'FIXME' and ensure that the tasks we are checkpointing are
> frozen unless its a self-checkpoint.
> 
> Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>

I remain not-a-fan of self-checkpoint, and think it needlessly
complicates locking, but it isn't for this patch to change that :)

Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

> ---
>  checkpoint/checkpoint.c |    5 ++++-
>  1 files changed, 4 insertions(+), 1 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index 9189abb..cfa6b4f 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -19,6 +19,7 @@
>  #include <linux/mount.h>
>  #include <linux/utsname.h>
>  #include <linux/magic.h>
> +#include <linux/freezer.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> 
> @@ -280,7 +281,9 @@ static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
>  	if (!ptrace_may_access(t, PTRACE_MODE_READ))
>  		return -EPERM;
> 
> -	/* FIXME: verify that the task is frozen (unless self) */
> +	/* verify that the task is frozen (unless self) */
> +	if (t != current && !frozen(t))
> +		return -EBUSY;
> 
>  	/* FIXME: change this for nested containers */
>  	if (task_nsproxy(t) != ctx->root_nsproxy)
> -- 
> 1.5.2.5
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 5/5] Deny external checkpoint unless task is frozen
       [not found]         ` <20090312173707.GC17253-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-03-13  3:19           ` Oren Laadan
       [not found]             ` <49B9D0BD.40701-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Oren Laadan @ 2009-03-13  3:19 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen



Serge E. Hallyn wrote:
> Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
>> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
>> Subject: [PATCH 5/5] Deny external checkpoint unless task is frozen
>>
>> Remove a 'FIXME' and ensure that the tasks we are checkpointing are
>> frozen unless its a self-checkpoint.
>>
>> Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> 
> I remain not-a-fan of self-checkpoint, and think it needlessly
> complicates locking, but it isn't for this patch to change that :)

lol ... here's is an interesting use case for self checkpoint: "ASSURE:
Automatic Software Self-healing Using REscue points" (ASPLOS 2009).

Oren.

> 
> Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> 
>> ---
>>  checkpoint/checkpoint.c |    5 ++++-
>>  1 files changed, 4 insertions(+), 1 deletions(-)
>>
>> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
>> index 9189abb..cfa6b4f 100644
>> --- a/checkpoint/checkpoint.c
>> +++ b/checkpoint/checkpoint.c
>> @@ -19,6 +19,7 @@
>>  #include <linux/mount.h>
>>  #include <linux/utsname.h>
>>  #include <linux/magic.h>
>> +#include <linux/freezer.h>
>>  #include <linux/checkpoint.h>
>>  #include <linux/checkpoint_hdr.h>
>>
>> @@ -280,7 +281,9 @@ static int cr_may_checkpoint_task(struct task_struct *t, struct cr_ctx *ctx)
>>  	if (!ptrace_may_access(t, PTRACE_MODE_READ))
>>  		return -EPERM;
>>
>> -	/* FIXME: verify that the task is frozen (unless self) */
>> +	/* verify that the task is frozen (unless self) */
>> +	if (t != current && !frozen(t))
>> +		return -EBUSY;
>>
>>  	/* FIXME: change this for nested containers */
>>  	if (task_nsproxy(t) != ctx->root_nsproxy)
>> -- 
>> 1.5.2.5
>>
>> _______________________________________________
>> Containers mailing list
>> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
>> https://lists.linux-foundation.org/mailman/listinfo/containers
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 5/5] Deny external checkpoint unless task is frozen
       [not found]             ` <49B9D0BD.40701-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-03-13 13:54               ` Serge E. Hallyn
  0 siblings, 0 replies; 11+ messages in thread
From: Serge E. Hallyn @ 2009-03-13 13:54 UTC (permalink / raw)
  To: Oren Laadan; +Cc: Containers, Sukadev Bhattiprolu, David C. Hansen

Quoting Oren Laadan (orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org):
> 
> 
> Serge E. Hallyn wrote:
> > Quoting Sukadev Bhattiprolu (sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org):
> >> From: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> >> Subject: [PATCH 5/5] Deny external checkpoint unless task is frozen
> >>
> >> Remove a 'FIXME' and ensure that the tasks we are checkpointing are
> >> frozen unless its a self-checkpoint.
> >>
> >> Signed-off-by: Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
> > 
> > I remain not-a-fan of self-checkpoint, and think it needlessly
> > complicates locking, but it isn't for this patch to change that :)
> 
> lol ... here's is an interesting use case for self checkpoint: "ASSURE:
> Automatic Software Self-healing Using REscue points" (ASPLOS 2009).
> 
> Oren.

Right, but we can always do self-checkpoint by sending a signal to
an external daemon which freezes us and then does the checkpoint, no?

Ok I haven't read the paper, so if the answer is "no that won't work,
dumbass" then i'll go and read it :)

thanks,
-serge

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-03-13 13:54 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-12  4:24 [PATCH 0/5] Missing patches in Dave Hansen's tree Sukadev Bhattiprolu
     [not found] ` <20090312042416.GC7733-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-12  4:25   ` [PATCH 1/5] Track in-kernel when we expect checkpoint/restart to work Sukadev Bhattiprolu
     [not found]     ` <20090312042503.GA8703-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-12 16:57       ` Serge E. Hallyn
2009-03-12  4:25   ` [PATCH 2/5] Checkpoint multiple processes Sukadev Bhattiprolu
2009-03-12  4:25   ` [PATCH 3/5] Restart " Sukadev Bhattiprolu
2009-03-12  4:25   ` [PATCH 4/5] Check 'may_checkpoint()' early Sukadev Bhattiprolu
     [not found]     ` <20090312042559.GD8703-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-12 17:33       ` Serge E. Hallyn
2009-03-12  4:26   ` [PATCH 5/5] Deny external checkpoint unless task is frozen Sukadev Bhattiprolu
     [not found]     ` <20090312042614.GE8703-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-12 17:37       ` Serge E. Hallyn
     [not found]         ` <20090312173707.GC17253-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-03-13  3:19           ` Oren Laadan
     [not found]             ` <49B9D0BD.40701-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-03-13 13:54               ` Serge E. Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.