linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [RFC/PATCH 0/3] checkpoint/restart for powerpc
@ 2009-01-28 22:41 Nathan Lynch
  2009-01-28 22:41 ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Nathan Lynch
                   ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Nathan Lynch @ 2009-01-28 22:41 UTC (permalink / raw)
  To: containers; +Cc: linuxppc-dev

This series is a first attempt at enabling checkpoint/restart for
powerpc.  It is based on Oren Laadan's v13 checkpoint/restart series:

http://lkml.org/lkml/2009/1/27/227

Nathan Lynch (3):
  powerpc: bare minimum checkpoint/restart implementation
  powerpc: wire up checkpoint and restart syscalls
  allow checkpoint/restart on powerpc

 arch/powerpc/include/asm/checkpoint_hdr.h |   40 +++++
 arch/powerpc/include/asm/systbl.h         |    2 +
 arch/powerpc/include/asm/unistd.h         |    4 +-
 arch/powerpc/mm/Makefile                  |    1 +
 arch/powerpc/mm/checkpoint.c              |  261 +++++++++++++++++++++++++++++
 checkpoint/Kconfig                        |    2 +-
 6 files changed, 308 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/include/asm/checkpoint_hdr.h
 create mode 100644 arch/powerpc/mm/checkpoint.c

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-01-28 22:41 [RFC/PATCH 0/3] checkpoint/restart for powerpc Nathan Lynch
@ 2009-01-28 22:41 ` Nathan Lynch
  2009-01-29  6:41   ` Oren Laadan
                     ` (2 more replies)
  2009-01-28 22:41 ` [PATCH 2/3] powerpc: wire up checkpoint and restart syscalls Nathan Lynch
  2009-01-28 22:41 ` [PATCH 3/3] allow checkpoint/restart on powerpc Nathan Lynch
  2 siblings, 3 replies; 28+ messages in thread
From: Nathan Lynch @ 2009-01-28 22:41 UTC (permalink / raw)
  To: containers; +Cc: linuxppc-dev

The only thing of significance here is that the checkpointed task's
pt_regs and fp state are saved and restored (see cr_write_cpu and
cr_read_cpu); the rest of the code consists of dummy implementations
of the APIs the arch needs to provide to the checkpoint/restart core.

What works:
* self and external checkpoint of simple (single thread, one open
  file) 32- and 64-bit processes on a ppc64 kernel

What doesn't work:
* restarting a 32-bit task from a 64-bit task and vice versa

Untested:
* ppc32 (but it builds)

Signed-off-by: Nathan Lynch <ntl@pobox.com>
---
 arch/powerpc/include/asm/checkpoint_hdr.h |   40 +++++
 arch/powerpc/mm/Makefile                  |    1 +
 arch/powerpc/mm/checkpoint.c              |  261 +++++++++++++++++++++++++++++
 3 files changed, 302 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/include/asm/checkpoint_hdr.h
 create mode 100644 arch/powerpc/mm/checkpoint.c

diff --git a/arch/powerpc/include/asm/checkpoint_hdr.h b/arch/powerpc/include/asm/checkpoint_hdr.h
new file mode 100644
index 0000000..752c53f
--- /dev/null
+++ b/arch/powerpc/include/asm/checkpoint_hdr.h
@@ -0,0 +1,40 @@
+#ifndef __ASM_PPC_CKPT_HDR_H
+#define __ASM_PPC_CKPT_HDR_H
+/*
+ *  Checkpoint/restart - architecture specific headers ppc
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/processor.h>
+
+struct cr_hdr_head_arch {
+	__u32 unimplemented;
+};
+
+struct cr_hdr_thread {
+	__u32 unimplemented;
+};
+
+struct cr_hdr_cpu {
+	struct pt_regs pt_regs;
+	/* relevant fields from thread_struct */
+	double fpr[32][TS_FPRWIDTH];
+	unsigned int fpscr;
+	int fpexc_mode;
+	/* unsigned int align_ctl; this is never updated? */
+	unsigned long dabr;
+};
+
+struct cr_hdr_mm_context {
+	__u32 unimplemented;
+};
+
+#endif /* __ASM_PPC_CKPT_HDR__H */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index e7392b4..8a523a0 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
+obj-$(CONFIG_CHECKPOINT_RESTART) += checkpoint.o
diff --git a/arch/powerpc/mm/checkpoint.c b/arch/powerpc/mm/checkpoint.c
new file mode 100644
index 0000000..8cdff24
--- /dev/null
+++ b/arch/powerpc/mm/checkpoint.c
@@ -0,0 +1,261 @@
+/*
+ *  Checkpoint/restart - architecture specific support for powerpc.
+ *  Based on x86 implementation.
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *  Copyright 2009 IBM Corp.
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define DEBUG 1 /* for pr_debug */
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/kernel.h>
+#include <asm/processor.h>
+
+static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
+{
+	hdr->type = type;
+	hdr->len = len;
+	hdr->parent = parent;
+}
+
+/* dump the thread_struct of a given task */
+int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr_thread *thread_hdr;
+	struct cr_hdr cr_hdr;
+	u32 parent;
+	int ret;
+
+	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
+	if (!thread_hdr)
+		return -ENOMEM;
+
+	parent = task_pid_vnr(t);
+
+	cr_hdr_init(&cr_hdr, CR_HDR_THREAD, sizeof(*thread_hdr), parent);
+
+	thread_hdr->unimplemented = 0xdeadbeef;
+
+	ret = cr_write_obj(ctx, &cr_hdr, thread_hdr);
+	cr_hbuf_put(ctx, sizeof(*thread_hdr));
+
+	return ret;
+}
+
+/* dump the cpu state and registers of a given task */
+int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr_cpu *cpu_hdr;
+	struct pt_regs *pt_regs;
+	struct cr_hdr cr_hdr;
+	u32 parent;
+	int ret;
+
+	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
+	if (!cpu_hdr)
+		return -ENOMEM;
+
+	parent = task_pid_vnr(t);
+
+	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
+
+	/* pt_regs: GPRs, MSR, etc */
+	pt_regs = task_pt_regs(t);
+	cpu_hdr->pt_regs = *pt_regs;
+
+	/* FP state */
+	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));
+	cpu_hdr->fpscr = t->thread.fpscr.val;
+	cpu_hdr->fpexc_mode = t->thread.fpexc_mode;
+
+	/* Handle DABR for now, dbcr[01] later */
+	cpu_hdr->dabr = t->thread.dabr;
+
+	/* ToDo: Altivec/VSX/SPE state */
+
+	ret = cr_write_obj(ctx, &cr_hdr, cpu_hdr);
+	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
+
+	return ret;
+}
+
+int cr_write_head_arch(struct cr_ctx *ctx)
+{
+	struct cr_hdr_head_arch *arch_hdr;
+	struct cr_hdr cr_hdr;
+	int ret;
+
+	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
+	if (!arch_hdr)
+		return -ENOMEM;
+
+	cr_hdr_init(&cr_hdr, CR_HDR_HEAD_ARCH, sizeof(*arch_hdr), 0);
+
+	arch_hdr->unimplemented = 0xdeadbeef;
+
+	ret = cr_write_obj(ctx, &cr_hdr, arch_hdr);
+	cr_hbuf_put(ctx, sizeof(*arch_hdr));
+
+	return ret;
+}
+
+/* dump the mm->context state */
+int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int parent)
+{
+	struct cr_hdr_mm_context *mm_hdr;
+	struct cr_hdr cr_hdr;
+	size_t size;
+	int ret;
+
+	size = sizeof(*mm_hdr);
+
+	mm_hdr = cr_hbuf_get(ctx, size);
+	if (!mm_hdr)
+		return -ENOMEM;
+
+	cr_hdr_init(&cr_hdr, CR_HDR_MM_CONTEXT, size, parent);
+
+	mm_hdr->unimplemented = 0xdeadbeef;
+
+	ret = cr_write_obj(ctx, &cr_hdr, mm_hdr);
+	cr_hbuf_put(ctx, size);
+
+	return ret;
+}
+
+/* restart APIs */
+
+/* read the thread_struct into the current task */
+int cr_read_thread(struct cr_ctx *ctx)
+{
+	struct cr_hdr_thread *thread_hdr;
+	int ret;
+
+	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
+	if (!thread_hdr)
+		return -ENOMEM;
+
+	ret = cr_read_obj_type(ctx, thread_hdr, sizeof(*thread_hdr),
+				  CR_HDR_THREAD);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+
+	if (thread_hdr->unimplemented != 0xdeadbeef) {
+		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
+			 __func__, (unsigned long)thread_hdr->unimplemented);
+		ret = -EINVAL;
+	}
+out:
+	cr_hbuf_put(ctx, sizeof(*thread_hdr));
+	return ret;
+}
+
+/* Based on the MSR value from a checkpoint image, produce an MSR
+ * value that is appropriate for the restored task.  Right now we only
+ * check for MSR_SF (64-bit) for PPC64.
+ */
+static unsigned long sanitize_msr(unsigned long msr_ckpt)
+{
+#ifdef CONFIG_PPC32
+	return MSR_USER;
+#else
+	if (msr_ckpt & MSR_SF)
+		return MSR_USER64;
+	return MSR_USER32;
+#endif
+}
+
+int cr_read_cpu(struct cr_ctx *ctx)
+{
+	struct cr_hdr_cpu *cpu_hdr;
+	struct pt_regs *regs;
+	int ret;
+
+	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
+	if (!cpu_hdr)
+		return -ENOMEM;
+
+	ret = cr_read_obj_type(ctx, cpu_hdr, sizeof(*cpu_hdr),
+				  CR_HDR_CPU);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+
+	regs = task_pt_regs(current);
+	*regs = cpu_hdr->pt_regs;
+
+	regs->msr = sanitize_msr(regs->msr);
+
+	/* FP state */
+	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
+	current->thread.fpscr.val = cpu_hdr->fpscr;
+	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
+
+	/* debug registers */
+	current->thread.dabr = cpu_hdr->dabr;
+out:
+	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
+	return ret;
+}
+
+int cr_read_head_arch(struct cr_ctx *ctx)
+{
+	struct cr_hdr_head_arch *arch_hdr;
+	int ret;
+
+	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
+	if (!arch_hdr)
+		return -ENOMEM;
+
+	ret = cr_read_obj_type(ctx, arch_hdr, sizeof(*arch_hdr),
+				  CR_HDR_HEAD_ARCH);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+
+	if (arch_hdr->unimplemented != 0xdeadbeef) {
+		pr_debug("%s: unexpected arch_hdr contents: 0x%lx\n",
+			 __func__, (unsigned long)arch_hdr->unimplemented);
+		ret = -EINVAL;
+	}
+out:
+	cr_hbuf_put(ctx, sizeof(*arch_hdr));
+	return ret;
+}
+
+int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
+{
+	struct cr_hdr_mm_context *mm_hdr;
+	int ret;
+
+	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
+	if (!mm_hdr)
+		return -ENOMEM;
+
+	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
+				  CR_HDR_MM_CONTEXT);
+	if (ret != rparent)
+		goto out;
+
+	ret = 0;
+
+	if (mm_hdr->unimplemented != 0xdeadbeef) {
+		pr_debug("%s: unexpected mm_hdr contents: 0x%lx\n",
+			 __func__, (unsigned long)mm_hdr->unimplemented);
+		ret = -EINVAL;
+	}
+
+out:
+	cr_hbuf_put(ctx, sizeof(*mm_hdr));
+	return ret;
+}
-- 
1.5.5

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/3] powerpc: wire up checkpoint and restart syscalls
  2009-01-28 22:41 [RFC/PATCH 0/3] checkpoint/restart for powerpc Nathan Lynch
  2009-01-28 22:41 ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Nathan Lynch
@ 2009-01-28 22:41 ` Nathan Lynch
  2009-01-28 22:41 ` [PATCH 3/3] allow checkpoint/restart on powerpc Nathan Lynch
  2 siblings, 0 replies; 28+ messages in thread
From: Nathan Lynch @ 2009-01-28 22:41 UTC (permalink / raw)
  To: containers; +Cc: linuxppc-dev

Signed-off-by: Nathan Lynch <ntl@pobox.com>
---
 arch/powerpc/include/asm/systbl.h |    2 ++
 arch/powerpc/include/asm/unistd.h |    4 +++-
 2 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 803def2..cdb60b4 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,3 +322,5 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
+SYSCALL(checkpoint)
+SYSCALL(restart)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index e07d0c7..2e333a1 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,10 +341,12 @@
 #define __NR_dup3		316
 #define __NR_pipe2		317
 #define __NR_inotify_init1	318
+#define __NR_checkpoint		319
+#define __NR_restart		320
 
 #ifdef __KERNEL__
 
-#define __NR_syscalls		319
+#define __NR_syscalls		321
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
-- 
1.5.5

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/3] allow checkpoint/restart on powerpc
  2009-01-28 22:41 [RFC/PATCH 0/3] checkpoint/restart for powerpc Nathan Lynch
  2009-01-28 22:41 ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Nathan Lynch
  2009-01-28 22:41 ` [PATCH 2/3] powerpc: wire up checkpoint and restart syscalls Nathan Lynch
@ 2009-01-28 22:41 ` Nathan Lynch
  2 siblings, 0 replies; 28+ messages in thread
From: Nathan Lynch @ 2009-01-28 22:41 UTC (permalink / raw)
  To: containers; +Cc: linuxppc-dev

Signed-off-by: Nathan Lynch <ntl@pobox.com>
---
 checkpoint/Kconfig |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/checkpoint/Kconfig b/checkpoint/Kconfig
index ffaa635..00036aa 100644
--- a/checkpoint/Kconfig
+++ b/checkpoint/Kconfig
@@ -1,7 +1,7 @@
 config CHECKPOINT_RESTART
 	prompt "Enable checkpoint/restart (EXPERIMENTAL)"
 	def_bool n
-	depends on X86_32 && EXPERIMENTAL
+	depends on (X86_32 || PPC) && EXPERIMENTAL
 	help
 	  Application checkpoint/restart is the ability to save the
 	  state of a running application so that it can later resume
-- 
1.5.5

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-01-28 22:41 ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Nathan Lynch
@ 2009-01-29  6:41   ` Oren Laadan
  2009-01-29 21:40     ` Nathan Lynch
  2009-01-30  4:01     ` Serge E. Hallyn
  2009-01-30  3:55   ` Serge E. Hallyn
  2009-02-04  3:39   ` Benjamin Herrenschmidt
  2 siblings, 2 replies; 28+ messages in thread
From: Oren Laadan @ 2009-01-29  6:41 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers, linuxppc-dev

Nathan,

Thanks for the patch. Looks good, see some comments below.
(disclaimer: I'm not very familiar with ppc architecture)

Nathan Lynch wrote:
> The only thing of significance here is that the checkpointed task's
> pt_regs and fp state are saved and restored (see cr_write_cpu and
> cr_read_cpu); the rest of the code consists of dummy implementations
> of the APIs the arch needs to provide to the checkpoint/restart core.
> 
> What works:
> * self and external checkpoint of simple (single thread, one open
>   file) 32- and 64-bit processes on a ppc64 kernel
> 
> What doesn't work:
> * restarting a 32-bit task from a 64-bit task and vice versa

Is there a test to bail if we attempt to checkpoint such tasks ?

> 
> Untested:
> * ppc32 (but it builds)
> 
> Signed-off-by: Nathan Lynch <ntl@pobox.com>
> ---
>  arch/powerpc/include/asm/checkpoint_hdr.h |   40 +++++
>  arch/powerpc/mm/Makefile                  |    1 +
>  arch/powerpc/mm/checkpoint.c              |  261 +++++++++++++++++++++++++++++
>  3 files changed, 302 insertions(+), 0 deletions(-)
>  create mode 100644 arch/powerpc/include/asm/checkpoint_hdr.h
>  create mode 100644 arch/powerpc/mm/checkpoint.c
> 
> diff --git a/arch/powerpc/include/asm/checkpoint_hdr.h b/arch/powerpc/include/asm/checkpoint_hdr.h
> new file mode 100644
> index 0000000..752c53f
> --- /dev/null
> +++ b/arch/powerpc/include/asm/checkpoint_hdr.h
> @@ -0,0 +1,40 @@
> +#ifndef __ASM_PPC_CKPT_HDR_H
> +#define __ASM_PPC_CKPT_HDR_H
> +/*
> + *  Checkpoint/restart - architecture specific headers ppc
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#include <linux/types.h>
> +#include <asm/ptrace.h>
> +#include <asm/mmu.h>
> +#include <asm/processor.h>
> +
> +struct cr_hdr_head_arch {
> +	__u32 unimplemented;
> +};
> +
> +struct cr_hdr_thread {
> +	__u32 unimplemented;
> +};
> +
> +struct cr_hdr_cpu {
> +	struct pt_regs pt_regs;

It has been suggested (as done in x86/32 code) not to use 'struct pt_regs'
because it "can (and has) changed on x86" and because "it only container
the registers that the kernel trashes, not all usermode registers".

https://lists.linux-foundation.org/pipermail/containers/2008-August/012355.html

> +	/* relevant fields from thread_struct */
> +	double fpr[32][TS_FPRWIDTH];

Can TS_FPRWIDTH change between sub-archs or kernel versions ?  If so, it
needs to be stated explicitly.

> +	unsigned int fpscr;
> +	int fpexc_mode;
> +	/* unsigned int align_ctl; this is never updated? */
> +	unsigned long dabr;

Are these fields always guarantee to compile to the same number of bytes
regardless of 32/64 bit choice of compiler (or sub-arch?) ?

In the x86(32/64) architecture we use types with explicit size such as
__u32 and the like to ensure that it always compiled to the same size.

> +};
> +
> +struct cr_hdr_mm_context {
> +	__u32 unimplemented;
> +};
> +
> +#endif /* __ASM_PPC_CKPT_HDR__H */
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index e7392b4..8a523a0 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -24,3 +24,4 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
>  obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
>  obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
>  obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
> +obj-$(CONFIG_CHECKPOINT_RESTART) += checkpoint.o
> diff --git a/arch/powerpc/mm/checkpoint.c b/arch/powerpc/mm/checkpoint.c
> new file mode 100644
> index 0000000..8cdff24
> --- /dev/null
> +++ b/arch/powerpc/mm/checkpoint.c
> @@ -0,0 +1,261 @@
> +/*
> + *  Checkpoint/restart - architecture specific support for powerpc.
> + *  Based on x86 implementation.
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *  Copyright 2009 IBM Corp.
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#define DEBUG 1 /* for pr_debug */
> +
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/kernel.h>
> +#include <asm/processor.h>
> +
> +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
> +{
> +	hdr->type = type;
> +	hdr->len = len;
> +	hdr->parent = parent;
> +}
> +

This function is rather generic and useful to non-arch-dependent and other
architectures code. Perhaps put in a separate patch ?

> +/* dump the thread_struct of a given task */
> +int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr_thread *thread_hdr;
> +	struct cr_hdr cr_hdr;
> +	u32 parent;
> +	int ret;
> +
> +	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
> +	if (!thread_hdr)
> +		return -ENOMEM;
> +
> +	parent = task_pid_vnr(t);
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_THREAD, sizeof(*thread_hdr), parent);
> +
> +	thread_hdr->unimplemented = 0xdeadbeef;
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, thread_hdr);
> +	cr_hbuf_put(ctx, sizeof(*thread_hdr));
> +
> +	return ret;
> +}
> +
> +/* dump the cpu state and registers of a given task */
> +int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr_cpu *cpu_hdr;
> +	struct pt_regs *pt_regs;
> +	struct cr_hdr cr_hdr;
> +	u32 parent;
> +	int ret;
> +
> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> +	if (!cpu_hdr)
> +		return -ENOMEM;
> +
> +	parent = task_pid_vnr(t);
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
> +
> +	/* pt_regs: GPRs, MSR, etc */
> +	pt_regs = task_pt_regs(t);
> +	cpu_hdr->pt_regs = *pt_regs;
> +
> +	/* FP state */
> +	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));

As note above, is sizeof(cpu_hdr->fpr) the same on all chips ?

> +	cpu_hdr->fpscr = t->thread.fpscr.val;
> +	cpu_hdr->fpexc_mode = t->thread.fpexc_mode;
> +
> +	/* Handle DABR for now, dbcr[01] later */
> +	cpu_hdr->dabr = t->thread.dabr;
> +
> +	/* ToDo: Altivec/VSX/SPE state */
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, cpu_hdr);
> +	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
> +
> +	return ret;
> +}
> +
> +int cr_write_head_arch(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_head_arch *arch_hdr;
> +	struct cr_hdr cr_hdr;
> +	int ret;
> +
> +	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
> +	if (!arch_hdr)
> +		return -ENOMEM;
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_HEAD_ARCH, sizeof(*arch_hdr), 0);
> +
> +	arch_hdr->unimplemented = 0xdeadbeef;
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, arch_hdr);
> +	cr_hbuf_put(ctx, sizeof(*arch_hdr));
> +
> +	return ret;
> +}
> +
> +/* dump the mm->context state */
> +int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int parent)
> +{
> +	struct cr_hdr_mm_context *mm_hdr;
> +	struct cr_hdr cr_hdr;
> +	size_t size;
> +	int ret;
> +
> +	size = sizeof(*mm_hdr);
> +
> +	mm_hdr = cr_hbuf_get(ctx, size);
> +	if (!mm_hdr)
> +		return -ENOMEM;
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_MM_CONTEXT, size, parent);
> +
> +	mm_hdr->unimplemented = 0xdeadbeef;
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, mm_hdr);
> +	cr_hbuf_put(ctx, size);
> +
> +	return ret;
> +}
> +
> +/* restart APIs */
> +

The restart APIs belong in a separate file: arch/powerpc/mm/restart.c

> +/* read the thread_struct into the current task */
> +int cr_read_thread(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_thread *thread_hdr;
> +	int ret;
> +
> +	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
> +	if (!thread_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, thread_hdr, sizeof(*thread_hdr),
> +				  CR_HDR_THREAD);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (thread_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)thread_hdr->unimplemented);

Given the macro for 'pr_fmt' in include/linux/checkpoint.h, the use of
__func__ is redunant.

> +		ret = -EINVAL;
> +	}
> +out:
> +	cr_hbuf_put(ctx, sizeof(*thread_hdr));
> +	return ret;
> +}
> +
> +/* Based on the MSR value from a checkpoint image, produce an MSR
> + * value that is appropriate for the restored task.  Right now we only
> + * check for MSR_SF (64-bit) for PPC64.
> + */
> +static unsigned long sanitize_msr(unsigned long msr_ckpt)
> +{
> +#ifdef CONFIG_PPC32
> +	return MSR_USER;
> +#else
> +	if (msr_ckpt & MSR_SF)
> +		return MSR_USER64;
> +	return MSR_USER32;
> +#endif
> +}
> +
> +int cr_read_cpu(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_cpu *cpu_hdr;
> +	struct pt_regs *regs;
> +	int ret;
> +
> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> +	if (!cpu_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, cpu_hdr, sizeof(*cpu_hdr),
> +				  CR_HDR_CPU);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	regs = task_pt_regs(current);
> +	*regs = cpu_hdr->pt_regs;
> +
> +	regs->msr = sanitize_msr(regs->msr);
> +
> +	/* FP state */
> +	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
> +	current->thread.fpscr.val = cpu_hdr->fpscr;
> +	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
> +
> +	/* debug registers */
> +	current->thread.dabr = cpu_hdr->dabr;

I'm unfamiliar with powerpc; is it necessary to sanitize any of the registers
here ?  For instance, can the user cause harm with specially crafted values
of some registers ?

> +out:
> +	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
> +	return ret;
> +}
> +
> +int cr_read_head_arch(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_head_arch *arch_hdr;
> +	int ret;
> +
> +	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
> +	if (!arch_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, arch_hdr, sizeof(*arch_hdr),
> +				  CR_HDR_HEAD_ARCH);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (arch_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected arch_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)arch_hdr->unimplemented);

Same as above: __func__ is redundant.

> +		ret = -EINVAL;
> +	}
> +out:
> +	cr_hbuf_put(ctx, sizeof(*arch_hdr));
> +	return ret;
> +}
> +
> +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
> +{
> +	struct cr_hdr_mm_context *mm_hdr;
> +	int ret;
> +
> +	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
> +	if (!mm_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
> +				  CR_HDR_MM_CONTEXT);
> +	if (ret != rparent)
> +		goto out;

Seems like 'ret' isn't set to an error value if the 'goto' executes.

> +
> +	ret = 0;
> +
> +	if (mm_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected mm_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)mm_hdr->unimplemented);

Again, __func__ redundant.

> +		ret = -EINVAL;
> +	}
> +
> +out:
> +	cr_hbuf_put(ctx, sizeof(*mm_hdr));
> +	return ret;
> +}

Thanks,

Oren.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-01-29  6:41   ` Oren Laadan
@ 2009-01-29 21:40     ` Nathan Lynch
  2009-01-30  0:11       ` Oren Laadan
  2009-02-17  7:03       ` Nathan Lynch
  2009-01-30  4:01     ` Serge E. Hallyn
  1 sibling, 2 replies; 28+ messages in thread
From: Nathan Lynch @ 2009-01-29 21:40 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers, linuxppc-dev

Hey Oren, thanks for taking a look.

Oren Laadan wrote:
> 
> Nathan Lynch wrote:
> > 
> > What doesn't work:
> > * restarting a 32-bit task from a 64-bit task and vice versa
> 
> Is there a test to bail if we attempt to checkpoint such tasks ?

No, but I'll add one if it looks too hard to fix for the next round.


> > +struct cr_hdr_cpu {
> > +	struct pt_regs pt_regs;
> 
> It has been suggested (as done in x86/32 code) not to use 'struct pt_regs'
> because it "can (and has) changed on x86" and because "it only container
> the registers that the kernel trashes, not all usermode registers".
> 
> https://lists.linux-foundation.org/pipermail/containers/2008-August/012355.html

Yeah, I considered that discussion, but the situation is different for
powerpc (someone on linuxppc-dev smack me if I'm wrong here :)
pt_regs is part of the ABI, and it encompasses all user mode registers
except for floating point, which are handled separately.


> > +	/* relevant fields from thread_struct */
> > +	double fpr[32][TS_FPRWIDTH];
> 
> Can TS_FPRWIDTH change between sub-archs or kernel versions ?  If so, it
> needs to be stated explicitly.
> 
> > +	unsigned int fpscr;
> > +	int fpexc_mode;
> > +	/* unsigned int align_ctl; this is never updated? */
> > +	unsigned long dabr;
> 
> Are these fields always guarantee to compile to the same number of bytes
> regardless of 32/64 bit choice of compiler (or sub-arch?) ?
> 
> In the x86(32/64) architecture we use types with explicit size such as
> __u32 and the like to ensure that it always compiled to the same
> size.

Yeah, I'll have to fix these up.



> > +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
> > +{
> > +	hdr->type = type;
> > +	hdr->len = len;
> > +	hdr->parent = parent;
> > +}
> > +
> 
> This function is rather generic and useful to non-arch-dependent and other
> architectures code. Perhaps put in a separate patch ?

Alright.  By the way, why are cr_hdr->type and cr_hdr->len signed
types?


> > +int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
> > +{
> > +	struct cr_hdr_cpu *cpu_hdr;
> > +	struct pt_regs *pt_regs;
> > +	struct cr_hdr cr_hdr;
> > +	u32 parent;
> > +	int ret;
> > +
> > +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> > +	if (!cpu_hdr)
> > +		return -ENOMEM;
> > +
> > +	parent = task_pid_vnr(t);
> > +
> > +	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
> > +
> > +	/* pt_regs: GPRs, MSR, etc */
> > +	pt_regs = task_pt_regs(t);
> > +	cpu_hdr->pt_regs = *pt_regs;
> > +
> > +	/* FP state */
> > +	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));
> 
> As note above, is sizeof(cpu_hdr->fpr) the same on all chips ?

It can differ depending on kernel configuration.


> > +/* restart APIs */
> > +
> 
> The restart APIs belong in a separate file: arch/powerpc/mm/restart.c

Explain why, please?  This isn't a lot of code, and it seems likely
that checkpoint and restart paths will share data structures and tend
to be modified together over time.


> > +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
> > +			 __func__, (unsigned long)thread_hdr->unimplemented);
> 
> Given the macro for 'pr_fmt' in include/linux/checkpoint.h, the use of
> __func__ is redunant.

It seems to me that defining your own pr_fmt in a "public" header like
that is inappropriate, or at least unconventional.  Any file that
happens to include linux/checkpoint.h will have any prior definitions
of pr_fmt overridden, no?


> > +	regs = task_pt_regs(current);
> > +	*regs = cpu_hdr->pt_regs;
> > +
> > +	regs->msr = sanitize_msr(regs->msr);
> > +
> > +	/* FP state */
> > +	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
> > +	current->thread.fpscr.val = cpu_hdr->fpscr;
> > +	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
> > +
> > +	/* debug registers */
> > +	current->thread.dabr = cpu_hdr->dabr;
> 
> I'm unfamiliar with powerpc; is it necessary to sanitize any of the registers
> here ?  For instance, can the user cause harm with specially crafted values
> of some registers ?

I had this in mind with the treatment of MSR, but I'll check on the
others, thanks.


> > +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
> > +{
> > +	struct cr_hdr_mm_context *mm_hdr;
> > +	int ret;
> > +
> > +	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
> > +	if (!mm_hdr)
> > +		return -ENOMEM;
> > +
> > +	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
> > +				  CR_HDR_MM_CONTEXT);
> > +	if (ret != rparent)
> > +		goto out;
> 
> Seems like 'ret' isn't set to an error value if the 'goto' executes.

It returns whatever error value cr_read_obj_type() returns.  Hrm.  I
guess if the image is garbage, cr_read_obj_type can potentially return
a non-error value that still isn't the desired value, is that right?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-01-29 21:40     ` Nathan Lynch
@ 2009-01-30  0:11       ` Oren Laadan
  2009-01-30 20:25         ` Nathan Lynch
  2009-02-17  7:03       ` Nathan Lynch
  1 sibling, 1 reply; 28+ messages in thread
From: Oren Laadan @ 2009-01-30  0:11 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers, linuxppc-dev


Nathan Lynch wrote:
> Hey Oren, thanks for taking a look.
> 
> Oren Laadan wrote:
>> Nathan Lynch wrote:
>>> What doesn't work:
>>> * restarting a 32-bit task from a 64-bit task and vice versa
>> Is there a test to bail if we attempt to checkpoint such tasks ?
> 
> No, but I'll add one if it looks too hard to fix for the next round.
> 
> 
>>> +struct cr_hdr_cpu {
>>> +	struct pt_regs pt_regs;
>> It has been suggested (as done in x86/32 code) not to use 'struct pt_regs'
>> because it "can (and has) changed on x86" and because "it only container
>> the registers that the kernel trashes, not all usermode registers".
>>
>> https://lists.linux-foundation.org/pipermail/containers/2008-August/012355.html
> 
> Yeah, I considered that discussion, but the situation is different for
> powerpc (someone on linuxppc-dev smack me if I'm wrong here :)
> pt_regs is part of the ABI, and it encompasses all user mode registers
> except for floating point, which are handled separately.
> 
> 
>>> +	/* relevant fields from thread_struct */
>>> +	double fpr[32][TS_FPRWIDTH];
>> Can TS_FPRWIDTH change between sub-archs or kernel versions ?  If so, it
>> needs to be stated explicitly.
>>
>>> +	unsigned int fpscr;
>>> +	int fpexc_mode;
>>> +	/* unsigned int align_ctl; this is never updated? */
>>> +	unsigned long dabr;
>> Are these fields always guarantee to compile to the same number of bytes
>> regardless of 32/64 bit choice of compiler (or sub-arch?) ?
>>
>> In the x86(32/64) architecture we use types with explicit size such as
>> __u32 and the like to ensure that it always compiled to the same
>> size.
> 
> Yeah, I'll have to fix these up.
> 
> 
> 
>>> +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
>>> +{
>>> +	hdr->type = type;
>>> +	hdr->len = len;
>>> +	hdr->parent = parent;
>>> +}
>>> +
>> This function is rather generic and useful to non-arch-dependent and other
>> architectures code. Perhaps put in a separate patch ?
> 
> Alright.  By the way, why are cr_hdr->type and cr_hdr->len signed
> types?
> 

No particular reason. I can change that in v14.

> 
>>> +int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
>>> +{
>>> +	struct cr_hdr_cpu *cpu_hdr;
>>> +	struct pt_regs *pt_regs;
>>> +	struct cr_hdr cr_hdr;
>>> +	u32 parent;
>>> +	int ret;
>>> +
>>> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
>>> +	if (!cpu_hdr)
>>> +		return -ENOMEM;
>>> +
>>> +	parent = task_pid_vnr(t);
>>> +
>>> +	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
>>> +
>>> +	/* pt_regs: GPRs, MSR, etc */
>>> +	pt_regs = task_pt_regs(t);
>>> +	cpu_hdr->pt_regs = *pt_regs;
>>> +
>>> +	/* FP state */
>>> +	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));
>> As note above, is sizeof(cpu_hdr->fpr) the same on all chips ?
> 
> It can differ depending on kernel configuration.

So the actual size needs to be explicitly indicated (and compared with).

> 
> 
>>> +/* restart APIs */
>>> +
>> The restart APIs belong in a separate file: arch/powerpc/mm/restart.c
> 
> Explain why, please?  This isn't a lot of code, and it seems likely
> that checkpoint and restart paths will share data structures and tend
> to be modified together over time.

This one has little code, but usually that isn't the case, and many of
the data structures shared are anyway exported. Since the split makes
sense in other cases, it makes sense to follow convention.

Personally I don't have a strong opinion on this. However one of the
initial feedbacks for the existing patchset requested that I split the
functionality between files (and to separate commits).

In other words, if nobody else cries, I won't spoil it ;)

> 
> 
>>> +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
>>> +			 __func__, (unsigned long)thread_hdr->unimplemented);
>> Given the macro for 'pr_fmt' in include/linux/checkpoint.h, the use of
>> __func__ is redunant.
> 
> It seems to me that defining your own pr_fmt in a "public" header like
> that is inappropriate, or at least unconventional.  Any file that
> happens to include linux/checkpoint.h will have any prior definitions
> of pr_fmt overridden, no?
> 

Hmmm.. didn't think of it this way. Using the pr_debug() there was yet
another feedback from LKML, and it seemed reasonable to me. Can you
think of a case where linux/checkpoint.h will happen to be included
in checkpoint-related code ?

> 
>>> +	regs = task_pt_regs(current);
>>> +	*regs = cpu_hdr->pt_regs;
>>> +
>>> +	regs->msr = sanitize_msr(regs->msr);
>>> +
>>> +	/* FP state */
>>> +	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
>>> +	current->thread.fpscr.val = cpu_hdr->fpscr;
>>> +	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
>>> +
>>> +	/* debug registers */
>>> +	current->thread.dabr = cpu_hdr->dabr;
>> I'm unfamiliar with powerpc; is it necessary to sanitize any of the registers
>> here ?  For instance, can the user cause harm with specially crafted values
>> of some registers ?
> 
> I had this in mind with the treatment of MSR, but I'll check on the
> others, thanks.
> 
> 
>>> +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
>>> +{
>>> +	struct cr_hdr_mm_context *mm_hdr;
>>> +	int ret;
>>> +
>>> +	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
>>> +	if (!mm_hdr)
>>> +		return -ENOMEM;
>>> +
>>> +	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
>>> +				  CR_HDR_MM_CONTEXT);
>>> +	if (ret != rparent)
>>> +		goto out;
>> Seems like 'ret' isn't set to an error value if the 'goto' executes.
> 
> It returns whatever error value cr_read_obj_type() returns.  Hrm.  I
> guess if the image is garbage, cr_read_obj_type can potentially return
> a non-error value that still isn't the desired value, is that right?
> 

True.

Thanks,

Oren.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-01-28 22:41 ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Nathan Lynch
  2009-01-29  6:41   ` Oren Laadan
@ 2009-01-30  3:55   ` Serge E. Hallyn
  2009-02-04  3:39   ` Benjamin Herrenschmidt
  2 siblings, 0 replies; 28+ messages in thread
From: Serge E. Hallyn @ 2009-01-30  3:55 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers, linuxppc-dev

Quoting Nathan Lynch (ntl@pobox.com):
> The only thing of significance here is that the checkpointed task's
> pt_regs and fp state are saved and restored (see cr_write_cpu and
> cr_read_cpu); the rest of the code consists of dummy implementations
> of the APIs the arch needs to provide to the checkpoint/restart core.
> 
> What works:
> * self and external checkpoint of simple (single thread, one open
>   file) 32- and 64-bit processes on a ppc64 kernel
> 
> What doesn't work:
> * restarting a 32-bit task from a 64-bit task and vice versa
> 
> Untested:
> * ppc32 (but it builds)
> 
> Signed-off-by: Nathan Lynch <ntl@pobox.com>
> ---
>  arch/powerpc/include/asm/checkpoint_hdr.h |   40 +++++
>  arch/powerpc/mm/Makefile                  |    1 +
>  arch/powerpc/mm/checkpoint.c              |  261 +++++++++++++++++++++++++++++
>  3 files changed, 302 insertions(+), 0 deletions(-)
>  create mode 100644 arch/powerpc/include/asm/checkpoint_hdr.h
>  create mode 100644 arch/powerpc/mm/checkpoint.c
> 
> diff --git a/arch/powerpc/include/asm/checkpoint_hdr.h b/arch/powerpc/include/asm/checkpoint_hdr.h
> new file mode 100644
> index 0000000..752c53f
> --- /dev/null
> +++ b/arch/powerpc/include/asm/checkpoint_hdr.h
> @@ -0,0 +1,40 @@
> +#ifndef __ASM_PPC_CKPT_HDR_H
> +#define __ASM_PPC_CKPT_HDR_H
> +/*
> + *  Checkpoint/restart - architecture specific headers ppc
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#include <linux/types.h>
> +#include <asm/ptrace.h>
> +#include <asm/mmu.h>
> +#include <asm/processor.h>
> +
> +struct cr_hdr_head_arch {
> +	__u32 unimplemented;
> +};
> +
> +struct cr_hdr_thread {
> +	__u32 unimplemented;
> +};
> +
> +struct cr_hdr_cpu {
> +	struct pt_regs pt_regs;
> +	/* relevant fields from thread_struct */
> +	double fpr[32][TS_FPRWIDTH];
> +	unsigned int fpscr;
> +	int fpexc_mode;
> +	/* unsigned int align_ctl; this is never updated? */
> +	unsigned long dabr;
> +};
> +
> +struct cr_hdr_mm_context {
> +	__u32 unimplemented;
> +};
> +
> +#endif /* __ASM_PPC_CKPT_HDR__H */
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index e7392b4..8a523a0 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -24,3 +24,4 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
>  obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
>  obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
>  obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
> +obj-$(CONFIG_CHECKPOINT_RESTART) += checkpoint.o
> diff --git a/arch/powerpc/mm/checkpoint.c b/arch/powerpc/mm/checkpoint.c
> new file mode 100644
> index 0000000..8cdff24
> --- /dev/null
> +++ b/arch/powerpc/mm/checkpoint.c
> @@ -0,0 +1,261 @@
> +/*
> + *  Checkpoint/restart - architecture specific support for powerpc.
> + *  Based on x86 implementation.
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *  Copyright 2009 IBM Corp.
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#define DEBUG 1 /* for pr_debug */
> +
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/kernel.h>
> +#include <asm/processor.h>
> +
> +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
> +{
> +	hdr->type = type;
> +	hdr->len = len;
> +	hdr->parent = parent;
> +}
> +
> +/* dump the thread_struct of a given task */
> +int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr_thread *thread_hdr;
> +	struct cr_hdr cr_hdr;
> +	u32 parent;
> +	int ret;
> +
> +	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
> +	if (!thread_hdr)
> +		return -ENOMEM;
> +
> +	parent = task_pid_vnr(t);
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_THREAD, sizeof(*thread_hdr), parent);

All you're writing here is the vpid, right?  The arch-independent
code already stores that?

> +
> +	thread_hdr->unimplemented = 0xdeadbeef;
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, thread_hdr);
> +	cr_hbuf_put(ctx, sizeof(*thread_hdr));
> +
> +	return ret;
> +}
> +
> +/* dump the cpu state and registers of a given task */
> +int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr_cpu *cpu_hdr;
> +	struct pt_regs *pt_regs;
> +	struct cr_hdr cr_hdr;
> +	u32 parent;
> +	int ret;
> +
> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> +	if (!cpu_hdr)
> +		return -ENOMEM;
> +
> +	parent = task_pid_vnr(t);
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_CPU, sizeof(*cpu_hdr), parent);
> +
> +	/* pt_regs: GPRs, MSR, etc */
> +	pt_regs = task_pt_regs(t);
> +	cpu_hdr->pt_regs = *pt_regs;
> +
> +	/* FP state */
> +	memcpy(cpu_hdr->fpr, t->thread.fpr, sizeof(cpu_hdr->fpr));
> +	cpu_hdr->fpscr = t->thread.fpscr.val;
> +	cpu_hdr->fpexc_mode = t->thread.fpexc_mode;
> +
> +	/* Handle DABR for now, dbcr[01] later */
> +	cpu_hdr->dabr = t->thread.dabr;
> +
> +	/* ToDo: Altivec/VSX/SPE state */
> +
> +	ret = cr_write_obj(ctx, &cr_hdr, cpu_hdr);
> +	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
> +
> +	return ret;
> +}
> +
> +int cr_write_head_arch(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_head_arch *arch_hdr;
> +	struct cr_hdr cr_hdr;
> +	int ret;
> +
> +	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
> +	if (!arch_hdr)
> +		return -ENOMEM;
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_HEAD_ARCH, sizeof(*arch_hdr), 0);
> +
> +	arch_hdr->unimplemented = 0xdeadbeef;

No need to write this just for the sake of writing something, imo.

> +	ret = cr_write_obj(ctx, &cr_hdr, arch_hdr);
> +	cr_hbuf_put(ctx, sizeof(*arch_hdr));
> +
> +	return ret;
> +}
> +
> +/* dump the mm->context state */
> +int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int parent)
> +{
> +	struct cr_hdr_mm_context *mm_hdr;
> +	struct cr_hdr cr_hdr;
> +	size_t size;
> +	int ret;
> +
> +	size = sizeof(*mm_hdr);
> +
> +	mm_hdr = cr_hbuf_get(ctx, size);
> +	if (!mm_hdr)
> +		return -ENOMEM;
> +
> +	cr_hdr_init(&cr_hdr, CR_HDR_MM_CONTEXT, size, parent);
> +
> +	mm_hdr->unimplemented = 0xdeadbeef;

Again

> +	ret = cr_write_obj(ctx, &cr_hdr, mm_hdr);
> +	cr_hbuf_put(ctx, size);
> +
> +	return ret;
> +}
> +
> +/* restart APIs */
> +
> +/* read the thread_struct into the current task */
> +int cr_read_thread(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_thread *thread_hdr;
> +	int ret;
> +
> +	thread_hdr = cr_hbuf_get(ctx, sizeof(*thread_hdr));
> +	if (!thread_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, thread_hdr, sizeof(*thread_hdr),
> +				  CR_HDR_THREAD);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (thread_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)thread_hdr->unimplemented);
> +		ret = -EINVAL;
> +	}

DROP

> +out:
> +	cr_hbuf_put(ctx, sizeof(*thread_hdr));
> +	return ret;
> +}
> +
> +/* Based on the MSR value from a checkpoint image, produce an MSR
> + * value that is appropriate for the restored task.  Right now we only
> + * check for MSR_SF (64-bit) for PPC64.
> + */
> +static unsigned long sanitize_msr(unsigned long msr_ckpt)
> +{
> +#ifdef CONFIG_PPC32
> +	return MSR_USER;
> +#else
> +	if (msr_ckpt & MSR_SF)
> +		return MSR_USER64;
> +	return MSR_USER32;
> +#endif
> +}
> +
> +int cr_read_cpu(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_cpu *cpu_hdr;
> +	struct pt_regs *regs;
> +	int ret;
> +
> +	cpu_hdr = cr_hbuf_get(ctx, sizeof(*cpu_hdr));
> +	if (!cpu_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, cpu_hdr, sizeof(*cpu_hdr),
> +				  CR_HDR_CPU);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	regs = task_pt_regs(current);
> +	*regs = cpu_hdr->pt_regs;
> +
> +	regs->msr = sanitize_msr(regs->msr);
> +
> +	/* FP state */
> +	memcpy(current->thread.fpr, cpu_hdr->fpr, sizeof(current->thread.fpr));
> +	current->thread.fpscr.val = cpu_hdr->fpscr;
> +	current->thread.fpexc_mode = cpu_hdr->fpexc_mode;
> +
> +	/* debug registers */
> +	current->thread.dabr = cpu_hdr->dabr;
> +out:
> +	cr_hbuf_put(ctx, sizeof(*cpu_hdr));
> +	return ret;
> +}
> +
> +int cr_read_head_arch(struct cr_ctx *ctx)
> +{
> +	struct cr_hdr_head_arch *arch_hdr;
> +	int ret;
> +
> +	arch_hdr = cr_hbuf_get(ctx, sizeof(*arch_hdr));
> +	if (!arch_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, arch_hdr, sizeof(*arch_hdr),
> +				  CR_HDR_HEAD_ARCH);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (arch_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected arch_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)arch_hdr->unimplemented);
> +		ret = -EINVAL;

DROP

> +	}
> +out:
> +	cr_hbuf_put(ctx, sizeof(*arch_hdr));
> +	return ret;
> +}
> +
> +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
> +{
> +	struct cr_hdr_mm_context *mm_hdr;
> +	int ret;
> +
> +	mm_hdr = cr_hbuf_get(ctx, sizeof(*mm_hdr));
> +	if (!mm_hdr)
> +		return -ENOMEM;
> +
> +	ret = cr_read_obj_type(ctx, mm_hdr, sizeof(*mm_hdr),
> +				  CR_HDR_MM_CONTEXT);
> +	if (ret != rparent)
> +		goto out;
> +
> +	ret = 0;
> +
> +	if (mm_hdr->unimplemented != 0xdeadbeef) {
> +		pr_debug("%s: unexpected mm_hdr contents: 0x%lx\n",
> +			 __func__, (unsigned long)mm_hdr->unimplemented);
> +		ret = -EINVAL;

DROP

> +	}
> +
> +out:
> +	cr_hbuf_put(ctx, sizeof(*mm_hdr));
> +	return ret;
> +}
> -- 
> 1.5.5
> 
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-01-29  6:41   ` Oren Laadan
  2009-01-29 21:40     ` Nathan Lynch
@ 2009-01-30  4:01     ` Serge E. Hallyn
  1 sibling, 0 replies; 28+ messages in thread
From: Serge E. Hallyn @ 2009-01-30  4:01 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers, linuxppc-dev, Nathan Lynch

Quoting Oren Laadan (orenl@cs.columbia.edu):
> > +static void cr_hdr_init(struct cr_hdr *hdr, __s16 type, __s16 len, __u32 parent)
> > +{
> > +	hdr->type = type;
> > +	hdr->len = len;
> > +	hdr->parent = parent;
> > +}
> > +
> 
> This function is rather generic and useful to non-arch-dependent and other
> architectures code. Perhaps put in a separate patch ?

BTW I disagree here - looking through the patch I found
the use of this fn to be very distracting.  To replace 3
simple in-line assignments, I have to verify the order of
4 weird-looking arguments, and actually first try to
remember what 'cr_hdr_init' is supposed to be and do?

That's not meant to be a complaint, just explanation :)

I prefer dropping its use altogether.  Since I believe
most of the functions calling it in this patch shouldn't
exist anyway (just writing 0xdeadbeef into the file), it
all the more should just go away.

-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-01-30  0:11       ` Oren Laadan
@ 2009-01-30 20:25         ` Nathan Lynch
  0 siblings, 0 replies; 28+ messages in thread
From: Nathan Lynch @ 2009-01-30 20:25 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers, linuxppc-dev

Oren Laadan wrote:
> 
> Nathan Lynch wrote:
> > 
> > Oren Laadan wrote:
> >> Nathan Lynch wrote:
> >>> +		pr_debug("%s: unexpected thread_hdr contents: 0x%lx\n",
> >>> +			 __func__, (unsigned long)thread_hdr->unimplemented);
> >> Given the macro for 'pr_fmt' in include/linux/checkpoint.h, the use of
> >> __func__ is redunant.
> > 
> > It seems to me that defining your own pr_fmt in a "public" header like
> > that is inappropriate, or at least unconventional.  Any file that
> > happens to include linux/checkpoint.h will have any prior definitions
> > of pr_fmt overridden, no?
> > 
> 
> Hmmm.. didn't think of it this way. Using the pr_debug() there was yet
> another feedback from LKML, and it seemed reasonable to me. Can you
> think of a case where linux/checkpoint.h will happen to be included
> in checkpoint-related code ?

(Assume you meant "included in checkpoint-unrelated code")

I could see checkpoint.h being included by files that don't
exclusively deal with C/R.  If you want a uniform debug statement
format for C/R-related code, that's fine, but this isn't the way to do
it.  See the existing users (almost all in drivers/s390).

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-01-28 22:41 ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Nathan Lynch
  2009-01-29  6:41   ` Oren Laadan
  2009-01-30  3:55   ` Serge E. Hallyn
@ 2009-02-04  3:39   ` Benjamin Herrenschmidt
  2009-02-04 15:54     ` Serge E. Hallyn
  2 siblings, 1 reply; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2009-02-04  3:39 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers, linuxppc-dev


> +struct cr_hdr_cpu {
> +	struct pt_regs pt_regs;
> +	/* relevant fields from thread_struct */
> +	double fpr[32][TS_FPRWIDTH];
> +	unsigned int fpscr;
> +	int fpexc_mode;
> +	/* unsigned int align_ctl; this is never updated? */
> +	unsigned long dabr;
> +};

Is there some version or other identification somewhere ? If not there
should be. ie, we're going to add things here. For example, what about
the vector registers ? Also, some CPUs will have more HW debug registers
than just the DABR (we plan to add support for all the BookE architected
IACs and DACs for example), etc...

Ben.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-04  3:39   ` Benjamin Herrenschmidt
@ 2009-02-04 15:54     ` Serge E. Hallyn
  2009-02-04 20:58       ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 28+ messages in thread
From: Serge E. Hallyn @ 2009-02-04 15:54 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: containers, linuxppc-dev, Nathan Lynch

Quoting Benjamin Herrenschmidt (benh@kernel.crashing.org):
> 
> > +struct cr_hdr_cpu {
> > +	struct pt_regs pt_regs;
> > +	/* relevant fields from thread_struct */
> > +	double fpr[32][TS_FPRWIDTH];
> > +	unsigned int fpscr;
> > +	int fpexc_mode;
> > +	/* unsigned int align_ctl; this is never updated? */
> > +	unsigned long dabr;
> > +};
> 
> Is there some version or other identification somewhere ? If not there
> should be. ie, we're going to add things here. For example, what about
> the vector registers ? Also, some CPUs will have more HW debug registers
> than just the DABR (we plan to add support for all the BookE architected
> IACs and DACs for example), etc...

The arch-independent checkpoint header does have kernel
maj:min:rev:patch info.  We expect to have to do more,
assuming that the .config can change the arch-dependent
cpu header (i.e. perhaps TS_FPRWIDTH could be changed).

-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-04 15:54     ` Serge E. Hallyn
@ 2009-02-04 20:58       ` Benjamin Herrenschmidt
  2009-02-04 23:44         ` Oren Laadan
  0 siblings, 1 reply; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2009-02-04 20:58 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers, linuxppc-dev, Nathan Lynch

On Wed, 2009-02-04 at 09:54 -0600, Serge E. Hallyn wrote:
> Quoting Benjamin Herrenschmidt (benh@kernel.crashing.org):
> > 
> > > +struct cr_hdr_cpu {
> > > +	struct pt_regs pt_regs;
> > > +	/* relevant fields from thread_struct */
> > > +	double fpr[32][TS_FPRWIDTH];
> > > +	unsigned int fpscr;
> > > +	int fpexc_mode;
> > > +	/* unsigned int align_ctl; this is never updated? */
> > > +	unsigned long dabr;
> > > +};
> > 
> > Is there some version or other identification somewhere ? If not there
> > should be. ie, we're going to add things here. For example, what about
> > the vector registers ? Also, some CPUs will have more HW debug registers
> > than just the DABR (we plan to add support for all the BookE architected
> > IACs and DACs for example), etc...
> 
> The arch-independent checkpoint header does have kernel
> maj:min:rev:patch info.  We expect to have to do more,
> assuming that the .config can change the arch-dependent
> cpu header (i.e. perhaps TS_FPRWIDTH could be changed).

It could to a certain extent... things like VSX, VSR, or freescale SPE,
or even the Cell SPU state etc....

I wonder if we want a tagged structure so we can easily add things...

Ben.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-04 20:58       ` Benjamin Herrenschmidt
@ 2009-02-04 23:44         ` Oren Laadan
  2009-02-05  0:16           ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 28+ messages in thread
From: Oren Laadan @ 2009-02-04 23:44 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: containers, Serge E. Hallyn, Nathan Lynch, linuxppc-dev



Benjamin Herrenschmidt wrote:
> On Wed, 2009-02-04 at 09:54 -0600, Serge E. Hallyn wrote:
>> Quoting Benjamin Herrenschmidt (benh@kernel.crashing.org):
>>>> +struct cr_hdr_cpu {
>>>> +	struct pt_regs pt_regs;
>>>> +	/* relevant fields from thread_struct */
>>>> +	double fpr[32][TS_FPRWIDTH];
>>>> +	unsigned int fpscr;
>>>> +	int fpexc_mode;
>>>> +	/* unsigned int align_ctl; this is never updated? */
>>>> +	unsigned long dabr;
>>>> +};
>>> Is there some version or other identification somewhere ? If not there
>>> should be. ie, we're going to add things here. For example, what about
>>> the vector registers ? Also, some CPUs will have more HW debug registers
>>> than just the DABR (we plan to add support for all the BookE architected
>>> IACs and DACs for example), etc...
>> The arch-independent checkpoint header does have kernel
>> maj:min:rev:patch info.  We expect to have to do more,
>> assuming that the .config can change the arch-dependent
>> cpu header (i.e. perhaps TS_FPRWIDTH could be changed).
> 
> It could to a certain extent... things like VSX, VSR, or freescale SPE,
> or even the Cell SPU state etc....
> 
> I wonder if we want a tagged structure so we can easily add things...

>From the little bit I read hear, I suspect that the sub-arch classification
is best done in an arch-dependent header. I'd follow the following rule
of thumb:

* Anything that is decided at compiled time should probably go to the arch-
dependent header.

* Anything that can change at boot time (e.g., for x86 that would include
the capabilities of the FPU), or even run time (is there any ?) should
be described to the letter (in fine print) in 'struct cr_hdr_cpu' and
friends.

Oren.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-04 23:44         ` Oren Laadan
@ 2009-02-05  0:16           ` Benjamin Herrenschmidt
  2009-02-05  3:30             ` Oren Laadan
  2009-02-05 16:09             ` Serge E. Hallyn
  0 siblings, 2 replies; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2009-02-05  0:16 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers, Serge E. Hallyn, Nathan Lynch, linuxppc-dev

On Wed, 2009-02-04 at 18:44 -0500, Oren Laadan wrote:
> * Anything that is decided at compiled time should probably go to the arch-
> dependent header.
> 
> * Anything that can change at boot time (e.g., for x86 that would include
> the capabilities of the FPU), or even run time (is there any ?) should
> be described to the letter (in fine print) in 'struct cr_hdr_cpu' and
> friends.

I think we should avoid compile time completely.

We mostly try to have kernels running on everything anyway, and there's
no reason not to be able to move a snapshot to a different CPU if it's
not using a feature of the CPU that is different.

Nathan, what about you start the structure with a 64 bit bitmask that
indicates what "records" are present followed by concatenated records ?

IE. The "main" state (pt_regs) wouldn't change, but then, you could have
a list of things:

 - FPRs
 - old style VSX
 - VSRs
 - Freescale SPE state
 - DABR
 - BookE IAC/DACs
 - tbd...

Then, when resuming a snapshot, we can use some bit masks trickery
indicating the validity on a given target. IE. If CPU has no VSX and
original program uses VSX then you can't resume. But if CPU has VSR you
can.. etc... We can keep it trivial at fist, especting the same
features, and try to be smart later.

Ben.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-05  0:16           ` Benjamin Herrenschmidt
@ 2009-02-05  3:30             ` Oren Laadan
  2009-02-05 16:09             ` Serge E. Hallyn
  1 sibling, 0 replies; 28+ messages in thread
From: Oren Laadan @ 2009-02-05  3:30 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: containers, Serge E. Hallyn, Nathan Lynch, linuxppc-dev



Benjamin Herrenschmidt wrote:
> On Wed, 2009-02-04 at 18:44 -0500, Oren Laadan wrote:
>> * Anything that is decided at compiled time should probably go to the arch-
>> dependent header.
>>
>> * Anything that can change at boot time (e.g., for x86 that would include
>> the capabilities of the FPU), or even run time (is there any ?) should
>> be described to the letter (in fine print) in 'struct cr_hdr_cpu' and
>> friends.
> 
> I think we should avoid compile time completely.

For instance, TASK_COMM_LEN is currently defined as 16; but in future
(or custom) kernel it may be different; so in the task header I put a
field that explicitly indicates this length, just in case. I think it's
useful to be able to detect such inconsistencies.

(of course this example is not arch-specific; and it would be wiser to
have one such entry for the entire checkpoint image instead of one for
each process)

I concur with the rest below.

Oren

> 
> We mostly try to have kernels running on everything anyway, and there's
> no reason not to be able to move a snapshot to a different CPU if it's
> not using a feature of the CPU that is different.
> 
> Nathan, what about you start the structure with a 64 bit bitmask that
> indicates what "records" are present followed by concatenated records ?
> 
> IE. The "main" state (pt_regs) wouldn't change, but then, you could have
> a list of things:
> 
>  - FPRs
>  - old style VSX
>  - VSRs
>  - Freescale SPE state
>  - DABR
>  - BookE IAC/DACs
>  - tbd...
> 
> Then, when resuming a snapshot, we can use some bit masks trickery
> indicating the validity on a given target. IE. If CPU has no VSX and
> original program uses VSX then you can't resume. But if CPU has VSR you
> can.. etc... We can keep it trivial at fist, especting the same
> features, and try to be smart later.
> 
> Ben.
> 
> 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-05  0:16           ` Benjamin Herrenschmidt
  2009-02-05  3:30             ` Oren Laadan
@ 2009-02-05 16:09             ` Serge E. Hallyn
  2009-02-05 21:01               ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 28+ messages in thread
From: Serge E. Hallyn @ 2009-02-05 16:09 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: containers, Oren Laadan, Nathan Lynch, linuxppc-dev

Quoting Benjamin Herrenschmidt (benh@kernel.crashing.org):
> On Wed, 2009-02-04 at 18:44 -0500, Oren Laadan wrote:
> > * Anything that is decided at compiled time should probably go to the arch-
> > dependent header.
> > 
> > * Anything that can change at boot time (e.g., for x86 that would include
> > the capabilities of the FPU), or even run time (is there any ?) should
> > be described to the letter (in fine print) in 'struct cr_hdr_cpu' and
> > friends.
> 
> I think we should avoid compile time completely.
> 
> We mostly try to have kernels running on everything anyway, and there's
> no reason not to be able to move a snapshot to a different CPU if it's
> not using a feature of the CPU that is different.

Absolutely, but the accepted way to handle that so far is that if
you want to run an "incompatible" checkpoint image on a new cpu,
a userspace program will rewrite the image to be correct for the target
cpu.

But what you list below seems more usable than trying to encapsulate
that info in some hokey version number system.

> Nathan, what about you start the structure with a 64 bit bitmask that
> indicates what "records" are present followed by concatenated records ?
> 
> IE. The "main" state (pt_regs) wouldn't change, but then, you could have
> a list of things:
> 
>  - FPRs
>  - old style VSX
>  - VSRs
>  - Freescale SPE state
>  - DABR
>  - BookE IAC/DACs
>  - tbd...
> 
> Then, when resuming a snapshot, we can use some bit masks trickery
> indicating the validity on a given target. IE. If CPU has no VSX and
> original program uses VSX then you can't resume. But if CPU has VSR you
> can.. etc... We can keep it trivial at fist, especting the same
> features, and try to be smart later.
> 
> Ben.
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-05 16:09             ` Serge E. Hallyn
@ 2009-02-05 21:01               ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 28+ messages in thread
From: Benjamin Herrenschmidt @ 2009-02-05 21:01 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers, Oren Laadan, Nathan Lynch, linuxppc-dev


> Absolutely, but the accepted way to handle that so far is that if
> you want to run an "incompatible" checkpoint image on a new cpu,
> a userspace program will rewrite the image to be correct for the target
> cpu.

That doesn't sound nice and definitely not something we want to do on
PowerPC. There are lots of reasons for that including the fact that the
actual feature set may depend on what the FW enabled which itself can
depend on the kernel version .. or not, etc etc...

I'd rather keep all that logic in the kernel to be honest.

> But what you list below seems more usable than trying to encapsulate
> that info in some hokey version number system.

There are things however that I can't expose and for which we can't do
much about.

IE. The kernel exposes some features to userspace, such as the PowerPC
ISA level supported, the presence of some optional instructions, etc...

We don't know whether the user space program is using that stuff
though... ie, we could get into situations where userspace is trying to
use, let's say, 44x MAC instructions, and thus that program will fail to
resume on some other processor, and we have no way to know about it from
the kernel.

But I'll leave that problem for later... Maybe we should implement some
way, using personalities or something similar, to run programs so that
they see a limited set of CPU features, for people who explicitely want
them to be migrateable.. a bit similar to what our Hypervisor does if
you want a partition to be migrateable between different processors, it
only advertises the common subset of functionality.

Ben.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-01-29 21:40     ` Nathan Lynch
  2009-01-30  0:11       ` Oren Laadan
@ 2009-02-17  7:03       ` Nathan Lynch
  2009-02-17 20:02         ` [PATCH 1/3 v2] powerpc: heckpoint/restart implementation Nathan Lynch
                           ` (2 more replies)
  1 sibling, 3 replies; 28+ messages in thread
From: Nathan Lynch @ 2009-02-17  7:03 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers, linuxppc-dev

Nathan Lynch <ntl@pobox.com> wrote:
>
> Oren Laadan wrote:
> > 
> > Nathan Lynch wrote:
> > > 
> > > What doesn't work:
> > > * restarting a 32-bit task from a 64-bit task and vice versa
> > 
> > Is there a test to bail if we attempt to checkpoint such tasks ?
> 
> No, but I'll add one if it looks too hard to fix for the next round.

Unfortunately, adding a check for this is hard.

The "point of no return" in the restart path is cr_read_mm, which tears
down current's address space.  cr_read_mm runs way before cr_read_cpu,
which is the only restart method I've implemented for powerpc so far.
So, checking for this condition in cr_read_cpu is too late if I want
restart(2) to return an error and leave the caller's memory map
intact.  (And I do want this: restart should be as robust as execve.)

Well okay then, cr_read_head_arch seems to be the right place in the
restart sequence for the architecture code to handle this.  However,
cr_write_head_arch (which produces the buffer that cr_read_head_arch
consumes) is not provided a reference to the task to be checkpointed,
nor can it assume that it's operating on current.  I need a reference
to a task before I can determine whether it's running in 32- or 64-bit
mode, or using the FPU, Altivec, SPE, whatever.

In any case, mixing 32- and 64-bit tasks across restart is something I
eventually want to support, not reject.  But the problem I've outlined
applies to FPU state and vector extensions (VMX, SPE), as well as
sanity-checking debug register (DABR) contents.  We'll need to be able
to error out gracefully from restart when a checkpoint image specifies a
feature unsupported by the current kernel or hardware.  But I don't see
how to do it with the current architecture.  Am I missing something?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/3 v2] powerpc: heckpoint/restart implementation
  2009-02-17  7:03       ` Nathan Lynch
@ 2009-02-17 20:02         ` Nathan Lynch
  2009-02-24 19:58         ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Serge E. Hallyn
  2009-03-13  3:31         ` Oren Laadan
  2 siblings, 0 replies; 28+ messages in thread
From: Nathan Lynch @ 2009-02-17 20:02 UTC (permalink / raw)
  To: containers; +Cc: linuxppc-dev, Oren Laadan

On Tue, 17 Feb 2009 01:03:55 -0600
Nathan Lynch <ntl@pobox.com> wrote:

> Nathan Lynch <ntl@pobox.com> wrote:
> >
> > Oren Laadan wrote:
> > > 
> > > Nathan Lynch wrote:
> > > > 
> > > > What doesn't work:
> > > > * restarting a 32-bit task from a 64-bit task and vice versa
> > > 
> > > Is there a test to bail if we attempt to checkpoint such tasks ?
> > 
> > No, but I'll add one if it looks too hard to fix for the next round.
> 
> Unfortunately, adding a check for this is hard.
> 
> The "point of no return" in the restart path is cr_read_mm, which tears
> down current's address space.  cr_read_mm runs way before cr_read_cpu,
> which is the only restart method I've implemented for powerpc so far.
> So, checking for this condition in cr_read_cpu is too late if I want
> restart(2) to return an error and leave the caller's memory map
> intact.  (And I do want this: restart should be as robust as execve.)
> 
> Well okay then, cr_read_head_arch seems to be the right place in the
> restart sequence for the architecture code to handle this.  However,
> cr_write_head_arch (which produces the buffer that cr_read_head_arch
> consumes) is not provided a reference to the task to be checkpointed,
> nor can it assume that it's operating on current.  I need a reference
> to a task before I can determine whether it's running in 32- or 64-bit
> mode, or using the FPU, Altivec, SPE, whatever.
> 
> In any case, mixing 32- and 64-bit tasks across restart is something I
> eventually want to support, not reject.  But the problem I've outlined
> applies to FPU state and vector extensions (VMX, SPE), as well as
> sanity-checking debug register (DABR) contents.  We'll need to be able
> to error out gracefully from restart when a checkpoint image specifies a
> feature unsupported by the current kernel or hardware.  But I don't see
> how to do it with the current architecture.  Am I missing something?

Anyway, here's what I have coded up in response to all the feedback
(thanks!)  But all the error/compatibility checking I added doesn't
seem that useful unless the above is addressed...

Support for checkpointing and restarting GPRs, FPU state, DABR, and
Altivec state.

The portion of the checkpoint image manipulated by this code begins
with a bitmask of features indicating the various contexts saved.
Fields in image that can vary depending on kernel configuration
(e.g. FP regs due to VSX) have their sizes explicitly recorded, except
for GPRS, so migrating between ppc32 and ppc64 won't work yet.

The restart code ensures that the task is not modified until the
checkpoint image is validated against the current kernel configuration
and hardware features (e.g. can't restart a task using Altivec on
non-Altivec systems).

What works:
* self and external checkpoint of simple (single thread, one open
  file) 32- and 64-bit processes on a ppc64 kernel

What doesn't work:
* restarting a 32-bit task from a 64-bit task and vice versa

Untested:
* ppc32 (but it builds)

Signed-off-by: Nathan Lynch <ntl@pobox.com>
---

This depends on "powerpc: provide APIs for validating and updating
DABR" which I posted to linuxppc-dev on 17 Feb:

http://patchwork.ozlabs.org/patch/23311/

v2 changelog:
- use feature bitmask in checkpoint image as suggested by Ben
- fail restart if checkpoint image specifies unsupported features
- handle Altivec/VMX and SPE register state
- validate DABR value from checkpoint image
- fail restart on differing FP register set sizes (can happen
  depending on CONFIG_VSX)
- fail restart on 32-/64-bit mismatch between image and restarting
  task
- don't write meaningless data in unimplemented arch callbacks
- kill cr_hdr_init helper


 arch/powerpc/include/asm/checkpoint_hdr.h |   15 +
 arch/powerpc/mm/Makefile                  |    1 +
 arch/powerpc/mm/checkpoint.c              |  482 +++++++++++++++++++++++++++++
 3 files changed, 498 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/include/asm/checkpoint_hdr.h
 create mode 100644 arch/powerpc/mm/checkpoint.c

diff --git a/arch/powerpc/include/asm/checkpoint_hdr.h b/arch/powerpc/include/asm/checkpoint_hdr.h
new file mode 100644
index 0000000..9f0d099
--- /dev/null
+++ b/arch/powerpc/include/asm/checkpoint_hdr.h
@@ -0,0 +1,15 @@
+#ifndef __ASM_PPC_CKPT_HDR_H
+#define __ASM_PPC_CKPT_HDR_H
+/*
+ *  Checkpoint/restart - architecture specific headers ppc
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* nothing to see here */
+
+#endif /* __ASM_PPC_CKPT_HDR__H */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index e7392b4..8a523a0 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
+obj-$(CONFIG_CHECKPOINT_RESTART) += checkpoint.o
diff --git a/arch/powerpc/mm/checkpoint.c b/arch/powerpc/mm/checkpoint.c
new file mode 100644
index 0000000..afc2138
--- /dev/null
+++ b/arch/powerpc/mm/checkpoint.c
@@ -0,0 +1,482 @@
+/*
+ *  Checkpoint/restart - architecture specific support for powerpc.
+ *  Based on x86 implementation.
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *  Copyright 2009 IBM Corp.
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define DEBUG 1 /* for pr_debug */
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/system.h>
+
+enum cr_cpu_feature {
+	CKPT_USED_FP,
+	CKPT_USED_DEBUG,
+	CKPT_USED_ALTIVEC,
+	CKPT_USED_SPE,
+	CKPT_USED_VSX,
+	CKPT_FTR_END = 31,
+};
+
+#define x(ftr) (1UL << ftr)
+
+/* features this kernel can handle for restart */
+enum {
+	CKPT_FTRS_POSSIBLE =
+#ifdef CONFIG_PPC_FPU
+	x(CKPT_USED_FP) |
+#endif
+	x(CKPT_USED_DEBUG) |
+#ifdef CONFIG_ALTIVEC
+	x(CKPT_USED_ALTIVEC) |
+#endif
+#ifdef CONFIG_SPE
+	x(CKPT_USED_SPE) |
+#endif
+#ifdef CONFIG_VSX
+	x(CKPT_USED_VSX)
+#endif
+	0,
+};
+
+#undef x
+
+struct cr_hdr_cpu {
+	u32 features_used;
+	u32 pt_regs_size;
+	u32 fpr_size;
+	struct pt_regs pt_regs;
+	/* relevant fields from thread_struct */
+	double fpr[32][TS_FPRWIDTH];
+	u32 fpscr;
+	s32 fpexc_mode;
+	u64 dabr;
+	/* Altivec/VMX state */
+	vector128 vr[32];
+	vector128 vscr;
+	u64 vrsave;
+	/* SPE state */
+	u32 evr[32];
+	u64 acc;
+	u32 spefscr;
+};
+
+static void cr_cpu_feature_set(struct cr_hdr_cpu *hdr, enum cr_cpu_feature ftr)
+{
+	hdr->features_used |= 1ULL << ftr;
+}
+
+static bool cr_cpu_feature_isset(const struct cr_hdr_cpu *hdr, enum cr_cpu_feature ftr)
+{
+	return hdr->features_used & (1ULL << ftr);
+}
+
+/* determine whether an image has feature bits set that this kernel
+ * does not support */
+static bool cr_cpu_features_unknown(const struct cr_hdr_cpu *hdr)
+{
+	return hdr->features_used & ~CKPT_FTRS_POSSIBLE;
+}
+
+static void checkpoint_gprs(struct cr_hdr_cpu *cpu_hdr, struct task_struct *task)
+{
+	struct pt_regs *pt_regs;
+
+	pr_debug("%s: saving GPRs\n", __func__);
+
+	cpu_hdr->pt_regs_size = sizeof(*pt_regs);
+	pt_regs = task_pt_regs(task);
+	cpu_hdr->pt_regs = *pt_regs;
+}
+
+#ifdef CONFIG_PPC_FPU
+static void checkpoint_fpu(struct cr_hdr_cpu *cpu_hdr, struct task_struct *task)
+{
+	/* easiest to save FP state unconditionally */
+
+	pr_debug("%s: saving FPU state\n", __func__);
+
+	if (task == current)
+		flush_fp_to_thread(task);
+
+	cpu_hdr->fpr_size = sizeof(cpu_hdr->fpr);
+	cpu_hdr->fpscr = task->thread.fpscr.val;
+	cpu_hdr->fpexc_mode = task->thread.fpexc_mode;
+
+	memcpy(cpu_hdr->fpr, task->thread.fpr, sizeof(cpu_hdr->fpr));
+
+	cr_cpu_feature_set(cpu_hdr, CKPT_USED_FP);
+}
+#else
+static void checkpoint_fpu(struct cr_hdr_cpu *cpu_hdr, struct task_struct *task)
+{
+	return;
+}
+#endif
+
+#ifdef CONFIG_ALTIVEC
+static void checkpoint_altivec(struct cr_hdr_cpu *cpu_hdr, struct task_struct *task)
+{
+	if (!cpu_has_feature(CPU_FTR_ALTIVEC))
+		return;
+
+	if (!task->thread.used_vr)
+		return;
+
+	pr_debug("%s: saving Altivec state\n", __func__);
+
+	if (task == current)
+		flush_altivec_to_thread(task);
+
+	cpu_hdr->vrsave = task->thread.vrsave;
+	memcpy(cpu_hdr->vr, task->thread.vr, sizeof(cpu_hdr->vr));
+	cr_cpu_feature_set(cpu_hdr, CKPT_USED_ALTIVEC);
+}
+#else
+static void checkpoint_altivec(struct cr_hdr_cpu *cpu_hdr, struct task_struct *task)
+{
+	return;
+}
+#endif
+
+#ifdef CONFIG_SPE
+static void checkpoint_spe(struct cr_hdr_cpu *cpu_hdr, struct task_struct *task)
+{
+	if (!cpu_has_feature(CPU_FTR_SPE))
+		return;
+
+	if (!task->thread.used_spe)
+		return;
+
+	pr_debug("%s: saving SPE state\n", __func__);
+
+	if (task == current)
+		flush_spe_to_thread(task);
+
+	cpu_hdr->acc = task->thread.acc;
+	cpu_hdr->spefscr = task->thread.spefscr;
+	memcpy(cpu_hdr->evr, task->thread.evr, sizeof(cpu_hdr->evr));
+	cr_cpu_feature_set(cpu_hdr, CKPT_USED_SPE);
+}
+#else
+static void checkpoint_spe(struct cr_hdr_cpu *cpu_hdr, struct task_struct *task)
+{
+	return;
+}
+#endif
+
+static void checkpoint_dabr(struct cr_hdr_cpu *cpu_hdr, const struct task_struct *task)
+{
+	if (!task->thread.dabr)
+		return;
+
+	cpu_hdr->dabr = task->thread.dabr;
+	cr_cpu_feature_set(cpu_hdr, CKPT_USED_DEBUG);
+}
+
+/* dump the thread_struct of a given task */
+int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
+{
+	return 0;
+}
+
+/* dump the cpu state and registers of a given task */
+int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
+{
+	struct cr_hdr_cpu *cpu_hdr;
+	struct cr_hdr cr_hdr;
+	int rc;
+
+	cr_hdr.type = CR_HDR_CPU;
+	cr_hdr.len = sizeof(*cpu_hdr);
+	cr_hdr.parent = task_pid_vnr(t);
+
+	rc = -ENOMEM;
+	cpu_hdr = kzalloc(sizeof(*cpu_hdr), GFP_KERNEL);
+	if (!cpu_hdr)
+		goto err;
+
+	checkpoint_gprs(cpu_hdr, t);
+	checkpoint_fpu(cpu_hdr, t);
+	checkpoint_dabr(cpu_hdr, t);
+	checkpoint_altivec(cpu_hdr, t);
+	checkpoint_spe(cpu_hdr, t);
+
+	rc = cr_write_obj(ctx, &cr_hdr, cpu_hdr);
+err:
+	kfree(cpu_hdr);
+	return rc;
+}
+
+int cr_write_head_arch(struct cr_ctx *ctx)
+{
+	return 0;
+}
+
+/* dump the mm->context state */
+int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int parent)
+{
+	return 0;
+}
+
+/* restart APIs */
+
+/* read the thread_struct into the current task */
+int cr_read_thread(struct cr_ctx *ctx)
+{
+	return 0;
+}
+
+/* Based on the MSR value from a checkpoint image, produce an MSR
+ * value that is appropriate for the restored task.  Right now we only
+ * check for MSR_SF (64-bit) for PPC64.
+ */
+static unsigned long sanitize_msr(unsigned long msr_ckpt)
+{
+#ifdef CONFIG_PPC32
+	return MSR_USER;
+#else
+	if (msr_ckpt & MSR_SF)
+		return MSR_USER64;
+	return MSR_USER32;
+#endif
+}
+
+static int restore_gprs(const struct cr_hdr_cpu *cpu_hdr, struct task_struct *task, bool update)
+{
+	struct pt_regs *regs;
+	int rc;
+
+	rc = -EINVAL;
+	if (cpu_hdr->pt_regs_size != sizeof(*regs))
+		goto out;
+
+	rc = 0;
+	if (!update)
+		goto out;
+
+	regs = task_pt_regs(task);
+	*regs = cpu_hdr->pt_regs;
+
+	regs->msr = sanitize_msr(regs->msr);
+out:
+	return rc;
+}
+
+#ifdef CONFIG_PPC_FPU
+static int restore_fpu(const struct cr_hdr_cpu *cpu_hdr, struct task_struct *task, bool update)
+{
+	int rc;
+
+	rc = -EINVAL;
+	if (cpu_hdr->fpr_size != sizeof(task->thread.fpr))
+		goto out;
+
+	rc = 0;
+	if (!update || !cr_cpu_feature_isset(cpu_hdr, CKPT_USED_FP))
+		goto out;
+
+	task->thread.fpscr.val = cpu_hdr->fpscr;
+	task->thread.fpexc_mode = cpu_hdr->fpexc_mode;
+
+	memcpy(task->thread.fpr, cpu_hdr->fpr, sizeof(task->thread.fpr));
+out:
+	return rc;
+}
+#else
+static int restore_fpu(const struct cr_hdr_cpu *cpu_hdr, struct task_struct *task, bool update)
+{
+	WARN_ON_ONCE(cr_cpu_feature_isset(cpu_hdr, CKPT_USED_FP));
+	return 0;
+}
+#endif
+
+static int restore_dabr(const struct cr_hdr_cpu *cpu_hdr, struct task_struct *task, bool update)
+{
+	int rc;
+
+	rc = 0;
+	if (!cr_cpu_feature_isset(cpu_hdr, CKPT_USED_DEBUG))
+		goto out;
+
+	rc = -EINVAL;
+	if (!debugreg_valid(cpu_hdr->dabr))
+		goto out;
+
+	rc = 0;
+	if (!update)
+		goto out;
+
+	debugreg_update(task, cpu_hdr->dabr);
+out:
+	return rc;
+}
+
+#ifdef CONFIG_ALTIVEC
+static int restore_altivec(const struct cr_hdr_cpu *cpu_hdr, struct task_struct *task, bool update)
+{
+	int rc;
+
+	rc = 0;
+	if (!cr_cpu_feature_isset(cpu_hdr, CKPT_USED_ALTIVEC))
+		goto out;
+
+	rc = -EINVAL;
+	if (!cpu_has_feature(CPU_FTR_ALTIVEC))
+		goto out;
+
+	rc = 0;
+	if (!update)
+		goto out;
+
+	task->thread.vrsave = cpu_hdr->vrsave;
+	task->thread.used_vr = 1;
+
+	memcpy(task->thread.vr, cpu_hdr->vr, sizeof(cpu_hdr->vr));
+out:
+	return rc;
+}
+#else
+static int restore_altivec(const struct cr_hdr_cpu *cpu_hdr, struct task_struct *task, bool update)
+{
+	WARN_ON_ONCE(cr_cpu_feature_isset(CKPT_USED_ALTIVEC));
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_SPE
+static int restore_spe(const struct cr_hdr_cpu *cpu_hdr, struct task_struct *task, bool update)
+{
+	int rc;
+
+	rc = 0;
+	if (!cr_cpu_feature_isset(cpu_hdr, CKPT_USED_SPE))
+		goto out;
+
+	rc = -EINVAL;
+	if (!cpu_has_feature(CPU_FTR_SPE))
+		goto out;
+
+	rc = 0;
+	if (!update)
+		goto out;
+
+	task->thread.acc = cpu_hdr->acc;
+	task->thread.spefscr = cpu_hdr->spefscr;
+	task->thread.used_spe = 1;
+
+	memcpy(task->thread.evr, cpu_hdr->evr, sizeof(cpu_hdr->evr));
+out:
+	return rc;
+}
+#else
+static int restore_spe(const struct cr_hdr_cpu *cpu_hdr, struct task_struct *task, bool update)
+{
+	WARN_ON_ONCE(cr_cpu_feature_isset(cpu_hdr, CKPT_USED_SPE));
+	return 0;
+}
+#endif
+
+struct restore_func_desc {
+	int (*func)(const struct cr_hdr_cpu *, struct task_struct *, bool);
+	const char *info;
+};
+
+typedef int (*restore_func_t)(const struct cr_hdr_cpu *, struct task_struct *, bool);
+
+static const restore_func_t restore_funcs[] = {
+	restore_gprs,
+	restore_fpu,
+	restore_dabr,
+	restore_altivec,
+	restore_spe,
+};
+
+static bool bitness_match(const struct cr_hdr_cpu *cpu_hdr, const struct task_struct *task)
+{
+	/* 64-bit image */
+	if (cpu_hdr->pt_regs.msr & MSR_SF) {
+		if (task->thread.regs->msr & MSR_SF)
+			return true;
+		else
+			return false;
+	}
+
+	/* 32-bit image */
+	if (task->thread.regs->msr & MSR_SF)
+		return false;
+
+	return true;
+}
+
+int cr_read_cpu(struct cr_ctx *ctx)
+{
+	struct cr_hdr_cpu *cpu_hdr;
+	bool update;
+	int rc;
+	int i;
+
+	rc = -ENOMEM;
+	cpu_hdr = kzalloc(sizeof(*cpu_hdr), GFP_KERNEL);
+	if (!cpu_hdr)
+		goto err;
+
+	rc = cr_read_obj_type(ctx, cpu_hdr, sizeof(*cpu_hdr), CR_HDR_CPU);
+	if (rc < 0)
+		goto err;
+
+	rc = -EINVAL;
+	if (cr_cpu_features_unknown(cpu_hdr))
+		goto err;
+
+	/* temporary: restoring a 32-bit image from a 64-bit task and
+	 * vice-versa is known not to work (probably not restoring
+	 * thread_info correctly); detect this and fail gracefully.
+	 */
+	if (!bitness_match(cpu_hdr, current))
+		goto err;
+
+	/* We want to determine whether there's anything wrong with
+	 * the checkpoint image before changing the task at all.  Run
+	 * a "check" phase (update = false) first.
+	 */
+	update = false;
+commit:
+	for (i = 0; i < ARRAY_SIZE(restore_funcs); i++) {
+		rc = restore_funcs[i](cpu_hdr, current, update);
+		if (rc == 0)
+			continue;
+		pr_debug("%s: restore_func[%i] failed\n", __func__, i);
+		WARN_ON_ONCE(update);
+		goto err;
+	}
+
+	if (!update) {
+		update = true;
+		goto commit;
+	}
+
+err:
+	kfree(cpu_hdr);
+	return rc;
+}
+
+int cr_read_head_arch(struct cr_ctx *ctx)
+{
+	return 0;
+}
+
+int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
+{
+	return 0;
+}
-- 
1.6.0.6

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-17  7:03       ` Nathan Lynch
  2009-02-17 20:02         ` [PATCH 1/3 v2] powerpc: heckpoint/restart implementation Nathan Lynch
@ 2009-02-24 19:58         ` Serge E. Hallyn
  2009-02-24 21:11           ` Nathan Lynch
  2009-03-13  3:31         ` Oren Laadan
  2 siblings, 1 reply; 28+ messages in thread
From: Serge E. Hallyn @ 2009-02-24 19:58 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers, Oren Laadan, linuxppc-dev

Quoting Nathan Lynch (ntl@pobox.com):
> Nathan Lynch <ntl@pobox.com> wrote:
> >
> > Oren Laadan wrote:
> > > 
> > > Nathan Lynch wrote:
> > > > 
> > > > What doesn't work:
> > > > * restarting a 32-bit task from a 64-bit task and vice versa
> > > 
> > > Is there a test to bail if we attempt to checkpoint such tasks ?
> > 
> > No, but I'll add one if it looks too hard to fix for the next round.
> 
> Unfortunately, adding a check for this is hard.
> 
> The "point of no return" in the restart path is cr_read_mm, which tears
> down current's address space.  cr_read_mm runs way before cr_read_cpu,
> which is the only restart method I've implemented for powerpc so far.
> So, checking for this condition in cr_read_cpu is too late if I want
> restart(2) to return an error and leave the caller's memory map
> intact.  (And I do want this: restart should be as robust as execve.)
> 
> Well okay then, cr_read_head_arch seems to be the right place in the
> restart sequence for the architecture code to handle this.  However,
> cr_write_head_arch (which produces the buffer that cr_read_head_arch
> consumes) is not provided a reference to the task to be checkpointed,
> nor can it assume that it's operating on current.  I need a reference
> to a task before I can determine whether it's running in 32- or 64-bit
> mode, or using the FPU, Altivec, SPE, whatever.
> 
> In any case, mixing 32- and 64-bit tasks across restart is something I
> eventually want to support, not reject.  But the problem I've outlined
> applies to FPU state and vector extensions (VMX, SPE), as well as
> sanity-checking debug register (DABR) contents.  We'll need to be able
> to error out gracefully from restart when a checkpoint image specifies a
> feature unsupported by the current kernel or hardware.  But I don't see
> how to do it with the current architecture.  Am I missing something?

I suspect I can guess the response to this suggestion, but how about we
accept that if sys_restart() fails due to something like this, the
task is lost and can't exit gracefully?  The end-user can then run
a user-space program to run through the checkpoint image ahead of
time (if they want) and verify whether restart can be expected to
succeed on the current hardware and os version?

If a minor difference is detected, the user-space program might
simply rewrite (a copy of) the checkpoint image into one which can
in fact succeed.

-serge

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-24 19:58         ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Serge E. Hallyn
@ 2009-02-24 21:11           ` Nathan Lynch
  2009-03-13  3:36             ` Oren Laadan
  0 siblings, 1 reply; 28+ messages in thread
From: Nathan Lynch @ 2009-02-24 21:11 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers, Oren Laadan, linuxppc-dev

On Tue, 24 Feb 2009 13:58:26 -0600
"Serge E. Hallyn" <serue@us.ibm.com> wrote:

> Quoting Nathan Lynch (ntl@pobox.com):
> > Nathan Lynch <ntl@pobox.com> wrote:
> > >
> > > Oren Laadan wrote:
> > > > 
> > > > Nathan Lynch wrote:
> > > > > 
> > > > > What doesn't work:
> > > > > * restarting a 32-bit task from a 64-bit task and vice versa
> > > > 
> > > > Is there a test to bail if we attempt to checkpoint such tasks ?
> > > 
> > > No, but I'll add one if it looks too hard to fix for the next round.
> > 
> > Unfortunately, adding a check for this is hard.
> > 
> > The "point of no return" in the restart path is cr_read_mm, which tears
> > down current's address space.  cr_read_mm runs way before cr_read_cpu,
> > which is the only restart method I've implemented for powerpc so far.
> > So, checking for this condition in cr_read_cpu is too late if I want
> > restart(2) to return an error and leave the caller's memory map
> > intact.  (And I do want this: restart should be as robust as execve.)
> > 
> > Well okay then, cr_read_head_arch seems to be the right place in the
> > restart sequence for the architecture code to handle this.  However,
> > cr_write_head_arch (which produces the buffer that cr_read_head_arch
> > consumes) is not provided a reference to the task to be checkpointed,
> > nor can it assume that it's operating on current.  I need a reference
> > to a task before I can determine whether it's running in 32- or 64-bit
> > mode, or using the FPU, Altivec, SPE, whatever.
> > 
> > In any case, mixing 32- and 64-bit tasks across restart is something I
> > eventually want to support, not reject.  But the problem I've outlined
> > applies to FPU state and vector extensions (VMX, SPE), as well as
> > sanity-checking debug register (DABR) contents.  We'll need to be able
> > to error out gracefully from restart when a checkpoint image specifies a
> > feature unsupported by the current kernel or hardware.  But I don't see
> > how to do it with the current architecture.  Am I missing something?
> 
> I suspect I can guess the response to this suggestion, but how about we
> accept that if sys_restart() fails due to something like this, the
> task is lost and can't exit gracefully?

In the short term it might be necessary.  But the restart code should
forcibly kill the task instead of returning an error back up to
userspace in this case.  Once the memory map of the process has been
altered, there is no point in allowing it to continue (and likely dump
a useless core).  Btw, this failure mode seems to apply when
cr_read_files() fails, too...

But in the long term, things need to be more robust (e.g. restart(2)
returns ENOEXEC without messing with current->mm).  I think it's worth
looking at how execve operates... if I understand correctly, it sets up
a new mm_struct disconnected from the current task and activates it at
the last moment.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-17  7:03       ` Nathan Lynch
  2009-02-17 20:02         ` [PATCH 1/3 v2] powerpc: heckpoint/restart implementation Nathan Lynch
  2009-02-24 19:58         ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Serge E. Hallyn
@ 2009-03-13  3:31         ` Oren Laadan
  2009-03-13 15:42           ` Cedric Le Goater
  2009-03-16 18:37           ` Nathan Lynch
  2 siblings, 2 replies; 28+ messages in thread
From: Oren Laadan @ 2009-03-13  3:31 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers, linuxppc-dev



Nathan Lynch wrote:
> Nathan Lynch <ntl@pobox.com> wrote:
>> Oren Laadan wrote:
>>> Nathan Lynch wrote:
>>>> What doesn't work:
>>>> * restarting a 32-bit task from a 64-bit task and vice versa
>>> Is there a test to bail if we attempt to checkpoint such tasks ?
>> No, but I'll add one if it looks too hard to fix for the next round.
> 
> Unfortunately, adding a check for this is hard.
> 
> The "point of no return" in the restart path is cr_read_mm, which tears
> down current's address space.  cr_read_mm runs way before cr_read_cpu,
> which is the only restart method I've implemented for powerpc so far.
> So, checking for this condition in cr_read_cpu is too late if I want
> restart(2) to return an error and leave the caller's memory map
> intact.  (And I do want this: restart should be as robust as execve.)

In the case of restarting a container, I think it's ok if a restarting
tasks dies in an "ugly" way -- this will be observed and handled by the
initiating task outside the container, which will gracefully report to
the caller/user.

Even if you close this hole, then any other failure later on during
restart - even a failure to allocate kernel memory due to memory pressure,
will give that undesired effect that you are trying to avoid.

That said, any difference in the architecture that may cause restart to
fail is probably best placed in cr_write_head_arch.

> 
> Well okay then, cr_read_head_arch seems to be the right place in the
> restart sequence for the architecture code to handle this.  However,
> cr_write_head_arch (which produces the buffer that cr_read_head_arch
> consumes) is not provided a reference to the task to be checkpointed,
> nor can it assume that it's operating on current.  I need a reference
> to a task before I can determine whether it's running in 32- or 64-bit
> mode, or using the FPU, Altivec, SPE, whatever.
> 
> In any case, mixing 32- and 64-bit tasks across restart is something I
> eventually want to support, not reject.  But the problem I've outlined
> applies to FPU state and vector extensions (VMX, SPE), as well as
> sanity-checking debug register (DABR) contents.  We'll need to be able
> to error out gracefully from restart when a checkpoint image specifies a
> feature unsupported by the current kernel or hardware.  But I don't see
> how to do it with the current architecture.  Am I missing something?
> 

More specifically, I envision restart to work like this:

1) user invokes user-land utility (e.g. "cr --restart ..."
2) 'cr' will create a new container
3) 'cr' will start a child in that container
4) child will create rest of tree (in kernel or in user space - tbd)
5) each task in that tree will restore itself
6) 'cr' monitors this process
7) if all goes well - 'cr' report ok.
8) if something goes bad, 'cr' notices and notifies caller/user

so tasks that are restarting may just as well die badly - we don't care.

Does that make sense ?

Oren.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-02-24 21:11           ` Nathan Lynch
@ 2009-03-13  3:36             ` Oren Laadan
  0 siblings, 0 replies; 28+ messages in thread
From: Oren Laadan @ 2009-03-13  3:36 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers, Serge E. Hallyn, linuxppc-dev



Nathan Lynch wrote:
> On Tue, 24 Feb 2009 13:58:26 -0600
> "Serge E. Hallyn" <serue@us.ibm.com> wrote:
> 
>> Quoting Nathan Lynch (ntl@pobox.com):
>>> Nathan Lynch <ntl@pobox.com> wrote:
>>>> Oren Laadan wrote:
>>>>> Nathan Lynch wrote:
>>>>>> What doesn't work:
>>>>>> * restarting a 32-bit task from a 64-bit task and vice versa
>>>>> Is there a test to bail if we attempt to checkpoint such tasks ?
>>>> No, but I'll add one if it looks too hard to fix for the next round.
>>> Unfortunately, adding a check for this is hard.
>>>
>>> The "point of no return" in the restart path is cr_read_mm, which tears
>>> down current's address space.  cr_read_mm runs way before cr_read_cpu,
>>> which is the only restart method I've implemented for powerpc so far.
>>> So, checking for this condition in cr_read_cpu is too late if I want
>>> restart(2) to return an error and leave the caller's memory map
>>> intact.  (And I do want this: restart should be as robust as execve.)
>>>
>>> Well okay then, cr_read_head_arch seems to be the right place in the
>>> restart sequence for the architecture code to handle this.  However,
>>> cr_write_head_arch (which produces the buffer that cr_read_head_arch
>>> consumes) is not provided a reference to the task to be checkpointed,
>>> nor can it assume that it's operating on current.  I need a reference
>>> to a task before I can determine whether it's running in 32- or 64-bit
>>> mode, or using the FPU, Altivec, SPE, whatever.
>>>
>>> In any case, mixing 32- and 64-bit tasks across restart is something I
>>> eventually want to support, not reject.  But the problem I've outlined
>>> applies to FPU state and vector extensions (VMX, SPE), as well as
>>> sanity-checking debug register (DABR) contents.  We'll need to be able
>>> to error out gracefully from restart when a checkpoint image specifies a
>>> feature unsupported by the current kernel or hardware.  But I don't see
>>> how to do it with the current architecture.  Am I missing something?
>> I suspect I can guess the response to this suggestion, but how about we
>> accept that if sys_restart() fails due to something like this, the
>> task is lost and can't exit gracefully?
> 
> In the short term it might be necessary.  But the restart code should
> forcibly kill the task instead of returning an error back up to
> userspace in this case.  Once the memory map of the process has been
> altered, there is no point in allowing it to continue (and likely dump
> a useless core).  Btw, this failure mode seems to apply when
> cr_read_files() fails, too...
> 
> But in the long term, things need to be more robust (e.g. restart(2)
> returns ENOEXEC without messing with current->mm).  I think it's worth
> looking at how execve operates... if I understand correctly, it sets up
> a new mm_struct disconnected from the current task and activates it at
> the last moment.
> 

That's a good idea, and I have considered it in the past.

However, it is easier to restarti a task in its own, new, context,
including the MM. For instance, you can leverage all memory syscalls.

An in-between way would be to switch to the new MM but not tear down
the original one, but rather save it along side. If a failure occur -
restore it.

Then, you'll have to ask the same question about all other resources -
signal handlers, open files, etc. Either you make all changes atomic
at once, or none - if you want the operation to be non-intrusive in
the case of an error.

However, I do think that this is not necessary: the tasks that are
doing the restart have been created from scratch for that purpose,
so they need not return any specific value to the user. It is the
task that initiates the restart that needs to handle error gracefully.
The scheme I proposed in the previous email does exactly that.

(This does not apply to self-restart, for obvious reasons, but that
is a special case anyway).

Oren.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-03-13  3:31         ` Oren Laadan
@ 2009-03-13 15:42           ` Cedric Le Goater
  2009-03-16 18:37           ` Nathan Lynch
  1 sibling, 0 replies; 28+ messages in thread
From: Cedric Le Goater @ 2009-03-13 15:42 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers, linuxppc-dev, Nathan Lynch


> More specifically, I envision restart to work like this:
> 
> 1) user invokes user-land utility (e.g. "cr --restart ..."
> 2) 'cr' will create a new container
> 3) 'cr' will start a child in that container

process 1 in its private namespaces.

> 4) child will create rest of tree (in kernel or in user space - tbd)
> 5) each task in that tree will restore itself
> 6) 'cr' monitors this process
> 7) if all goes well - 'cr' report ok.
> 8) if something goes bad, 'cr' notices and notifies caller/user

that's MCR implementation of restart. 
 
> so tasks that are restarting may just as well die badly - we don't care.

just sigkill them, but at end, before releasing the container from 
a frozen state, we have to make sure the right number of tasks have 
restarted ... so you need to track them along the way.

C.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-03-13  3:31         ` Oren Laadan
  2009-03-13 15:42           ` Cedric Le Goater
@ 2009-03-16 18:37           ` Nathan Lynch
  2009-03-17  6:55             ` Cedric Le Goater
  1 sibling, 1 reply; 28+ messages in thread
From: Nathan Lynch @ 2009-03-16 18:37 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers, linuxppc-dev

Oren Laadan <orenl@cs.columbia.edu> wrote:
> 
> Nathan Lynch wrote:
> > Nathan Lynch <ntl@pobox.com> wrote:
> >> Oren Laadan wrote:
> >>> Nathan Lynch wrote:
> >>>> What doesn't work:
> >>>> * restarting a 32-bit task from a 64-bit task and vice versa
> >>> Is there a test to bail if we attempt to checkpoint such tasks ?
> >> No, but I'll add one if it looks too hard to fix for the next round.
> > 
> > Unfortunately, adding a check for this is hard.
> > 
> > The "point of no return" in the restart path is cr_read_mm, which tears
> > down current's address space.  cr_read_mm runs way before cr_read_cpu,
> > which is the only restart method I've implemented for powerpc so far.
> > So, checking for this condition in cr_read_cpu is too late if I want
> > restart(2) to return an error and leave the caller's memory map
> > intact.  (And I do want this: restart should be as robust as execve.)
> 
> In the case of restarting a container, I think it's ok if a restarting
> tasks dies in an "ugly" way -- this will be observed and handled by the
> initiating task outside the container, which will gracefully report to
> the caller/user.

How would task exit be observed?  Are all tasks in a restarted
container guaranteed to be children (in the sense that wait(2) would
work) of the initiating task?


> Even if you close this hole, then any other failure later on during
> restart - even a failure to allocate kernel memory due to memory pressure,
> will give that undesired effect that you are trying to avoid.

Kernel memory allocation failure is not the kind of problem I'm trying
to address.  I am trying to address the case of restarting a checkpoint
image that needs features that are not present, where the set of
features used by the checkpoint image can be compared against the set
of features the platform provides.


> That said, any difference in the architecture that may cause restart to
> fail is probably best placed in cr_write_head_arch.

I think I explained in my earlier mail why the current implementation's
cr_write_head_arch doesn't help in this case:

> > Well okay then, cr_read_head_arch seems to be the right place in the
> > restart sequence for the architecture code to handle this.  However,
> > cr_write_head_arch (which produces the buffer that cr_read_head_arch
> > consumes) is not provided a reference to the task to be checkpointed,
> > nor can it assume that it's operating on current.  I need a reference
> > to a task before I can determine whether it's running in 32- or 64-bit
> > mode, or using the FPU, Altivec, SPE, whatever.
> > 
> > In any case, mixing 32- and 64-bit tasks across restart is something I
> > eventually want to support, not reject.  But the problem I've outlined
> > applies to FPU state and vector extensions (VMX, SPE), as well as
> > sanity-checking debug register (DABR) contents.  We'll need to be able
> > to error out gracefully from restart when a checkpoint image specifies a
> > feature unsupported by the current kernel or hardware.  But I don't see
> > how to do it with the current architecture.  Am I missing something?
> > 
> 
> More specifically, I envision restart to work like this:
> 
> 1) user invokes user-land utility (e.g. "cr --restart ..."
> 2) 'cr' will create a new container
> 3) 'cr' will start a child in that container
> 4) child will create rest of tree (in kernel or in user space - tbd)
> 5) each task in that tree will restore itself
> 6) 'cr' monitors this process
> 7) if all goes well - 'cr' report ok.
> 8) if something goes bad, 'cr' notices and notifies caller/user

Again, how would 'cr' obtain exit status for these tasks, and how would
it distinguish failure from normal operation?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-03-16 18:37           ` Nathan Lynch
@ 2009-03-17  6:55             ` Cedric Le Goater
  2009-03-18  9:15               ` Oren Laadan
  0 siblings, 1 reply; 28+ messages in thread
From: Cedric Le Goater @ 2009-03-17  6:55 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: containers, Oren Laadan, linuxppc-dev

> Again, how would 'cr' obtain exit status for these tasks, and how would
> it distinguish failure from normal operation?

Here's our solution to this issue.

mcr maintains in its kernel container object an exitcode attribute for 
the mcr-restart process. This process is detached from the fork tree of 
the restarted application.  

when the restart is finished, an mcr-wait command can be called to reap 
this exitcode. This make it possible to distinguish an exit of the 
application process from an exit of the mcr-restart process.

This is a must-have for batch managers in an HPC environment. 

Cheers,

C.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation
  2009-03-17  6:55             ` Cedric Le Goater
@ 2009-03-18  9:15               ` Oren Laadan
  0 siblings, 0 replies; 28+ messages in thread
From: Oren Laadan @ 2009-03-18  9:15 UTC (permalink / raw)
  To: Cedric Le Goater; +Cc: containers, linuxppc-dev, Nathan Lynch, Serge E. Hallyn


An alternative: the task that created the container namely, is the parent
(outside the container) of the container init(1). In turn, init(1) creates
a special 'monitor' thread that monitors the restart, and the outside task
reaps the exit status of that thread (and only that thread).

[Hmmm... thinking about this - what happens if the container init(1) calls
clone() with CLONE_PARENT ??  does it not generate sort of a competing
container init(1) ??!!

Oren.


Cedric Le Goater wrote:
>> Again, how would 'cr' obtain exit status for these tasks, and how would
>> it distinguish failure from normal operation?
> 
> Here's our solution to this issue.
> 
> mcr maintains in its kernel container object an exitcode attribute for 
> the mcr-restart process. This process is detached from the fork tree of 
> the restarted application.  
> 
> when the restart is finished, an mcr-wait command can be called to reap 
> this exitcode. This make it possible to distinguish an exit of the 
> application process from an exit of the mcr-restart process.
> 
> This is a must-have for batch managers in an HPC environment. 
> 
> Cheers,
> 
> C.
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2009-03-18  9:15 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-01-28 22:41 [RFC/PATCH 0/3] checkpoint/restart for powerpc Nathan Lynch
2009-01-28 22:41 ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Nathan Lynch
2009-01-29  6:41   ` Oren Laadan
2009-01-29 21:40     ` Nathan Lynch
2009-01-30  0:11       ` Oren Laadan
2009-01-30 20:25         ` Nathan Lynch
2009-02-17  7:03       ` Nathan Lynch
2009-02-17 20:02         ` [PATCH 1/3 v2] powerpc: heckpoint/restart implementation Nathan Lynch
2009-02-24 19:58         ` [PATCH 1/3] powerpc: bare minimum checkpoint/restart implementation Serge E. Hallyn
2009-02-24 21:11           ` Nathan Lynch
2009-03-13  3:36             ` Oren Laadan
2009-03-13  3:31         ` Oren Laadan
2009-03-13 15:42           ` Cedric Le Goater
2009-03-16 18:37           ` Nathan Lynch
2009-03-17  6:55             ` Cedric Le Goater
2009-03-18  9:15               ` Oren Laadan
2009-01-30  4:01     ` Serge E. Hallyn
2009-01-30  3:55   ` Serge E. Hallyn
2009-02-04  3:39   ` Benjamin Herrenschmidt
2009-02-04 15:54     ` Serge E. Hallyn
2009-02-04 20:58       ` Benjamin Herrenschmidt
2009-02-04 23:44         ` Oren Laadan
2009-02-05  0:16           ` Benjamin Herrenschmidt
2009-02-05  3:30             ` Oren Laadan
2009-02-05 16:09             ` Serge E. Hallyn
2009-02-05 21:01               ` Benjamin Herrenschmidt
2009-01-28 22:41 ` [PATCH 2/3] powerpc: wire up checkpoint and restart syscalls Nathan Lynch
2009-01-28 22:41 ` [PATCH 3/3] allow checkpoint/restart on powerpc Nathan Lynch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).