linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH][CFT] kexec (rewrite) for 2.5.52
@ 2002-12-22 11:07 Eric W. Biederman
  2002-12-31 14:35 ` Suparna Bhattacharya
  0 siblings, 1 reply; 15+ messages in thread
From: Eric W. Biederman @ 2002-12-22 11:07 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Andy Pfiffer, Suparna Bhattacharya, Dave Hansen


I have recently taken the time to dig through the internals of
kexec to see if I could make my code any simpler and have managed
to trim off about 100 lines, and have made the code much more
obviously correct.

The key realization was that I had a much simpler test to detect
if a page was a destination page.  Allowing me to remove the second
pass, and put all of the logic to avoid stepping on my own
toes in the page allocator.

I have also made the small changes necessary to allow using high
memory pages.  Since I cannot push the request for memory below 4GB
into alloc_pages, I simply keep push the unusable pages on a list, and
keep requesting memory.  This should be o.k. on a large memory machine
but I can also see it as being pathological.  The advantage to using
high memory is that I should be able to use most of the memory below
4GB for a kernel image, and depending on how the zones are setup this
keeps me from artificially limiting myself.  I get the feeling what I
really want is my own special zone, maybe later...

With all of the strange logic in the kimage_alloc_page the code 
is much more obviously correct, and in most cases should run in O(N)
time, though it can still get pathological and run in O(N^2).

I have also made allocation of the reboot code buffer a little less
clever, removing a very small pathological case that was previously
present.

Anyway, I would love to know in what entertaining ways this code blows
up, or if I get lucky and it doesn't.  I probably will not reply back
in a timely manner as I am off to visit my parents, for Christmas and
New Years.  

Eric

 MAINTAINERS                        |    8 
 arch/i386/Kconfig                  |   17 +
 arch/i386/kernel/Makefile          |    1 
 arch/i386/kernel/entry.S           |    1 
 arch/i386/kernel/machine_kexec.c   |  140 +++++++++
 arch/i386/kernel/relocate_kernel.S |  107 +++++++
 include/asm-i386/kexec.h           |   23 +
 include/asm-i386/unistd.h          |    1 
 include/linux/kexec.h              |   54 +++
 include/linux/reboot.h             |    2 
 kernel/Makefile                    |    1 
 kernel/kexec.c                     |  547 +++++++++++++++++++++++++++++++++++++
 kernel/sys.c                       |   23 +
 13 files changed, 925 insertions

diff -uNr linux-2.5.52/MAINTAINERS linux-2.5.52.x86kexec-2/MAINTAINERS
--- linux-2.5.52/MAINTAINERS	Thu Dec 12 07:41:16 2002
+++ linux-2.5.52.x86kexec-2/MAINTAINERS	Mon Dec 16 02:24:32 2002
@@ -997,6 +997,14 @@
 W:	http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
 S:	Maintained
 
+KEXEC
+P:	Eric Biederman
+M:	ebiederm@xmission.com
+M:	ebiederman@lnxi.com
+W:	http://www.xmission.com/~ebiederm/files/kexec/
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 LANMEDIA WAN CARD DRIVER
 P:	Andrew Stanley-Jones
 M:	asj@lanmedia.com
diff -uNr linux-2.5.52/arch/i386/Kconfig linux-2.5.52.x86kexec-2/arch/i386/Kconfig
--- linux-2.5.52/arch/i386/Kconfig	Mon Dec 16 02:18:32 2002
+++ linux-2.5.52.x86kexec-2/arch/i386/Kconfig	Mon Dec 16 02:23:00 2002
@@ -686,6 +686,23 @@
 	depends on (SMP || PREEMPT) && X86_CMPXCHG
 	default y
 
+config KEXEC
+	bool "kexec system call (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  kexec is a system call that implements the ability to  shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but it is indepedent of the system firmware.   And like a reboot
+	  you can start any kernel with it not just Linux.  
+	
+	  The name comes from the similiarity to the exec system call. 
+	
+	  It is on an going process to be certain the hardware in a machine
+	  is properly shutdown, so do not be surprised if this code does not
+	  initially work for you.  It may help to enable device hotplugging
+	  support.  As of this writing the exact hardware interface is
+	  strongly in flux, so no good recommendation can be made.
+
 endmenu
 
 
diff -uNr linux-2.5.52/arch/i386/kernel/Makefile linux-2.5.52.x86kexec-2/arch/i386/kernel/Makefile
--- linux-2.5.52/arch/i386/kernel/Makefile	Mon Dec 16 02:18:32 2002
+++ linux-2.5.52.x86kexec-2/arch/i386/kernel/Makefile	Mon Dec 16 02:23:00 2002
@@ -24,6 +24,7 @@
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend.o suspend_asm.o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
 obj-$(CONFIG_PROFILING)		+= profile.o
diff -uNr linux-2.5.52/arch/i386/kernel/entry.S linux-2.5.52.x86kexec-2/arch/i386/kernel/entry.S
--- linux-2.5.52/arch/i386/kernel/entry.S	Thu Dec 12 07:41:17 2002
+++ linux-2.5.52.x86kexec-2/arch/i386/kernel/entry.S	Sat Dec 21 23:36:10 2002
@@ -743,6 +743,7 @@
 	.long sys_epoll_wait
  	.long sys_remap_file_pages
  	.long sys_set_tid_address
+	.long sys_kexec_load
 
 
 	.rept NR_syscalls-(.-sys_call_table)/4
diff -uNr linux-2.5.52/arch/i386/kernel/machine_kexec.c linux-2.5.52.x86kexec-2/arch/i386/kernel/machine_kexec.c
--- linux-2.5.52/arch/i386/kernel/machine_kexec.c	Wed Dec 31 17:00:00 1969
+++ linux-2.5.52.x86kexec-2/arch/i386/kernel/machine_kexec.c	Sat Dec 21 16:07:05 2002
@@ -0,0 +1,140 @@
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+
+
+/*
+ * machine_kexec
+ * =======================
+ */
+
+
+static void set_idt(void *newidt, __u16 limit)
+{
+	unsigned char curidt[6];
+
+	/* ia32 supports unaliged loads & stores */
+	(*(__u16 *)(curidt)) = limit;
+	(*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
+
+	__asm__ __volatile__ (
+		"lidt %0\n" 
+		: "=m" (curidt)
+		);
+};
+
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+	unsigned char curgdt[6];
+
+	/* ia32 supports unaliged loads & stores */
+	(*(__u16 *)(curgdt)) = limit;
+	(*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+	__asm__ __volatile__ (
+		"lgdt %0\n" 
+		: "=m" (curgdt)
+		);
+};
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+	__asm__ __volatile__ (
+		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+		"\t1:\n"
+		"\tmovl $"STR(__KERNEL_DS)",%eax\n"
+		"\tmovl %eax,%ds\n"
+		"\tmovl %eax,%es\n"
+		"\tmovl %eax,%fs\n"
+		"\tmovl %eax,%gs\n"
+		"\tmovl %eax,%ss\n"
+		);
+#undef STR
+#undef __STR
+}
+
+static void identity_map_page(unsigned long address)
+{
+	/* This code is x86 specific...
+	 * general purpose code must be more carful 
+	 * of caches and tlbs...
+	 */
+	pgd_t *pgd;
+	pmd_t *pmd;
+	struct mm_struct *mm = current->mm;
+	spin_lock(&mm->page_table_lock);
+	
+	pgd = pgd_offset(mm, address);
+	pmd = pmd_alloc(mm, pgd, address);
+
+	if (pmd) {
+		pte_t *pte = pte_alloc_map(mm, pmd, address);
+		if (pte) {
+			set_pte(pte, 
+				mk_pte(pfn_to_page(address >> PAGE_SHIFT), PAGE_SHARED));
+			__flush_tlb_one(address);
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+}
+
+
+typedef void (*relocate_new_kernel_t)(
+	unsigned long indirection_page, unsigned long reboot_code_buffer,
+	unsigned long start_address);
+
+const extern unsigned char relocate_new_kernel[];
+extern void relocate_new_kernel_end(void);
+const extern unsigned int relocate_new_kernel_size;
+
+void machine_kexec(struct kimage *image)
+{
+	unsigned long indirection_page;
+	unsigned long reboot_code_buffer;
+	void *ptr;
+	relocate_new_kernel_t rnk;
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+	reboot_code_buffer = page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT;
+	indirection_page = image->head & PAGE_MASK;
+
+	identity_map_page(reboot_code_buffer);
+
+	/* copy it out */
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
+
+	/* The segment registers are funny things, they are
+	 * automatically loaded from a table, in memory wherever you
+	 * set them to a specific selector, but this table is never
+	 * accessed again you set the segment to a different selector.
+	 *
+	 * The more common model is are caches where the behide
+	 * the scenes work is done, but is also dropped at arbitrary
+	 * times.
+	 *
+	 * I take advantage of this here by force loading the
+	 * segments, before I zap the gdt with an invalid value.
+	 */
+	load_segments();
+	/* The gdt & idt are now invalid.
+	 * If you want to load them you must set up your own idt & gdt.
+	 */
+	set_gdt(phys_to_virt(0),0);
+	set_idt(phys_to_virt(0),0);
+
+	/* now call it */
+	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+	(*rnk)(indirection_page, reboot_code_buffer, image->start);
+}
+
diff -uNr linux-2.5.52/arch/i386/kernel/relocate_kernel.S linux-2.5.52.x86kexec-2/arch/i386/kernel/relocate_kernel.S
--- linux-2.5.52/arch/i386/kernel/relocate_kernel.S	Wed Dec 31 17:00:00 1969
+++ linux-2.5.52.x86kexec-2/arch/i386/kernel/relocate_kernel.S	Mon Dec 16 02:23:00 2002
@@ -0,0 +1,107 @@
+#include <linux/config.h>
+#include <linux/linkage.h>
+
+	/* Must be relocatable PIC code callable as a C function, that once
+	 * it starts can not use the previous processes stack.
+	 *
+	 */
+	.globl relocate_new_kernel
+relocate_new_kernel:
+	/* read the arguments and say goodbye to the stack */
+	movl  4(%esp), %ebx /* indirection_page */
+	movl  8(%esp), %ebp /* reboot_code_buffer */
+	movl  12(%esp), %edx /* start address */
+
+	/* zero out flags, and disable interrupts */
+	pushl $0
+	popfl
+
+	/* set a new stack at the bottom of our page... */
+	lea   4096(%ebp), %esp
+
+	/* store the parameters back on the stack */
+	pushl   %edx /* store the start address */
+
+	/* Set cr0 to a known state:
+	 * 31 0 == Paging disabled
+	 * 18 0 == Alignment check disabled
+	 * 16 0 == Write protect disabled
+	 * 3  0 == No task switch
+	 * 2  0 == Don't do FP software emulation.
+	 * 0  1 == Proctected mode enabled
+	 */
+	movl	%cr0, %eax
+	andl	$~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
+	orl	$(1<<0), %eax
+	movl	%eax, %cr0
+	
+	/* Set cr4 to a known state:
+	 * Setting everything to zero seems safe.
+	 */
+	movl	%cr4, %eax
+	andl	$0, %eax
+	movl	%eax, %cr4
+	
+	jmp 1f
+1:	
+
+	/* Flush the TLB (needed?) */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/* Do the copies */
+	cld
+0:	/* top, read another word for the indirection page */
+	movl    %ebx, %ecx
+	movl	(%ebx), %ecx
+	addl	$4, %ebx
+	testl	$0x1,   %ecx  /* is it a destination page */
+	jz	1f
+	movl	%ecx,	%edi
+	andl	$0xfffff000, %edi
+	jmp     0b
+1:
+	testl	$0x2,	%ecx  /* is it an indirection page */
+	jz	1f
+	movl	%ecx,	%ebx
+	andl	$0xfffff000, %ebx
+	jmp     0b
+1:
+	testl   $0x4,   %ecx /* is it the done indicator */
+	jz      1f
+	jmp     2f
+1:
+	testl   $0x8,   %ecx /* is it the source indicator */
+	jz      0b	     /* Ignore it otherwise */
+	movl    %ecx,   %esi /* For every source page do a copy */
+	andl    $0xfffff000, %esi
+
+	movl    $1024, %ecx
+	rep ; movsl
+	jmp     0b
+
+2:
+
+	/* To be certain of avoiding problems with self modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB, it's handy, and not processor dependent.
+	 */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+	
+	/* set all of the registers to known values */
+	/* leave %esp alone */
+	
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	xorl    %ecx, %ecx
+	xorl    %edx, %edx
+	xorl    %esi, %esi
+	xorl    %edi, %edi
+	xorl    %ebp, %ebp
+	ret
+relocate_new_kernel_end:
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:	
+	.long relocate_new_kernel_end - relocate_new_kernel
diff -uNr linux-2.5.52/include/asm-i386/kexec.h linux-2.5.52.x86kexec-2/include/asm-i386/kexec.h
--- linux-2.5.52/include/asm-i386/kexec.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.52.x86kexec-2/include/asm-i386/kexec.h	Sat Dec 21 14:18:31 2002
@@ -0,0 +1,23 @@
+#ifndef _I386_KEXEC_H
+#define _I386_KEXEC_H
+
+#include <asm/fixmap.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
+ * calculation for the amount of memory directly mappable into the
+ * kernel memory space.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+
+#define KEXEC_REBOOT_CODE_SIZE	4096
+
+#endif /* _I386_KEXEC_H */
diff -uNr linux-2.5.52/include/asm-i386/unistd.h linux-2.5.52.x86kexec-2/include/asm-i386/unistd.h
--- linux-2.5.52/include/asm-i386/unistd.h	Thu Dec 12 07:41:35 2002
+++ linux-2.5.52.x86kexec-2/include/asm-i386/unistd.h	Sat Dec 21 23:36:55 2002
@@ -264,6 +264,7 @@
 #define __NR_epoll_wait		256
 #define __NR_remap_file_pages	257
 #define __NR_set_tid_address	258
+#define __NR_sys_kexec_load	259
 
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
diff -uNr linux-2.5.52/include/linux/kexec.h linux-2.5.52.x86kexec-2/include/linux/kexec.h
--- linux-2.5.52/include/linux/kexec.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.52.x86kexec-2/include/linux/kexec.h	Sat Dec 21 15:27:17 2002
@@ -0,0 +1,54 @@
+#ifndef LINUX_KEXEC_H
+#define LINUX_KEXEC_H
+
+#if CONFIG_KEXEC
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/kexec.h>
+
+/* 
+ * This structure is used to hold the arguments that are used when loading
+ * kernel binaries.
+ */
+
+typedef unsigned long kimage_entry_t;
+#define IND_DESTINATION  0x1
+#define IND_INDIRECTION  0x2
+#define IND_DONE         0x4
+#define IND_SOURCE       0x8
+
+#define KEXEC_SEGMENT_MAX 8
+struct kexec_segment {
+	void *buf;
+	size_t bufsz;
+	void *mem;
+	size_t memsz;
+};
+
+struct kimage {
+	kimage_entry_t head;
+	kimage_entry_t *entry;
+	kimage_entry_t *last_entry;
+
+	unsigned long destination;
+	unsigned long offset;
+
+	unsigned long start;
+	struct page *reboot_code_pages;
+
+	unsigned long nr_segments;
+	struct kexec_segment segment[KEXEC_SEGMENT_MAX+1];
+
+	struct list_head dest_pages;
+	struct list_head unuseable_pages;
+};
+
+
+/* kexec interface functions */
+extern void machine_kexec(struct kimage *image);
+extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments, 
+	struct kexec_segment *segments);
+extern struct kimage *kexec_image;
+#endif
+#endif /* LINUX_KEXEC_H */
+
diff -uNr linux-2.5.52/include/linux/reboot.h linux-2.5.52.x86kexec-2/include/linux/reboot.h
--- linux-2.5.52/include/linux/reboot.h	Thu Dec 12 07:41:37 2002
+++ linux-2.5.52.x86kexec-2/include/linux/reboot.h	Mon Dec 16 02:23:00 2002
@@ -21,6 +21,7 @@
  * POWER_OFF   Stop OS and remove all power from system, if possible.
  * RESTART2    Restart system using given command string.
  * SW_SUSPEND  Suspend system using Software Suspend if compiled in
+ * KEXEC       Restart the system using a different kernel.
  */
 
 #define	LINUX_REBOOT_CMD_RESTART	0x01234567
@@ -30,6 +31,7 @@
 #define	LINUX_REBOOT_CMD_POWER_OFF	0x4321FEDC
 #define	LINUX_REBOOT_CMD_RESTART2	0xA1B2C3D4
 #define	LINUX_REBOOT_CMD_SW_SUSPEND	0xD000FCE2
+#define LINUX_REBOOT_CMD_KEXEC		0x45584543
 
 
 #ifdef __KERNEL__
diff -uNr linux-2.5.52/kernel/Makefile linux-2.5.52.x86kexec-2/kernel/Makefile
--- linux-2.5.52/kernel/Makefile	Mon Dec 16 02:19:15 2002
+++ linux-2.5.52.x86kexec-2/kernel/Makefile	Mon Dec 16 02:23:00 2002
@@ -21,6 +21,7 @@
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
+obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 
 ifneq ($(CONFIG_IA64),y)
diff -uNr linux-2.5.52/kernel/kexec.c linux-2.5.52.x86kexec-2/kernel/kexec.c
--- linux-2.5.52/kernel/kexec.c	Wed Dec 31 17:00:00 1969
+++ linux-2.5.52.x86kexec-2/kernel/kexec.c	Sun Dec 22 02:58:12 2002
@@ -0,0 +1,547 @@
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/version.h>
+#include <linux/compile.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <net/checksum.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+
+/* When kexec transitions to the new kernel there is a one to one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to it's final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the 
+ * the new kernel is placed in the reboot_code_buffer, whose size
+ * is given by KEXEC_REBOOT_CODE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifyable.
+ *
+ * The assembly stub in the reboot code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in it's final resting place (if it happens 
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of ram can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the reboot code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic, more
+ *    reliable.  
+ *  - allocating the pages for a page table for machines that cannot
+ *    disable their MMUs.  (Hammer, Alpha...)
+ */
+
+/* KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end);
+static struct page *kimage_alloc_reboot_code_pages(struct kimage *image);
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+
+static int kimage_alloc(struct kimage **rimage, 
+	unsigned long nr_segments, struct kexec_segment *segments)
+{
+	int result;
+	struct kimage *image;
+	size_t segment_bytes;
+	struct page *reboot_pages;
+	unsigned long i;
+
+	/* Allocate a controlling structure */
+	result = -ENOMEM;
+	image = kmalloc(sizeof(*image), GFP_KERNEL);
+	if (!image) {
+		goto out;
+	}
+	memset(image, 0, sizeof(*image));
+	image->head = 0;
+	image->entry = &image->head;
+	image->last_entry = &image->head;
+
+	/* Initialize the list of destination pages */
+	INIT_LIST_HEAD(&image->dest_pages);
+
+	/* Initialize the list of unuseable pages */
+	INIT_LIST_HEAD(&image->unuseable_pages);
+
+	/* Read in the segments */
+	image->nr_segments = nr_segments;
+	segment_bytes = nr_segments * sizeof*segments;
+	result = copy_from_user(image->segment, segments, segment_bytes);
+	if (result) 
+		goto out;
+
+	/* Verify we have good destination addresses.  The caller is
+	 * responsible for making certain we don't attempt to load
+	 * the new image into invalid or reserved areas of RAM.  This
+	 * just verifies it is an address we can use. 
+	 */
+	result = -EADDRNOTAVAIL;
+	for(i = 0; i < nr_segments; i++) {
+		unsigned long mend;
+		mend = ((unsigned long)(image->segment[i].mem)) + 
+			image->segment[i].memsz;
+		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+			goto out;
+	}
+
+	/* Find a location for the reboot code buffer, and add it
+	 * the vector of segments so that it's pages will also be
+	 * counted as destination pages.  
+	 */
+	result = -ENOMEM;
+	reboot_pages = kimage_alloc_reboot_code_pages(image);
+	if (!reboot_pages) {
+		printk(KERN_ERR "Could not allocate reboot_code_buffer\n");
+		goto out;
+	}
+	image->reboot_code_pages = reboot_pages;
+	image->segment[nr_segments].buf = 0;
+	image->segment[nr_segments].bufsz = 0;
+	image->segment[nr_segments].mem = (void *)(page_to_pfn(reboot_pages) << PAGE_SHIFT);
+	image->segment[nr_segments].memsz = KEXEC_REBOOT_CODE_SIZE;
+	image->nr_segments++;
+
+	result = 0;
+ out:
+	if (result == 0) {
+		*rimage = image;
+	} else {
+		kfree(image);
+	}
+	return result;
+}
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end)
+{
+	unsigned long i;
+	for(i = 0; i < image->nr_segments; i++) {
+		unsigned long mstart, mend;
+		mstart = (unsigned long)image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		if ((end > mstart) && (start < mend)) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+struct page *kimage_alloc_reboot_code_pages(struct kimage *image)
+{
+	/* The reboot code buffer is special.  It is the only set of
+	 * pages that must be allocated in their final resting place,
+	 * and the only set of pages whose final resting place we can
+	 * pick. 
+	 *
+	 * At worst this runs in O(N) of the image size.
+	 */
+	struct list_head extra_pages, *pos, *next;
+	struct page *pages;
+	unsigned long addr;
+	int order;
+	order = get_order(KEXEC_REBOOT_CODE_SIZE);
+	INIT_LIST_HEAD(&extra_pages);
+	do {
+		pages = alloc_pages(GFP_HIGHUSER, order);
+		addr = page_to_pfn(pages) << PAGE_SHIFT;
+		if ((page_to_pfn(pages) >= (TASK_SIZE >> PAGE_SHIFT)) ||
+			kimage_is_destination_range(image, addr, addr + KEXEC_REBOOT_CODE_SIZE)) {
+			list_add(&pages->list, &extra_pages);
+			pages = 0;
+		}
+	} while(!pages);
+	/* If I could convert a multi page allocation into a buch of
+	 * single page allocations I could add these pages to
+	 * image->dest_pages.  For now it is simpler to just free the
+	 * pages again.
+	 */
+	list_for_each_safe(pos, next, &extra_pages) {
+		struct page *page;
+		page = list_entry(pos, struct page, list);
+		list_del(&extra_pages);
+		__free_pages(page, order);
+	}
+	return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+	if (image->offset != 0) {
+		image->entry++;
+	}
+	if (image->entry == image->last_entry) {
+		kimage_entry_t *ind_page;
+		struct page *page;
+		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+		if (!page) {
+			return -ENOMEM;
+		}
+		ind_page = page_address(page);
+		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		image->entry = ind_page;
+		image->last_entry = 
+			ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+	}
+	*image->entry = entry;
+	image->entry++;
+	image->offset = 0;
+	return 0;
+}
+
+static int kimage_set_destination(
+	struct kimage *image, unsigned long destination) 
+{
+	int result;
+	destination &= PAGE_MASK;
+	result = kimage_add_entry(image, destination | IND_DESTINATION);
+	if (result == 0) {
+		image->destination = destination;
+	}
+	return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+	int result;
+	page &= PAGE_MASK;
+	result = kimage_add_entry(image, page | IND_SOURCE);
+	if (result == 0) {
+		image->destination += PAGE_SIZE;
+	}
+	return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+	/* Walk through and free any extra destination pages I may have */
+	struct list_head *pos, *next;
+	list_for_each_safe(pos, next, &image->dest_pages) {
+		struct page *page;
+		page = list_entry(pos, struct page, list);
+		list_del(&page->list);
+		__free_page(page);
+	}
+	/* Walk through and free any unuseable pages I have cached */
+	list_for_each_safe(pos, next, &image->unuseable_pages) {
+		struct page *page;
+		page = list_entry(pos, struct page, list);
+		list_del(&page->list);
+		__free_page(page);
+	}
+
+}
+static int kimage_terminate(struct kimage *image)
+{
+	int result;
+	result = kimage_add_entry(image, IND_DONE);
+	if (result == 0) {
+		/* Point at the terminating element */
+		image->entry--;
+		kimage_free_extra_pages(image);
+	}
+	return result;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+		ptr = (entry & IND_INDIRECTION)? \
+			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free(struct kimage *image)
+{
+	kimage_entry_t *ptr, entry;
+	kimage_entry_t ind = 0;
+	if (!image)
+		return;
+	kimage_free_extra_pages(image);
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_INDIRECTION) {
+			/* Free the previous indirection page */
+			if (ind & IND_INDIRECTION) {
+				free_page((unsigned long)phys_to_virt(ind & PAGE_MASK));
+			}
+			/* Save this indirection page until we are
+			 * done with it.
+			 */
+			ind = entry;
+		}
+		else if (entry & IND_SOURCE) {
+			free_page((unsigned long)phys_to_virt(entry & PAGE_MASK));
+		}
+	}
+	__free_pages(image->reboot_code_pages, get_order(KEXEC_REBOOT_CODE_SIZE));
+	kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+{
+	kimage_entry_t *ptr, entry;
+	unsigned long destination = 0;
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_DESTINATION) {
+			destination = entry & PAGE_MASK;
+		}
+		else if (entry & IND_SOURCE) {
+			if (page == destination) {
+				return ptr;
+			}
+			destination += PAGE_SIZE;
+		}
+	}
+	return 0;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+{
+	/* Here we implment safe guards to ensure that a source page
+	 * is not copied to it's destination page before the data on
+	 * the destination page is no longer useful.
+	 *
+	 * To do this we maintain the invariant that a source page is
+	 * either it's own destination page, or it is not a
+	 * destination page at all.  
+	 *
+	 * That is slightly stronger than required, but the proof
+	 * that no problems will not occur is trivial, and the
+	 * implemenation is simply to verify.
+	 *
+	 * When allocating all pages normally this algorithm will run
+	 * in O(N) time, but in the worst case it will run in O(N^2)
+	 * time.   If the runtime is a problem the data structures can
+	 * be fixed.
+	 */
+	struct page *page;
+	unsigned long addr;
+
+	/* Walk through the list of destination pages, and see if I
+	 * have a match.
+	 */
+	list_for_each_entry(page, &image->dest_pages, list) {
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+		if (addr == destination) {
+			list_del(&page->list);
+			return page;
+		}
+	}
+	page = 0;
+	while(1) {
+		kimage_entry_t *old;
+		/* Allocate a page, if we run out of memory give up */
+		page = alloc_page(gfp_mask);
+		if (!page) {
+			return 0;
+		}
+
+		/* If the page cannot be used file it away */
+		if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+			list_add(&page->list, &image->unuseable_pages);
+			continue;
+		}
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+
+		/* If it is the destination page we want use it */
+		if (addr == destination)
+			break;
+
+		/* If the page is not a destination page use it */
+		if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+			break;
+
+		/* I know that the page is someones destination page.
+		 * See if there is already a source page for this
+		 * destination page.  And if so swap the source pages.
+		 */
+		old = kimage_dst_used(image, addr);
+		if (old) {
+			/* If so move it */
+			unsigned long old_addr;
+			struct page *old_page;
+			
+			old_addr = *old & PAGE_MASK;
+			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			copy_highpage(page, old_page);
+			*old = addr | (*old & ~PAGE_MASK);
+
+			/* The old page I have found cannot be a
+			 * destination page, so return it.
+			 */
+			addr = old_addr;
+			page = old_page;
+			break;
+		}
+		else {
+			/* Place the page on the destination list I
+			 * will use it later.
+			 */
+			list_add(&page->list, &image->dest_pages);
+		}
+	}
+	return page;
+}
+
+static int kimage_load_segment(struct kimage *image,
+	struct kexec_segment *segment)
+{	
+	unsigned long mstart;
+	int result;
+	unsigned long offset;
+	unsigned long offset_end;
+	unsigned char *buf;
+
+	result = 0;
+	buf = segment->buf;
+	mstart = (unsigned long)segment->mem;
+
+	offset_end = segment->memsz;
+
+	result = kimage_set_destination(image, mstart);
+	if (result < 0) {
+		goto out;
+	}
+	for(offset = 0;  offset < segment->memsz; offset += PAGE_SIZE) {
+		struct page *page;
+		char *ptr;
+		size_t size, leader;
+		page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
+		if (page == 0) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
+		if (result < 0) {
+			goto out;
+		}
+		ptr = kmap(page);
+		if (segment->bufsz < offset) {
+			/* We are past the end zero the whole page */
+			memset(ptr, 0, PAGE_SIZE);
+			kunmap(page);
+			continue;
+		}
+		size = PAGE_SIZE;
+		leader = 0;
+		if ((offset == 0)) {
+			leader = mstart & ~PAGE_MASK;
+		}
+		if (leader) {
+			/* We are on the first page zero the unused portion */
+			memset(ptr, 0, leader);
+			size -= leader;
+			ptr += leader;
+		}
+		if (size > (segment->bufsz - offset)) {
+			size = segment->bufsz - offset;
+		}
+		if (size < (PAGE_SIZE - leader)) {
+			/* zero the trailing part of the page */
+			memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
+		}
+		result = copy_from_user(ptr, buf + offset, size);
+		kunmap(page);
+		if (result) {
+			result = (result < 0)?result : -EIO;
+			goto out;
+		}
+	}
+ out:
+	return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ * 
+ * This call breaks up into three pieces.  
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and the copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = 0;
+
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 
+	struct kexec_segment *segments, unsigned long flags)
+{
+	struct kimage *image;
+	int result;
+		
+	/* We only trust the superuser with rebooting the system. */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* In case we need just a little bit of special behavior for
+	 * reboot on panic 
+	 */
+	if (flags != 0)
+		return -EINVAL;
+
+	if (nr_segments > KEXEC_SEGMENT_MAX)
+		return -EINVAL;
+	image = 0;
+
+	result = 0;
+	if (nr_segments > 0) {
+		unsigned long i;
+		result = kimage_alloc(&image, nr_segments, segments);
+		if (result) {
+			goto out;
+		}
+		image->start = entry;
+		for(i = 0; i < nr_segments; i++) {
+			result = kimage_load_segment(image, &segments[i]);
+			if (result) {
+				goto out;
+			}
+		}
+		result = kimage_terminate(image);
+		if (result) {
+			goto out;
+		}
+	}
+
+	image = xchg(&kexec_image, image);
+
+ out:
+	kimage_free(image);
+	return result;
+}
diff -uNr linux-2.5.52/kernel/sys.c linux-2.5.52.x86kexec-2/kernel/sys.c
--- linux-2.5.52/kernel/sys.c	Thu Dec 12 07:41:37 2002
+++ linux-2.5.52.x86kexec-2/kernel/sys.c	Mon Dec 16 02:23:00 2002
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kexec.h>
 #include <linux/workqueue.h>
 #include <linux/device.h>
 #include <linux/times.h>
@@ -207,6 +208,7 @@
 cond_syscall(sys_lookup_dcookie)
 cond_syscall(sys_swapon)
 cond_syscall(sys_swapoff)
+cond_syscall(sys_kexec_load)
 cond_syscall(sys_init_module)
 cond_syscall(sys_delete_module)
 
@@ -419,6 +421,27 @@
 		machine_restart(buffer);
 		break;
 
+#ifdef CONFIG_KEXEC
+	case LINUX_REBOOT_CMD_KEXEC:
+	{
+		struct kimage *image;
+		if (arg) {
+			unlock_kernel();
+			return -EINVAL;
+		}
+		image = xchg(&kexec_image, 0);
+		if (!image) {
+			unlock_kernel();
+			return -EINVAL;
+		}
+		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+		system_running = 0;
+		device_shutdown();
+		printk(KERN_EMERG "Starting new kernel\n");
+		machine_kexec(image);
+		break;
+	}
+#endif
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		if (!software_suspend_enabled) {

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][CFT] kexec (rewrite) for 2.5.52
  2002-12-22 11:07 [PATCH][CFT] kexec (rewrite) for 2.5.52 Eric W. Biederman
@ 2002-12-31 14:35 ` Suparna Bhattacharya
  2003-01-03 10:37   ` Eric W. Biederman
                     ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Suparna Bhattacharya @ 2002-12-31 14:35 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, Linus Torvalds, Andy Pfiffer, Dave Hansen, wa

On Sun, Dec 22, 2002 at 04:07:52AM -0700, Eric W. Biederman wrote:
> 
> I have recently taken the time to dig through the internals of
> kexec to see if I could make my code any simpler and have managed
> to trim off about 100 lines, and have made the code much more
> obviously correct.
> 
> Anyway, I would love to know in what entertaining ways this code blows
> up, or if I get lucky and it doesn't.  I probably will not reply back
> in a timely manner as I am off to visit my parents, for Christmas and
> New Years.  
> 

The good news is that it worked for me. Not only that, I have just 
managed to get lkcd to save a dump in memory and then write it out 
to disk after a kexec soft boot ! I haven't tried real panic cases yet 
(which probably won't work rightaway :) ) and have testing and 
tuning to do. But kexec seems to be looking good.

Have a wonderful new year.

Regards
Suparna

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][CFT] kexec (rewrite) for 2.5.52
  2002-12-31 14:35 ` Suparna Bhattacharya
@ 2003-01-03 10:37   ` Eric W. Biederman
  2003-01-03 12:41     ` Suparna Bhattacharya
  2003-01-04  0:32   ` 2.5.54: Re: [PATCH][CFT] kexec (rewrite) for 2.5.52 Andy Pfiffer
       [not found]   ` <m11y2w557p.fsf@frodo.biederman.org>
  2 siblings, 1 reply; 15+ messages in thread
From: Eric W. Biederman @ 2003-01-03 10:37 UTC (permalink / raw)
  To: suparna; +Cc: linux-kernel, Linus Torvalds, Andy Pfiffer, Dave Hansen, wa

Suparna Bhattacharya <suparna@in.ibm.com> writes:

> The good news is that it worked for me. Not only that, I have just 
> managed to get lkcd to save a dump in memory and then write it out 
> to disk after a kexec soft boot ! I haven't tried real panic cases yet 
> (which probably won't work rightaway :) ) and have testing and 
> tuning to do. But kexec seems to be looking good.

Nice.  Any pointers besides lkcd.sourceforge.net

For the kexec on panic case there is a little code motion yet to be
done so that no memory allocations need to happen.  The big one is
setting up a page table with the reboot code buffer identity mapped.

I am tempted to do the identity mapping of the reboot code buffer in
init_mm, but for starters I will look at how complex it will be to
have a spare mm just sitting around for that purpose.  When I get
to dealing with the architectures like the hammer, and the alpha where
you always need page tables I will need to develop an architecture
specific hook for building the page tables needed by the
code residing in the reboot code buffer, (because virtual memory
cannot be disabled), but that should be straight forward.

My goal is to have no locks on the kexec part of the panic path.  And
the current memory allocations are the only really bad part of that.

A dump question.  Why doesn't the lkcd stuff use the normal ELF core
dump format?  allowing ``gdb vmlinux core'' to work?

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][CFT] kexec (rewrite) for 2.5.52
  2003-01-03 10:37   ` Eric W. Biederman
@ 2003-01-03 12:41     ` Suparna Bhattacharya
  2003-01-04 20:34       ` Eric W. Biederman
                         ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Suparna Bhattacharya @ 2003-01-03 12:41 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, Linus Torvalds, Andy Pfiffer, Dave Hansen, wa, lkcd-devel

On Fri, Jan 03, 2003 at 03:37:06AM -0700, Eric W. Biederman wrote:
> Suparna Bhattacharya <suparna@in.ibm.com> writes:
> 
> > The good news is that it worked for me. Not only that, I have just 
> > managed to get lkcd to save a dump in memory and then write it out 
> > to disk after a kexec soft boot ! I haven't tried real panic cases yet 
> > (which probably won't work rightaway :) ) and have testing and 
> > tuning to do. But kexec seems to be looking good.
> 
> Nice.  Any pointers besides lkcd.sourceforge.net

I haven't posted this code to lkcd as yet - so far I'd only
checked in the preparatory code reshuffle into lkcd cvs. There are
still some things to improve and think about, but am planning
to upgrade to the latest tree early next week and put things
out, and then work on it incrementally.

> 
> For the kexec on panic case there is a little code motion yet to be
> done so that no memory allocations need to happen.  The big one is
> setting up a page table with the reboot code buffer identity mapped.

I missed noticing that.
Bootimg avoided the allocation at this stage. It did something like 
this:

+static unsigned long get_identity_mapped_page(void)
+{
+       set_pgd(pgd_offset(current->active_mm, 
+ 	virt_to_phys(unity_page)), __pgd((_KERNPG_TABLE 
+ 	_PAGE_PSE + (virt_to_phys(unity_page)&PGDIR_MASK))));
+       return (unsigned long)unity_page;
+}

where unity page is within directly mapped memory (not highmem).

> 
> I am tempted to do the identity mapping of the reboot code buffer in
> init_mm, but for starters I will look at how complex it will be to
> have a spare mm just sitting around for that purpose.  When I get
> to dealing with the architectures like the hammer, and the alpha where
> you always need page tables I will need to develop an architecture
> specific hook for building the page tables needed by the
> code residing in the reboot code buffer, (because virtual memory
> cannot be disabled), but that should be straight forward.

A spare mm may be something which I could use for the crash dump
pages mapping possibly simpler than the way it is maintained
right now ... but haven't given enough thought to it yet.

> 
> My goal is to have no locks on the kexec part of the panic path.  And
> the current memory allocations are the only really bad part of that.

OK.

> 
> A dump question.  Why doesn't the lkcd stuff use the normal ELF core
> dump format?  allowing ``gdb vmlinux core'' to work?

I guess its ultimately a choice of format,  how much processing 
to do at dump time, vs afterwards prior to analysis, and whether
it captures all aspects relevant for the kind of analysis
intended. The lkcd dump format appears to designed in a way that 
makes it suitable for crash dumping kind of situations. It takes 
an approach of simplifying work at dump time (desirable). It 
enables pages to be dumped right away in any order with a header 
preceding the page dumped, which makes it easier to support extraction 
of information from truncated dumps. This also makes it easier to do 
selective dumping and placement of more critical data earlier 
in the dump.

Secondly, it retains the notion of pages being dumped by physical 
address, with interpretation/conversions from virt-to-phys on 
analysis being taken care of the analyser or convertor. For example
there has been work on a post processor that generates a core file 
from the lkcd dump corresponding to a given task/address space 
context for analysis via gdb. Similarly there is a capibility
in lcrash that lets one generate a (smaller) selected subset of 
dumped state from an existing dump, which can be mailed out from 
a remote site for analysis.

The tradeoff is that there is a bit of pre-processing that happens
prior to analysis for generation of an index, or conversion
depending on what analysis tool gets used. But that time is less
crucial than actual dump time.

Am cc'ing lkcd-devel on this one - there are experts who can
add to this or answer this question better than I can. 

Regards
Suparna

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Labs, India


^ permalink raw reply	[flat|nested] 15+ messages in thread

* 2.5.54: Re: [PATCH][CFT] kexec (rewrite) for 2.5.52
  2002-12-31 14:35 ` Suparna Bhattacharya
  2003-01-03 10:37   ` Eric W. Biederman
@ 2003-01-04  0:32   ` Andy Pfiffer
  2003-01-04 18:56     ` Eric W. Biederman
       [not found]   ` <m11y2w557p.fsf@frodo.biederman.org>
  2 siblings, 1 reply; 15+ messages in thread
From: Andy Pfiffer @ 2003-01-04  0:32 UTC (permalink / raw)
  To: Eric W. Biederman, suparna
  Cc: linux-kernel, Linus Torvalds, Dave Hansen, Werner Almesberger

On Tue, 2002-12-31 at 06:35, Suparna Bhattacharya wrote:
> On Sun, Dec 22, 2002 at 04:07:52AM -0700, Eric W. Biederman wrote:
> > 
> > I have recently taken the time to dig through the internals of
> > kexec to see if I could make my code any simpler and have managed
> > to trim off about 100 lines, and have made the code much more
> > obviously correct.
> > 
> > Anyway, I would love to know in what entertaining ways this code blows
> > up, or if I get lucky and it doesn't.  I probably will not reply back
> > in a timely manner as I am off to visit my parents, for Christmas and
> > New Years.  
> > 

Eric,

The patch applied cleanly to 2.5.54 for me.

The kexec portion works just fine and the reboot discovers all of the
memory on my system using kexec_tools 1.8.

However, something has recently changed in the 2.5.5x series that causes
the reboot to hang while calibrating the delay loop after a kexec
reboot:

setup16_end: 00091b2f
Synchronizing SCSI caches: 
Shutting down devices      
Starting new kernel  
Linux version 2.5.54 (andyp@joe) (gcc version 2.95.3 20010315 (SuSE)) #2
SMP Fri Jan 3 21:36:51 PST 2003
Video mode to be used for restore is ffff
BIOS-provided physical RAM map:          
 BIOS-e820: 0000000000000000 - 000000000009dc00 (usable)
 BIOS-e820: 000000000009dc00 - 00000000000a0000 (reserved)
 BIOS-e820: 0000000000100000 - 0000000027fed140 (usable)  
 BIOS-e820: 0000000027fed140 - 0000000027ff0000 (ACPI data)
 BIOS-e820: 0000000027ff0000 - 0000000028000000 (reserved) 
 BIOS-e820: 00000000fec00000 - 0000000100000000 (reserved)
639MB LOWMEM available.                                   
found SMP MP-table at 0009ddd0
hm, page 0009d000 reserved twice.
hm, page 0009e000 reserved twice.
hm, page 0009d000 reserved twice.
hm, page 0009e000 reserved twice.
WARNING: MP table in the EBDA can be UNSAFE, contact
linux-smp@vger.kernel.org if you experience SMP problems!
On node 0 totalpages: 163821  
  DMA zone: 4096 pages, LIFO batch:1
  Normal zone: 159725 pages, LIFO batch:16
  HighMem zone: 0 pages, LIFO batch:1     
Intel MultiProcessor Specification v1.4
    Virtual Wire compatibility mode.   
OEM ID: IBM ENSW Product ID: xSeries 220  APIC at: 0xFEE00000
Processor #0 6:8 APIC version 17                             
I/O APIC #14 Version 17 at 0xFEC00000.
I/O APIC #13 Version 17 at 0xFEC01000.
Enabling APIC mode:  Flat.  Using 2 I/O APICs
Processors: 1                                
IBM machine detected. Enabling interrupts during APM calls.
IBM machine detected. Disabling SMBus accesses.            
Building zonelist for node : 0                 
Kernel command line: auto BOOT_IMAGE=linux-2.5 ro root=805
console=ttyS0,9600n8
Initializing
CPU#0                                                             
Detected 799.578 MHz processor.
Console: colour VGA+ 80x25     
Calibrating delay loop... 

<wedged>

This happens with -and- without the separate "hwfixes" chunk of code
(that patch carries forward and continues to apply cleanly).

It would appear that clock interrupts are no long arriving (ticks always
equals jiffies).

You can download the kexec patches for 2.5.54 from OSDL's PLM service:
(apologies in advance for the long URL):
https://www.osdl.org/cgi-bin/plm?module=search&search_patch=kexec-rewrite&search_created=Anytime&search_format=detailed&action=run_patch_search&sort_field=idDESC

If the URL is mangled, go here:
https://www.osdl.org/cgi-bin/plm?module=search
and then put "kexec-rewrite" into the "Patch Name or ID" box,
and then press "Submit Query".

Key:
kexec-rewrite-2.5.54-1-of-3-1 == your rewrite from 2002-12-22
kexec-rewrite-2.5.54-2-of-3-1 == your "hwfixes" from 2.5.48ish
kexec-rewrite-2.5.54-3-of-3-1 == ignore it (changes CONFIG_KEXEC=y for PLM)

Regards,
Andy





^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: 2.5.54: Re: [PATCH][CFT] kexec (rewrite) for 2.5.52
  2003-01-04  0:32   ` 2.5.54: Re: [PATCH][CFT] kexec (rewrite) for 2.5.52 Andy Pfiffer
@ 2003-01-04 18:56     ` Eric W. Biederman
  0 siblings, 0 replies; 15+ messages in thread
From: Eric W. Biederman @ 2003-01-04 18:56 UTC (permalink / raw)
  To: Andy Pfiffer
  Cc: suparna, linux-kernel, Linus Torvalds, Dave Hansen, Werner Almesberger

Andy Pfiffer <andyp@osdl.org> writes:

> Eric,
> 
> The patch applied cleanly to 2.5.54 for me.
> 
> The kexec portion works just fine and the reboot discovers all of the
> memory on my system using kexec_tools 1.8.
> 
> However, something has recently changed in the 2.5.5x series that causes
> the reboot to hang while calibrating the delay loop after a kexec
> reboot:

Thanks I will take a look.  It looks like something is definitely having
interrupt problems...

BTW, Have you tried booting an older kernel?
That would help indicate where the problem is.  I am pretty certain
it is from somewhere in the kernels initialization path.

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][CFT] kexec (rewrite) for 2.5.52
  2003-01-03 12:41     ` Suparna Bhattacharya
@ 2003-01-04 20:34       ` Eric W. Biederman
  2003-01-04 22:42       ` Eric W. Biederman
  2003-01-06  5:48       ` [PATCH] kexec for 2.5.54 Eric W. Biederman
  2 siblings, 0 replies; 15+ messages in thread
From: Eric W. Biederman @ 2003-01-04 20:34 UTC (permalink / raw)
  To: suparna
  Cc: linux-kernel, Linus Torvalds, Andy Pfiffer, Dave Hansen, wa, lkcd-devel

Suparna Bhattacharya <suparna@in.ibm.com> writes:

> On Fri, Jan 03, 2003 at 03:37:06AM -0700, Eric W. Biederman wrote:
> > Suparna Bhattacharya <suparna@in.ibm.com> writes:
> > 
> > > The good news is that it worked for me. Not only that, I have just 
> > > managed to get lkcd to save a dump in memory and then write it out 
> > > to disk after a kexec soft boot ! I haven't tried real panic cases yet 
> > > (which probably won't work rightaway :) ) and have testing and 
> > > tuning to do. But kexec seems to be looking good.
> > 
> > Nice.  Any pointers besides lkcd.sourceforge.net
> 
> I haven't posted this code to lkcd as yet - so far I'd only
> checked in the preparatory code reshuffle into lkcd cvs. There are
> still some things to improve and think about, but am planning
> to upgrade to the latest tree early next week and put things
> out, and then work on it incrementally.

O.k.

> > For the kexec on panic case there is a little code motion yet to be
> > done so that no memory allocations need to happen.  The big one is
> > setting up a page table with the reboot code buffer identity mapped.
> 
> I missed noticing that.
> Bootimg avoided the allocation at this stage. It did something like 
> this:
> 
> +static unsigned long get_identity_mapped_page(void)
> +{
> +       set_pgd(pgd_offset(current->active_mm, 
> + 	virt_to_phys(unity_page)), __pgd((_KERNPG_TABLE 
> + 	_PAGE_PSE + (virt_to_phys(unity_page)&PGDIR_MASK))));
> +       return (unsigned long)unity_page;
> +}
> 
> where unity page is within directly mapped memory (not highmem).

With unity_page being allocated ahead of time...
But there is some other trick it is pulling to make certain the
intermediate page table entries are present.  Spooky and I don't want
to go there.

> > I am tempted to do the identity mapping of the reboot code buffer in
> > init_mm, but for starters I will look at how complex it will be to
> > have a spare mm just sitting around for that purpose.  When I get
> > to dealing with the architectures like the hammer, and the alpha where
> > you always need page tables I will need to develop an architecture
> > specific hook for building the page tables needed by the
> > code residing in the reboot code buffer, (because virtual memory
> > cannot be disabled), but that should be straight forward.
> 
> A spare mm may be something which I could use for the crash dump
> pages mapping possibly simpler than the way it is maintained
> right now ... but haven't given enough thought to it yet.

Given that it is likely only to be a temporary thing I doubt it will
help.  A very interesting question along those lines is how do 
you get at all of the memory you are dumping, especially in PAE mode.
I have not seen the code that handles that part at all...

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][CFT] kexec (rewrite) for 2.5.52
  2003-01-03 12:41     ` Suparna Bhattacharya
  2003-01-04 20:34       ` Eric W. Biederman
@ 2003-01-04 22:42       ` Eric W. Biederman
  2003-01-06  5:48       ` [PATCH] kexec for 2.5.54 Eric W. Biederman
  2 siblings, 0 replies; 15+ messages in thread
From: Eric W. Biederman @ 2003-01-04 22:42 UTC (permalink / raw)
  To: suparna
  Cc: linux-kernel, Linus Torvalds, Andy Pfiffer, Dave Hansen, wa, lkcd-devel

Suparna Bhattacharya <suparna@in.ibm.com> writes:

> On Fri, Jan 03, 2003 at 03:37:06AM -0700, Eric W. Biederman wrote:
> > A dump question.  Why doesn't the lkcd stuff use the normal ELF core
> > dump format?  allowing ``gdb vmlinux core'' to work?

Digesting....  Of the pieces I have no problem if a valid
ELF core dump is written but gdb does not know what to do with it out
of the box.  The piece that disturbed me most is that the file format
seemed to be mutating from release to release.

An ELF core dump consists of:
ELF header
ELF Program header
ELF Note segment
Data Segments...

All of the weird processor specific information can be stored as
various ELF note types.

Compression of pages that are zero can be handled by treating
them as BSS pages and not putting them in the image. 

I do admit it would likely take an extra pass to generate the ELF
program header if anything non-trivial like zero removal or
compression was going on.   But at the same time that should also
quite dramatically reduce the per page overhead.  A pure dump of ram
on x86 should take only 2 or 3 segments.

Using physical addresses is no problem in an ELF core dump.  The ELF
program header has both physical and virtual addresses, and you just
fill in the physical addresses.

I keep asking and thinking about ELF images, because they are 
simple, clean, extensible, and well documented.  With an added bonus
that using the allows a large degree of code reuse with existing
tools.


[snip a good description of the usefulness of the existing core dump format]

> Am cc'ing lkcd-devel on this one - there are experts who can
> add to this or answer this question better than I can. 

Thanks,

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] kexec for 2.5.54
  2003-01-03 12:41     ` Suparna Bhattacharya
  2003-01-04 20:34       ` Eric W. Biederman
  2003-01-04 22:42       ` Eric W. Biederman
@ 2003-01-06  5:48       ` Eric W. Biederman
  2003-01-07 22:46         ` Andy Pfiffer
  2003-01-15 19:43         ` [2.5.58][KEXEC] Success! (using 2.5.54 version + kexec tools 1.8) Andy Pfiffer
  2 siblings, 2 replies; 15+ messages in thread
From: Eric W. Biederman @ 2003-01-06  5:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: suparna, Linus Torvalds, Andy Pfiffer, Dave Hansen, wa


O.k.  I have switched to using the init_mm and premapping the reboot
code buffer.  I would have play with a fully private mm, but all of
the functions to allocate one are private to kernel/fork.c and so to
much of a pain to mess with, right now.

The code in machine_kexec now takes no locks and is drop dead simple,
so it should be safe to call from a panic handler.  

It is funny in making identity_map_page generic so it should work on
all architectures, and using more kernel prebuilt functions the code
actually got a little longer...

Linus if you would like to apply it, be my guest.

Suparna this should be a good base to build the kexec on panic code
upon.  Until I see it a little more in action this is as much as I can
do to help.

And if this week goes on schedule I can do an Itanium port...

Eric

 MAINTAINERS                        |    8 
 arch/i386/Kconfig                  |   17 +
 arch/i386/kernel/Makefile          |    1 
 arch/i386/kernel/entry.S           |    1 
 arch/i386/kernel/machine_kexec.c   |  115 ++++++
 arch/i386/kernel/relocate_kernel.S |  107 ++++++
 include/asm-i386/kexec.h           |   23 +
 include/asm-i386/unistd.h          |    1 
 include/linux/kexec.h              |   54 +++
 include/linux/reboot.h             |    2 
 kernel/Makefile                    |    1 
 kernel/kexec.c                     |  629 +++++++++++++++++++++++++++++++++++++
 kernel/sys.c                       |   23 +
 13 files changed, 982 insertions

diff -uNr linux-2.5.54/MAINTAINERS linux-2.5.54.x86kexec/MAINTAINERS
--- linux-2.5.54/MAINTAINERS	Sat Jan  4 12:00:56 2003
+++ linux-2.5.54.x86kexec/MAINTAINERS	Sat Jan  4 12:02:05 2003
@@ -1006,6 +1006,14 @@
 W:	http://www.cse.unsw.edu.au/~neilb/patches/linux-devel/
 S:	Maintained
 
+KEXEC
+P:	Eric Biederman
+M:	ebiederm@xmission.com
+M:	ebiederman@lnxi.com
+W:	http://www.xmission.com/~ebiederm/files/kexec/
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 LANMEDIA WAN CARD DRIVER
 P:	Andrew Stanley-Jones
 M:	asj@lanmedia.com
diff -uNr linux-2.5.54/arch/i386/Kconfig linux-2.5.54.x86kexec/arch/i386/Kconfig
--- linux-2.5.54/arch/i386/Kconfig	Sat Jan  4 12:00:56 2003
+++ linux-2.5.54.x86kexec/arch/i386/Kconfig	Sat Jan  4 12:02:05 2003
@@ -733,6 +733,23 @@
 	depends on (SMP || PREEMPT) && X86_CMPXCHG
 	default y
 
+config KEXEC
+	bool "kexec system call (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  kexec is a system call that implements the ability to  shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but it is indepedent of the system firmware.   And like a reboot
+	  you can start any kernel with it not just Linux.  
+	
+	  The name comes from the similiarity to the exec system call. 
+	
+	  It is on an going process to be certain the hardware in a machine
+	  is properly shutdown, so do not be surprised if this code does not
+	  initially work for you.  It may help to enable device hotplugging
+	  support.  As of this writing the exact hardware interface is
+	  strongly in flux, so no good recommendation can be made.
+
 endmenu
 
 
diff -uNr linux-2.5.54/arch/i386/kernel/Makefile linux-2.5.54.x86kexec/arch/i386/kernel/Makefile
--- linux-2.5.54/arch/i386/kernel/Makefile	Sat Jan  4 12:00:56 2003
+++ linux-2.5.54.x86kexec/arch/i386/kernel/Makefile	Sat Jan  4 12:02:05 2003
@@ -25,6 +25,7 @@
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend.o suspend_asm.o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
 obj-$(CONFIG_PROFILING)		+= profile.o
diff -uNr linux-2.5.54/arch/i386/kernel/entry.S linux-2.5.54.x86kexec/arch/i386/kernel/entry.S
--- linux-2.5.54/arch/i386/kernel/entry.S	Sat Jan  4 12:00:56 2003
+++ linux-2.5.54.x86kexec/arch/i386/kernel/entry.S	Sat Jan  4 12:02:05 2003
@@ -804,6 +804,7 @@
 	.long sys_epoll_wait
  	.long sys_remap_file_pages
  	.long sys_set_tid_address
+	.long sys_kexec_load
 
 
 	.rept NR_syscalls-(.-sys_call_table)/4
diff -uNr linux-2.5.54/arch/i386/kernel/machine_kexec.c linux-2.5.54.x86kexec/arch/i386/kernel/machine_kexec.c
--- linux-2.5.54/arch/i386/kernel/machine_kexec.c	Wed Dec 31 17:00:00 1969
+++ linux-2.5.54.x86kexec/arch/i386/kernel/machine_kexec.c	Sun Jan  5 16:12:28 2003
@@ -0,0 +1,115 @@
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+
+
+/*
+ * machine_kexec
+ * =======================
+ */
+
+
+static void set_idt(void *newidt, __u16 limit)
+{
+	unsigned char curidt[6];
+
+	/* ia32 supports unaliged loads & stores */
+	(*(__u16 *)(curidt)) = limit;
+	(*(__u32 *)(curidt +2)) = (unsigned long)(newidt);
+
+	__asm__ __volatile__ (
+		"lidt %0\n" 
+		: "=m" (curidt)
+		);
+};
+
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+	unsigned char curgdt[6];
+
+	/* ia32 supports unaliged loads & stores */
+	(*(__u16 *)(curgdt)) = limit;
+	(*(__u32 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+	__asm__ __volatile__ (
+		"lgdt %0\n" 
+		: "=m" (curgdt)
+		);
+};
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+	__asm__ __volatile__ (
+		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
+		"\t1:\n"
+		"\tmovl $"STR(__KERNEL_DS)",%eax\n"
+		"\tmovl %eax,%ds\n"
+		"\tmovl %eax,%es\n"
+		"\tmovl %eax,%fs\n"
+		"\tmovl %eax,%gs\n"
+		"\tmovl %eax,%ss\n"
+		);
+#undef STR
+#undef __STR
+}
+
+typedef void (*relocate_new_kernel_t)(
+	unsigned long indirection_page, unsigned long reboot_code_buffer,
+	unsigned long start_address);
+
+const extern unsigned char relocate_new_kernel[];
+extern void relocate_new_kernel_end(void);
+const extern unsigned int relocate_new_kernel_size;
+
+void machine_kexec(struct kimage *image)
+{
+	unsigned long indirection_page;
+	unsigned long reboot_code_buffer;
+	void *ptr;
+	relocate_new_kernel_t rnk;
+
+	/* switch to an mm where the reboot_code_buffer is identity mapped */
+	switch_mm(current->active_mm, &init_mm, current, smp_processor_id());
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+	reboot_code_buffer = page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT;
+	indirection_page = image->head & PAGE_MASK;
+
+	/* copy it out */
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
+
+	/* The segment registers are funny things, they are
+	 * automatically loaded from a table, in memory wherever you
+	 * set them to a specific selector, but this table is never
+	 * accessed again you set the segment to a different selector.
+	 *
+	 * The more common model is are caches where the behide
+	 * the scenes work is done, but is also dropped at arbitrary
+	 * times.
+	 *
+	 * I take advantage of this here by force loading the
+	 * segments, before I zap the gdt with an invalid value.
+	 */
+	load_segments();
+	/* The gdt & idt are now invalid.
+	 * If you want to load them you must set up your own idt & gdt.
+	 */
+	set_gdt(phys_to_virt(0),0);
+	set_idt(phys_to_virt(0),0);
+
+	/* now call it */
+	rnk = (relocate_new_kernel_t) reboot_code_buffer;
+	(*rnk)(indirection_page, reboot_code_buffer, image->start);
+}
diff -uNr linux-2.5.54/arch/i386/kernel/relocate_kernel.S linux-2.5.54.x86kexec/arch/i386/kernel/relocate_kernel.S
--- linux-2.5.54/arch/i386/kernel/relocate_kernel.S	Wed Dec 31 17:00:00 1969
+++ linux-2.5.54.x86kexec/arch/i386/kernel/relocate_kernel.S	Sat Jan  4 12:02:05 2003
@@ -0,0 +1,107 @@
+#include <linux/config.h>
+#include <linux/linkage.h>
+
+	/* Must be relocatable PIC code callable as a C function, that once
+	 * it starts can not use the previous processes stack.
+	 *
+	 */
+	.globl relocate_new_kernel
+relocate_new_kernel:
+	/* read the arguments and say goodbye to the stack */
+	movl  4(%esp), %ebx /* indirection_page */
+	movl  8(%esp), %ebp /* reboot_code_buffer */
+	movl  12(%esp), %edx /* start address */
+
+	/* zero out flags, and disable interrupts */
+	pushl $0
+	popfl
+
+	/* set a new stack at the bottom of our page... */
+	lea   4096(%ebp), %esp
+
+	/* store the parameters back on the stack */
+	pushl   %edx /* store the start address */
+
+	/* Set cr0 to a known state:
+	 * 31 0 == Paging disabled
+	 * 18 0 == Alignment check disabled
+	 * 16 0 == Write protect disabled
+	 * 3  0 == No task switch
+	 * 2  0 == Don't do FP software emulation.
+	 * 0  1 == Proctected mode enabled
+	 */
+	movl	%cr0, %eax
+	andl	$~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
+	orl	$(1<<0), %eax
+	movl	%eax, %cr0
+	
+	/* Set cr4 to a known state:
+	 * Setting everything to zero seems safe.
+	 */
+	movl	%cr4, %eax
+	andl	$0, %eax
+	movl	%eax, %cr4
+	
+	jmp 1f
+1:	
+
+	/* Flush the TLB (needed?) */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/* Do the copies */
+	cld
+0:	/* top, read another word for the indirection page */
+	movl    %ebx, %ecx
+	movl	(%ebx), %ecx
+	addl	$4, %ebx
+	testl	$0x1,   %ecx  /* is it a destination page */
+	jz	1f
+	movl	%ecx,	%edi
+	andl	$0xfffff000, %edi
+	jmp     0b
+1:
+	testl	$0x2,	%ecx  /* is it an indirection page */
+	jz	1f
+	movl	%ecx,	%ebx
+	andl	$0xfffff000, %ebx
+	jmp     0b
+1:
+	testl   $0x4,   %ecx /* is it the done indicator */
+	jz      1f
+	jmp     2f
+1:
+	testl   $0x8,   %ecx /* is it the source indicator */
+	jz      0b	     /* Ignore it otherwise */
+	movl    %ecx,   %esi /* For every source page do a copy */
+	andl    $0xfffff000, %esi
+
+	movl    $1024, %ecx
+	rep ; movsl
+	jmp     0b
+
+2:
+
+	/* To be certain of avoiding problems with self modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB, it's handy, and not processor dependent.
+	 */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+	
+	/* set all of the registers to known values */
+	/* leave %esp alone */
+	
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	xorl    %ecx, %ecx
+	xorl    %edx, %edx
+	xorl    %esi, %esi
+	xorl    %edi, %edi
+	xorl    %ebp, %ebp
+	ret
+relocate_new_kernel_end:
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:	
+	.long relocate_new_kernel_end - relocate_new_kernel
diff -uNr linux-2.5.54/include/asm-i386/kexec.h linux-2.5.54.x86kexec/include/asm-i386/kexec.h
--- linux-2.5.54/include/asm-i386/kexec.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.54.x86kexec/include/asm-i386/kexec.h	Sat Jan  4 12:02:05 2003
@@ -0,0 +1,23 @@
+#ifndef _I386_KEXEC_H
+#define _I386_KEXEC_H
+
+#include <asm/fixmap.h>
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
+ * calculation for the amount of memory directly mappable into the
+ * kernel memory space.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+
+#define KEXEC_REBOOT_CODE_SIZE	4096
+
+#endif /* _I386_KEXEC_H */
diff -uNr linux-2.5.54/include/asm-i386/unistd.h linux-2.5.54.x86kexec/include/asm-i386/unistd.h
--- linux-2.5.54/include/asm-i386/unistd.h	Sat Jan  4 12:01:05 2003
+++ linux-2.5.54.x86kexec/include/asm-i386/unistd.h	Sat Jan  4 12:02:05 2003
@@ -262,6 +262,7 @@
 #define __NR_epoll_wait		256
 #define __NR_remap_file_pages	257
 #define __NR_set_tid_address	258
+#define __NR_sys_kexec_load	259
 
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
diff -uNr linux-2.5.54/include/linux/kexec.h linux-2.5.54.x86kexec/include/linux/kexec.h
--- linux-2.5.54/include/linux/kexec.h	Wed Dec 31 17:00:00 1969
+++ linux-2.5.54.x86kexec/include/linux/kexec.h	Sat Jan  4 16:17:20 2003
@@ -0,0 +1,54 @@
+#ifndef LINUX_KEXEC_H
+#define LINUX_KEXEC_H
+
+#if CONFIG_KEXEC
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/kexec.h>
+
+/* 
+ * This structure is used to hold the arguments that are used when loading
+ * kernel binaries.
+ */
+
+typedef unsigned long kimage_entry_t;
+#define IND_DESTINATION  0x1
+#define IND_INDIRECTION  0x2
+#define IND_DONE         0x4
+#define IND_SOURCE       0x8
+
+#define KEXEC_SEGMENT_MAX 8
+struct kexec_segment {
+	void *buf;
+	size_t bufsz;
+	void *mem;
+	size_t memsz;
+};
+
+struct kimage {
+	kimage_entry_t head;
+	kimage_entry_t *entry;
+	kimage_entry_t *last_entry;
+
+	unsigned long destination;
+	unsigned long offset;
+
+	unsigned long start;
+	struct page *reboot_code_pages;
+
+	unsigned long nr_segments;
+	struct kexec_segment segment[KEXEC_SEGMENT_MAX+1];
+
+	struct list_head dest_pages;
+	struct list_head unuseable_pages;
+};
+
+
+/* kexec interface functions */
+extern void machine_kexec(struct kimage *image);
+extern asmlinkage long sys_kexec(unsigned long entry, long nr_segments, 
+	struct kexec_segment *segments);
+extern struct kimage *kexec_image;
+#endif
+#endif /* LINUX_KEXEC_H */
+
diff -uNr linux-2.5.54/include/linux/reboot.h linux-2.5.54.x86kexec/include/linux/reboot.h
--- linux-2.5.54/include/linux/reboot.h	Thu Dec 12 07:41:37 2002
+++ linux-2.5.54.x86kexec/include/linux/reboot.h	Sat Jan  4 12:02:05 2003
@@ -21,6 +21,7 @@
  * POWER_OFF   Stop OS and remove all power from system, if possible.
  * RESTART2    Restart system using given command string.
  * SW_SUSPEND  Suspend system using Software Suspend if compiled in
+ * KEXEC       Restart the system using a different kernel.
  */
 
 #define	LINUX_REBOOT_CMD_RESTART	0x01234567
@@ -30,6 +31,7 @@
 #define	LINUX_REBOOT_CMD_POWER_OFF	0x4321FEDC
 #define	LINUX_REBOOT_CMD_RESTART2	0xA1B2C3D4
 #define	LINUX_REBOOT_CMD_SW_SUSPEND	0xD000FCE2
+#define LINUX_REBOOT_CMD_KEXEC		0x45584543
 
 
 #ifdef __KERNEL__
diff -uNr linux-2.5.54/kernel/Makefile linux-2.5.54.x86kexec/kernel/Makefile
--- linux-2.5.54/kernel/Makefile	Mon Dec 16 02:19:15 2002
+++ linux-2.5.54.x86kexec/kernel/Makefile	Sat Jan  4 12:02:05 2003
@@ -21,6 +21,7 @@
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
+obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 
 ifneq ($(CONFIG_IA64),y)
diff -uNr linux-2.5.54/kernel/kexec.c linux-2.5.54.x86kexec/kernel/kexec.c
--- linux-2.5.54/kernel/kexec.c	Wed Dec 31 17:00:00 1969
+++ linux-2.5.54.x86kexec/kernel/kexec.c	Sun Jan  5 21:54:52 2003
@@ -0,0 +1,629 @@
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/version.h>
+#include <linux/compile.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <net/checksum.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+
+/* When kexec transitions to the new kernel there is a one to one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to it's final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the 
+ * the new kernel is placed in the reboot_code_buffer, whose size
+ * is given by KEXEC_REBOOT_CODE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifyable.
+ *
+ * The assembly stub in the reboot code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in it's final resting place (if it happens 
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of ram can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the reboot code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic, more
+ *    reliable.  
+ *  - allocating the pages for a page table for machines that cannot
+ *    disable their MMUs.  (Hammer, Alpha...)
+ */
+
+/* KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end);
+static struct page *kimage_alloc_reboot_code_pages(struct kimage *image);
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+
+
+static int kimage_alloc(struct kimage **rimage, 
+	unsigned long nr_segments, struct kexec_segment *segments)
+{
+	int result;
+	struct kimage *image;
+	size_t segment_bytes;
+	struct page *reboot_pages;
+	unsigned long i;
+
+	/* Allocate a controlling structure */
+	result = -ENOMEM;
+	image = kmalloc(sizeof(*image), GFP_KERNEL);
+	if (!image) {
+		goto out;
+	}
+	memset(image, 0, sizeof(*image));
+	image->head = 0;
+	image->entry = &image->head;
+	image->last_entry = &image->head;
+
+	/* Initialize the list of destination pages */
+	INIT_LIST_HEAD(&image->dest_pages);
+
+	/* Initialize the list of unuseable pages */
+	INIT_LIST_HEAD(&image->unuseable_pages);
+
+	/* Read in the segments */
+	image->nr_segments = nr_segments;
+	segment_bytes = nr_segments * sizeof*segments;
+	result = copy_from_user(image->segment, segments, segment_bytes);
+	if (result) 
+		goto out;
+
+	/* Verify we have good destination addresses.  The caller is
+	 * responsible for making certain we don't attempt to load
+	 * the new image into invalid or reserved areas of RAM.  This
+	 * just verifies it is an address we can use. 
+	 */
+	result = -EADDRNOTAVAIL;
+	for(i = 0; i < nr_segments; i++) {
+		unsigned long mend;
+		mend = ((unsigned long)(image->segment[i].mem)) + 
+			image->segment[i].memsz;
+		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+			goto out;
+	}
+
+	/* Find a location for the reboot code buffer, and add it
+	 * the vector of segments so that it's pages will also be
+	 * counted as destination pages.  
+	 */
+	result = -ENOMEM;
+	reboot_pages = kimage_alloc_reboot_code_pages(image);
+	if (!reboot_pages) {
+		printk(KERN_ERR "Could not allocate reboot_code_buffer\n");
+		goto out;
+	}
+	image->reboot_code_pages = reboot_pages;
+	image->segment[nr_segments].buf = 0;
+	image->segment[nr_segments].bufsz = 0;
+	image->segment[nr_segments].mem = (void *)(page_to_pfn(reboot_pages) << PAGE_SHIFT);
+	image->segment[nr_segments].memsz = KEXEC_REBOOT_CODE_SIZE;
+	image->nr_segments++;
+
+	result = 0;
+ out:
+	if (result == 0) {
+		*rimage = image;
+	} else {
+		kfree(image);
+	}
+	return result;
+}
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end)
+{
+	unsigned long i;
+	for(i = 0; i < image->nr_segments; i++) {
+		unsigned long mstart, mend;
+		mstart = (unsigned long)image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		if ((end > mstart) && (start < mend)) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+#ifdef CONFIG_MMU
+static int identity_map_pages(struct page *pages, int order)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	int error;
+	mm = &init_mm;
+	vma = 0;
+
+	down_write(&mm->mmap_sem);
+	error = -ENOMEM;
+	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	if (!vma) {
+		goto out;
+	}
+
+	memset(vma, 0, sizeof(vma));
+	vma->vm_mm = mm;
+	vma->vm_start = page_to_pfn(pages) << PAGE_SHIFT;
+	vma->vm_end = vma->vm_start + (1 << (order + PAGE_SHIFT));
+	vma->vm_ops = 0;
+	vma->vm_flags = VM_SHARED \
+		| VM_READ | VM_WRITE | VM_EXEC \
+		| VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC \
+		| VM_DONTCOPY | VM_RESERVED;
+	vma->vm_page_prot = protection_map[vma->vm_flags & 0xf];
+	vma->vm_file = NULL;
+	vma->vm_private_data = NULL;
+	INIT_LIST_HEAD(&vma->shared);
+	insert_vm_struct(mm, vma);
+	
+	error = remap_page_range(vma, vma->vm_start, vma->vm_start,
+		vma->vm_end - vma->vm_start, vma->vm_page_prot);
+	if (error) {
+		goto out;
+	}
+
+	error = 0;
+ out:
+	if (error && vma) {
+		kmem_cache_free(vm_area_cachep, vma);
+		vma = 0;
+	}
+	up_write(&mm->mmap_sem);
+
+	return error;
+}
+#else
+#define identity_map_pages(pages, order) 0
+#endif
+
+struct page *kimage_alloc_reboot_code_pages(struct kimage *image)
+{
+	/* The reboot code buffer is special.  It is the only set of
+	 * pages that must be allocated in their final resting place,
+	 * and the only set of pages whose final resting place we can
+	 * pick. 
+	 *
+	 * At worst this runs in O(N) of the image size.
+	 */
+	struct list_head extra_pages, *pos, *next;
+	struct page *pages;
+	unsigned long addr;
+	int order, count;
+	order = get_order(KEXEC_REBOOT_CODE_SIZE);
+	count = 1 << order;
+	INIT_LIST_HEAD(&extra_pages);
+	do {
+		int i;
+		pages = alloc_pages(GFP_HIGHUSER, order);
+		if (!pages)
+			break;
+		for(i = 0; i < count; i++) {
+			SetPageReserved(pages +i);
+		}
+		addr = page_to_pfn(pages) << PAGE_SHIFT;
+		if ((page_to_pfn(pages) >= (TASK_SIZE >> PAGE_SHIFT)) ||
+			kimage_is_destination_range(image, addr, addr + KEXEC_REBOOT_CODE_SIZE)) {
+			list_add(&pages->list, &extra_pages);
+			pages = 0;
+		}
+	} while(!pages);
+	if (pages) {
+		int i, result;
+		result = identity_map_pages(pages, order);
+		if (result < 0) {
+			list_add(&pages->list, &extra_pages);
+			pages = 0;
+		}
+	}
+	/* If I could convert a multi page allocation into a buch of
+	 * single page allocations I could add these pages to
+	 * image->dest_pages.  For now it is simpler to just free the
+	 * pages again.
+	 */
+	list_for_each_safe(pos, next, &extra_pages) {
+		struct page *page;
+		int i;
+		page = list_entry(pos, struct page, list);
+		for(i = 0; i < count; i++) {
+			ClearPageReserved(pages +i);
+		}
+		list_del(&extra_pages);
+		__free_pages(page, order);
+	}
+	return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+	if (image->offset != 0) {
+		image->entry++;
+	}
+	if (image->entry == image->last_entry) {
+		kimage_entry_t *ind_page;
+		struct page *page;
+		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+		if (!page) {
+			return -ENOMEM;
+		}
+		ind_page = page_address(page);
+		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		image->entry = ind_page;
+		image->last_entry = 
+			ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+	}
+	*image->entry = entry;
+	image->entry++;
+	image->offset = 0;
+	return 0;
+}
+
+static int kimage_set_destination(
+	struct kimage *image, unsigned long destination) 
+{
+	int result;
+	destination &= PAGE_MASK;
+	result = kimage_add_entry(image, destination | IND_DESTINATION);
+	if (result == 0) {
+		image->destination = destination;
+	}
+	return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+	int result;
+	page &= PAGE_MASK;
+	result = kimage_add_entry(image, page | IND_SOURCE);
+	if (result == 0) {
+		image->destination += PAGE_SIZE;
+	}
+	return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+	/* Walk through and free any extra destination pages I may have */
+	struct list_head *pos, *next;
+	list_for_each_safe(pos, next, &image->dest_pages) {
+		struct page *page;
+		page = list_entry(pos, struct page, list);
+		list_del(&page->list);
+		ClearPageReserved(page);
+		__free_page(page);
+	}
+	/* Walk through and free any unuseable pages I have cached */
+	list_for_each_safe(pos, next, &image->unuseable_pages) {
+		struct page *page;
+		page = list_entry(pos, struct page, list);
+		list_del(&page->list);
+		ClearPageReserved(page);
+		__free_page(page);
+	}
+
+}
+static int kimage_terminate(struct kimage *image)
+{
+	int result;
+	result = kimage_add_entry(image, IND_DONE);
+	if (result == 0) {
+		/* Point at the terminating element */
+		image->entry--;
+		kimage_free_extra_pages(image);
+	}
+	return result;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+		ptr = (entry & IND_INDIRECTION)? \
+			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free(struct kimage *image)
+{
+	kimage_entry_t *ptr, entry;
+	kimage_entry_t ind = 0;
+	int i, count, order;
+	if (!image)
+		return;
+	kimage_free_extra_pages(image);
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_INDIRECTION) {
+			/* Free the previous indirection page */
+			if (ind & IND_INDIRECTION) {
+				free_page((unsigned long)phys_to_virt(ind & PAGE_MASK));
+			}
+			/* Save this indirection page until we are
+			 * done with it.
+			 */
+			ind = entry;
+		}
+		else if (entry & IND_SOURCE) {
+			free_page((unsigned long)phys_to_virt(entry & PAGE_MASK));
+		}
+	}
+	order = get_order(KEXEC_REBOOT_CODE_SIZE);
+	count = 1 << order;
+	do_munmap(&init_mm, 
+		page_to_pfn(image->reboot_code_pages) << PAGE_SHIFT, 
+		count << PAGE_SHIFT);
+	for(i = 0; i < count; i++) {
+		ClearPageReserved(image->reboot_code_pages + i);
+	}
+	__free_pages(image->reboot_code_pages, order);
+	kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+{
+	kimage_entry_t *ptr, entry;
+	unsigned long destination = 0;
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_DESTINATION) {
+			destination = entry & PAGE_MASK;
+		}
+		else if (entry & IND_SOURCE) {
+			if (page == destination) {
+				return ptr;
+			}
+			destination += PAGE_SIZE;
+		}
+	}
+	return 0;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+{
+	/* Here we implment safe guards to ensure that a source page
+	 * is not copied to it's destination page before the data on
+	 * the destination page is no longer useful.
+	 *
+	 * To do this we maintain the invariant that a source page is
+	 * either it's own destination page, or it is not a
+	 * destination page at all.  
+	 *
+	 * That is slightly stronger than required, but the proof
+	 * that no problems will not occur is trivial, and the
+	 * implemenation is simply to verify.
+	 *
+	 * When allocating all pages normally this algorithm will run
+	 * in O(N) time, but in the worst case it will run in O(N^2)
+	 * time.   If the runtime is a problem the data structures can
+	 * be fixed.
+	 */
+	struct page *page;
+	unsigned long addr;
+
+	/* Walk through the list of destination pages, and see if I
+	 * have a match.
+	 */
+	list_for_each_entry(page, &image->dest_pages, list) {
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+		if (addr == destination) {
+			list_del(&page->list);
+			return page;
+		}
+	}
+	page = 0;
+	while(1) {
+		kimage_entry_t *old;
+		/* Allocate a page, if we run out of memory give up */
+		page = alloc_page(gfp_mask);
+		if (!page) {
+			return 0;
+		}
+		SetPageReserved(page);
+		/* If the page cannot be used file it away */
+		if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+			list_add(&page->list, &image->unuseable_pages);
+			continue;
+		}
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+
+		/* If it is the destination page we want use it */
+		if (addr == destination)
+			break;
+
+		/* If the page is not a destination page use it */
+		if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+			break;
+
+		/* I know that the page is someones destination page.
+		 * See if there is already a source page for this
+		 * destination page.  And if so swap the source pages.
+		 */
+		old = kimage_dst_used(image, addr);
+		if (old) {
+			/* If so move it */
+			unsigned long old_addr;
+			struct page *old_page;
+			
+			old_addr = *old & PAGE_MASK;
+			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			copy_highpage(page, old_page);
+			*old = addr | (*old & ~PAGE_MASK);
+
+			/* The old page I have found cannot be a
+			 * destination page, so return it.
+			 */
+			addr = old_addr;
+			page = old_page;
+			break;
+		}
+		else {
+			/* Place the page on the destination list I
+			 * will use it later.
+			 */
+			list_add(&page->list, &image->dest_pages);
+		}
+	}
+	return page;
+}
+
+static int kimage_load_segment(struct kimage *image,
+	struct kexec_segment *segment)
+{	
+	unsigned long mstart;
+	int result;
+	unsigned long offset;
+	unsigned long offset_end;
+	unsigned char *buf;
+
+	result = 0;
+	buf = segment->buf;
+	mstart = (unsigned long)segment->mem;
+
+	offset_end = segment->memsz;
+
+	result = kimage_set_destination(image, mstart);
+	if (result < 0) {
+		goto out;
+	}
+	for(offset = 0;  offset < segment->memsz; offset += PAGE_SIZE) {
+		struct page *page;
+		char *ptr;
+		size_t size, leader;
+		page = kimage_alloc_page(image, GFP_HIGHUSER, mstart + offset);
+		if (page == 0) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
+		if (result < 0) {
+			goto out;
+		}
+		ptr = kmap(page);
+		if (segment->bufsz < offset) {
+			/* We are past the end zero the whole page */
+			memset(ptr, 0, PAGE_SIZE);
+			kunmap(page);
+			continue;
+		}
+		size = PAGE_SIZE;
+		leader = 0;
+		if ((offset == 0)) {
+			leader = mstart & ~PAGE_MASK;
+		}
+		if (leader) {
+			/* We are on the first page zero the unused portion */
+			memset(ptr, 0, leader);
+			size -= leader;
+			ptr += leader;
+		}
+		if (size > (segment->bufsz - offset)) {
+			size = segment->bufsz - offset;
+		}
+		if (size < (PAGE_SIZE - leader)) {
+			/* zero the trailing part of the page */
+			memset(ptr + size, 0, (PAGE_SIZE - leader) - size);
+		}
+		result = copy_from_user(ptr, buf + offset, size);
+		kunmap(page);
+		if (result) {
+			result = (result < 0)?result : -EIO;
+			goto out;
+		}
+	}
+ out:
+	return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ * 
+ * This call breaks up into three pieces.  
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and the copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = 0;
+
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 
+	struct kexec_segment *segments, unsigned long flags)
+{
+	struct kimage *image;
+	int result;
+		
+	/* We only trust the superuser with rebooting the system. */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* In case we need just a little bit of special behavior for
+	 * reboot on panic 
+	 */
+	if (flags != 0)
+		return -EINVAL;
+
+	if (nr_segments > KEXEC_SEGMENT_MAX)
+		return -EINVAL;
+	image = 0;
+
+	result = 0;
+	if (nr_segments > 0) {
+		unsigned long i;
+		result = kimage_alloc(&image, nr_segments, segments);
+		if (result) {
+			goto out;
+		}
+		image->start = entry;
+		for(i = 0; i < nr_segments; i++) {
+			result = kimage_load_segment(image, &segments[i]);
+			if (result) {
+				goto out;
+			}
+		}
+		result = kimage_terminate(image);
+		if (result) {
+			goto out;
+		}
+	}
+
+	image = xchg(&kexec_image, image);
+
+ out:
+	kimage_free(image);
+	return result;
+}
diff -uNr linux-2.5.54/kernel/sys.c linux-2.5.54.x86kexec/kernel/sys.c
--- linux-2.5.54/kernel/sys.c	Thu Dec 12 07:41:37 2002
+++ linux-2.5.54.x86kexec/kernel/sys.c	Sat Jan  4 12:02:05 2003
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kexec.h>
 #include <linux/workqueue.h>
 #include <linux/device.h>
 #include <linux/times.h>
@@ -207,6 +208,7 @@
 cond_syscall(sys_lookup_dcookie)
 cond_syscall(sys_swapon)
 cond_syscall(sys_swapoff)
+cond_syscall(sys_kexec_load)
 cond_syscall(sys_init_module)
 cond_syscall(sys_delete_module)
 
@@ -419,6 +421,27 @@
 		machine_restart(buffer);
 		break;
 
+#ifdef CONFIG_KEXEC
+	case LINUX_REBOOT_CMD_KEXEC:
+	{
+		struct kimage *image;
+		if (arg) {
+			unlock_kernel();
+			return -EINVAL;
+		}
+		image = xchg(&kexec_image, 0);
+		if (!image) {
+			unlock_kernel();
+			return -EINVAL;
+		}
+		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+		system_running = 0;
+		device_shutdown();
+		printk(KERN_EMERG "Starting new kernel\n");
+		machine_kexec(image);
+		break;
+	}
+#endif
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		if (!software_suspend_enabled) {







^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kexec for 2.5.54
  2003-01-06  5:48       ` [PATCH] kexec for 2.5.54 Eric W. Biederman
@ 2003-01-07 22:46         ` Andy Pfiffer
  2003-01-07 23:01           ` Dave Hansen
  2003-01-15 19:43         ` [2.5.58][KEXEC] Success! (using 2.5.54 version + kexec tools 1.8) Andy Pfiffer
  1 sibling, 1 reply; 15+ messages in thread
From: Andy Pfiffer @ 2003-01-07 22:46 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, suparna, Linus Torvalds, Dave Hansen, Werner Almesberger

On Sun, 2003-01-05 at 21:48, Eric W. Biederman wrote:
> 
> O.k.  I have switched to using the init_mm and premapping the reboot
> code buffer.
<snip>
> The code in machine_kexec now takes no locks and is drop dead simple,
> so it should be safe to call from a panic handler.  

Eric,

The patch applies cleanly to 2.5.54 for me.  Current behavior matches
the version of kexec for 2.5.48 that I carried forward into 2.5.52 and
2.5.54 (and kexec_tools 1.8): 

- the kexec-ed kernel starts rebooting and finds all of my system's
memory, so the generic kexec machinery is working as expected.

- the kexec-ed kernel hangs while calibrating the delay loop.  The list
of kernels I attempted to reboot includes permutations of 2.5.48 +/-
kexec, 2.5.52 +/- kexec(from 2.5.48), 2.5.54, 2.5.54 + kexec(from
2.5.48), and 2.5.54 + kexec (recent patch from you).

- Whatever it is that SuSE supplies in 8.0 (2.4.x +) panics near/during
frame buffer initialization when rebooted via kexec for 2.5.54:
	.
	.
	.
	Initializing CPU#0
	Detected 799.665 MHz Processor
	Console: colour VGA+ 80x25
	invalid operand: 0000
	CPU: 0
	EIP: 0010[<00000007>] Not tainted
	EFLAGS 00010002
	.
	.
	.

Something has definitely changed in the 2.5.5x series, and the symptoms
indicate that at least the clock interrupt is not being received.

kexec for 2.5.48 worked for me (with some limits), so I should be able
to walk the tree forwards and poke at it some more.

For those that have had success w/ recent vintage kernels and kexec (>
2.5.48), could I get a roll-call of your machine's hardware?  Uniproc,
SMP, AGP, chipset, BIOS version, that kind of thing.  lspci -v,
/cat/proc/cpuinfo, and maybe the boot-up messages would all be
appreciated.

Regards,
Andy



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kexec for 2.5.54
  2003-01-07 22:46         ` Andy Pfiffer
@ 2003-01-07 23:01           ` Dave Hansen
  2003-01-07 23:11             ` Martin J. Bligh
  0 siblings, 1 reply; 15+ messages in thread
From: Dave Hansen @ 2003-01-07 23:01 UTC (permalink / raw)
  To: Andy Pfiffer; +Cc: Eric W. Biederman, linux-kernel, suparna, Werner Almesberger

... taking poor Linus off the cc list
Andy Pfiffer wrote:
> For those that have had success w/ recent vintage kernels and kexec (>
> 2.5.48), could I get a roll-call of your machine's hardware?  Uniproc,
> SMP, AGP, chipset, BIOS version, that kind of thing.  lspci -v,
> /cat/proc/cpuinfo, and maybe the boot-up messages would all be
> appreciated.

I've had it work on 2 IBM x86 boxes.
4/8-way SMP
1/4/16 GB RAM
no AGP
Intel Profusion Chipset and some funky IBM one

It failed on the NUMA-Q's I tried it on.  I haven't investigated any 
more thoroughly.

If you want more details, let me know.  But, I've never seen your 
"Calibrating delay loop..." problem.  The last time I saw problems 
there was when I broke the interrupt stack patches.  But, since those 
aren't in mainline, you shouldn't be seeing it.
-- 
Dave Hansen
haveblue@us.ibm.com


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] kexec for 2.5.54
  2003-01-07 23:01           ` Dave Hansen
@ 2003-01-07 23:11             ` Martin J. Bligh
  0 siblings, 0 replies; 15+ messages in thread
From: Martin J. Bligh @ 2003-01-07 23:11 UTC (permalink / raw)
  To: Dave Hansen, Andy Pfiffer
  Cc: Eric W. Biederman, linux-kernel, suparna, Werner Almesberger

> ... taking poor Linus off the cc list
> Andy Pfiffer wrote:
>> For those that have had success w/ recent vintage kernels and kexec (>
>> 2.5.48), could I get a roll-call of your machine's hardware?  Uniproc,
>> SMP, AGP, chipset, BIOS version, that kind of thing.  lspci -v,
>> /cat/proc/cpuinfo, and maybe the boot-up messages would all be
>> appreciated.
> 
> I've had it work on 2 IBM x86 boxes.
> 4/8-way SMP
> 1/4/16 GB RAM
> no AGP
> Intel Profusion Chipset and some funky IBM one
> 
> It failed on the NUMA-Q's I tried it on.  I haven't investigated any more thoroughly.
> 
> If you want more details, let me know.  But, I've never seen your "Calibrating delay loop..." problem.  The last time I saw problems there was when I broke the interrupt stack patches.  But, since those aren't in mainline, you shouldn't be seeing it.


Last time I saw calibrating delay loop problems, it just mean the other CPUs
weren't getting / acting upon IPIs. I might expect that on NUMA-Q, but the
INIT, INIT, STARTUP sequence on normal machines should kick the remote proc
pretty damned hard and reset it. You might want to add more APIC resetting
things (I think there are some in there that only NUMA-Q does right now ..
try turning those on).

M.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [2.5.58][KEXEC] Success! (using 2.5.54 version + kexec tools 1.8)
  2003-01-06  5:48       ` [PATCH] kexec for 2.5.54 Eric W. Biederman
  2003-01-07 22:46         ` Andy Pfiffer
@ 2003-01-15 19:43         ` Andy Pfiffer
  1 sibling, 0 replies; 15+ messages in thread
From: Andy Pfiffer @ 2003-01-15 19:43 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, suparna, Linus Torvalds, Dave Hansen, Werner Almesberger

Eric,

Success!

I've been carrying your kexec for 2.5.54 patch (and the hwfixes patch)
forward through subsequent kernels, and have had good luck (kexec works
fine for me) with them in 2.5.58 on my troublesome system (UP, P3-800,
640MB, Adaptec AIC7XXX SCSI).

I haven't had to change a thing with kexec.  For reference, the code I'm
currently using is downloadable from OSDL's patch-manager:

The "kexec-hwfixes" for to 2.5.58:
http://www.osdl.org/cgi-bin/plm?module=patch_info&patch_id=1432

kexec patch for 2.5.58 (from the 2.5.54 version):
http://www.osdl.org/cgi-bin/plm?module=patch_info&patch_id=1424

Regards,
Andy



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH][WIP] Using kexec for crash dumps in LKCD
       [not found]       ` <m1d6m81ttu.fsf@frodo.biederman.org>
@ 2003-02-06 15:56         ` Suparna Bhattacharya
  2003-02-07 15:39           ` Suparna Bhattacharya
  0 siblings, 1 reply; 15+ messages in thread
From: Suparna Bhattacharya @ 2003-02-06 15:56 UTC (permalink / raw)
  To: Eric W. Biederman, lkcd-devel; +Cc: fastboot, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 4597 bytes --]

This is an extension to LKCD to make use of Eric
Biederman's kexec implementation to delay the actual
writeout of a crashdump to disk to happen after a 
memory preserving reboot of a new kernel.

The real thanks for this goes to Dave Winchell and the 
rest of the Mission Critical Linux folks for first
implementing such an approach in mcore using Werner
Alamesberger's bootimg, and letting us learn and borrow 
ideas from it.

There is a subtle but crucial difference in the design 
of the scheme we use to get spare pages to save the dump 
which potentially enables us to save a complete memory 
snapshot (not just kernel pages) if we can get a good 
compression efficiency (i.e. theoretically limited 
only by the degree of compressability of the memory 
state and working memory space that must be left for the 
dump and kernel bootup code).

This code is still somewhat raw and there's a list of 
todo's and improvements in my mind, and loopholes to fix, 
but I decided it was high time to put this out for a start, 
so anyone who is interested could start taking a look and 
playing with it, and maybe help out if they like.

I plan to fold it into lkcd cvs tomorrow if possible unless 
anyone notices a major regression of existing lkcd 
functionality (i.e.  without CONFIG_CRASHDUMP_MEMDEV and 
CRASH_DUMP_SOFT_BOOT). I have  tried out Alt+Sysrq+d and a 
simple panic from a module as a sanity check.

(I haven't tried it out for a true panic yet - going there
bit by bit :))

In any case, I'll tag the cvs tree before checking in.

Merging and testing has been rather time consuming, so 
would appreciate if anyone planning to check in any changes 
before I do would let me know ahead of time.

I'm considering also checkin in a TODO file at the
top of the 2.5 directory in CVS to keep track of what
needs to be done. Would that be a good idea ?
I'll probably also post the TODOs on the mailing list.

OK, going ahead:

Steps to use:
--------------

A. Patching the kernel:
1) Patch vanilla 2.5.59 kernel with the kexec patches for
   2.5.59.
   I picked the ones from the OSDL site which Andy Pfiffer had
   mentioned in an earlier post
 	kexec for 2.5.59 (based upon the version for 2.5.54)
	http://www.osdl.org/cgi-bin/plm?module=patch_info&patch_id=1442

	hwfixes that makes it work for me (same as for 2.5.58):
	http://www.osdl.org/cgi-bin/plm?module=patch_info&patch_id=1444

2) Apply the latest dump patches from lkcd cvs
	i.e. apply the kernel patches under 2.5/patches
    (expect to see one reject in the 2nd hunk for reboot.c
     when applying notify_die.patch - you could ignore it for
     now) 
	and  copy the dump driver files at the appropriate
	places

3) Apply the attached patch (kexecdump.patch)

B. Kernel Build Configuration settings 
   You'll need CRASH_DUMP to be built into the kernel (not
   as a module) to be able to dump across a kexec boot
   CRASH_DUMP_BLOCKDEV, CRASH_DUMP_COMPRESS_GZIP are needed
   as we use them today
   New options you'll need CRASH_DUMP_MEMDEV (memory dump 
   driver) and CRASH_DUMP_SOFTBOOT (kexec based dumping) 

C. Run-time setup
   A new dump flag for memory-save-and-dump-after-boot 
   DUMP_FLAGS_SOFTBOOT has been introduced (0x2), which
   would need to be turned on in the dump flags.

   After running lkcd config as usual, there is one
   extra step needed to load the kernel to be kexec'ed
   This involves executing "kexec -l" with the regular
   command line options (derived from you /proc/cmdline)
   and one extra boot parameter, obtained as follows:
   crashdump=`cat /proc/sys/kernel/dump/addr`
   (This tells the new kernel where to find a saved
   in-memory crash dump from previous boot)

   e.g.
   kexec -l --command-line="root=806 console=tty0 console=
   ttyS0,38400 crashdump=`cat /proc/sys/kernel/dump/addr`"	
   <kernel bzImage>

D. On panic, the dump is saved in memory and then kexec is
   used to boot up a new kernel (instead of a regular reboot)
   If Alt+Sysrq+d is pressed then the dump is just saved
   in memory without rebooting

   [Note: The first few times you try it, it might be a 
   good idea to drop into "init 1" and unmount most filesystems 
   or remount them as read-only , before you force the panic
   - thanks to Andy Pfiffer for the tip ]

E. After running "lkcd config" triggers a writeout
   to the dump disk of the previously saved dump in memory.

F. From here on, one can run "lkcd save" as usual to generate
   the /var/log/dump/* files for analysis.


Regards
Suparna



-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Labs, India


[-- Attachment #2: kexecdump.patch --]
[-- Type: text/plain, Size: 63747 bytes --]

diff -urN -X ../dontdiff linux-2.5.59/arch/i386/Kconfig linux-2.5.59-kexecdump/arch/i386/Kconfig
--- linux-2.5.59/arch/i386/Kconfig	Thu Feb  6 16:42:29 2003
+++ linux-2.5.59-kexecdump/arch/i386/Kconfig	Thu Feb  6 10:30:48 2003
@@ -1577,6 +1577,23 @@
 	help
 	  Say Y to allow saving crash dumps over a network device.
 
+config CRASH_DUMP_MEMDEV
+	bool "Crash dump staged memory driver"
+	depends on CRASH_DUMP
+	help
+	  Say Y to allow intermediate saving crash dumps in spare 
+	  memory pages which would then be written out to disk
+	  later.
+
+config CRASH_DUMP_SOFTBOOT
+	bool "Save crash dump across a soft reboot"
+	depends on CRASH_DUMP_MEMDEV
+	help
+	  Say Y to allow a crash dump to be preserved in memory
+	  pages across a soft reboot and written out to disk
+	  thereafter. For this to work, CRASH_DUMP must be 
+	  configured as part of the kernel (not as a module).
+
 config CRASH_DUMP_COMPRESS_RLE
 	tristate "Crash dump RLE compression"
 	depends on CRASH_DUMP
diff -urN -X ../dontdiff linux-2.5.59/arch/i386/kernel/setup.c linux-2.5.59-kexecdump/arch/i386/kernel/setup.c
--- linux-2.5.59/arch/i386/kernel/setup.c	Fri Jan 17 07:52:08 2003
+++ linux-2.5.59-kexecdump/arch/i386/kernel/setup.c	Thu Feb  6 13:04:11 2003
@@ -503,6 +503,7 @@
 	print_memory_map(who);
 } /* setup_memory_region */
 
+unsigned long crashdump_addr = 0xdeadbeef;
 
 static void __init parse_cmdline_early (char ** cmdline_p)
 {
@@ -565,6 +566,9 @@
 		if (c == ' ' && !memcmp(from, "highmem=", 8))
 			highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
 	
+		if (c == ' ' && !memcmp(from, "crashdump=", 10))
+			crashdump_addr = memparse(from+10, &from); 
+			
 		c = *(from++);
 		if (!c)
 			break;
@@ -839,6 +843,8 @@
 		pci_mem_start = low_mem_size;
 }
 
+extern void crashdump_reserve(void);
+
 void __init setup_arch(char **cmdline_p)
 {
 	unsigned long max_low_pfn;
@@ -895,6 +901,10 @@
 	smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
 #endif
 	paging_init();
+
+#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
+	crashdump_reserve(); /* Preserve crash dump state from prev boot */
+#endif
 #ifdef CONFIG_ACPI_BOOT
 	/*
 	 * Parse the ACPI tables for possible boot-time SMP configuration.
diff -urN -X ../dontdiff linux-2.5.59/drivers/dump/Makefile linux-2.5.59-kexecdump/drivers/dump/Makefile
--- linux-2.5.59/drivers/dump/Makefile	Wed Dec 18 13:47:50 2002
+++ linux-2.5.59-kexecdump/drivers/dump/Makefile	Thu Feb  6 07:49:15 2003
@@ -5,6 +5,7 @@
 
 dump-y					:= dump_setup.o dump_fmt.o dump_filters.o dump_scheme.o dump_execute.o
 dump-$(CONFIG_X86)			+= dump_i386.o
+dump-$(CONFIG_CRASH_DUMP_MEMDEV)	+= dump_memdev.o dump_overlay.o
 dump-objs				+= $(dump-y)
 
 obj-$(CONFIG_CRASH_DUMP)		+= dump.o
diff -urN -X ../dontdiff linux-2.5.59/drivers/dump/dump_blockdev.c linux-2.5.59-kexecdump/drivers/dump/dump_blockdev.c
--- linux-2.5.59/drivers/dump/dump_blockdev.c	Mon Feb  3 23:56:59 2003
+++ linux-2.5.59-kexecdump/drivers/dump/dump_blockdev.c	Thu Feb  6 07:49:15 2003
@@ -61,14 +61,19 @@
 	int len) 
 {
 	struct bio *bio = dev->bio;
+	unsigned long bsize = 0;
 
 	if (!bio->bi_vcnt)
 		return 0; /* first time, not mapped */
 
 
-	if ((bio_page(bio) != page) || (len != bio->bi_vcnt << PAGE_SHIFT)) 
+	if ((bio_page(bio) != page) || (len > bio->bi_vcnt << PAGE_SHIFT))
 		return 0; /* buffer not mapped */
 
+	bsize = bdev_hardsect_size(bio->bi_bdev);
+	if ((len & (PAGE_SIZE - 1)) || (len & bsize))
+		return 0; /* alignment checks needed */
+
 	/* quick check to decide if we need to redo bio_add_page */
 	if (bdev_get_queue(bio->bi_bdev)->merge_bvec_fn)
 		return 0; /* device may have other restrictions */
diff -urN -X ../dontdiff linux-2.5.59/drivers/dump/dump_execute.c linux-2.5.59-kexecdump/drivers/dump/dump_execute.c
--- linux-2.5.59/drivers/dump/dump_execute.c	Mon Feb  3 23:58:01 2003
+++ linux-2.5.59-kexecdump/drivers/dump/dump_execute.c	Thu Feb  6 07:49:15 2003
@@ -22,8 +22,6 @@
 #include <linux/dump.h>
 #include "dump_methods.h"
 
-extern int dump_device;
-
 struct notifier_block *dump_notifier_list; /* dump started/ended callback */
 
 /* Dump progress indicator */
@@ -84,7 +82,7 @@
 /* Saves all dump data */
 int dump_execute_savedump(void)
 {
-	int ret = 0;
+	int ret = 0, err = 0;
 
 	if ((ret = dump_begin()))  {
 		return ret;
@@ -93,7 +91,9 @@
 	if (dump_config.level != DUMP_LEVEL_HEADER) { 
 		ret = dump_sequencer();
 	}
-	dump_complete();
+	if ((err = dump_complete())) {
+		printk("Dump complete failed. Error %d\n", err);
+	}
 
 	return ret;
 }
@@ -109,7 +109,8 @@
 	}
 
 	/* tell interested parties that a dump is about to start */
-	notifier_call_chain(&dump_notifier_list, DUMP_BEGIN, &dump_device);
+	notifier_call_chain(&dump_notifier_list, DUMP_BEGIN, 
+		&dump_config.dump_device);
 
 	if (dump_config.level != DUMP_LEVEL_NONE)
 		ret = dump_execute_savedump();
@@ -118,7 +119,8 @@
 		dump_config.dumper->count, DUMP_BUFFER_SIZE);
 	
 	/* tell interested parties that a dump has completed */
-	notifier_call_chain(&dump_notifier_list, DUMP_END, &dump_device);
+	notifier_call_chain(&dump_notifier_list, DUMP_END, 
+		&dump_config.dump_device);
 
 	return ret;
 }
diff -urN -X ../dontdiff linux-2.5.59/drivers/dump/dump_fmt.c linux-2.5.59-kexecdump/drivers/dump/dump_fmt.c
--- linux-2.5.59/drivers/dump/dump_fmt.c	Mon Feb  3 23:56:59 2003
+++ linux-2.5.59-kexecdump/drivers/dump/dump_fmt.c	Thu Feb  6 07:49:15 2003
@@ -88,6 +88,7 @@
 /*
  *  Set up common header fields (mainly the arch indep section) 
  *  Per-cpu state is handled by lcrash_save_context
+ *  Returns the size of the header in bytes.
  */
 static int lcrash_init_dump_header(const char *panic_str)
 {
@@ -154,7 +155,7 @@
 
 	dump_header_asm.dha_dumping_cpu = smp_processor_id();
 	
-	return 0;
+	return sizeof(dump_header) + sizeof(dump_header_asm);
 }
 
 
@@ -163,8 +164,7 @@
 {
 	int retval = 0;
 
-	if ((retval = lcrash_init_dump_header(panic_str))) 
-		return retval;
+	dump_config.dumper->header_len = lcrash_init_dump_header(panic_str);
 
 	/* capture register states for all processors */
 	dump_save_this_cpu(regs);
@@ -212,10 +212,10 @@
 	size = sizeof(dump_header);
 	memcpy(buf + size, (void *)&dump_header_asm, sizeof(dump_header_asm));
 	size += sizeof(dump_header_asm);
-	/* assuming header is dump buffer size always ? */
-	retval = dump_ll_write(buf , DUMP_BUFFER_SIZE);
+	size = PAGE_ALIGN(size);
+	retval = dump_ll_write(buf , size);
 
-	if (retval < DUMP_BUFFER_SIZE) 
+	if (retval < size) 
 		return (retval >= 0) ? ENOSPC : retval;
 
 	return 0;
diff -urN -X ../dontdiff linux-2.5.59/drivers/dump/dump_memdev.c linux-2.5.59-kexecdump/drivers/dump/dump_memdev.c
--- linux-2.5.59/drivers/dump/dump_memdev.c	Thu Jan  1 05:30:00 1970
+++ linux-2.5.59-kexecdump/drivers/dump/dump_memdev.c	Thu Feb  6 13:38:30 2003
@@ -0,0 +1,640 @@
+/*
+ * Implements the dump driver interface for saving a dump in available
+ * memory areas. The saved pages may be written out to persistent storage  
+ * after a soft reboot.
+ *
+ * Started: Oct 2002 -  Suparna Bhattacharya <suparna@in.ibm.com>
+ *
+ * Copyright (C) 2002 International Business Machines Corp. 
+ *
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * The approach of tracking pages containing saved dump using map pages 
+ * allocated as needed has been derived from the Mission Critical Linux 
+ * mcore dump implementation. 
+ *
+ * Credits and a big thanks for letting the lkcd project make use of 
+ * the excellent piece of work and also helping with clarifications 
+ * and tips along the way are due to:
+ * 	Dave Winchell <winchell@mclx.com> (primary author of mcore)
+ * 	Jeff Moyer <moyer@mclx.com>
+ * 	Josh Huber <huber@mclx.com>
+ *
+ * For those familiar with the mcore code, the main differences worth
+ * noting here (besides the dump device abstraction) result from enabling 
+ * "high" memory pages (pages not permanently mapped in the kernel 
+ * address space) to be used for saving dump data (because of which a 
+ * simple virtual address based linked list cannot be used anymore for 
+ * managing free pages), an added level of indirection for faster 
+ * lookups during the post-boot stage, and the idea of pages being 
+ * made available as they get freed up while dump to memory progresses 
+ * rather than one time before starting the dump. The last point enables 
+ * a full memory snapshot to be saved starting with an initial set of 
+ * bootstrap pages given a good compression ratio. (See dump_overlay.c)
+ *
+ */
+
+/*
+ * -----------------MEMORY LAYOUT ------------------
+ * The memory space consists of a set of discontiguous pages, and
+ * discontiguous map pages as well, rooted in a chain of indirect
+ * map pages (also discontiguous). Except for the indirect maps 
+ * (which must be preallocated in advance), the rest of the pages 
+ * could be in high memory.
+ *
+ * root
+ *  |    ---------    --------        --------
+ *  -->  | .  . +|--->|  .  +|------->| . .  |       indirect 
+ *       --|--|---    ---|----        --|-|---	     maps
+ *         |  |          |     	        | |	
+ *    ------  ------   -------     ------ -------
+ *    | .  |  | .  |   | .  . |    | .  | |  . . |   maps 
+ *    --|---  --|---   --|--|--    --|--- ---|-|--
+ *     page    page    page page   page   page page  data
+ *                                                   pages
+ *
+ * Writes to the dump device happen sequentially in append mode.
+ * The main reason for the existence of the indirect map is
+ * to enable a quick way to lookup a specific logical offset in
+ * the saved data post-soft-boot, e.g. to writeout pages
+ * with more critical data first, even though such pages
+ * would have been compressed and copied last, being the lowest
+ * ranked candidates for reuse due to their criticality.
+ * (See dump_overlay.c)
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/dump.h>
+#include "dump_methods.h"
+
+#define DUMP_MAP_SZ (PAGE_SIZE / sizeof(unsigned long)) /* direct map size */
+#define DUMP_IND_MAP_SZ	DUMP_MAP_SZ - 1  /* indirect map size */
+#define DUMP_NR_BOOTSTRAP	64  /* no of bootstrap pages */
+
+
+/* check if the next entry crosses a page boundary */
+static inline int is_last_map_entry(unsigned long *map)
+{
+	unsigned long addr = (unsigned long)(map + 1);
+
+	return (!(addr & (PAGE_SIZE - 1)));
+}
+
+/* Todo: should have some validation checks */
+/* The last entry in the indirect map points to the next indirect map */
+/* Indirect maps are referred to directly by virtual address */
+static inline unsigned long *next_indirect_map(unsigned long *map)
+{
+	return (unsigned long *)map[DUMP_IND_MAP_SZ];
+}
+
+#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
+/* Called during early bootup - fixme: make this __init */
+void dump_early_reserve_map(struct dump_memdev *dev)
+{
+	unsigned long *map1, *map2;
+	loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT;
+	int i, j;
+	
+	printk("Reserve bootmap space holding previous dump of %lld pages\n",
+			last);
+	map1= (unsigned long *)dev->indirect_map_root;
+
+	while (map1 && (off < last)) {
+		reserve_bootmem(virt_to_phys((void *)map1), PAGE_SIZE);
+		for (i=0;  (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last); 
+			i++, off += DUMP_MAP_SZ) {
+			pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]);
+			if (map1[i] >= max_low_pfn)
+				continue;
+			reserve_bootmem(map1[i] << PAGE_SHIFT, PAGE_SIZE);
+			map2 = pfn_to_kaddr(map1[i]);
+			for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] && 
+				(off + j < last); j++) {
+				pr_debug("\t map[%d][%d] = 0x%lx\n", i, j, 
+					map2[j]);
+				if (map2[j] < max_low_pfn) {
+					reserve_bootmem(map2[j] << PAGE_SHIFT,
+						PAGE_SIZE);
+				}
+			}
+		}
+		map1 = next_indirect_map(map1);
+	}
+	dev->nr_free = 0; /* these pages don't belong to this boot */
+}
+#endif
+
+/* mark dump pages so that they aren't used by this kernel */
+void dump_mark_map(struct dump_memdev *dev)
+{
+	unsigned long *map1, *map2;
+	loff_t off = 0, last = dev->last_used_offset >> PAGE_SHIFT;
+	struct page *page;
+	int i, j;
+	
+	printk("Dump: marking pages in use by previous dump\n");
+	map1= (unsigned long *)dev->indirect_map_root;
+
+	while (map1 && (off < last)) {
+		page = virt_to_page(map1);	
+		SetPageInuse(page);
+		for (i=0;  (i < DUMP_MAP_SZ - 1) && map1[i] && (off < last); 
+			i++, off += DUMP_MAP_SZ) {
+			pr_debug("indirect map[%d] = 0x%lx\n", i, map1[i]);
+			page = pfn_to_page(map1[i]);
+			SetPageInuse(page);
+			map2 = kmap_atomic(page, KM_DUMP);
+			for (j = 0 ; (j < DUMP_MAP_SZ) && map2[j] && 
+				(off + j < last); j++) {
+				pr_debug("\t map[%d][%d] = 0x%lx\n", i, j, 
+					map2[j]);
+				page = pfn_to_page(map2[j]);
+				SetPageInuse(page);
+			}
+		}
+		map1 = next_indirect_map(map1);
+	}
+}
+	
+
+/* 
+ * Given a logical offset into the mem device lookup the 
+ * corresponding page 
+ * 	loc is specified in units of pages 
+ * Note: affects curr_map (even in the case where lookup fails)
+ */
+struct page *dump_mem_lookup(struct dump_memdev *dump_mdev, unsigned long loc)
+{
+	unsigned long *map;
+	unsigned long i, index = loc / DUMP_MAP_SZ;
+	struct page *page = NULL;
+	unsigned long curr_pfn, curr_map, *curr_map_ptr = NULL;
+
+	map = (unsigned long *)dump_mdev->indirect_map_root;
+	if (!map)
+		return NULL;
+
+	if (loc > dump_mdev->last_offset >> PAGE_SHIFT)
+		return NULL;
+
+	/* 
+	 * first locate the right indirect map 
+	 * in the chain of indirect maps 
+	 */
+	for (i = 0; i + DUMP_IND_MAP_SZ < index ; i += DUMP_IND_MAP_SZ) {
+		if (!(map = next_indirect_map(map)))
+			return NULL;
+	}
+	/* then the right direct map */
+	/* map entries are referred to by page index */
+	if ((curr_map = map[index - i])) {
+		page = pfn_to_page(curr_map);
+		/* update the current traversal index */
+		/* dump_mdev->curr_map = &map[index - i];*/
+		curr_map_ptr = &map[index - i];
+	}
+
+	if (page)
+		map = kmap_atomic(page, KM_DUMP);
+	else 
+		return NULL;
+
+	/* and finally the right entry therein */
+	/* data pages are referred to by page index */
+	i = index * DUMP_MAP_SZ;
+	if ((curr_pfn = map[loc - i])) {
+		page = pfn_to_page(curr_pfn);
+		dump_mdev->curr_map = curr_map_ptr;
+		dump_mdev->curr_map_offset = loc - i;
+		dump_mdev->ddev.curr_offset = loc << PAGE_SHIFT;
+	} else {
+		page = NULL;
+	}
+	kunmap_atomic(map, KM_DUMP);
+
+	return page;
+}
+			
+/* 
+ * Retrieves a pointer to the next page in the dump device 
+ * Used during the lookup pass post-soft-reboot 
+ */
+struct page *dump_mem_next_page(struct dump_memdev *dev)
+{
+	unsigned long i; 
+	unsigned long *map;	
+	struct page *page = NULL;
+
+	if (dev->ddev.curr_offset + PAGE_SIZE >= dev->last_offset) {
+		return NULL;
+	}
+
+	if ((i = (unsigned long)(++dev->curr_map_offset)) >= DUMP_MAP_SZ) {
+		/* move to next map */	
+		if (is_last_map_entry(++dev->curr_map)) {
+			/* move to the next indirect map page */
+			printk("dump_mem_next_page: go to next indirect map\n");
+			dev->curr_map = (unsigned long *)*dev->curr_map;
+			if (!dev->curr_map)
+				return NULL;
+		}
+		i = dev->curr_map_offset = 0;
+		pr_debug("dump_mem_next_page: next map 0x%lx, entry 0x%lx\n",
+				dev->curr_map, *dev->curr_map);
+
+	};
+	
+	if (*dev->curr_map) {
+		map = kmap_atomic(pfn_to_page(*dev->curr_map), KM_DUMP);
+		if (map[i])
+			page = pfn_to_page(map[i]);
+		kunmap_atomic(map, KM_DUMP);
+		dev->ddev.curr_offset += PAGE_SIZE;
+	};
+
+	return page;
+}
+
+
+/* Set up the initial maps and bootstrap space  */
+/* Must be called only after any previous dump is written out */
+int dump_mem_open(struct dump_dev *dev, unsigned long devid)
+{
+	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
+	unsigned long nr_maps, *map, *prev_map = &dump_mdev->indirect_map_root;
+	void *addr;
+	struct page *page;
+	unsigned long i = 0;
+
+	/* Todo: sanity check for unwritten previous dump */
+
+	/* allocate pages for indirect map (non highmem area) */
+	nr_maps = num_physpages / DUMP_MAP_SZ; /* maps to cover entire mem */
+	for (i = 0; i < nr_maps; i += DUMP_IND_MAP_SZ) {
+		if (!(map = (unsigned long *)dump_alloc_mem(PAGE_SIZE))) {
+			printk("Unable to alloc indirect map %ld\n", 
+				i / DUMP_IND_MAP_SZ);
+			return -ENOMEM;
+		}
+		clear_page(map);
+		*prev_map = (unsigned long)map;
+		prev_map = &map[DUMP_IND_MAP_SZ];
+	};
+		
+	dump_mdev->curr_map = (unsigned long *)dump_mdev->indirect_map_root;
+	dump_mdev->curr_map_offset = 0;	
+
+	/* 
+	 * allocate a few bootstrap pages: at least 1 map and 1 data page
+	 * plus enough to save the dump header
+	 */
+	i = 0;
+	do {
+		if (!(addr = dump_alloc_mem(PAGE_SIZE))) {
+			printk("Unable to alloc bootstrap page %ld\n", i);
+			return -ENOMEM;
+		}
+		page = virt_to_page(addr);
+		ClearPageInuse(page); /* bypass kernel page check */
+		if (dump_check_and_free_page(dump_mdev, page))
+			i++;
+		SetPageInuse(page); 
+	} while (i < DUMP_NR_BOOTSTRAP);
+
+
+	printk("dump memdev init: %ld maps, %ld bootstrap pgs, %ld free pgs\n",
+		nr_maps, i, dump_mdev->last_offset >> PAGE_SHIFT);
+	
+	dump_mdev->last_bs_offset = dump_mdev->last_offset;
+
+	return 0;
+}
+
+/* Releases all pre-alloc'd pages */
+int dump_mem_release(struct dump_dev *dev)
+{
+	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
+	struct page *page, *map_page;
+	unsigned long *map, *prev_map;
+	void *addr;
+	int i;
+
+	if (!dump_mdev->nr_free)
+		return 0;
+
+	pr_debug("dump_mem_release\n");
+	page = dump_mem_lookup(dump_mdev, 0);
+	for (i = 0; page && (i < DUMP_NR_BOOTSTRAP - 1); i++) {
+		if (PageHighMem(page))
+			break;
+		addr = page_address(page);
+		if (!addr) {
+			printk("page_address(%p) = NULL\n", page);
+			break;
+		}
+		pr_debug("Freeing page at 0x%lx\n", addr); 
+		dump_free_mem(addr);
+		if (dump_mdev->curr_map_offset >= DUMP_MAP_SZ - 1) {
+			map_page = pfn_to_page(*dump_mdev->curr_map);
+			if (PageHighMem(map_page))
+				break;
+			page = dump_mem_next_page(dump_mdev);
+			addr = page_address(map_page);
+			if (!addr) {
+				printk("page_address(%p) = NULL\n", 
+					map_page);
+				break;
+			}
+			pr_debug("Freeing map page at 0x%lx\n", addr);
+			dump_free_mem(addr);
+			i++;
+		} else {
+			page = dump_mem_next_page(dump_mdev);
+		}
+	}
+
+	/* now for the last used bootstrap page used as a map page */
+	if ((i < DUMP_NR_BOOTSTRAP) && (*dump_mdev->curr_map)) {
+		map_page = pfn_to_page(*dump_mdev->curr_map);
+		if ((map_page) && !PageHighMem(map_page)) {
+			addr = page_address(map_page);
+			if (!addr) {
+				printk("page_address(%p) = NULL\n", map_page);
+			} else {
+				pr_debug("Freeing map page at 0x%lx\n", addr);
+				dump_free_mem(addr);
+				i++;
+			}
+		}
+	}
+
+	printk("Freed %d bootstrap pages\n", i);
+
+	/* free the indirect maps */
+	map = (unsigned long *)dump_mdev->indirect_map_root;
+
+	i = 0;
+	while (map) {
+		prev_map = map;
+		map = next_indirect_map(map);
+		dump_free_mem(prev_map);
+		i++;
+	}
+
+	printk("Freed %d indirect map(s)\n", i);
+
+	/* Reset the indirect map */
+	dump_mdev->indirect_map_root = 0;
+	dump_mdev->curr_map = 0;
+
+	/* Reset the free list */
+	dump_mdev->nr_free = 0;
+
+	dump_mdev->last_offset = dump_mdev->ddev.curr_offset = 0;
+	dump_mdev->last_used_offset = 0;
+	dump_mdev->curr_map = NULL;
+	dump_mdev->curr_map_offset = 0;
+	return 0;
+}
+
+/*
+ * Long term:
+ * It is critical for this to be very strict. Cannot afford
+ * to have anything running and accessing memory while we overwrite 
+ * memory (potential risk of data corruption).
+ * If in doubt (e.g if a cpu is hung and not responding) just give
+ * up and refuse to proceed with this scheme.
+ *
+ * Note: I/O will only happen after soft-boot/switchover, so we can 
+ * safely disable interrupts and force stop other CPUs if this is
+ * going to be a disruptive dump, no matter what they
+ * are in the middle of.
+ */
+/* 
+ * ATM Most of this is already taken care of in the nmi handler 
+ * We may halt the cpus rightaway if we know this is going to be disruptive 
+ * For now, since we've limited ourselves to overwriting free pages we
+ * aren't doing much here. Eventually, we'd have to wait to make sure other
+ * cpus aren't using memory we could be overwriting
+ */
+int dump_mem_silence(struct dump_dev *dev)
+{
+	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
+
+	if (dump_mdev->last_offset > dump_mdev->last_bs_offset) {
+		/* prefer to run lkcd config & start with a clean slate */
+		return -EEXIST;
+	}
+	return 0;
+}
+
+extern int dump_overlay_resume(void);
+
+/* Trigger the next stage of dumping */
+int dump_mem_resume(struct dump_dev *dev)
+{
+	dump_overlay_resume(); 
+	return 0;
+}
+
+/* 
+ * Allocate mem dev pages as required and copy buffer contents into it.
+ * Fails if the no free pages are available
+ * Keeping it simple and limited for starters (can modify this over time)
+ *  Does not handle holes or a sparse layout
+ *  Data must be in multiples of PAGE_SIZE
+ */
+int dump_mem_write(struct dump_dev *dev, void *buf, unsigned long len)
+{
+	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
+	struct page *page;
+	unsigned long n = 0;
+	void *addr;
+	unsigned long *saved_curr_map, saved_map_offset;
+	int ret = 0;
+
+	pr_debug("dump_mem_write: offset 0x%llx, size %ld\n", 
+		dev->curr_offset, len);
+
+	if (dev->curr_offset + len > dump_mdev->last_offset)  {
+		printk("Out of space to write\n");
+		return -ENOSPC;
+	}
+	
+	if ((len & (PAGE_SIZE - 1)) || (dev->curr_offset & (PAGE_SIZE - 1)))
+		return -EINVAL; /* not aligned in units of page size */
+
+	saved_curr_map = dump_mdev->curr_map;
+	saved_map_offset = dump_mdev->curr_map_offset;
+	page = dump_mem_lookup(dump_mdev, dev->curr_offset >> PAGE_SHIFT);
+
+	for (n = len; (n > 0) && page; n -= PAGE_SIZE, buf += PAGE_SIZE ) {
+		addr = kmap_atomic(page, KM_DUMP);
+		/* memset(addr, 'x', PAGE_SIZE); */
+		memcpy(addr, buf, PAGE_SIZE);
+		kunmap_atomic(addr, KM_DUMP);
+		/* dev->curr_offset += PAGE_SIZE; */
+		page = dump_mem_next_page(dump_mdev);
+	}
+
+	dump_mdev->curr_map = saved_curr_map;
+	dump_mdev->curr_map_offset = saved_map_offset;
+
+	if (dump_mdev->last_used_offset < dev->curr_offset)
+		dump_mdev->last_used_offset = dev->curr_offset;
+
+	return (len - n) ? (len - n) : ret ;
+}
+
+/* dummy - always ready */
+int dump_mem_ready(struct dump_dev *dev, void *buf)
+{
+	return 0;
+}
+
+/* 
+ * Should check for availability of space to write upto the offset 
+ * affects only the curr_offset; last_offset untouched 
+ * Keep it simple: Only allow multiples of PAGE_SIZE for now 
+ */
+int dump_mem_seek(struct dump_dev *dev, loff_t offset)
+{
+	struct dump_memdev *dump_mdev = DUMP_MDEV(dev);
+
+	if (offset & (PAGE_SIZE - 1))
+		return -EINVAL; /* allow page size units only for now */
+	
+	/* Are we exceeding available space ? */
+	if (offset > dump_mdev->last_offset) {
+		printk("dump_mem_seek failed for offset 0x%llx\n",
+			offset);
+		return -ENOSPC;	
+	}
+
+	dump_mdev->ddev.curr_offset = offset;
+	return 0;
+}
+
+/* Copied from dump_filters.c */
+static inline int kernel_page(struct page *p)
+{
+	if (PageReserved(p) || (!PageLRU(p) && PageInuse(p)))
+		return 1;
+	else
+		return 0;
+}
+
+static inline int user_page(struct page *p)
+{
+	if (PageInuse(p)) {
+		if (!PageReserved(p) && PageLRU(p))
+			return 1;
+	}
+	return 0;
+}
+
+
+extern int dump_low_page(struct page *);
+
+int dump_reused_by_boot(struct page *page)
+{
+	/* Todo
+	 * Checks:
+	 * if PageReserved 
+	 * if < __end + bootmem_bootmap_pages for this boot + allowance 
+	 * if overwritten by initrd (how to check ?)
+	 * Also, add more checks in early boot code
+	 * e.g. bootmem bootmap alloc verify not overwriting dump, and if
+	 * so then realloc or move the dump pages out accordingly.
+	 */
+
+	/* Temporary proof of concept hack, avoid overwriting kern pages */
+
+	return (kernel_page(page) || dump_low_page(page) || user_page(page));
+}
+
+
+/* Uses the free page passed in to expand available space */
+int dump_mem_add_space(struct dump_memdev *dev, struct page *page)
+{
+	struct page *map_page;
+	unsigned long *map;	
+	unsigned long i; 
+
+	if (!dev->curr_map)
+		return -ENOMEM; /* must've exhausted indirect map */
+
+	if (!*dev->curr_map || dev->curr_map_offset >= DUMP_MAP_SZ) {
+		/* add map space */
+		*dev->curr_map = page_to_pfn(page);
+		dev->curr_map_offset = 0;
+		return 0;
+	}
+
+	/* add data space */
+	i = dev->curr_map_offset;
+	map_page = pfn_to_page(*dev->curr_map);
+	map = (unsigned long *)kmap_atomic(map_page, KM_DUMP);
+	map[i] = page_to_pfn(page);
+	kunmap_atomic(map, KM_DUMP);
+	dev->curr_map_offset = ++i;
+	dev->last_offset += PAGE_SIZE;
+	if (i >= DUMP_MAP_SZ) {
+		/* move to next map */
+		if (is_last_map_entry(++dev->curr_map)) {
+			/* move to the next indirect map page */
+			pr_debug("dump_mem_add_space: using next"
+			"indirect map\n");
+			dev->curr_map = (unsigned long *)*dev->curr_map;
+		}
+	}		
+	return 0;
+}
+
+
+/* Caution: making a dest page invalidates existing contents of the page */
+int dump_check_and_free_page(struct dump_memdev *dev, struct page *page)
+{
+	int err = 0;
+
+	/* 
+	 * the page can be used as a destination only if we are sure
+	 * it won't get overwritten by the soft-boot, and is not
+	 * critical for us right now.
+	 */
+	if (dump_reused_by_boot(page))
+		return 0;
+
+	if ((err = dump_mem_add_space(dev, page))) {
+		printk("Warning: Unable to extend memdev space. Err %d\n",
+		err);
+		return 0;
+	}
+
+	dev->nr_free++;
+	return 1;
+}
+
+
+struct dump_dev_ops dump_memdev_ops = {
+	.open 		= dump_mem_open,
+	.release	= dump_mem_release,
+	.silence	= dump_mem_silence,
+	.resume 	= dump_mem_resume,
+	.seek		= dump_mem_seek,
+	.write		= dump_mem_write,
+	.read		= NULL, /* not implemented at the moment */
+	.ready		= dump_mem_ready
+};
+
+static struct dump_memdev default_dump_memdev = {
+	.ddev = {.type_name = "memdev", .ops = &dump_memdev_ops,
+        	 .device_id = 0x14}
+	/* assume the rest of the fields are zeroed by default */
+};	
+	
+/* may be overwritten if a previous dump exists */
+struct dump_memdev *dump_memdev = &default_dump_memdev;
+
diff -urN -X ../dontdiff linux-2.5.59/drivers/dump/dump_methods.h linux-2.5.59-kexecdump/drivers/dump/dump_methods.h
--- linux-2.5.59/drivers/dump/dump_methods.h	Mon Feb  3 23:56:59 2003
+++ linux-2.5.59-kexecdump/drivers/dump/dump_methods.h	Thu Feb  6 14:05:52 2003
@@ -139,6 +139,7 @@
 	void *curr_buf; /* current position in the dump buffer */
 	void *dump_buf; /* starting addr of dump buffer */
 	int header_dirty; /* whether the header needs to be written out */
+	int header_len; 
 	struct list_head dumper_list; /* links to other dumpers */
 };	
 
@@ -147,11 +148,32 @@
 	ulong level;
 	ulong flags;
 	struct dumper *dumper;
+	unsigned long dump_device;
+	unsigned long dump_addr; /* relevant only for in-memory dumps */
 	struct list_head dump_dev_list;
 };	
 
 extern struct dump_config dump_config;
 
+/* Used to save the dump config across a reboot for 2-stage dumps: 
+ * 
+ * Note: The scheme, format, compression and device type should be 
+ * registered at bootup, for this config to be sharable across soft-boot. 
+ * The function addresses could have changed and become invalid, and
+ * need to be set up again.
+ */
+struct dump_config_block {
+	u64 magic; /* for a quick sanity check after reboot */
+	struct dump_memdev memdev; /* handle to dump stored in memory */
+	struct dump_config config;
+	struct dumper dumper;
+	struct dump_scheme scheme;
+	struct dump_fmt fmt;
+	struct __dump_compress compress;
+	struct dump_data_filter filter_table[MAX_PASSES];
+	struct dump_anydev dev[MAX_DEVS]; /* target dump device */
+};
+
 
 /* Wrappers that invoke the methods for the current (active) dumper */
 
@@ -275,6 +297,8 @@
 
 /* Some pre-defined dumpers */
 extern struct dumper dumper_singlestage;
+extern struct dumper dumper_stage1;
+extern struct dumper dumper_stage2;
 
 /* These are temporary */
 #define DUMP_MASK_HEADER	DUMP_LEVEL_HEADER
@@ -287,6 +311,7 @@
 
 int dump_generic_execute(const char *panic_str, const struct pt_regs *regs);
 extern int dump_ll_write(void *buf, unsigned long len); 
+int dump_check_and_free_page(struct dump_memdev *dev, struct page *page);
 
 static inline void dumper_reset(void)
 {
@@ -308,7 +333,16 @@
 
 static inline void dump_free_mem(void *buf)
 {
+	struct page *page;
+
+	/* ignore reserved pages (e.g. post soft boot stage) */
+	if (buf && (page = virt_to_page(buf))) {
+		if (PageReserved(page))
+			return;
+	}
+
 	kfree(buf);
 }
 
+
 #endif /*  _LINUX_DUMP_METHODS_H */
diff -urN -X ../dontdiff linux-2.5.59/drivers/dump/dump_overlay.c linux-2.5.59-kexecdump/drivers/dump/dump_overlay.c
--- linux-2.5.59/drivers/dump/dump_overlay.c	Thu Jan  1 05:30:00 1970
+++ linux-2.5.59-kexecdump/drivers/dump/dump_overlay.c	Thu Feb  6 14:06:43 2003
@@ -0,0 +1,848 @@
+/*
+ * Two-stage soft-boot based dump scheme methods (memory overlay
+ * with post soft-boot writeout)
+ *
+ * Started: Oct 2002 -  Suparna Bhattacharya <suparna@in.ibm.com>
+ *
+ * This approach of saving the dump in memory and writing it 
+ * out after a softboot without clearing memory is derived from the 
+ * Mission Critical Linux dump implementation. Credits and a big
+ * thanks for letting the lkcd project make use of the excellent 
+ * piece of work and also for helping with clarifications and 
+ * tips along the way are due to:
+ * 	Dave Winchell <winchell@mclx.com> (primary author of mcore)
+ * 	and also to
+ * 	Jeff Moyer <moyer@mclx.com>
+ * 	Josh Huber <huber@mclx.com>
+ * 
+ * For those familiar with the mcore implementation, the key 
+ * differences/extensions here are in allowing entire memory to be 
+ * saved (in compressed form) through a careful ordering scheme 
+ * on both the way down as well on the way up after boot, the latter
+ * for supporting the LKCD notion of passes in which most critical 
+ * data is the first to be saved to the dump device. Also the post 
+ * boot writeout happens from within the kernel rather than driven 
+ * from userspace.
+ *
+ * The sequence is orchestrated through the abstraction of "dumpers",
+ * one for the first stage which then sets up the dumper for the next 
+ * stage, providing for a smooth and flexible reuse of the singlestage 
+ * dump scheme methods and a handle to pass dump device configuration 
+ * information across the soft boot. 
+ *
+ * Copyright (C) 2002 International Business Machines Corp. 
+ *
+ * This code is released under version 2 of the GNU GPL.
+ */
+
+/*
+ * Disruptive dumping using the second kernel soft-boot option
+ * for issuing dump i/o operates in 2 stages:
+ * 
+ * (1) - Saves the (compressed & formatted) dump in memory using a 
+ *       carefully ordered overlay scheme designed to capture the 
+ *       entire physical memory or selective portions depending on 
+ *       dump config settings, 
+ *     - Registers the stage 2 dumper and 
+ *     - Issues a soft reboot w/o clearing memory. 
+ *
+ *     The overlay scheme starts with a small bootstrap free area
+ *     and follows a reverse ordering of passes wherein it 
+ *     compresses and saves data starting with the least critical 
+ *     areas first, thus freeing up the corresponding pages to 
+ *     serve as destination for subsequent data to be saved, and
+ *     so on. With a good compression ratio, this makes it feasible
+ *     to capture an entire physical memory dump without significantly
+ *     reducing memory available during regular operation.
+ *
+ * (2) Post soft-reboot, runs through the saved memory dump and
+ *     writes it out to disk, this time around, taking care to
+ *     save the more critical data first (i.e. pages which figure 
+ *     in early passes for a regular dump). Finally issues a 
+ *     clean reboot.
+ *     
+ *     Since the data was saved in memory after selection/filtering
+ *     and formatted as per the chosen output dump format, at this 
+ *     stage the filter and format actions are just dummy (or
+ *     passthrough) actions, except for influence on ordering of
+ *     passes.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/dump.h>
+#include "dump_methods.h"
+
+extern struct list_head dumper_list_head;
+extern struct dump_memdev *dump_memdev;
+extern struct dumper dumper_stage2;
+struct dump_config_block *dump_saved_config = NULL;
+extern struct dump_blockdev *dump_blockdev;
+static struct dump_memdev *saved_dump_memdev = NULL;
+static struct dumper *saved_dumper = NULL;
+
+/* For testing 
+extern void dump_display_map(struct dump_memdev *);
+*/
+
+struct dumper *dumper_by_name(char *name)
+{
+#ifdef LATER
+	struct dumper *dumper;
+	list_for_each_entry(dumper, &dumper_list_head, dumper_list)
+		if (!strncmp(dumper->name, name, 32))
+			return dumper;
+
+	/* not found */
+	return NULL; 
+#endif
+	/* Temporary proof of concept */
+	if (!strncmp(dumper_stage2.name, name, 32))
+		return &dumper_stage2;
+	else
+		return NULL;
+}
+
+#ifdef CONFIG_CRASH_DUMP_SOFTBOOT
+extern void dump_early_reserve_map(struct dump_memdev *);
+
+void crashdump_reserve(void)
+{
+	extern unsigned long crashdump_addr;
+
+	if (crashdump_addr == 0xdeadbeef) 
+		return;
+
+	/* reserve dump config and saved dump pages */
+	dump_saved_config = (struct dump_config_block *)crashdump_addr;
+	/* magic verification */
+	if (dump_saved_config->magic != DUMP_MAGIC_LIVE) {
+		printk("Invalid dump magic. Ignoring dump\n");
+		dump_saved_config = NULL;
+		return;
+	}
+			
+	printk("Dump may be available from previous boot\n");
+
+	reserve_bootmem(virt_to_phys((void *)crashdump_addr), 
+		PAGE_ALIGN(sizeof(struct dump_config_block)));
+	dump_early_reserve_map(&dump_saved_config->memdev);
+
+}
+#endif
+
+/* 
+ * Loads the dump configuration from a memory block saved across soft-boot
+ * The ops vectors need fixing up as the corresp. routines may have 
+ * relocated in the new soft-booted kernel.
+ */
+int dump_load_config(struct dump_config_block *config)
+{
+	struct dumper *dumper;
+	struct dump_data_filter *filter_table, *filter;
+	struct dump_dev *dev;
+	int i;
+
+	if (config->magic != DUMP_MAGIC_LIVE)
+		return -ENOENT; /* not a valid config */
+
+	/* initialize generic config data */
+	memcpy(&dump_config, &config->config, sizeof(dump_config));
+
+	/* initialize dumper state */
+	if (!(dumper = dumper_by_name(config->dumper.name)))  {
+		printk("dumper name mismatch\n");
+		return -ENOENT; /* dumper mismatch */
+	}
+	
+	/* verify and fixup schema */
+	if (strncmp(dumper->scheme->name, config->scheme.name, 32)) {
+		printk("dumper scheme mismatch\n");
+		return -ENOENT; /* mismatch */
+	}
+	config->scheme.ops = dumper->scheme->ops;
+	config->dumper.scheme = &config->scheme;
+	
+	/* verify and fixup filter operations */
+	filter_table = dumper->filter;
+	for (i = 0, filter = config->filter_table; 
+		((i < MAX_PASSES) && filter_table[i].selector); 
+		i++, filter++) {
+		if (strncmp(filter_table[i].name, filter->name, 32)) {
+			printk("dump filter mismatch\n");
+			return -ENOENT; /* filter name mismatch */
+		}
+		filter->selector = filter_table[i].selector;
+	}
+	config->dumper.filter = config->filter_table;
+
+	/* fixup format */
+	if (strncmp(dumper->fmt->name, config->fmt.name, 32)) {
+		printk("dump format mismatch\n");
+		return -ENOENT; /* mismatch */
+	}
+	config->fmt.ops = dumper->fmt->ops;
+	config->dumper.fmt = &config->fmt;
+
+	/* fixup target device */
+	dev = (struct dump_dev *)(&config->dev[0]);
+	if (dumper->dev == NULL) {
+		pr_debug("Vanilla dumper - assume default\n");
+		if (dump_dev == NULL)
+			return -ENODEV;
+		dumper->dev = dump_dev;
+	}
+
+	if (strncmp(dumper->dev->type_name, dev->type_name, 32)) { 
+		printk("dump dev type mismatch %s instead of %s\n",
+				dev->type_name, dumper->dev->type_name);
+		return -ENOENT; /* mismatch */
+	}
+	dev->ops = dumper->dev->ops; 
+	config->dumper.dev = dev;
+	
+	/* fixup memory device containing saved dump pages */
+	/* assume statically init'ed dump_memdev */
+	config->memdev.ddev.ops = dump_memdev->ddev.ops; 
+	/* switch to memdev from prev boot */
+	saved_dump_memdev = dump_memdev; /* remember current */
+	dump_memdev = &config->memdev;
+
+	/* Make this the current primary dumper */
+	dump_config.dumper = &config->dumper;
+
+	return 0;
+}
+
+/* Saves the dump configuration in a memory block for use across a soft-boot */
+int dump_save_config(struct dump_config_block *config)
+{
+	printk("saving dump config settings\n");
+
+	/* dump config settings */
+	memcpy(&config->config, &dump_config, sizeof(dump_config));
+
+	/* dumper state */
+	memcpy(&config->dumper, dump_config.dumper, sizeof(struct dumper));
+	memcpy(&config->scheme, dump_config.dumper->scheme, 
+		sizeof(struct dump_scheme));
+	memcpy(&config->fmt, dump_config.dumper->fmt, sizeof(struct dump_fmt));
+	memcpy(&config->dev[0], dump_config.dumper->dev, 
+		sizeof(struct dump_anydev));
+	memcpy(&config->filter_table, dump_config.dumper->filter, 
+		sizeof(struct dump_data_filter)*MAX_PASSES);
+
+	/* handle to saved mem pages */
+	memcpy(&config->memdev, dump_memdev, sizeof(struct dump_memdev));
+
+	config->magic = DUMP_MAGIC_LIVE;
+	
+	return 0;
+}
+
+int dump_init_stage2(struct dump_config_block *saved_config)
+{
+	int err = 0;
+
+	pr_debug("dump_init_stage2\n");
+	/* Check if dump from previous boot exists */
+	if (saved_config) {
+		printk("loading dumper from previous boot \n");
+		/* load and configure dumper from previous boot */
+		if ((err = dump_load_config(saved_config)))
+			return err;
+
+		if (!dump_oncpu) {
+			if ((err = dump_configure(dump_config.dump_device))) {
+				printk("Stage 2 dump configure failed\n");
+				return err;
+			}
+		}
+
+		dumper_reset();
+		dump_dev = dump_config.dumper->dev;
+		/* write out the dump */
+		err = dump_generic_execute(NULL, NULL);
+		
+		dump_saved_config = NULL;
+
+		if (!dump_oncpu) {
+			dump_unconfigure(); 
+		}
+		
+		return err;
+
+	} else {
+		/* no dump to write out */
+		printk("no dumper from previous boot \n");
+		return 0;
+	}
+}
+
+extern void dump_mem_markpages(struct dump_memdev *);
+
+int dump_switchover_stage(void)
+{
+	int ret = 0;
+
+	/* trigger stage 2 rightaway - in real life would be after soft-boot */
+	/* dump_saved_config would be a boot param */
+	saved_dump_memdev = dump_memdev;
+	saved_dumper = dump_config.dumper;
+	ret = dump_init_stage2(dump_saved_config);
+	dump_memdev = saved_dump_memdev;
+	dump_config.dumper = saved_dumper;
+	return ret;
+}
+
+int dump_activate_softboot(void) 
+{
+	int err = 0;
+
+	/* temporary - switchover to writeout previously saved dump */
+	err = dump_switchover_stage(); /* non-disruptive case */
+	if (dump_oncpu) 
+		dump_config.dumper = &dumper_stage1; /* set things back */
+
+	return err;
+
+	dump_silence_level = DUMP_HALT_CPUS;
+	/* wait till we become the only cpu */
+	/* maybe by checking for online cpus ? */
+
+	/* now call into kexec */
+
+	/* TBD/Fixme: 
+	 * should we call reboot notifiers ? inappropriate for panic ?  
+	 * what about device_shutdown() ? 
+	 * is explicit bus master disabling needed or can we do that
+	 * through driverfs ? 
+	 */
+	return 0;
+}
+
+/* --- DUMP SCHEME ROUTINES  --- */
+
+static inline int dump_buf_pending(struct dumper *dumper)
+{
+	return (dumper->curr_buf - dumper->dump_buf);
+}
+
+/* Invoked during stage 1 of soft-reboot based dumping */
+int dump_overlay_sequencer(void)
+{
+	struct dump_data_filter *filter = dump_config.dumper->filter;
+	struct dump_data_filter *filter2 = dumper_stage2.filter;
+	int pass = 0, err = 0, save = 0;
+	int (*action)(unsigned long, unsigned long);
+
+	/* Make sure gzip compression is being used */
+	if (dump_config.dumper->compress->compress_type != DUMP_COMPRESS_GZIP) {
+		printk(" Please set GZIP compression \n");
+		return -EINVAL;
+	}
+
+	/* start filling in dump data right after the header */
+	dump_config.dumper->curr_offset = 
+		PAGE_ALIGN(dump_config.dumper->header_len);
+
+	/* Locate the last pass */
+	for (;filter->selector; filter++, pass++);
+	
+	/* 
+	 * Start from the end backwards: overlay involves a reverse 
+	 * ordering of passes, since less critical pages are more
+	 * likely to be reusable as scratch space once we are through
+	 * with them. 
+	 */
+	for (--pass, --filter; pass >= 0; pass--, filter--)
+	{
+		/* Assumes passes are exclusive (even across dumpers) */
+		/* Requires care when coding the selection functions */
+		if ((save = filter->level_mask & dump_config.level))
+			action = dump_save_data;
+		else
+			action = dump_skip_data;
+
+		/* Remember the offset where this pass started */
+		/* The second stage dumper would use this */
+		if (dump_buf_pending(dump_config.dumper) & (PAGE_SIZE - 1)) {
+			pr_debug("Starting pass %d with pending data\n", pass);
+			pr_debug("filling dummy data to page-align it\n");
+			dump_config.dumper->curr_buf = (void *)PAGE_ALIGN(
+				(unsigned long)dump_config.dumper->curr_buf);
+		}
+		
+		filter2[pass].start = dump_config.dumper->curr_offset
+			+ dump_buf_pending(dump_config.dumper);
+
+		err = dump_iterator(pass, action, filter);
+
+		filter2[pass].end = dump_config.dumper->curr_offset
+			+ dump_buf_pending(dump_config.dumper);
+
+		if (err < 0) {
+			printk("dump_overlay_seq: failure %d in pass %d\n", 
+				err, pass);
+			break;
+		}	
+		printk("\n %d overlay pages %s of %d each in pass %d\n", 
+		err, save ? "saved" : "skipped", DUMP_PAGE_SIZE, pass);
+	}
+
+	return err;
+}
+
+/* from dump_memdev.c */
+extern struct page *dump_mem_lookup(struct dump_memdev *dev, unsigned long loc);
+extern struct page *dump_mem_next_page(struct dump_memdev *dev);
+
+static inline struct page *dump_get_saved_page(loff_t loc)
+{
+	return (dump_mem_lookup(dump_memdev, loc >> PAGE_SHIFT));
+}
+
+static inline struct page *dump_next_saved_page(void)
+{
+	return (dump_mem_next_page(dump_memdev));
+}
+
+/* 
+ * Iterates over list of saved dump pages. Invoked during second stage of 
+ * soft boot dumping
+ *
+ * Observation: If additional selection is desired at this stage then
+ * a different iterator could be written which would advance 
+ * to the next page header everytime instead of blindly picking up
+ * the data. In such a case loc would be interpreted differently. 
+ * At this moment however a blind pass seems sufficient, cleaner and
+ * faster.
+ */
+int dump_saved_data_iterator(int pass, int (*action)(unsigned long, 
+	unsigned long), struct dump_data_filter *filter)
+{
+	loff_t loc = filter->start;
+	struct page *page;
+	unsigned long count = 0;
+	int err = 0;
+	unsigned long sz;
+
+	printk("pass %d, start off 0x%llx end offset 0x%llx\n", pass,
+			filter->start, filter->end);
+
+	/* loc will get treated as logical offset into stage 1 */
+	page = dump_get_saved_page(loc);
+			
+	for (; loc < filter->end; loc += PAGE_SIZE) {
+		dump_config.dumper->curr_loc = loc;
+		if (!page) {
+			printk("no more saved data for pass %d\n", pass);
+			break;
+		}
+		sz = (loc + PAGE_SIZE > filter->end) ? filter->end - loc :
+			PAGE_SIZE;
+
+		if (page && filter->selector(pass, (unsigned long)page, 
+			PAGE_SIZE))  {
+			pr_debug("mem offset 0x%llx\n", loc);
+			if ((err = action((unsigned long)page, sz))) 
+				break;
+			else
+				count++;
+			/* clear the contents of page */
+			/* fixme: consider using KM_DUMP instead */
+			clear_highpage(page);
+			
+		}
+		page = dump_next_saved_page();
+	}
+
+	return err ? err : count;
+}
+
+static inline int dump_overlay_pages_done(struct page *page, int nr)
+{
+	int ret=0;
+
+	for (; nr ; page++, nr--) {
+		if (dump_check_and_free_page(dump_memdev, page))
+			ret++;
+	}
+	return ret;
+}
+
+int dump_overlay_save_data(unsigned long loc, unsigned long len)
+{
+	int err = 0;
+	struct page *page = (struct page *)loc;
+	static unsigned long cnt = 0;
+
+	if ((err = dump_generic_save_data(loc, len)))
+		return err;
+
+	if (dump_overlay_pages_done(page, len >> PAGE_SHIFT)) {
+		cnt++;
+		if (!(cnt & 0x7f))
+			pr_debug("released page 0x%lx\n", page_to_pfn(page));
+	}
+	
+	return err;
+}
+
+
+int dump_overlay_skip_data(unsigned long loc, unsigned long len)
+{
+	struct page *page = (struct page *)loc;
+
+	dump_overlay_pages_done(page, len >> PAGE_SHIFT);
+	return 0;
+}
+
+int dump_overlay_resume(void)
+{
+	int err = 0;
+
+	/* 
+	 * switch to stage 2 dumper, save dump_config_block
+	 * and then trigger a soft-boot
+	 */
+	dumper_stage2.header_len = dump_config.dumper->header_len;
+	dump_config.dumper = &dumper_stage2;
+	if ((err = dump_save_config(dump_saved_config)))
+		return err;
+
+	dump_dev = dump_config.dumper->dev;
+
+	return err;
+	err = dump_switchover_stage();  /* plugs into soft boot mechanism */
+	dump_config.dumper = &dumper_stage1; /* set things back */
+	return err;
+}
+
+int dump_overlay_configure(unsigned long devid)
+{
+	struct dump_dev *dev;
+	struct dump_config_block *saved_config = dump_saved_config;
+	int err = 0;
+
+	/* If there is a previously saved dump, write it out first */
+	if (saved_config) {
+		printk("Processing old dump pending writeout\n");
+		err = dump_switchover_stage();
+		if (err) {
+			printk("failed to writeout saved dump\n");
+			return err;
+		}
+		dump_free_mem(saved_config); /* testing only: not after boot */
+	}
+
+	dev = dumper_stage2.dev = dump_config.dumper->dev;
+	/* From here on the intermediate dump target is memory-only */
+	dump_dev = dump_config.dumper->dev = &dump_memdev->ddev;
+	if ((err = dump_generic_configure(0))) {
+		printk("dump generic configure failed: err %d\n", err);
+		return err;
+	}
+	/* temporary */
+	dumper_stage2.dump_buf = dump_config.dumper->dump_buf;
+
+	/* Sanity check on the actual target dump device */
+	if (!dev || (err = dev->ops->open(dev, devid))) {
+		return err;
+	}
+	/* TBD: should we release the target if this is soft-boot only ? */
+
+	/* alloc a dump config block area to save across reboot */
+	if (!(dump_saved_config = dump_alloc_mem(sizeof(struct 
+		dump_config_block)))) {
+		printk("dump config block alloc failed\n");
+		/* undo configure */
+		dump_generic_unconfigure();
+		return -ENOMEM;
+	}
+	dump_config.dump_addr = (unsigned long)dump_saved_config;
+	printk("Dump config block of size %d set up at 0x%lx\n", 
+		sizeof(*dump_saved_config), (unsigned long)dump_saved_config);
+	return 0;
+}
+
+int dump_overlay_unconfigure(void)
+{
+	struct dump_dev *dev = dumper_stage2.dev;
+	int err = 0;
+
+	pr_debug("dump_overlay_unconfigure\n");
+	/* Close the secondary device */
+	dev->ops->release(dev); 
+	pr_debug("released secondary device\n");
+
+	err = dump_generic_unconfigure();
+	pr_debug("Unconfigured generic portions\n");
+	dump_free_mem(dump_saved_config);
+	dump_saved_config = NULL;
+	pr_debug("Freed saved config block\n");
+	dump_dev = dump_config.dumper->dev = dumper_stage2.dev;
+
+	printk("Unconfigured overlay dumper\n");
+	return err;
+}
+
+int dump_staged_unconfigure(void)
+{
+	int err = 0;
+	struct dump_config_block *saved_config = dump_saved_config;
+	struct dump_dev *dev;
+
+	pr_debug("dump_staged_unconfigure\n");
+	err = dump_generic_unconfigure();
+
+	/* now check if there is a saved dump waiting to be written out */
+	if (saved_config) {
+		printk("Processing saved dump pending writeout\n");
+		if ((err = dump_switchover_stage())) {
+			printk("Error in commiting saved dump at 0x%lx\n", 
+				(unsigned long)saved_config);
+			printk("Old dump may hog memory\n");
+		} else {
+			dump_free_mem(saved_config);
+			pr_debug("Freed saved config block\n");
+		}
+		dump_saved_config = NULL;
+	} else {
+		dev = &dump_memdev->ddev;
+		dev->ops->release(dev);
+	}
+	printk("Unconfigured second stage dumper\n");
+
+	return 0;
+}
+
+/* ----- PASSTHRU FILTER ROUTINE --------- */
+
+/* transparent - passes everything through */
+int dump_passthru_filter(int pass, unsigned long loc, unsigned long sz)
+{
+	return 1;
+}
+
+/* ----- PASSTRU FORMAT ROUTINES ---- */
+
+
+int dump_passthru_configure_header(const char *panic_str, const struct pt_regs *regs)
+{
+	dump_config.dumper->header_dirty++;
+	return 0;
+}
+
+/* Copies bytes of data from page(s) to the specified buffer */
+int dump_copy_pages(void *buf, struct page *page, unsigned long sz)
+{
+	unsigned long len = 0, bytes;
+	void *addr;
+
+	while (len < sz) {
+		addr = kmap_atomic(page, KM_DUMP);
+		bytes = (sz > len + PAGE_SIZE) ? PAGE_SIZE : sz - len;	
+		memcpy(buf, addr, bytes); 
+		kunmap_atomic(addr, KM_DUMP);
+		buf += bytes;
+		len += bytes;
+		page++;
+	}
+	/* memset(dump_config.dumper->curr_buf, 0x57, len); temporary */
+
+	return sz - len;
+}
+
+int dump_passthru_update_header(void)
+{
+	long len = dump_config.dumper->header_len;
+	struct page *page;
+	void *buf = dump_config.dumper->dump_buf;
+	int err = 0;
+
+	if (!dump_config.dumper->header_dirty)
+		return 0;
+
+	pr_debug("Copying header of size %ld bytes from memory\n", len);
+	if (len > DUMP_BUFFER_SIZE) 
+		return -E2BIG;
+
+	page = dump_mem_lookup(dump_memdev, 0);
+	for (; (len > 0) && page; buf += PAGE_SIZE, len -= PAGE_SIZE) {
+		if ((err = dump_copy_pages(buf, page, PAGE_SIZE)))
+			return err;
+		page = dump_mem_next_page(dump_memdev);
+	}
+	if (len > 0) {
+		printk("Incomplete header saved in mem\n");
+		return -ENOENT;
+	}
+
+	if ((err = dump_dev_seek(0))) {
+		printk("Unable to seek to dump header offset\n");
+		return err;
+	}
+	err = dump_ll_write(dump_config.dumper->dump_buf, 
+		buf - dump_config.dumper->dump_buf);
+	if (err < dump_config.dumper->header_len)
+		return (err < 0) ? err : -ENOSPC;
+
+	dump_config.dumper->header_dirty = 0;
+	return 0;
+}
+
+static loff_t next_dph_offset = 0;
+
+static int dph_valid(struct __dump_page *dph)
+{
+	if ((dph->dp_address & (PAGE_SIZE - 1)) || (dph->dp_flags 
+	      > DUMP_DH_COMPRESSED) || (!dph->dp_flags) ||
+		(dph->dp_size > PAGE_SIZE)) {
+	printk("dp->address = 0x%llx, dp->size = 0x%x, dp->flag = 0x%x\n",
+		dph->dp_address, dph->dp_size, dph->dp_flags);
+		return 0;
+	}
+	return 1;
+}
+
+int dump_verify_lcrash_data(void *buf, unsigned long sz)
+{
+	struct __dump_page *dph;
+
+	/* sanity check for page headers */
+	while (next_dph_offset + sizeof(*dph) < sz) {
+		dph = (struct __dump_page *)(buf + next_dph_offset);
+		if (!dph_valid(dph)) {
+			printk("Invalid page hdr at offset 0x%llx\n",
+				next_dph_offset);
+			return -EINVAL;
+		}
+		next_dph_offset += dph->dp_size + sizeof(*dph);
+	}
+
+	next_dph_offset -= sz;	
+	return 0;
+}
+
+/* 
+ * TBD/Later: Consider avoiding the copy by using a scatter/gather 
+ * vector representation for the dump buffer
+ */
+int dump_passthru_add_data(unsigned long loc, unsigned long sz)
+{
+	struct page *page = (struct page *)loc;
+	void *buf = dump_config.dumper->curr_buf;
+	int err = 0;
+
+	if ((err = dump_copy_pages(buf, page, sz))) {
+		printk("dump_copy_pages failed");
+		return err;
+	}
+
+	if ((err = dump_verify_lcrash_data(buf, sz))) {
+		printk("dump_verify_lcrash_data failed\n");
+		printk("Invalid data for pfn 0x%lx\n", page_to_pfn(page));
+		printk("Page flags 0x%lx\n", page->flags);
+		printk("Page count 0x%x\n", atomic_read(&page->count));
+		return err;
+	}
+
+	dump_config.dumper->curr_buf = buf + sz;
+
+	return 0;
+}
+
+
+/* Stage 1 dumper: Saves compressed dump in memory and soft-boots system */
+
+/* Scheme to overlay saved data in memory for writeout after a soft-boot */
+struct dump_scheme_ops dump_scheme_overlay_ops = {
+	.configure	= dump_overlay_configure,
+	.unconfigure	= dump_overlay_unconfigure,
+	.sequencer	= dump_overlay_sequencer,
+	.iterator	= dump_page_iterator,
+	.save_data	= dump_overlay_save_data,
+	.skip_data	= dump_overlay_skip_data,
+	.write_buffer	= dump_generic_write_buffer
+};
+
+struct dump_scheme dump_scheme_overlay = {
+	.name		= "overlay",
+	.ops		= &dump_scheme_overlay_ops
+};
+
+
+/* Stage 1 must use a good compression scheme - default to gzip */
+extern struct __dump_compress dump_gzip_compression;
+
+struct dumper dumper_stage1 = {
+	.name		= "stage1",
+	.scheme		= &dump_scheme_overlay,
+	.fmt		= &dump_fmt_lcrash,
+	.compress 	= &dump_none_compression, /* needs to be gzip */
+	.filter		= dump_filter_table,
+	.dev		= NULL,
+};		
+
+/* Stage 2 dumper: Activated after softboot to write out saved dump to device */
+
+/* Formatter that transfers data as is (transparent) w/o further conversion */
+struct dump_fmt_ops dump_fmt_passthru_ops = {
+	.configure_header	= dump_passthru_configure_header,
+	.update_header		= dump_passthru_update_header,
+	.save_context		= NULL, /* unused */
+	.add_data		= dump_passthru_add_data,
+	.update_end_marker	= dump_lcrash_update_end_marker
+};
+
+struct dump_fmt dump_fmt_passthru = {
+	.name	= "passthru",
+	.ops	= &dump_fmt_passthru_ops
+};
+
+/* Filter that simply passes along any data within the range (transparent)*/
+/* Note: The start and end ranges in the table are filled in at run-time */
+
+extern int dump_filter_none(int pass, unsigned long loc, unsigned long sz);
+
+struct dump_data_filter dump_passthru_filtertable[MAX_PASSES] = {
+{.name = "passkern", .selector = dump_passthru_filter, 
+	.level_mask = DUMP_MASK_KERN },
+{.name = "passuser", .selector = dump_passthru_filter, 
+	.level_mask = DUMP_MASK_USED },
+{.name = "passunused", .selector = dump_passthru_filter, 
+	.level_mask = DUMP_MASK_UNUSED },
+{.name = "none", .selector = dump_filter_none, 
+	.level_mask = DUMP_MASK_REST }
+};
+
+
+/* Scheme to handle data staged / preserved across a soft-boot */
+struct dump_scheme_ops dump_scheme_staged_ops = {
+	.configure	= dump_generic_configure,
+	.unconfigure	= dump_staged_unconfigure,
+	.sequencer	= dump_generic_sequencer,
+	.iterator	= dump_saved_data_iterator,
+	.save_data	= dump_generic_save_data,
+	.skip_data	= dump_generic_skip_data,
+	.write_buffer	= dump_generic_write_buffer
+};
+
+struct dump_scheme dump_scheme_staged = {
+	.name		= "staged",
+	.ops		= &dump_scheme_staged_ops
+};
+
+/* The stage 2 dumper comprising all these */
+struct dumper dumper_stage2 = {
+	.name		= "stage2",
+	.scheme		= &dump_scheme_staged,
+	.fmt		= &dump_fmt_passthru,
+	.compress 	= &dump_none_compression,
+	.filter		= dump_passthru_filtertable,
+	.dev		= NULL,
+};		
+
diff -urN -X ../dontdiff linux-2.5.59/drivers/dump/dump_scheme.c linux-2.5.59-kexecdump/drivers/dump/dump_scheme.c
--- linux-2.5.59/drivers/dump/dump_scheme.c	Mon Feb  3 23:56:59 2003
+++ linux-2.5.59-kexecdump/drivers/dump/dump_scheme.c	Thu Feb  6 07:49:15 2003
@@ -151,7 +151,7 @@
 
 		if (!(dump_config.dumper->count & 0x3f)) {
 			/* Update the header every one in a while */
-			/* memset((void *)dump_buf, 'b', DUMP_BUFFER_SIZE);*/
+			memset((void *)dump_buf, 'b', DUMP_BUFFER_SIZE);
 			if ((ret = dump_update_header()) < 0) {
 				/* issue warning */
 				return ret;
@@ -167,6 +167,12 @@
 
 		/* --- Done with periodic chores -- */
 
+		/* 
+		 * extra bit of copying to simplify verification  
+		 * in the second kernel boot based scheme
+		 */
+		memcpy(dump_buf - DUMP_PAGE_SIZE, dump_buf + 
+			DUMP_BUFFER_SIZE - DUMP_PAGE_SIZE, DUMP_PAGE_SIZE);
 
 		/* now adjust the leftover bits back to the top of the page */
 		/* this case would not arise during stage 2 (passthru) */
@@ -267,7 +273,7 @@
 
 	/* Allocate the dump buffer and initialize dumper state */
 	/* Assume that we get aligned addresses */
-	if (!(buf = dump_alloc_mem(DUMP_BUFFER_SIZE + 2 * DUMP_PAGE_SIZE)))
+	if (!(buf = dump_alloc_mem(DUMP_BUFFER_SIZE + 3 * DUMP_PAGE_SIZE)))
 		return -ENOMEM;
 
 	if ((unsigned long)buf & (PAGE_SIZE - 1)) {
@@ -277,7 +283,7 @@
 	}
 
 	/* Initialize the rest of the fields */
-	dump_config.dumper->dump_buf = buf;
+	dump_config.dumper->dump_buf = buf + DUMP_PAGE_SIZE;
 	dumper_reset();
 
 	/* Open the dump device */
@@ -305,14 +311,19 @@
 	void *buf = dump_config.dumper->dump_buf;
 	int ret = 0;
 
+	pr_debug("Generic unconfigure\n");
 	/* Close the dump device */
 	if (dev && (ret = dev->ops->release(dev)))
 		return ret;
+
+	printk("Closed dump device\n");
 	
 	if (buf)
-		dump_free_mem(buf);
+		dump_free_mem((buf - DUMP_PAGE_SIZE));
 
 	dump_config.dumper->curr_buf = dump_config.dumper->dump_buf = NULL;
+	pr_debug("Released dump buffer\n");
+
 	return 0;
 }
 
diff -urN -X ../dontdiff linux-2.5.59/drivers/dump/dump_setup.c linux-2.5.59-kexecdump/drivers/dump/dump_setup.c
--- linux-2.5.59/drivers/dump/dump_setup.c	Mon Feb  3 23:57:00 2003
+++ linux-2.5.59-kexecdump/drivers/dump/dump_setup.c	Thu Feb  6 16:52:08 2003
@@ -129,6 +129,8 @@
 struct dump_config dump_config = {
 	.level 		= 0,
 	.flags 		= 0,
+	.dump_device	= 0,
+	.dump_addr	= 0,
 	.dumper 	= NULL
 };
 
@@ -140,7 +142,6 @@
 /* Other global fields */
 extern struct __dump_header dump_header; 
 struct dump_dev *dump_dev = NULL;  /* Active dump device                   */
-int dump_device = 0;
 static int dump_compress = 0;
 
 static u16 dump_compress_none(const u8 *old, u16 oldsize, u8 *new, u16 newsize);
@@ -191,6 +192,8 @@
 static int proc_dump_device(ctl_table *ctl, int write, struct file *f,
 			    void *buffer, size_t *lenp);
 
+static int proc_doulonghex(ctl_table *ctl, int write, struct file *f,
+			    void *buffer, size_t *lenp);
 /*
  * sysctl-tuning infrastructure.
  */
@@ -200,14 +203,14 @@
 	  .data = &dump_config.level, 	 
 	  .maxlen = sizeof(int),
 	  .mode = 0644,
-	  .proc_handler = proc_dointvec, },
+	  .proc_handler = proc_doulonghex, },
 
 	{ .ctl_name = CTL_DUMP_FLAGS,
 	  .procname = DUMP_FLAGS_NAME,
 	  .data = &dump_config.flags,	
 	  .maxlen = sizeof(int),
 	  .mode = 0644,
-	  .proc_handler = proc_dointvec, },
+	  .proc_handler = proc_doulonghex, },
 
 	{ .ctl_name = CTL_DUMP_COMPRESS,
 	  .procname = DUMP_COMPRESS_NAME,
@@ -219,10 +222,19 @@
 	{ .ctl_name = CTL_DUMP_DEVICE,
 	  .procname = DUMP_DEVICE_NAME,
 	  .mode = 0644,
-	  .data = &dump_device, /* FIXME */
+	  .data = &dump_config.dump_device, /* FIXME */
 	  .maxlen = sizeof(int),
 	  .proc_handler = proc_dump_device },
 
+#ifdef CONFIG_CRASH_DUMP_MEMDEV
+	{ .ctl_name = CTL_DUMP_ADDR,
+	  .procname = DUMP_ADDR_NAME,
+	  .mode = 0444,
+	  .data = &dump_config.dump_addr,
+	  .maxlen = sizeof(unsigned long),
+	  .proc_handler = proc_doulonghex },
+#endif
+
 	{ 0, }
 };
 
@@ -392,7 +404,16 @@
 		dump_unconfigure();
 	}
 	/* set up new dumper */
-	dump_config.dumper = &dumper_singlestage;
+	if (dump_config.flags & DUMP_FLAGS_SOFTBOOT) {
+		printk("Configuring softboot based dump \n");
+#ifdef CONFIG_CRASH_DUMP_MEMDEV
+		dump_config.dumper = &dumper_stage1; 
+#else
+		printk("Requires CONFIG_CRASHDUMP_MEMDEV. Can't proceed.\n");
+#endif
+	} else {
+		dump_config.dumper = &dumper_singlestage;
+	}	
 	dump_config.dumper->dev = dump_dev;
 
 	ret = dump_configure(devid);
@@ -400,10 +421,11 @@
 		dump_okay = 1;
 		pr_debug("%s dumper set up for dev 0x%lx\n", 
 			dump_config.dumper->name, devid);
-		dump_device = devid;
+ 		dump_config.dump_device = devid;
 	} else {
 		printk("%s dumper set up failed for dev 0x%lx\n", 
 		       dump_config.dumper->name, devid);
+ 		dump_config.dumper = NULL;
 	}
 	return ret;
 }
@@ -473,7 +495,7 @@
 
 		
 	case DIOGDUMPDEV:	/* get dump_device */
-		return put_user((long)dump_device, (long *)arg);
+		return put_user((long)dump_config.dump_device, (long *)arg);
 
 	case DIOSDUMPLEVEL:	/* set dump_level */
 		if (!(f->f_flags & O_RDWR))
@@ -563,10 +585,10 @@
 
 	/* same permission checks as ioctl */
 	if (capable(CAP_SYS_ADMIN)) {
-		ret = proc_dointvec(ctl, write, f, buffer, lenp);
+		ret = proc_doulonghex(ctl, write, f, buffer, lenp);
 		if (ret == 0 && write && *valp != oval) {
 			/* need to restore old value to close properly */
-			dump_device = (dev_t) oval;
+			dump_config.dump_device = (dev_t) oval;
 			__dump_open();
 			ret = dumper_setup(dump_config.flags, (dev_t) *valp);
 		}
@@ -575,6 +597,37 @@
 	return ret;
 }
 
+/* All for the want of a proc_do_xxx routine which prints values in hex */
+static int 
+proc_doulonghex(ctl_table *ctl, int write, struct file *f,
+		 void *buffer, size_t *lenp)
+{
+#define TMPBUFLEN 20
+	unsigned long *i;
+	size_t len, left;
+	char buf[TMPBUFLEN];
+
+	if (!ctl->data || !ctl->maxlen || !*lenp || (f->f_pos)) {
+		*lenp = 0;
+		return 0;
+	}
+	
+	i = (unsigned long *) ctl->data;
+	left = *lenp;
+	
+	sprintf(buf, "0x%lx\n", (*i));
+	len = strlen(buf);
+	if (len > left)
+		len = left;
+	if(copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	
+	left -= len;
+	*lenp -= left;
+	f->f_pos += *lenp;
+	return 0;
+}
+
 /*
  * -----------------------------------------------------------------------
  *                     I N I T   F U N C T I O N S
diff -urN -X ../dontdiff linux-2.5.59/include/linux/dump.h linux-2.5.59-kexecdump/include/linux/dump.h
--- linux-2.5.59/include/linux/dump.h	Mon Feb  3 23:57:01 2003
+++ linux-2.5.59-kexecdump/include/linux/dump.h	Thu Feb  6 13:56:35 2003
@@ -87,6 +87,7 @@
 
 /* dump flags - any dump-type specific flags -- add as necessary */
 #define DUMP_FLAGS_NONE		0x0	/* no flags are set for this dump   */
+#define DUMP_FLAGS_SOFTBOOT	0x2	/* 2 stage soft-boot based dump	    */
 
 #define DUMP_FLAGS_TARGETMASK	0xf0000000 /* handle special case targets   */
 #define DUMP_FLAGS_DISKDUMP	0x80000000 /* dump to local disk 	    */
@@ -107,6 +108,7 @@
 #define DUMP_COMPRESS_NAME	"compress"
 #define DUMP_LEVEL_NAME		"level"
 #define DUMP_FLAGS_NAME		"flags"
+#define DUMP_ADDR_NAME		"addr"
 
 #define DUMP_SYSRQ_KEY		'd'	/* key to use for MAGIC_SYSRQ key   */
 
@@ -117,7 +119,8 @@
 	CTL_DUMP_COMPRESS=3,
 	CTL_DUMP_LEVEL=3,
 	CTL_DUMP_FLAGS=4,
-	CTL_DUMP_TEST=5,
+	CTL_DUMP_ADDR=5,
+	CTL_DUMP_TEST=6,
 };
 
 
diff -urN -X ../dontdiff linux-2.5.59/include/linux/dumpdev.h linux-2.5.59-kexecdump/include/linux/dumpdev.h
--- linux-2.5.59/include/linux/dumpdev.h	Mon Feb  3 23:57:01 2003
+++ linux-2.5.59-kexecdump/include/linux/dumpdev.h	Thu Feb  6 07:49:15 2003
@@ -70,6 +70,44 @@
 	return container_of(dev, struct dump_blockdev, ddev);
 }
 
+
+/* mem  - for internal use by soft-boot based dumper */
+struct dump_memdev {
+	struct dump_dev ddev;
+	unsigned long indirect_map_root;
+	unsigned long nr_free;
+	struct page *curr_page;
+	unsigned long *curr_map;
+	unsigned long curr_map_offset;
+	unsigned long last_offset;
+	unsigned long last_used_offset;
+	unsigned long last_bs_offset;
+};	
+
+static inline struct dump_memdev *DUMP_MDEV(struct dump_dev *dev)
+{
+	return container_of(dev, struct dump_memdev, ddev);
+}
+
+/* Todo/future - meant for raw dedicated interfaces e.g. mini-ide driver */
+struct dump_rdev {
+	struct dump_dev ddev;
+	char name[32];
+	int (*reset)(struct dump_rdev *, unsigned int, 
+		unsigned long);
+	/* ... to do ... */
+};
+
+/* just to get the size right when saving config across a soft-reboot */
+struct dump_anydev {
+	union {
+		struct dump_blockdev bddev;
+		/* .. add other types here .. */
+	};
+};
+
+
+
 /* Dump device / target operation wrappers */
 /* These assume that dump_dev is initiatized to dump_config.dumper->dev */
 
diff -urN -X ../dontdiff linux-2.5.59/kernel/panic.c linux-2.5.59-kexecdump/kernel/panic.c
--- linux-2.5.59/kernel/panic.c	Thu Feb  6 16:42:29 2003
+++ linux-2.5.59-kexecdump/kernel/panic.c	Thu Feb  6 16:18:10 2003
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/sysrq.h>
 #include <linux/interrupt.h>
+#include <linux/kexec.h>
 
 asmlinkage void sys_sync(void);	/* it's really int */
 
@@ -74,13 +75,22 @@
 	 	 * Delay timeout seconds before rebooting the machine. 
 		 * We can't use the "normal" timers since we just panicked..
 	 	 */
+		struct kimage *image;
 		printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
-		mdelay(panic_timeout*1000);
 		/*
 		 *	Should we run the reboot notifier. For the moment Im
 		 *	choosing not too. It might crash, be corrupt or do
 		 *	more harm than good for other reasons.
 		 */
+#ifdef CONFIG_KEXEC
+		image = xchg(&kexec_image, 0);
+		if (image) {
+			printk(KERN_EMERG "by starting a new kernel ..\n");
+			mdelay(panic_timeout*1000);
+			machine_kexec(image);
+		}
+#endif
+		mdelay(panic_timeout*1000);
 		machine_restart(NULL);
 	}
 #ifdef __sparc__

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH][WIP] Using kexec for crash dumps in LKCD
  2003-02-06 15:56         ` [PATCH][WIP] Using kexec for crash dumps in LKCD Suparna Bhattacharya
@ 2003-02-07 15:39           ` Suparna Bhattacharya
  0 siblings, 0 replies; 15+ messages in thread
From: Suparna Bhattacharya @ 2003-02-07 15:39 UTC (permalink / raw)
  To: Eric W. Biederman, lkcd-devel; +Cc: linux-kernel, fastboot

I just checked in the changes for kexec based dumping
into lkcd cvs after tagging the earlier code as "linux2559".

Brief Description/Recap from CVS log:
------------------------------------
Initial release of code to implement an option to save dump
in memory and write it out later (CONFIG_CRASH_DUMP_MEMDEV).

In case of a panic dump, if CONFIG_CRASH_DUMP_SOFTBOOT is
enabled and CONFIG_KEXEC is on, this would use Eric Biederman's
kexec implementation to delay the actual writeout of the dump
to disk to happen after a memory preserving reboot of a new
kernel (along the lines of Mission Critical Linux's mcore
implementation). 
-------------------------------------

The first call to lkcd config after a boot would trigger 
the actual writeout to the dump disk/partition. And then 
lkcd save works as usual to copy the dump into /var/log/dump/<n>/

This is still in a somewhat raw/experimental form, made
available for anyone who'd like to play with it early.
To use it you'd need to apply the kexec patches along
with lkcd. I'll try to put out a TODO list soon for
some things that need to be done to stabilize, complete
and simplify this.

The following patches to sbin.lkcd script and the 
sysconfig.dump file should be used with this.
(kexec preloading gets automatically handled 
once you specify the image and kernel command line 
to use).

I've held back on checking in these particular changes 
for now as I wanted to be sure not to break anything 
for 2.4 lkcd users.  

(BTW, is that also why the lkcd script still uses 227 
as the dump device number rather than 221 ?)

Any suggestions on how to handle this best would be
appreciated.

Regards
Suparna

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Labs, India


--- sbin.lkcd	Mon Dec 16 15:52:51 2002
+++ /home/suparna/suparna/ras/dump/sbin.lkcd	Fri Feb  7 19:28:16 2003
@@ -14,6 +14,7 @@
 KERNTYPES=/boot/Kerntypes
 DUMPCONFIG=/sbin/lkcd_config
 DUMPSYSDEVICE=/dev/dump
+KEXEC=/sbin/kexec
 
 ###########################################################################
 # Functions
@@ -65,8 +66,37 @@
 
 	if [ $? -ne 0 ] ; then
 		echo "$DUMPCONFIG failed!" >&2
+		exit 1
+	fi
+
+	# Set things up for kexec based dumping if applicable
+
+	if [ ! -e /proc/sys/kernel/dump/addr ] ; then
+		return
+	fi
+
+	DUMP_ADDR=`cat /proc/sys/kernel/dump/addr`
+
+	if [ $DUMP_ADDR != 0 ] ; then
+		echo "Preloading kernel for kexec based dumping"
+	else
+		return
 	fi
 
+	if [ ! -e "$KEXEC_IMAGE" ] ; then
+		echo "KEXECIMAGE does not exist!" >&2
+		exit 1
+	fi
+
+	# Preload the kernel image to switch to on panic
+	echo $KEXEC -l \
+	--command-line="$KEXEC_CMDLINE crashdump=$DUMP_ADDR" \
+	$KEXEC_IMAGE
+
+	$KEXEC -l \
+	--command-line="$KEXEC_CMDLINE crashdump=$DUMP_ADDR" \
+	$KEXEC_IMAGE
+
 	return
 }
 
@@ -256,7 +286,7 @@
 
 # make sure system dump device exists -- otherwise, make it
 if [ ! -e $DUMPSYSDEVICE ] ; then
-	mknod $DUMPSYSDEVICE c 227 0
+	mknod $DUMPSYSDEVICE c 221 0
 	chmod 644 $DUMPSYSDEVICE
 fi
 
--- sysconfig.dump	Wed Dec  4 15:20:45 2002
+++ /home/suparna/suparna/ras/dump/sysconfig.dump	Fri Feb  7 20:32:39 2003
@@ -117,11 +117,15 @@
 DUMPDEV=/dev/vmdump
 DUMPDIR=/var/log/dump
 DUMP_SAVE=1
-DUMP_LEVEL=8
-DUMP_FLAGS=0x80000000 # disruptive disk dump is default
-DUMP_COMPRESS=0
+DUMP_LEVEL=2
+DUMP_FLAGS=0x80000002 # disruptive disk dump is default
+DUMP_COMPRESS=2
 PANIC_TIMEOUT=5
 
+# Only relevant for dumping via kexec
+KEXEC_IMAGE=/boot/kexec	
+KEXEC_CMDLINE="root=806 console=tty0 console=ttyS0,38400"
+
 # Network dump configuration parameters
 TARGET_HOST=hostname # set this to vaild hostname/IP
 TARGET_PORT=6666

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2003-02-07 15:24 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-12-22 11:07 [PATCH][CFT] kexec (rewrite) for 2.5.52 Eric W. Biederman
2002-12-31 14:35 ` Suparna Bhattacharya
2003-01-03 10:37   ` Eric W. Biederman
2003-01-03 12:41     ` Suparna Bhattacharya
2003-01-04 20:34       ` Eric W. Biederman
2003-01-04 22:42       ` Eric W. Biederman
2003-01-06  5:48       ` [PATCH] kexec for 2.5.54 Eric W. Biederman
2003-01-07 22:46         ` Andy Pfiffer
2003-01-07 23:01           ` Dave Hansen
2003-01-07 23:11             ` Martin J. Bligh
2003-01-15 19:43         ` [2.5.58][KEXEC] Success! (using 2.5.54 version + kexec tools 1.8) Andy Pfiffer
2003-01-04  0:32   ` 2.5.54: Re: [PATCH][CFT] kexec (rewrite) for 2.5.52 Andy Pfiffer
2003-01-04 18:56     ` Eric W. Biederman
     [not found]   ` <m11y2w557p.fsf@frodo.biederman.org>
     [not found]     ` <20030204142426.A1950@in.ibm.com>
     [not found]       ` <m1d6m81ttu.fsf@frodo.biederman.org>
2003-02-06 15:56         ` [PATCH][WIP] Using kexec for crash dumps in LKCD Suparna Bhattacharya
2003-02-07 15:39           ` Suparna Bhattacharya

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).