All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-20  5:50 ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-20  5:50 UTC (permalink / raw)
  To: linux-kernel, kexec
  Cc: akpm, Eric Biederman, Dave Young, Baoquan He, Petr Tesarik,
	Mahesh Salgaonkar, Xunlei Pang

As Eric said,
"what we need to do is move the variable vmcoreinfo_note out
of the kernel's .bss section.  And modify the code to regenerate
and keep this information in something like the control page.

Definitely something like this needs a page all to itself, and ideally
far away from any other kernel data structures.  I clearly was not
watching closely the data someone decided to keep this silly thing
in the kernel's .bss section."

This patch allocates extra pages for these vmcoreinfo_XXX variables,
one advantage is that it enhances some safety of vmcoreinfo, because
vmcoreinfo now is kept far away from other kernel data structures.

Suggested-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
---
 arch/ia64/kernel/machine_kexec.c |  5 -----
 arch/x86/kernel/crash.c          |  2 +-
 include/linux/kexec.h            |  2 +-
 kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
 kernel/ksysfs.c                  |  2 +-
 5 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index 599507b..c14815d 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
 #endif
 }
 
-phys_addr_t paddr_vmcoreinfo_note(void)
-{
-	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 3741461..4d35fbb 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
 	bufp += sizeof(Elf64_Phdr);
 	phdr->p_type = PT_NOTE;
 	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
-	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
+	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
 	(ehdr->e_phnum)++;
 
 #ifdef CONFIG_X86_64
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index e98e546..f1c601b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
 extern struct resource crashk_low_res;
 typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
 extern note_buf_t __percpu *crash_notes;
-extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+extern u32 *vmcoreinfo_note;
 extern size_t vmcoreinfo_size;
 extern size_t vmcoreinfo_max_size;
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bfe62d5..e3a4bda 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,10 +52,10 @@
 note_buf_t __percpu *crash_notes;
 
 /* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+static unsigned char *vmcoreinfo_data;
 size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
+u32 *vmcoreinfo_note;
 
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
@@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
 
 void crash_save_vmcoreinfo(void)
 {
+	if (!vmcoreinfo_note)
+		return;
+
 	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
 	update_vmcoreinfo_note();
 }
@@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
 void __weak arch_crash_save_vmcoreinfo(void)
 {}
 
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
+phys_addr_t paddr_vmcoreinfo_note(void)
 {
-	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+	return __pa(vmcoreinfo_note);
 }
 
 static int __init crash_save_vmcoreinfo_init(void)
 {
+	/* One page should be enough for VMCOREINFO_BYTES under all archs */
+	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+	if (!vmcoreinfo_data) {
+		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
+		return -ENOMEM;
+	}
+
+	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!vmcoreinfo_note) {
+		free_page((unsigned long)vmcoreinfo_data);
+		vmcoreinfo_data = NULL;
+		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
+		return -ENOMEM;
+	}
+
 	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
 	VMCOREINFO_PAGESIZE(PAGE_SIZE);
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index ee1bc1b..9de6fcc 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 {
 	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
 	return sprintf(buf, "%pa %x\n", &vmcore_base,
-		       (unsigned int)sizeof(vmcoreinfo_note));
+			(unsigned int)VMCOREINFO_NOTE_SIZE);
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-20  5:50 ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-20  5:50 UTC (permalink / raw)
  To: linux-kernel, kexec
  Cc: Baoquan He, Petr Tesarik, Eric Biederman, Xunlei Pang, akpm, Dave Young

As Eric said,
"what we need to do is move the variable vmcoreinfo_note out
of the kernel's .bss section.  And modify the code to regenerate
and keep this information in something like the control page.

Definitely something like this needs a page all to itself, and ideally
far away from any other kernel data structures.  I clearly was not
watching closely the data someone decided to keep this silly thing
in the kernel's .bss section."

This patch allocates extra pages for these vmcoreinfo_XXX variables,
one advantage is that it enhances some safety of vmcoreinfo, because
vmcoreinfo now is kept far away from other kernel data structures.

Suggested-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
---
 arch/ia64/kernel/machine_kexec.c |  5 -----
 arch/x86/kernel/crash.c          |  2 +-
 include/linux/kexec.h            |  2 +-
 kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
 kernel/ksysfs.c                  |  2 +-
 5 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index 599507b..c14815d 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
 #endif
 }
 
-phys_addr_t paddr_vmcoreinfo_note(void)
-{
-	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 3741461..4d35fbb 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
 	bufp += sizeof(Elf64_Phdr);
 	phdr->p_type = PT_NOTE;
 	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
-	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
+	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
 	(ehdr->e_phnum)++;
 
 #ifdef CONFIG_X86_64
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index e98e546..f1c601b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
 extern struct resource crashk_low_res;
 typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
 extern note_buf_t __percpu *crash_notes;
-extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+extern u32 *vmcoreinfo_note;
 extern size_t vmcoreinfo_size;
 extern size_t vmcoreinfo_max_size;
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bfe62d5..e3a4bda 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,10 +52,10 @@
 note_buf_t __percpu *crash_notes;
 
 /* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+static unsigned char *vmcoreinfo_data;
 size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
+u32 *vmcoreinfo_note;
 
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
@@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
 
 void crash_save_vmcoreinfo(void)
 {
+	if (!vmcoreinfo_note)
+		return;
+
 	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
 	update_vmcoreinfo_note();
 }
@@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
 void __weak arch_crash_save_vmcoreinfo(void)
 {}
 
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
+phys_addr_t paddr_vmcoreinfo_note(void)
 {
-	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+	return __pa(vmcoreinfo_note);
 }
 
 static int __init crash_save_vmcoreinfo_init(void)
 {
+	/* One page should be enough for VMCOREINFO_BYTES under all archs */
+	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+	if (!vmcoreinfo_data) {
+		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
+		return -ENOMEM;
+	}
+
+	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!vmcoreinfo_note) {
+		free_page((unsigned long)vmcoreinfo_data);
+		vmcoreinfo_data = NULL;
+		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
+		return -ENOMEM;
+	}
+
 	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
 	VMCOREINFO_PAGESIZE(PAGE_SIZE);
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index ee1bc1b..9de6fcc 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 {
 	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
 	return sprintf(buf, "%pa %x\n", &vmcore_base,
-		       (unsigned int)sizeof(vmcoreinfo_note));
+			(unsigned int)VMCOREINFO_NOTE_SIZE);
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-- 
1.8.3.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v3 2/3] powerpc/fadump: Use the correct VMCOREINFO_NOTE_SIZE for phdr
  2017-03-20  5:50 ` Xunlei Pang
@ 2017-03-20  5:50   ` Xunlei Pang
  -1 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-20  5:50 UTC (permalink / raw)
  To: linux-kernel, kexec
  Cc: akpm, Eric Biederman, Dave Young, Baoquan He, Petr Tesarik,
	Mahesh Salgaonkar, Xunlei Pang

vmcoreinfo_max_size stands for the vmcoreinfo_data, the
correct one we should use is vmcoreinfo_note whose total
size is VMCOREINFO_NOTE_SIZE.

Like explained in commit 77019967f06b ("kdump: fix exported
size of vmcoreinfo note"), it does not affect the actual 
function, we better fix it, also this change should be safe 
and backward compatible.

After this, we can get rid of variable vmcoreinfo_max_size,
let's use the macro VMCOREINFO_BYTES instead, fewer variables
means more safety for vmcoreinfo operation.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
---
 arch/powerpc/kernel/fadump.c | 3 +--
 include/linux/kexec.h        | 1 -
 kernel/kexec_core.c          | 3 +--
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ff0dd4..b8e15cf 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -906,8 +906,7 @@ static int fadump_create_elfcore_headers(char *bufp)
 
 	phdr->p_paddr	= fadump_relocate(paddr_vmcoreinfo_note());
 	phdr->p_offset	= phdr->p_paddr;
-	phdr->p_memsz	= vmcoreinfo_max_size;
-	phdr->p_filesz	= vmcoreinfo_max_size;
+	phdr->p_memsz	= phdr->p_filesz = VMCOREINFO_NOTE_SIZE;
 
 	/* Increment number of program headers. */
 	(elf->e_phnum)++;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index f1c601b..6918fda 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -319,7 +319,6 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
 extern note_buf_t __percpu *crash_notes;
 extern u32 *vmcoreinfo_note;
 extern size_t vmcoreinfo_size;
-extern size_t vmcoreinfo_max_size;
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index e3a4bda..e503b48 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -54,7 +54,6 @@
 /* vmcoreinfo stuff */
 static unsigned char *vmcoreinfo_data;
 size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
 u32 *vmcoreinfo_note;
 
 /* Flag to indicate we are going to kexec a new kernel */
@@ -1386,7 +1385,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
 	r = vscnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 
-	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+	r = min(r, VMCOREINFO_BYTES - vmcoreinfo_size);
 
 	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v3 2/3] powerpc/fadump: Use the correct VMCOREINFO_NOTE_SIZE for phdr
@ 2017-03-20  5:50   ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-20  5:50 UTC (permalink / raw)
  To: linux-kernel, kexec
  Cc: Baoquan He, Petr Tesarik, Eric Biederman, Xunlei Pang, akpm, Dave Young

vmcoreinfo_max_size stands for the vmcoreinfo_data, the
correct one we should use is vmcoreinfo_note whose total
size is VMCOREINFO_NOTE_SIZE.

Like explained in commit 77019967f06b ("kdump: fix exported
size of vmcoreinfo note"), it does not affect the actual 
function, we better fix it, also this change should be safe 
and backward compatible.

After this, we can get rid of variable vmcoreinfo_max_size,
let's use the macro VMCOREINFO_BYTES instead, fewer variables
means more safety for vmcoreinfo operation.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
---
 arch/powerpc/kernel/fadump.c | 3 +--
 include/linux/kexec.h        | 1 -
 kernel/kexec_core.c          | 3 +--
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ff0dd4..b8e15cf 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -906,8 +906,7 @@ static int fadump_create_elfcore_headers(char *bufp)
 
 	phdr->p_paddr	= fadump_relocate(paddr_vmcoreinfo_note());
 	phdr->p_offset	= phdr->p_paddr;
-	phdr->p_memsz	= vmcoreinfo_max_size;
-	phdr->p_filesz	= vmcoreinfo_max_size;
+	phdr->p_memsz	= phdr->p_filesz = VMCOREINFO_NOTE_SIZE;
 
 	/* Increment number of program headers. */
 	(elf->e_phnum)++;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index f1c601b..6918fda 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -319,7 +319,6 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
 extern note_buf_t __percpu *crash_notes;
 extern u32 *vmcoreinfo_note;
 extern size_t vmcoreinfo_size;
-extern size_t vmcoreinfo_max_size;
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index e3a4bda..e503b48 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -54,7 +54,6 @@
 /* vmcoreinfo stuff */
 static unsigned char *vmcoreinfo_data;
 size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
 u32 *vmcoreinfo_note;
 
 /* Flag to indicate we are going to kexec a new kernel */
@@ -1386,7 +1385,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
 	r = vscnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 
-	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+	r = min(r, VMCOREINFO_BYTES - vmcoreinfo_size);
 
 	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
 
-- 
1.8.3.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v3 3/3] kdump: Relocate vmcoreinfo to the crash memory range
  2017-03-20  5:50 ` Xunlei Pang
@ 2017-03-20  5:50   ` Xunlei Pang
  -1 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-20  5:50 UTC (permalink / raw)
  To: linux-kernel, kexec
  Cc: akpm, Eric Biederman, Dave Young, Baoquan He, Petr Tesarik,
	Mahesh Salgaonkar, Xunlei Pang

Currently vmcoreinfo data is updated at boot time subsys_initcall(),
it has the risk of being modified by some wrong code during system
is running.

As a result, vmcore dumped may contain the wrong vmcoreinfo. Later on,
when using "crash", "makedumpfile", etc utility to parse this vmcore,
we probably will get "Segmentation fault" or other unexpected errors.

E.g. 1) wrong code overwrites vmcoreinfo_data; 2) further crashes the
system; 3) trigger kdump, then we obviously will fail to recognize the
crash context correctly due to the corrupted vmcoreinfo.

Now except for vmcoreinfo, all the crash data is well protected(including
the cpu note which is fully updated in the crash path, thus its correctness
is guaranteed). Given that vmcoreinfo data is a large chunk, we better
protect it as well.

To solve this, we relocate and copy vmcoreinfo_data to the crash memory
when kdump is loading via kexec syscalls. Because the whole crash memory
will be protected by existing arch_kexec_protect_crashkres() mechanism,
we naturally protect vmcoreinfo_data from write(even read) access under
kernel direct mapping after kdump is loaded.

Since kdump is usually loaded at the very early stage after boot, we can
trust the correctness of the vmcoreinfo data copied.

On the other hand, we still need to operate the vmcoreinfo safe copy when
crash happens to generate vmcoreinfo_note again, we rely on vmap() to map
out a new kernel virtual address and update to use this new one instead in
the following crash_save_vmcoreinfo().

BTW, we do not touch vmcoreinfo_note, because it will be fully updated
using the protected vmcoreinfo_data after crash which is surely correct
just like the cpu crash note.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
---
 include/linux/kexec.h |  3 +++
 kernel/kexec.c        |  3 +++
 kernel/kexec_core.c   | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/kexec_file.c   |  3 +++
 4 files changed, 61 insertions(+)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 6918fda..fae2fc6 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -187,6 +187,8 @@ struct kimage {
 	unsigned long start;
 	struct page *control_code_page;
 	struct page *swap_page;
+	void *vmcoreinfo_data_copy; /* locates in the crash memory */
+	size_t vmcoreinfo_size_copy;
 
 	unsigned long nr_segments;
 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
@@ -243,6 +245,7 @@ extern asmlinkage long sys_kexec_load(unsigned long entry,
 extern int kernel_kexec(void);
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
+extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
 extern int kexec_load_purgatory(struct kimage *image, unsigned long min,
 				unsigned long max, int top_down,
 				unsigned long *load_addr);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 980936a..e0c4dea 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -93,6 +93,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 			pr_err("Could not allocate swap buffer\n");
 			goto out_free_control_pages;
 		}
+	} else {
+		if (kimage_crash_copy_vmcoreinfo(image) < 0)
+			goto out_free_image;
 	}
 
 	*rimage = image;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index e503b48..7fad9f6 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -486,6 +486,45 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
 	return pages;
 }
 
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
+{
+	struct page *vmcoreinfo_page;
+	void *safecopy;
+
+	WARN_ON(image->type != KEXEC_TYPE_CRASH);
+
+	if (!vmcoreinfo_size) {
+		pr_err("empty vmcoreinfo data\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * For kdump, allocate one vmcoreinfo safe copy from the
+	 * crash memory. as we have arch_kexec_protect_crashkres()
+	 * after kexec syscall, we naturally protect it from write
+	 * (even read) access under kernel direct mapping. But on
+	 * the other hand, we still need to operate it when crash
+	 * happens to generate vmcoreinfo note, hereby we rely on
+	 * vmap for this purpose.
+	 */
+	vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+	if (!vmcoreinfo_page) {
+		pr_err("could not allocate vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+	safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+	if (!safecopy) {
+		pr_err("cound not vmap vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+
+	memcpy(safecopy, vmcoreinfo_data, vmcoreinfo_size);
+	image->vmcoreinfo_data_copy = safecopy;
+	image->vmcoreinfo_size_copy = vmcoreinfo_size;
+
+	return 0;
+}
+
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
 	if (*image->entry != 0)
@@ -603,6 +642,9 @@ void kimage_free(struct kimage *image)
 	if (image->file_mode)
 		kimage_file_post_load_cleanup(image);
 
+	if (image->vmcoreinfo_data_copy)
+		vunmap(image->vmcoreinfo_data_copy);
+
 	kfree(image);
 }
 
@@ -1370,6 +1412,16 @@ void crash_save_vmcoreinfo(void)
 {
 	if (!vmcoreinfo_note)
 		return;
+	/*
+	 * Always use the safe copy to generate vmcoreinfo note.
+	 * Check kexec_crash_image, fadump does not use kexec.
+	 */
+	if (kexec_crash_image &&
+	    kexec_crash_image->vmcoreinfo_data_copy &&
+	    kexec_crash_image->vmcoreinfo_size_copy) {
+		vmcoreinfo_data = kexec_crash_image->vmcoreinfo_data_copy;
+		vmcoreinfo_size = kexec_crash_image->vmcoreinfo_size_copy;
+	}
 
 	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
 	update_vmcoreinfo_note();
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b56a558..6bb3e4d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -244,6 +244,9 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 			pr_err("Could not allocate swap buffer\n");
 			goto out_free_control_pages;
 		}
+	} else {
+		if (kimage_crash_copy_vmcoreinfo(image) < 0)
+			goto out_free_post_load_bufs;
 	}
 
 	*rimage = image;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH v3 3/3] kdump: Relocate vmcoreinfo to the crash memory range
@ 2017-03-20  5:50   ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-20  5:50 UTC (permalink / raw)
  To: linux-kernel, kexec
  Cc: Baoquan He, Petr Tesarik, Eric Biederman, Xunlei Pang, akpm, Dave Young

Currently vmcoreinfo data is updated at boot time subsys_initcall(),
it has the risk of being modified by some wrong code during system
is running.

As a result, vmcore dumped may contain the wrong vmcoreinfo. Later on,
when using "crash", "makedumpfile", etc utility to parse this vmcore,
we probably will get "Segmentation fault" or other unexpected errors.

E.g. 1) wrong code overwrites vmcoreinfo_data; 2) further crashes the
system; 3) trigger kdump, then we obviously will fail to recognize the
crash context correctly due to the corrupted vmcoreinfo.

Now except for vmcoreinfo, all the crash data is well protected(including
the cpu note which is fully updated in the crash path, thus its correctness
is guaranteed). Given that vmcoreinfo data is a large chunk, we better
protect it as well.

To solve this, we relocate and copy vmcoreinfo_data to the crash memory
when kdump is loading via kexec syscalls. Because the whole crash memory
will be protected by existing arch_kexec_protect_crashkres() mechanism,
we naturally protect vmcoreinfo_data from write(even read) access under
kernel direct mapping after kdump is loaded.

Since kdump is usually loaded at the very early stage after boot, we can
trust the correctness of the vmcoreinfo data copied.

On the other hand, we still need to operate the vmcoreinfo safe copy when
crash happens to generate vmcoreinfo_note again, we rely on vmap() to map
out a new kernel virtual address and update to use this new one instead in
the following crash_save_vmcoreinfo().

BTW, we do not touch vmcoreinfo_note, because it will be fully updated
using the protected vmcoreinfo_data after crash which is surely correct
just like the cpu crash note.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
---
 include/linux/kexec.h |  3 +++
 kernel/kexec.c        |  3 +++
 kernel/kexec_core.c   | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/kexec_file.c   |  3 +++
 4 files changed, 61 insertions(+)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 6918fda..fae2fc6 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -187,6 +187,8 @@ struct kimage {
 	unsigned long start;
 	struct page *control_code_page;
 	struct page *swap_page;
+	void *vmcoreinfo_data_copy; /* locates in the crash memory */
+	size_t vmcoreinfo_size_copy;
 
 	unsigned long nr_segments;
 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
@@ -243,6 +245,7 @@ extern asmlinkage long sys_kexec_load(unsigned long entry,
 extern int kernel_kexec(void);
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
+extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
 extern int kexec_load_purgatory(struct kimage *image, unsigned long min,
 				unsigned long max, int top_down,
 				unsigned long *load_addr);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 980936a..e0c4dea 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -93,6 +93,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 			pr_err("Could not allocate swap buffer\n");
 			goto out_free_control_pages;
 		}
+	} else {
+		if (kimage_crash_copy_vmcoreinfo(image) < 0)
+			goto out_free_image;
 	}
 
 	*rimage = image;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index e503b48..7fad9f6 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -486,6 +486,45 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
 	return pages;
 }
 
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
+{
+	struct page *vmcoreinfo_page;
+	void *safecopy;
+
+	WARN_ON(image->type != KEXEC_TYPE_CRASH);
+
+	if (!vmcoreinfo_size) {
+		pr_err("empty vmcoreinfo data\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * For kdump, allocate one vmcoreinfo safe copy from the
+	 * crash memory. as we have arch_kexec_protect_crashkres()
+	 * after kexec syscall, we naturally protect it from write
+	 * (even read) access under kernel direct mapping. But on
+	 * the other hand, we still need to operate it when crash
+	 * happens to generate vmcoreinfo note, hereby we rely on
+	 * vmap for this purpose.
+	 */
+	vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+	if (!vmcoreinfo_page) {
+		pr_err("could not allocate vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+	safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+	if (!safecopy) {
+		pr_err("cound not vmap vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+
+	memcpy(safecopy, vmcoreinfo_data, vmcoreinfo_size);
+	image->vmcoreinfo_data_copy = safecopy;
+	image->vmcoreinfo_size_copy = vmcoreinfo_size;
+
+	return 0;
+}
+
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
 	if (*image->entry != 0)
@@ -603,6 +642,9 @@ void kimage_free(struct kimage *image)
 	if (image->file_mode)
 		kimage_file_post_load_cleanup(image);
 
+	if (image->vmcoreinfo_data_copy)
+		vunmap(image->vmcoreinfo_data_copy);
+
 	kfree(image);
 }
 
@@ -1370,6 +1412,16 @@ void crash_save_vmcoreinfo(void)
 {
 	if (!vmcoreinfo_note)
 		return;
+	/*
+	 * Always use the safe copy to generate vmcoreinfo note.
+	 * Check kexec_crash_image, fadump does not use kexec.
+	 */
+	if (kexec_crash_image &&
+	    kexec_crash_image->vmcoreinfo_data_copy &&
+	    kexec_crash_image->vmcoreinfo_size_copy) {
+		vmcoreinfo_data = kexec_crash_image->vmcoreinfo_data_copy;
+		vmcoreinfo_size = kexec_crash_image->vmcoreinfo_size_copy;
+	}
 
 	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
 	update_vmcoreinfo_note();
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b56a558..6bb3e4d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -244,6 +244,9 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 			pr_err("Could not allocate swap buffer\n");
 			goto out_free_control_pages;
 		}
+	} else {
+		if (kimage_crash_copy_vmcoreinfo(image) < 0)
+			goto out_free_post_load_bufs;
 	}
 
 	*rimage = image;
-- 
1.8.3.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-20  5:50 ` Xunlei Pang
@ 2017-03-21  3:33   ` Eric W. Biederman
  -1 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2017-03-21  3:33 UTC (permalink / raw)
  To: Xunlei Pang
  Cc: linux-kernel, kexec, akpm, Dave Young, Baoquan He, Petr Tesarik,
	Mahesh Salgaonkar

Xunlei Pang <xlpang@redhat.com> writes:

> As Eric said,
> "what we need to do is move the variable vmcoreinfo_note out
> of the kernel's .bss section.  And modify the code to regenerate
> and keep this information in something like the control page.
>
> Definitely something like this needs a page all to itself, and ideally
> far away from any other kernel data structures.  I clearly was not
> watching closely the data someone decided to keep this silly thing
> in the kernel's .bss section."
>
> This patch allocates extra pages for these vmcoreinfo_XXX variables,
> one advantage is that it enhances some safety of vmcoreinfo, because
> vmcoreinfo now is kept far away from other kernel data structures.

Can you preceed this patch with a patch that removes CRASHTIME from
vmcoreinfo?  If someone actually cares we can add a separate note that holds
a 64bit crashtime in the per cpu notes.  

As we are looking at reliability concerns removing CRASHTIME should make
everything in vmcoreinfo a boot time constant.  Which should simplify
everything considerably.

Which means we only need to worry abou the per-cpu notes being written
at the time of a crash.

> Suggested-by: Eric Biederman <ebiederm@xmission.com>
> Signed-off-by: Xunlei Pang <xlpang@redhat.com>
> ---
>  arch/ia64/kernel/machine_kexec.c |  5 -----
>  arch/x86/kernel/crash.c          |  2 +-
>  include/linux/kexec.h            |  2 +-
>  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
>  kernel/ksysfs.c                  |  2 +-
>  5 files changed, 27 insertions(+), 13 deletions(-)
>
> diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
> index 599507b..c14815d 100644
> --- a/arch/ia64/kernel/machine_kexec.c
> +++ b/arch/ia64/kernel/machine_kexec.c
> @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
>  #endif
>  }
>  
> -phys_addr_t paddr_vmcoreinfo_note(void)
> -{
> -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
> -}
> -
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index 3741461..4d35fbb 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
>  	bufp += sizeof(Elf64_Phdr);
>  	phdr->p_type = PT_NOTE;
>  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
> -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
> +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
>  	(ehdr->e_phnum)++;
>  
>  #ifdef CONFIG_X86_64
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index e98e546..f1c601b 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
>  extern struct resource crashk_low_res;
>  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
>  extern note_buf_t __percpu *crash_notes;
> -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
> +extern u32 *vmcoreinfo_note;
>  extern size_t vmcoreinfo_size;
>  extern size_t vmcoreinfo_max_size;
>  
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index bfe62d5..e3a4bda 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -52,10 +52,10 @@
>  note_buf_t __percpu *crash_notes;
>  
>  /* vmcoreinfo stuff */
> -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
> -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
> +static unsigned char *vmcoreinfo_data;
>  size_t vmcoreinfo_size;
> -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
> +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
> +u32 *vmcoreinfo_note;
>  
>  /* Flag to indicate we are going to kexec a new kernel */
>  bool kexec_in_progress = false;
> @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
>  
>  void crash_save_vmcoreinfo(void)
>  {
> +	if (!vmcoreinfo_note)
> +		return;
> +
>  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
>  	update_vmcoreinfo_note();
>  }
> @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
>  void __weak arch_crash_save_vmcoreinfo(void)
>  {}
>  
> -phys_addr_t __weak paddr_vmcoreinfo_note(void)
> +phys_addr_t paddr_vmcoreinfo_note(void)
>  {
> -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
> +	return __pa(vmcoreinfo_note);
>  }
>  
>  static int __init crash_save_vmcoreinfo_init(void)
>  {
> +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
> +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
> +	if (!vmcoreinfo_data) {
> +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
> +		return -ENOMEM;
> +	}
> +
> +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
> +						GFP_KERNEL | __GFP_ZERO);
> +	if (!vmcoreinfo_note) {
> +		free_page((unsigned long)vmcoreinfo_data);
> +		vmcoreinfo_data = NULL;
> +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
> +		return -ENOMEM;
> +	}
> +
>  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
>  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
>  
> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
> index ee1bc1b..9de6fcc 100644
> --- a/kernel/ksysfs.c
> +++ b/kernel/ksysfs.c
> @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
>  {
>  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
>  	return sprintf(buf, "%pa %x\n", &vmcore_base,
> -		       (unsigned int)sizeof(vmcoreinfo_note));
> +			(unsigned int)VMCOREINFO_NOTE_SIZE);
>  }
>  KERNEL_ATTR_RO(vmcoreinfo);

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-21  3:33   ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2017-03-21  3:33 UTC (permalink / raw)
  To: Xunlei Pang
  Cc: Baoquan He, Petr Tesarik, kexec, linux-kernel, akpm, Dave Young

Xunlei Pang <xlpang@redhat.com> writes:

> As Eric said,
> "what we need to do is move the variable vmcoreinfo_note out
> of the kernel's .bss section.  And modify the code to regenerate
> and keep this information in something like the control page.
>
> Definitely something like this needs a page all to itself, and ideally
> far away from any other kernel data structures.  I clearly was not
> watching closely the data someone decided to keep this silly thing
> in the kernel's .bss section."
>
> This patch allocates extra pages for these vmcoreinfo_XXX variables,
> one advantage is that it enhances some safety of vmcoreinfo, because
> vmcoreinfo now is kept far away from other kernel data structures.

Can you preceed this patch with a patch that removes CRASHTIME from
vmcoreinfo?  If someone actually cares we can add a separate note that holds
a 64bit crashtime in the per cpu notes.  

As we are looking at reliability concerns removing CRASHTIME should make
everything in vmcoreinfo a boot time constant.  Which should simplify
everything considerably.

Which means we only need to worry abou the per-cpu notes being written
at the time of a crash.

> Suggested-by: Eric Biederman <ebiederm@xmission.com>
> Signed-off-by: Xunlei Pang <xlpang@redhat.com>
> ---
>  arch/ia64/kernel/machine_kexec.c |  5 -----
>  arch/x86/kernel/crash.c          |  2 +-
>  include/linux/kexec.h            |  2 +-
>  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
>  kernel/ksysfs.c                  |  2 +-
>  5 files changed, 27 insertions(+), 13 deletions(-)
>
> diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
> index 599507b..c14815d 100644
> --- a/arch/ia64/kernel/machine_kexec.c
> +++ b/arch/ia64/kernel/machine_kexec.c
> @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
>  #endif
>  }
>  
> -phys_addr_t paddr_vmcoreinfo_note(void)
> -{
> -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
> -}
> -
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index 3741461..4d35fbb 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
>  	bufp += sizeof(Elf64_Phdr);
>  	phdr->p_type = PT_NOTE;
>  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
> -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
> +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
>  	(ehdr->e_phnum)++;
>  
>  #ifdef CONFIG_X86_64
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index e98e546..f1c601b 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
>  extern struct resource crashk_low_res;
>  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
>  extern note_buf_t __percpu *crash_notes;
> -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
> +extern u32 *vmcoreinfo_note;
>  extern size_t vmcoreinfo_size;
>  extern size_t vmcoreinfo_max_size;
>  
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index bfe62d5..e3a4bda 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -52,10 +52,10 @@
>  note_buf_t __percpu *crash_notes;
>  
>  /* vmcoreinfo stuff */
> -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
> -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
> +static unsigned char *vmcoreinfo_data;
>  size_t vmcoreinfo_size;
> -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
> +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
> +u32 *vmcoreinfo_note;
>  
>  /* Flag to indicate we are going to kexec a new kernel */
>  bool kexec_in_progress = false;
> @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
>  
>  void crash_save_vmcoreinfo(void)
>  {
> +	if (!vmcoreinfo_note)
> +		return;
> +
>  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
>  	update_vmcoreinfo_note();
>  }
> @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
>  void __weak arch_crash_save_vmcoreinfo(void)
>  {}
>  
> -phys_addr_t __weak paddr_vmcoreinfo_note(void)
> +phys_addr_t paddr_vmcoreinfo_note(void)
>  {
> -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
> +	return __pa(vmcoreinfo_note);
>  }
>  
>  static int __init crash_save_vmcoreinfo_init(void)
>  {
> +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
> +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
> +	if (!vmcoreinfo_data) {
> +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
> +		return -ENOMEM;
> +	}
> +
> +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
> +						GFP_KERNEL | __GFP_ZERO);
> +	if (!vmcoreinfo_note) {
> +		free_page((unsigned long)vmcoreinfo_data);
> +		vmcoreinfo_data = NULL;
> +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
> +		return -ENOMEM;
> +	}
> +
>  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
>  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
>  
> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
> index ee1bc1b..9de6fcc 100644
> --- a/kernel/ksysfs.c
> +++ b/kernel/ksysfs.c
> @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
>  {
>  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
>  	return sprintf(buf, "%pa %x\n", &vmcore_base,
> -		       (unsigned int)sizeof(vmcoreinfo_note));
> +			(unsigned int)VMCOREINFO_NOTE_SIZE);
>  }
>  KERNEL_ATTR_RO(vmcoreinfo);

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-20  5:50 ` Xunlei Pang
@ 2017-03-21  9:27   ` Petr Tesarik
  -1 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2017-03-21  9:27 UTC (permalink / raw)
  To: Xunlei Pang
  Cc: linux-kernel, kexec, Baoquan He, Eric Biederman, akpm, Dave Young

On Mon, 20 Mar 2017 13:50:31 +0800
Xunlei Pang <xlpang@redhat.com> wrote:

> As Eric said,
> "what we need to do is move the variable vmcoreinfo_note out
> of the kernel's .bss section.  And modify the code to regenerate
> and keep this information in something like the control page.
> 
> Definitely something like this needs a page all to itself, and ideally
> far away from any other kernel data structures.  I clearly was not
> watching closely the data someone decided to keep this silly thing
> in the kernel's .bss section."
> 
> This patch allocates extra pages for these vmcoreinfo_XXX variables,
> one advantage is that it enhances some safety of vmcoreinfo, because
> vmcoreinfo now is kept far away from other kernel data structures.

Yes, I like this patch set very much now. Thank you!

Petr T

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-21  9:27   ` Petr Tesarik
  0 siblings, 0 replies; 36+ messages in thread
From: Petr Tesarik @ 2017-03-21  9:27 UTC (permalink / raw)
  To: Xunlei Pang
  Cc: Baoquan He, kexec, linux-kernel, Eric Biederman, akpm, Dave Young

On Mon, 20 Mar 2017 13:50:31 +0800
Xunlei Pang <xlpang@redhat.com> wrote:

> As Eric said,
> "what we need to do is move the variable vmcoreinfo_note out
> of the kernel's .bss section.  And modify the code to regenerate
> and keep this information in something like the control page.
> 
> Definitely something like this needs a page all to itself, and ideally
> far away from any other kernel data structures.  I clearly was not
> watching closely the data someone decided to keep this silly thing
> in the kernel's .bss section."
> 
> This patch allocates extra pages for these vmcoreinfo_XXX variables,
> one advantage is that it enhances some safety of vmcoreinfo, because
> vmcoreinfo now is kept far away from other kernel data structures.

Yes, I like this patch set very much now. Thank you!

Petr T

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-21  3:33   ` Eric W. Biederman
@ 2017-03-22  2:55     ` Dave Young
  -1 siblings, 0 replies; 36+ messages in thread
From: Dave Young @ 2017-03-22  2:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Xunlei Pang, Baoquan He, Petr Tesarik, kexec, linux-kernel, akpm,
	Atsushi Kumagai

On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
> Xunlei Pang <xlpang@redhat.com> writes:
> 
> > As Eric said,
> > "what we need to do is move the variable vmcoreinfo_note out
> > of the kernel's .bss section.  And modify the code to regenerate
> > and keep this information in something like the control page.
> >
> > Definitely something like this needs a page all to itself, and ideally
> > far away from any other kernel data structures.  I clearly was not
> > watching closely the data someone decided to keep this silly thing
> > in the kernel's .bss section."
> >
> > This patch allocates extra pages for these vmcoreinfo_XXX variables,
> > one advantage is that it enhances some safety of vmcoreinfo, because
> > vmcoreinfo now is kept far away from other kernel data structures.
> 
> Can you preceed this patch with a patch that removes CRASHTIME from
> vmcoreinfo?  If someone actually cares we can add a separate note that holds
> a 64bit crashtime in the per cpu notes.  

I think makedumpfile is using it, but I also vote to remove the
CRASHTIME. It is better not to do this while crashing and a makedumpfile
userspace patch is needed to drop the use of it.

> 
> As we are looking at reliability concerns removing CRASHTIME should make
> everything in vmcoreinfo a boot time constant.  Which should simplify
> everything considerably.

It is a nice improvement..

> 
> Which means we only need to worry abou the per-cpu notes being written
> at the time of a crash.
> 
> > Suggested-by: Eric Biederman <ebiederm@xmission.com>
> > Signed-off-by: Xunlei Pang <xlpang@redhat.com>
> > ---
> >  arch/ia64/kernel/machine_kexec.c |  5 -----
> >  arch/x86/kernel/crash.c          |  2 +-
> >  include/linux/kexec.h            |  2 +-
> >  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
> >  kernel/ksysfs.c                  |  2 +-
> >  5 files changed, 27 insertions(+), 13 deletions(-)
> >
> > diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
> > index 599507b..c14815d 100644
> > --- a/arch/ia64/kernel/machine_kexec.c
> > +++ b/arch/ia64/kernel/machine_kexec.c
> > @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
> >  #endif
> >  }
> >  
> > -phys_addr_t paddr_vmcoreinfo_note(void)
> > -{
> > -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
> > -}
> > -
> > diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> > index 3741461..4d35fbb 100644
> > --- a/arch/x86/kernel/crash.c
> > +++ b/arch/x86/kernel/crash.c
> > @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
> >  	bufp += sizeof(Elf64_Phdr);
> >  	phdr->p_type = PT_NOTE;
> >  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
> > -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
> > +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
> >  	(ehdr->e_phnum)++;
> >  
> >  #ifdef CONFIG_X86_64
> > diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> > index e98e546..f1c601b 100644
> > --- a/include/linux/kexec.h
> > +++ b/include/linux/kexec.h
> > @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
> >  extern struct resource crashk_low_res;
> >  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
> >  extern note_buf_t __percpu *crash_notes;
> > -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
> > +extern u32 *vmcoreinfo_note;
> >  extern size_t vmcoreinfo_size;
> >  extern size_t vmcoreinfo_max_size;
> >  
> > diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> > index bfe62d5..e3a4bda 100644
> > --- a/kernel/kexec_core.c
> > +++ b/kernel/kexec_core.c
> > @@ -52,10 +52,10 @@
> >  note_buf_t __percpu *crash_notes;
> >  
> >  /* vmcoreinfo stuff */
> > -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
> > -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
> > +static unsigned char *vmcoreinfo_data;
> >  size_t vmcoreinfo_size;
> > -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
> > +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
> > +u32 *vmcoreinfo_note;
> >  
> >  /* Flag to indicate we are going to kexec a new kernel */
> >  bool kexec_in_progress = false;
> > @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
> >  
> >  void crash_save_vmcoreinfo(void)
> >  {
> > +	if (!vmcoreinfo_note)
> > +		return;
> > +
> >  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
> >  	update_vmcoreinfo_note();
> >  }
> > @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
> >  void __weak arch_crash_save_vmcoreinfo(void)
> >  {}
> >  
> > -phys_addr_t __weak paddr_vmcoreinfo_note(void)
> > +phys_addr_t paddr_vmcoreinfo_note(void)
> >  {
> > -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
> > +	return __pa(vmcoreinfo_note);
> >  }
> >  
> >  static int __init crash_save_vmcoreinfo_init(void)
> >  {
> > +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
> > +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
> > +	if (!vmcoreinfo_data) {
> > +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
> > +		return -ENOMEM;
> > +	}
> > +
> > +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
> > +						GFP_KERNEL | __GFP_ZERO);
> > +	if (!vmcoreinfo_note) {
> > +		free_page((unsigned long)vmcoreinfo_data);
> > +		vmcoreinfo_data = NULL;
> > +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
> > +		return -ENOMEM;
> > +	}
> > +
> >  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
> >  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
> >  
> > diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
> > index ee1bc1b..9de6fcc 100644
> > --- a/kernel/ksysfs.c
> > +++ b/kernel/ksysfs.c
> > @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
> >  {
> >  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
> >  	return sprintf(buf, "%pa %x\n", &vmcore_base,
> > -		       (unsigned int)sizeof(vmcoreinfo_note));
> > +			(unsigned int)VMCOREINFO_NOTE_SIZE);
> >  }
> >  KERNEL_ATTR_RO(vmcoreinfo);
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

Thanks
Dave

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22  2:55     ` Dave Young
  0 siblings, 0 replies; 36+ messages in thread
From: Dave Young @ 2017-03-22  2:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, akpm, kexec

On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
> Xunlei Pang <xlpang@redhat.com> writes:
> 
> > As Eric said,
> > "what we need to do is move the variable vmcoreinfo_note out
> > of the kernel's .bss section.  And modify the code to regenerate
> > and keep this information in something like the control page.
> >
> > Definitely something like this needs a page all to itself, and ideally
> > far away from any other kernel data structures.  I clearly was not
> > watching closely the data someone decided to keep this silly thing
> > in the kernel's .bss section."
> >
> > This patch allocates extra pages for these vmcoreinfo_XXX variables,
> > one advantage is that it enhances some safety of vmcoreinfo, because
> > vmcoreinfo now is kept far away from other kernel data structures.
> 
> Can you preceed this patch with a patch that removes CRASHTIME from
> vmcoreinfo?  If someone actually cares we can add a separate note that holds
> a 64bit crashtime in the per cpu notes.  

I think makedumpfile is using it, but I also vote to remove the
CRASHTIME. It is better not to do this while crashing and a makedumpfile
userspace patch is needed to drop the use of it.

> 
> As we are looking at reliability concerns removing CRASHTIME should make
> everything in vmcoreinfo a boot time constant.  Which should simplify
> everything considerably.

It is a nice improvement..

> 
> Which means we only need to worry abou the per-cpu notes being written
> at the time of a crash.
> 
> > Suggested-by: Eric Biederman <ebiederm@xmission.com>
> > Signed-off-by: Xunlei Pang <xlpang@redhat.com>
> > ---
> >  arch/ia64/kernel/machine_kexec.c |  5 -----
> >  arch/x86/kernel/crash.c          |  2 +-
> >  include/linux/kexec.h            |  2 +-
> >  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
> >  kernel/ksysfs.c                  |  2 +-
> >  5 files changed, 27 insertions(+), 13 deletions(-)
> >
> > diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
> > index 599507b..c14815d 100644
> > --- a/arch/ia64/kernel/machine_kexec.c
> > +++ b/arch/ia64/kernel/machine_kexec.c
> > @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
> >  #endif
> >  }
> >  
> > -phys_addr_t paddr_vmcoreinfo_note(void)
> > -{
> > -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
> > -}
> > -
> > diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> > index 3741461..4d35fbb 100644
> > --- a/arch/x86/kernel/crash.c
> > +++ b/arch/x86/kernel/crash.c
> > @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
> >  	bufp += sizeof(Elf64_Phdr);
> >  	phdr->p_type = PT_NOTE;
> >  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
> > -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
> > +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
> >  	(ehdr->e_phnum)++;
> >  
> >  #ifdef CONFIG_X86_64
> > diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> > index e98e546..f1c601b 100644
> > --- a/include/linux/kexec.h
> > +++ b/include/linux/kexec.h
> > @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
> >  extern struct resource crashk_low_res;
> >  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
> >  extern note_buf_t __percpu *crash_notes;
> > -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
> > +extern u32 *vmcoreinfo_note;
> >  extern size_t vmcoreinfo_size;
> >  extern size_t vmcoreinfo_max_size;
> >  
> > diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> > index bfe62d5..e3a4bda 100644
> > --- a/kernel/kexec_core.c
> > +++ b/kernel/kexec_core.c
> > @@ -52,10 +52,10 @@
> >  note_buf_t __percpu *crash_notes;
> >  
> >  /* vmcoreinfo stuff */
> > -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
> > -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
> > +static unsigned char *vmcoreinfo_data;
> >  size_t vmcoreinfo_size;
> > -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
> > +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
> > +u32 *vmcoreinfo_note;
> >  
> >  /* Flag to indicate we are going to kexec a new kernel */
> >  bool kexec_in_progress = false;
> > @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
> >  
> >  void crash_save_vmcoreinfo(void)
> >  {
> > +	if (!vmcoreinfo_note)
> > +		return;
> > +
> >  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
> >  	update_vmcoreinfo_note();
> >  }
> > @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
> >  void __weak arch_crash_save_vmcoreinfo(void)
> >  {}
> >  
> > -phys_addr_t __weak paddr_vmcoreinfo_note(void)
> > +phys_addr_t paddr_vmcoreinfo_note(void)
> >  {
> > -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
> > +	return __pa(vmcoreinfo_note);
> >  }
> >  
> >  static int __init crash_save_vmcoreinfo_init(void)
> >  {
> > +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
> > +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
> > +	if (!vmcoreinfo_data) {
> > +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
> > +		return -ENOMEM;
> > +	}
> > +
> > +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
> > +						GFP_KERNEL | __GFP_ZERO);
> > +	if (!vmcoreinfo_note) {
> > +		free_page((unsigned long)vmcoreinfo_data);
> > +		vmcoreinfo_data = NULL;
> > +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
> > +		return -ENOMEM;
> > +	}
> > +
> >  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
> >  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
> >  
> > diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
> > index ee1bc1b..9de6fcc 100644
> > --- a/kernel/ksysfs.c
> > +++ b/kernel/ksysfs.c
> > @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
> >  {
> >  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
> >  	return sprintf(buf, "%pa %x\n", &vmcore_base,
> > -		       (unsigned int)sizeof(vmcoreinfo_note));
> > +			(unsigned int)VMCOREINFO_NOTE_SIZE);
> >  }
> >  KERNEL_ATTR_RO(vmcoreinfo);
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

Thanks
Dave

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-22  2:55     ` Dave Young
@ 2017-03-22  3:18       ` Eric W. Biederman
  -1 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2017-03-22  3:18 UTC (permalink / raw)
  To: Dave Young
  Cc: Xunlei Pang, Baoquan He, Petr Tesarik, kexec, linux-kernel, akpm,
	Atsushi Kumagai

Dave Young <dyoung@redhat.com> writes:

> On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
>> Xunlei Pang <xlpang@redhat.com> writes:
>> 
>> > As Eric said,
>> > "what we need to do is move the variable vmcoreinfo_note out
>> > of the kernel's .bss section.  And modify the code to regenerate
>> > and keep this information in something like the control page.
>> >
>> > Definitely something like this needs a page all to itself, and ideally
>> > far away from any other kernel data structures.  I clearly was not
>> > watching closely the data someone decided to keep this silly thing
>> > in the kernel's .bss section."
>> >
>> > This patch allocates extra pages for these vmcoreinfo_XXX variables,
>> > one advantage is that it enhances some safety of vmcoreinfo, because
>> > vmcoreinfo now is kept far away from other kernel data structures.
>> 
>> Can you preceed this patch with a patch that removes CRASHTIME from
>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>> a 64bit crashtime in the per cpu notes.  
>
> I think makedumpfile is using it, but I also vote to remove the
> CRASHTIME. It is better not to do this while crashing and a makedumpfile
> userspace patch is needed to drop the use of it.
>
>> 
>> As we are looking at reliability concerns removing CRASHTIME should make
>> everything in vmcoreinfo a boot time constant.  Which should simplify
>> everything considerably.
>
> It is a nice improvement..

We also need to take a close look at what s390 is doing with vmcoreinfo.
As apparently it is reading it in a different kind of crashdump process.

Eric

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22  3:18       ` Eric W. Biederman
  0 siblings, 0 replies; 36+ messages in thread
From: Eric W. Biederman @ 2017-03-22  3:18 UTC (permalink / raw)
  To: Dave Young
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, akpm, kexec

Dave Young <dyoung@redhat.com> writes:

> On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
>> Xunlei Pang <xlpang@redhat.com> writes:
>> 
>> > As Eric said,
>> > "what we need to do is move the variable vmcoreinfo_note out
>> > of the kernel's .bss section.  And modify the code to regenerate
>> > and keep this information in something like the control page.
>> >
>> > Definitely something like this needs a page all to itself, and ideally
>> > far away from any other kernel data structures.  I clearly was not
>> > watching closely the data someone decided to keep this silly thing
>> > in the kernel's .bss section."
>> >
>> > This patch allocates extra pages for these vmcoreinfo_XXX variables,
>> > one advantage is that it enhances some safety of vmcoreinfo, because
>> > vmcoreinfo now is kept far away from other kernel data structures.
>> 
>> Can you preceed this patch with a patch that removes CRASHTIME from
>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>> a 64bit crashtime in the per cpu notes.  
>
> I think makedumpfile is using it, but I also vote to remove the
> CRASHTIME. It is better not to do this while crashing and a makedumpfile
> userspace patch is needed to drop the use of it.
>
>> 
>> As we are looking at reliability concerns removing CRASHTIME should make
>> everything in vmcoreinfo a boot time constant.  Which should simplify
>> everything considerably.
>
> It is a nice improvement..

We also need to take a close look at what s390 is doing with vmcoreinfo.
As apparently it is reading it in a different kind of crashdump process.

Eric

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-22  3:18       ` Eric W. Biederman
@ 2017-03-22  4:30         ` Dave Young
  -1 siblings, 0 replies; 36+ messages in thread
From: Dave Young @ 2017-03-22  4:30 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, akpm, kexec, holzheu, mahesh, hbathini

On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
> Dave Young <dyoung@redhat.com> writes:
> 
> > On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
> >> Xunlei Pang <xlpang@redhat.com> writes:
> >> 
> >> > As Eric said,
> >> > "what we need to do is move the variable vmcoreinfo_note out
> >> > of the kernel's .bss section.  And modify the code to regenerate
> >> > and keep this information in something like the control page.
> >> >
> >> > Definitely something like this needs a page all to itself, and ideally
> >> > far away from any other kernel data structures.  I clearly was not
> >> > watching closely the data someone decided to keep this silly thing
> >> > in the kernel's .bss section."
> >> >
> >> > This patch allocates extra pages for these vmcoreinfo_XXX variables,
> >> > one advantage is that it enhances some safety of vmcoreinfo, because
> >> > vmcoreinfo now is kept far away from other kernel data structures.
> >> 
> >> Can you preceed this patch with a patch that removes CRASHTIME from
> >> vmcoreinfo?  If someone actually cares we can add a separate note that holds
> >> a 64bit crashtime in the per cpu notes.  
> >
> > I think makedumpfile is using it, but I also vote to remove the
> > CRASHTIME. It is better not to do this while crashing and a makedumpfile
> > userspace patch is needed to drop the use of it.
> >
> >> 
> >> As we are looking at reliability concerns removing CRASHTIME should make
> >> everything in vmcoreinfo a boot time constant.  Which should simplify
> >> everything considerably.
> >
> > It is a nice improvement..
> 
> We also need to take a close look at what s390 is doing with vmcoreinfo.
> As apparently it is reading it in a different kind of crashdump process.

Yes, need careful review from s390 and maybe ppc64 especially about
patch 2/3, better to have comments from IBM about s390 dump tool and ppc
fadump. Added more cc.

Thanks
Dave

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22  4:30         ` Dave Young
  0 siblings, 0 replies; 36+ messages in thread
From: Dave Young @ 2017-03-22  4:30 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, hbathini, akpm, holzheu, kexec

On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
> Dave Young <dyoung@redhat.com> writes:
> 
> > On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
> >> Xunlei Pang <xlpang@redhat.com> writes:
> >> 
> >> > As Eric said,
> >> > "what we need to do is move the variable vmcoreinfo_note out
> >> > of the kernel's .bss section.  And modify the code to regenerate
> >> > and keep this information in something like the control page.
> >> >
> >> > Definitely something like this needs a page all to itself, and ideally
> >> > far away from any other kernel data structures.  I clearly was not
> >> > watching closely the data someone decided to keep this silly thing
> >> > in the kernel's .bss section."
> >> >
> >> > This patch allocates extra pages for these vmcoreinfo_XXX variables,
> >> > one advantage is that it enhances some safety of vmcoreinfo, because
> >> > vmcoreinfo now is kept far away from other kernel data structures.
> >> 
> >> Can you preceed this patch with a patch that removes CRASHTIME from
> >> vmcoreinfo?  If someone actually cares we can add a separate note that holds
> >> a 64bit crashtime in the per cpu notes.  
> >
> > I think makedumpfile is using it, but I also vote to remove the
> > CRASHTIME. It is better not to do this while crashing and a makedumpfile
> > userspace patch is needed to drop the use of it.
> >
> >> 
> >> As we are looking at reliability concerns removing CRASHTIME should make
> >> everything in vmcoreinfo a boot time constant.  Which should simplify
> >> everything considerably.
> >
> > It is a nice improvement..
> 
> We also need to take a close look at what s390 is doing with vmcoreinfo.
> As apparently it is reading it in a different kind of crashdump process.

Yes, need careful review from s390 and maybe ppc64 especially about
patch 2/3, better to have comments from IBM about s390 dump tool and ppc
fadump. Added more cc.

Thanks
Dave

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-21  3:33   ` Eric W. Biederman
@ 2017-03-22  8:55     ` Xunlei Pang
  -1 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-22  8:55 UTC (permalink / raw)
  To: Eric W. Biederman, Xunlei Pang
  Cc: linux-kernel, kexec, akpm, Dave Young, Baoquan He, Petr Tesarik,
	Mahesh Salgaonkar

On 03/21/2017 at 11:33 AM, Eric W. Biederman wrote:
> Xunlei Pang <xlpang@redhat.com> writes:
>
>> As Eric said,
>> "what we need to do is move the variable vmcoreinfo_note out
>> of the kernel's .bss section.  And modify the code to regenerate
>> and keep this information in something like the control page.
>>
>> Definitely something like this needs a page all to itself, and ideally
>> far away from any other kernel data structures.  I clearly was not
>> watching closely the data someone decided to keep this silly thing
>> in the kernel's .bss section."
>>
>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>> one advantage is that it enhances some safety of vmcoreinfo, because
>> vmcoreinfo now is kept far away from other kernel data structures.
> Can you preceed this patch with a patch that removes CRASHTIME from
> vmcoreinfo?  If someone actually cares we can add a separate note that holds
> a 64bit crashtime in the per cpu notes.  

Hi Eric,

Thanks for your review, I took some time and did some investigation.

Removing "CRASHTIME=X" from vmcoreinfo_note will break user-space tools.
For example, makedumpfile gets vmcoreinfo note information by reading
"/sys/kernel/vmcoreinfo"  its PA, then get its "VA = PA | PAGE_OFFSET",
and then get the timestamp. This operates in the first kernel even before
kdump is loaded.

Actually, even moving vmcoreinfo_note[] into the crash memory, it
may have problems, for example, on s390 system the crash memory
range will be unmapped, so I guess it may cause some risks.

Additionally, there is no available way for us to allocate a page from the
crash memory during kernel initialization, we only can achieve this during
the kexec syscalls. There is not a neat way to implement a function to
allocate pages from the crash memory during kernel initialization without
some hack code added, because user-space tools(like kexec-tools) can
allocate the crash segment by their own ways from the crash memory.

That's why I only copy vmcoreinfo_data[] into the crash memory, and
not touch vmcoreinfo_note, so vmcoreinfo_data is well protected in
the crash memory copy, then in crash_save_vmcoreinfo(), we copy
this guaranteed copy into vmcoreinfo_note[], so the correctness of
vmcoreinfo_note[] is guaranteed. This is what [PATCH v3 3/3] does.

The current crash_save_vmcoreinfo() only involves memory(memcpy)
operations even for get_seconds(no locks), the only risk I can think
of now is that vmcoreinfo_note pointer may be corrupted. If it is a concern,
I guess we can put it into struct kimage" just like vmcoreinfo_XXX_copy
in this patch. After all if kimage structure was corrupted when crash happens,
we can do nothing but have to accept the fate.

So does it really deserve to eliminate crash_save_vmcoreinfo()?

Regards,
Xunlei

>
> As we are looking at reliability concerns removing CRASHTIME should make
> everything in vmcoreinfo a boot time constant.  Which should simplify
> everything considerably.
>
> Which means we only need to worry abou the per-cpu notes being written
> at the time of a crash.
>
>> Suggested-by: Eric Biederman <ebiederm@xmission.com>
>> Signed-off-by: Xunlei Pang <xlpang@redhat.com>
>> ---
>>  arch/ia64/kernel/machine_kexec.c |  5 -----
>>  arch/x86/kernel/crash.c          |  2 +-
>>  include/linux/kexec.h            |  2 +-
>>  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
>>  kernel/ksysfs.c                  |  2 +-
>>  5 files changed, 27 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
>> index 599507b..c14815d 100644
>> --- a/arch/ia64/kernel/machine_kexec.c
>> +++ b/arch/ia64/kernel/machine_kexec.c
>> @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
>>  #endif
>>  }
>>  
>> -phys_addr_t paddr_vmcoreinfo_note(void)
>> -{
>> -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
>> -}
>> -
>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>> index 3741461..4d35fbb 100644
>> --- a/arch/x86/kernel/crash.c
>> +++ b/arch/x86/kernel/crash.c
>> @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
>>  	bufp += sizeof(Elf64_Phdr);
>>  	phdr->p_type = PT_NOTE;
>>  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
>> -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
>> +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
>>  	(ehdr->e_phnum)++;
>>  
>>  #ifdef CONFIG_X86_64
>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>> index e98e546..f1c601b 100644
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>> @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
>>  extern struct resource crashk_low_res;
>>  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
>>  extern note_buf_t __percpu *crash_notes;
>> -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>> +extern u32 *vmcoreinfo_note;
>>  extern size_t vmcoreinfo_size;
>>  extern size_t vmcoreinfo_max_size;
>>  
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index bfe62d5..e3a4bda 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -52,10 +52,10 @@
>>  note_buf_t __percpu *crash_notes;
>>  
>>  /* vmcoreinfo stuff */
>> -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
>> -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>> +static unsigned char *vmcoreinfo_data;
>>  size_t vmcoreinfo_size;
>> -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
>> +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
>> +u32 *vmcoreinfo_note;
>>  
>>  /* Flag to indicate we are going to kexec a new kernel */
>>  bool kexec_in_progress = false;
>> @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
>>  
>>  void crash_save_vmcoreinfo(void)
>>  {
>> +	if (!vmcoreinfo_note)
>> +		return;
>> +
>>  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
>>  	update_vmcoreinfo_note();
>>  }
>> @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
>>  void __weak arch_crash_save_vmcoreinfo(void)
>>  {}
>>  
>> -phys_addr_t __weak paddr_vmcoreinfo_note(void)
>> +phys_addr_t paddr_vmcoreinfo_note(void)
>>  {
>> -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
>> +	return __pa(vmcoreinfo_note);
>>  }
>>  
>>  static int __init crash_save_vmcoreinfo_init(void)
>>  {
>> +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
>> +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
>> +	if (!vmcoreinfo_data) {
>> +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
>> +						GFP_KERNEL | __GFP_ZERO);
>> +	if (!vmcoreinfo_note) {
>> +		free_page((unsigned long)vmcoreinfo_data);
>> +		vmcoreinfo_data = NULL;
>> +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
>> +		return -ENOMEM;
>> +	}
>> +
>>  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
>>  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
>>  
>> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
>> index ee1bc1b..9de6fcc 100644
>> --- a/kernel/ksysfs.c
>> +++ b/kernel/ksysfs.c
>> @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
>>  {
>>  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
>>  	return sprintf(buf, "%pa %x\n", &vmcore_base,
>> -		       (unsigned int)sizeof(vmcoreinfo_note));
>> +			(unsigned int)VMCOREINFO_NOTE_SIZE);
>>  }
>>  KERNEL_ATTR_RO(vmcoreinfo);

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22  8:55     ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-22  8:55 UTC (permalink / raw)
  To: Eric W. Biederman, Xunlei Pang
  Cc: Baoquan He, Petr Tesarik, kexec, linux-kernel, akpm, Dave Young

On 03/21/2017 at 11:33 AM, Eric W. Biederman wrote:
> Xunlei Pang <xlpang@redhat.com> writes:
>
>> As Eric said,
>> "what we need to do is move the variable vmcoreinfo_note out
>> of the kernel's .bss section.  And modify the code to regenerate
>> and keep this information in something like the control page.
>>
>> Definitely something like this needs a page all to itself, and ideally
>> far away from any other kernel data structures.  I clearly was not
>> watching closely the data someone decided to keep this silly thing
>> in the kernel's .bss section."
>>
>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>> one advantage is that it enhances some safety of vmcoreinfo, because
>> vmcoreinfo now is kept far away from other kernel data structures.
> Can you preceed this patch with a patch that removes CRASHTIME from
> vmcoreinfo?  If someone actually cares we can add a separate note that holds
> a 64bit crashtime in the per cpu notes.  

Hi Eric,

Thanks for your review, I took some time and did some investigation.

Removing "CRASHTIME=X" from vmcoreinfo_note will break user-space tools.
For example, makedumpfile gets vmcoreinfo note information by reading
"/sys/kernel/vmcoreinfo"  its PA, then get its "VA = PA | PAGE_OFFSET",
and then get the timestamp. This operates in the first kernel even before
kdump is loaded.

Actually, even moving vmcoreinfo_note[] into the crash memory, it
may have problems, for example, on s390 system the crash memory
range will be unmapped, so I guess it may cause some risks.

Additionally, there is no available way for us to allocate a page from the
crash memory during kernel initialization, we only can achieve this during
the kexec syscalls. There is not a neat way to implement a function to
allocate pages from the crash memory during kernel initialization without
some hack code added, because user-space tools(like kexec-tools) can
allocate the crash segment by their own ways from the crash memory.

That's why I only copy vmcoreinfo_data[] into the crash memory, and
not touch vmcoreinfo_note, so vmcoreinfo_data is well protected in
the crash memory copy, then in crash_save_vmcoreinfo(), we copy
this guaranteed copy into vmcoreinfo_note[], so the correctness of
vmcoreinfo_note[] is guaranteed. This is what [PATCH v3 3/3] does.

The current crash_save_vmcoreinfo() only involves memory(memcpy)
operations even for get_seconds(no locks), the only risk I can think
of now is that vmcoreinfo_note pointer may be corrupted. If it is a concern,
I guess we can put it into struct kimage" just like vmcoreinfo_XXX_copy
in this patch. After all if kimage structure was corrupted when crash happens,
we can do nothing but have to accept the fate.

So does it really deserve to eliminate crash_save_vmcoreinfo()?

Regards,
Xunlei

>
> As we are looking at reliability concerns removing CRASHTIME should make
> everything in vmcoreinfo a boot time constant.  Which should simplify
> everything considerably.
>
> Which means we only need to worry abou the per-cpu notes being written
> at the time of a crash.
>
>> Suggested-by: Eric Biederman <ebiederm@xmission.com>
>> Signed-off-by: Xunlei Pang <xlpang@redhat.com>
>> ---
>>  arch/ia64/kernel/machine_kexec.c |  5 -----
>>  arch/x86/kernel/crash.c          |  2 +-
>>  include/linux/kexec.h            |  2 +-
>>  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
>>  kernel/ksysfs.c                  |  2 +-
>>  5 files changed, 27 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
>> index 599507b..c14815d 100644
>> --- a/arch/ia64/kernel/machine_kexec.c
>> +++ b/arch/ia64/kernel/machine_kexec.c
>> @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
>>  #endif
>>  }
>>  
>> -phys_addr_t paddr_vmcoreinfo_note(void)
>> -{
>> -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
>> -}
>> -
>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>> index 3741461..4d35fbb 100644
>> --- a/arch/x86/kernel/crash.c
>> +++ b/arch/x86/kernel/crash.c
>> @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
>>  	bufp += sizeof(Elf64_Phdr);
>>  	phdr->p_type = PT_NOTE;
>>  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
>> -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
>> +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
>>  	(ehdr->e_phnum)++;
>>  
>>  #ifdef CONFIG_X86_64
>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>> index e98e546..f1c601b 100644
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>> @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
>>  extern struct resource crashk_low_res;
>>  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
>>  extern note_buf_t __percpu *crash_notes;
>> -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>> +extern u32 *vmcoreinfo_note;
>>  extern size_t vmcoreinfo_size;
>>  extern size_t vmcoreinfo_max_size;
>>  
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index bfe62d5..e3a4bda 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -52,10 +52,10 @@
>>  note_buf_t __percpu *crash_notes;
>>  
>>  /* vmcoreinfo stuff */
>> -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
>> -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>> +static unsigned char *vmcoreinfo_data;
>>  size_t vmcoreinfo_size;
>> -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
>> +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
>> +u32 *vmcoreinfo_note;
>>  
>>  /* Flag to indicate we are going to kexec a new kernel */
>>  bool kexec_in_progress = false;
>> @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
>>  
>>  void crash_save_vmcoreinfo(void)
>>  {
>> +	if (!vmcoreinfo_note)
>> +		return;
>> +
>>  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
>>  	update_vmcoreinfo_note();
>>  }
>> @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
>>  void __weak arch_crash_save_vmcoreinfo(void)
>>  {}
>>  
>> -phys_addr_t __weak paddr_vmcoreinfo_note(void)
>> +phys_addr_t paddr_vmcoreinfo_note(void)
>>  {
>> -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
>> +	return __pa(vmcoreinfo_note);
>>  }
>>  
>>  static int __init crash_save_vmcoreinfo_init(void)
>>  {
>> +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
>> +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
>> +	if (!vmcoreinfo_data) {
>> +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
>> +						GFP_KERNEL | __GFP_ZERO);
>> +	if (!vmcoreinfo_note) {
>> +		free_page((unsigned long)vmcoreinfo_data);
>> +		vmcoreinfo_data = NULL;
>> +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
>> +		return -ENOMEM;
>> +	}
>> +
>>  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
>>  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
>>  
>> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
>> index ee1bc1b..9de6fcc 100644
>> --- a/kernel/ksysfs.c
>> +++ b/kernel/ksysfs.c
>> @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
>>  {
>>  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
>>  	return sprintf(buf, "%pa %x\n", &vmcore_base,
>> -		       (unsigned int)sizeof(vmcoreinfo_note));
>> +			(unsigned int)VMCOREINFO_NOTE_SIZE);
>>  }
>>  KERNEL_ATTR_RO(vmcoreinfo);


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-22  8:55     ` Xunlei Pang
@ 2017-03-22  9:16       ` Xunlei Pang
  -1 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-22  9:16 UTC (permalink / raw)
  To: Eric W. Biederman, Xunlei Pang
  Cc: Baoquan He, Petr Tesarik, kexec, linux-kernel, akpm, Dave Young

On 03/22/2017 at 04:55 PM, Xunlei Pang wrote:
> On 03/21/2017 at 11:33 AM, Eric W. Biederman wrote:
>> Xunlei Pang <xlpang@redhat.com> writes:
>>
>>> As Eric said,
>>> "what we need to do is move the variable vmcoreinfo_note out
>>> of the kernel's .bss section.  And modify the code to regenerate
>>> and keep this information in something like the control page.
>>>
>>> Definitely something like this needs a page all to itself, and ideally
>>> far away from any other kernel data structures.  I clearly was not
>>> watching closely the data someone decided to keep this silly thing
>>> in the kernel's .bss section."
>>>
>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>> vmcoreinfo now is kept far away from other kernel data structures.
>> Can you preceed this patch with a patch that removes CRASHTIME from
>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>> a 64bit crashtime in the per cpu notes.  
> Hi Eric,
>
> Thanks for your review, I took some time and did some investigation.
>
> Removing "CRASHTIME=X" from vmcoreinfo_note will break user-space tools.
> For example, makedumpfile gets vmcoreinfo note information by reading
> "/sys/kernel/vmcoreinfo"  its PA, then get its "VA = PA | PAGE_OFFSET",
> and then get the timestamp. This operates in the first kernel even before
> kdump is loaded.

Think more, this is not a problem for "makedumpfile --mem-usage",
as the system doesn't have "CRASHTIME" before crash. But still we
may have the following concerns.

>
> Actually, even moving vmcoreinfo_note[] into the crash memory, it
> may have problems, for example, on s390 system the crash memory
> range will be unmapped, so I guess it may cause some risks.
>
> Additionally, there is no available way for us to allocate a page from the
> crash memory during kernel initialization, we only can achieve this during
> the kexec syscalls. There is not a neat way to implement a function to
> allocate pages from the crash memory during kernel initialization without
> some hack code added, because user-space tools(like kexec-tools) can
> allocate the crash segment by their own ways from the crash memory.
>
> That's why I only copy vmcoreinfo_data[] into the crash memory, and
> not touch vmcoreinfo_note, so vmcoreinfo_data is well protected in
> the crash memory copy, then in crash_save_vmcoreinfo(), we copy
> this guaranteed copy into vmcoreinfo_note[], so the correctness of
> vmcoreinfo_note[] is guaranteed. This is what [PATCH v3 3/3] does.
>
> The current crash_save_vmcoreinfo() only involves memory(memcpy)
> operations even for get_seconds(no locks), the only risk I can think
> of now is that vmcoreinfo_note pointer may be corrupted. If it is a concern,
> I guess we can put it into struct kimage" just like vmcoreinfo_XXX_copy
> in this patch. After all if kimage structure was corrupted when crash happens,
> we can do nothing but have to accept the fate.
>
> So does it really deserve to eliminate crash_save_vmcoreinfo()?
>
> Regards,
> Xunlei
>
>> As we are looking at reliability concerns removing CRASHTIME should make
>> everything in vmcoreinfo a boot time constant.  Which should simplify
>> everything considerably.
>>
>> Which means we only need to worry abou the per-cpu notes being written
>> at the time of a crash.
>>
>>> Suggested-by: Eric Biederman <ebiederm@xmission.com>
>>> Signed-off-by: Xunlei Pang <xlpang@redhat.com>
>>> ---
>>>  arch/ia64/kernel/machine_kexec.c |  5 -----
>>>  arch/x86/kernel/crash.c          |  2 +-
>>>  include/linux/kexec.h            |  2 +-
>>>  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
>>>  kernel/ksysfs.c                  |  2 +-
>>>  5 files changed, 27 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
>>> index 599507b..c14815d 100644
>>> --- a/arch/ia64/kernel/machine_kexec.c
>>> +++ b/arch/ia64/kernel/machine_kexec.c
>>> @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
>>>  #endif
>>>  }
>>>  
>>> -phys_addr_t paddr_vmcoreinfo_note(void)
>>> -{
>>> -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
>>> -}
>>> -
>>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>>> index 3741461..4d35fbb 100644
>>> --- a/arch/x86/kernel/crash.c
>>> +++ b/arch/x86/kernel/crash.c
>>> @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
>>>  	bufp += sizeof(Elf64_Phdr);
>>>  	phdr->p_type = PT_NOTE;
>>>  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
>>> -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
>>> +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
>>>  	(ehdr->e_phnum)++;
>>>  
>>>  #ifdef CONFIG_X86_64
>>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>>> index e98e546..f1c601b 100644
>>> --- a/include/linux/kexec.h
>>> +++ b/include/linux/kexec.h
>>> @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
>>>  extern struct resource crashk_low_res;
>>>  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
>>>  extern note_buf_t __percpu *crash_notes;
>>> -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>>> +extern u32 *vmcoreinfo_note;
>>>  extern size_t vmcoreinfo_size;
>>>  extern size_t vmcoreinfo_max_size;
>>>  
>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>> index bfe62d5..e3a4bda 100644
>>> --- a/kernel/kexec_core.c
>>> +++ b/kernel/kexec_core.c
>>> @@ -52,10 +52,10 @@
>>>  note_buf_t __percpu *crash_notes;
>>>  
>>>  /* vmcoreinfo stuff */
>>> -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
>>> -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>>> +static unsigned char *vmcoreinfo_data;
>>>  size_t vmcoreinfo_size;
>>> -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
>>> +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
>>> +u32 *vmcoreinfo_note;
>>>  
>>>  /* Flag to indicate we are going to kexec a new kernel */
>>>  bool kexec_in_progress = false;
>>> @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
>>>  
>>>  void crash_save_vmcoreinfo(void)
>>>  {
>>> +	if (!vmcoreinfo_note)
>>> +		return;
>>> +
>>>  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
>>>  	update_vmcoreinfo_note();
>>>  }
>>> @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
>>>  void __weak arch_crash_save_vmcoreinfo(void)
>>>  {}
>>>  
>>> -phys_addr_t __weak paddr_vmcoreinfo_note(void)
>>> +phys_addr_t paddr_vmcoreinfo_note(void)
>>>  {
>>> -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
>>> +	return __pa(vmcoreinfo_note);
>>>  }
>>>  
>>>  static int __init crash_save_vmcoreinfo_init(void)
>>>  {
>>> +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
>>> +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
>>> +	if (!vmcoreinfo_data) {
>>> +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>> +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
>>> +						GFP_KERNEL | __GFP_ZERO);
>>> +	if (!vmcoreinfo_note) {
>>> +		free_page((unsigned long)vmcoreinfo_data);
>>> +		vmcoreinfo_data = NULL;
>>> +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>>  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
>>>  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
>>>  
>>> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
>>> index ee1bc1b..9de6fcc 100644
>>> --- a/kernel/ksysfs.c
>>> +++ b/kernel/ksysfs.c
>>> @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
>>>  {
>>>  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
>>>  	return sprintf(buf, "%pa %x\n", &vmcore_base,
>>> -		       (unsigned int)sizeof(vmcoreinfo_note));
>>> +			(unsigned int)VMCOREINFO_NOTE_SIZE);
>>>  }
>>>  KERNEL_ATTR_RO(vmcoreinfo);
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22  9:16       ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-22  9:16 UTC (permalink / raw)
  To: Eric W. Biederman, Xunlei Pang
  Cc: Baoquan He, Petr Tesarik, kexec, linux-kernel, akpm, Dave Young

On 03/22/2017 at 04:55 PM, Xunlei Pang wrote:
> On 03/21/2017 at 11:33 AM, Eric W. Biederman wrote:
>> Xunlei Pang <xlpang@redhat.com> writes:
>>
>>> As Eric said,
>>> "what we need to do is move the variable vmcoreinfo_note out
>>> of the kernel's .bss section.  And modify the code to regenerate
>>> and keep this information in something like the control page.
>>>
>>> Definitely something like this needs a page all to itself, and ideally
>>> far away from any other kernel data structures.  I clearly was not
>>> watching closely the data someone decided to keep this silly thing
>>> in the kernel's .bss section."
>>>
>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>> vmcoreinfo now is kept far away from other kernel data structures.
>> Can you preceed this patch with a patch that removes CRASHTIME from
>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>> a 64bit crashtime in the per cpu notes.  
> Hi Eric,
>
> Thanks for your review, I took some time and did some investigation.
>
> Removing "CRASHTIME=X" from vmcoreinfo_note will break user-space tools.
> For example, makedumpfile gets vmcoreinfo note information by reading
> "/sys/kernel/vmcoreinfo"  its PA, then get its "VA = PA | PAGE_OFFSET",
> and then get the timestamp. This operates in the first kernel even before
> kdump is loaded.

Think more, this is not a problem for "makedumpfile --mem-usage",
as the system doesn't have "CRASHTIME" before crash. But still we
may have the following concerns.

>
> Actually, even moving vmcoreinfo_note[] into the crash memory, it
> may have problems, for example, on s390 system the crash memory
> range will be unmapped, so I guess it may cause some risks.
>
> Additionally, there is no available way for us to allocate a page from the
> crash memory during kernel initialization, we only can achieve this during
> the kexec syscalls. There is not a neat way to implement a function to
> allocate pages from the crash memory during kernel initialization without
> some hack code added, because user-space tools(like kexec-tools) can
> allocate the crash segment by their own ways from the crash memory.
>
> That's why I only copy vmcoreinfo_data[] into the crash memory, and
> not touch vmcoreinfo_note, so vmcoreinfo_data is well protected in
> the crash memory copy, then in crash_save_vmcoreinfo(), we copy
> this guaranteed copy into vmcoreinfo_note[], so the correctness of
> vmcoreinfo_note[] is guaranteed. This is what [PATCH v3 3/3] does.
>
> The current crash_save_vmcoreinfo() only involves memory(memcpy)
> operations even for get_seconds(no locks), the only risk I can think
> of now is that vmcoreinfo_note pointer may be corrupted. If it is a concern,
> I guess we can put it into struct kimage" just like vmcoreinfo_XXX_copy
> in this patch. After all if kimage structure was corrupted when crash happens,
> we can do nothing but have to accept the fate.
>
> So does it really deserve to eliminate crash_save_vmcoreinfo()?
>
> Regards,
> Xunlei
>
>> As we are looking at reliability concerns removing CRASHTIME should make
>> everything in vmcoreinfo a boot time constant.  Which should simplify
>> everything considerably.
>>
>> Which means we only need to worry abou the per-cpu notes being written
>> at the time of a crash.
>>
>>> Suggested-by: Eric Biederman <ebiederm@xmission.com>
>>> Signed-off-by: Xunlei Pang <xlpang@redhat.com>
>>> ---
>>>  arch/ia64/kernel/machine_kexec.c |  5 -----
>>>  arch/x86/kernel/crash.c          |  2 +-
>>>  include/linux/kexec.h            |  2 +-
>>>  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
>>>  kernel/ksysfs.c                  |  2 +-
>>>  5 files changed, 27 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
>>> index 599507b..c14815d 100644
>>> --- a/arch/ia64/kernel/machine_kexec.c
>>> +++ b/arch/ia64/kernel/machine_kexec.c
>>> @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
>>>  #endif
>>>  }
>>>  
>>> -phys_addr_t paddr_vmcoreinfo_note(void)
>>> -{
>>> -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
>>> -}
>>> -
>>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>>> index 3741461..4d35fbb 100644
>>> --- a/arch/x86/kernel/crash.c
>>> +++ b/arch/x86/kernel/crash.c
>>> @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
>>>  	bufp += sizeof(Elf64_Phdr);
>>>  	phdr->p_type = PT_NOTE;
>>>  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
>>> -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
>>> +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
>>>  	(ehdr->e_phnum)++;
>>>  
>>>  #ifdef CONFIG_X86_64
>>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>>> index e98e546..f1c601b 100644
>>> --- a/include/linux/kexec.h
>>> +++ b/include/linux/kexec.h
>>> @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
>>>  extern struct resource crashk_low_res;
>>>  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
>>>  extern note_buf_t __percpu *crash_notes;
>>> -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>>> +extern u32 *vmcoreinfo_note;
>>>  extern size_t vmcoreinfo_size;
>>>  extern size_t vmcoreinfo_max_size;
>>>  
>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>> index bfe62d5..e3a4bda 100644
>>> --- a/kernel/kexec_core.c
>>> +++ b/kernel/kexec_core.c
>>> @@ -52,10 +52,10 @@
>>>  note_buf_t __percpu *crash_notes;
>>>  
>>>  /* vmcoreinfo stuff */
>>> -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
>>> -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>>> +static unsigned char *vmcoreinfo_data;
>>>  size_t vmcoreinfo_size;
>>> -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
>>> +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
>>> +u32 *vmcoreinfo_note;
>>>  
>>>  /* Flag to indicate we are going to kexec a new kernel */
>>>  bool kexec_in_progress = false;
>>> @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
>>>  
>>>  void crash_save_vmcoreinfo(void)
>>>  {
>>> +	if (!vmcoreinfo_note)
>>> +		return;
>>> +
>>>  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
>>>  	update_vmcoreinfo_note();
>>>  }
>>> @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
>>>  void __weak arch_crash_save_vmcoreinfo(void)
>>>  {}
>>>  
>>> -phys_addr_t __weak paddr_vmcoreinfo_note(void)
>>> +phys_addr_t paddr_vmcoreinfo_note(void)
>>>  {
>>> -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
>>> +	return __pa(vmcoreinfo_note);
>>>  }
>>>  
>>>  static int __init crash_save_vmcoreinfo_init(void)
>>>  {
>>> +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
>>> +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
>>> +	if (!vmcoreinfo_data) {
>>> +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>> +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
>>> +						GFP_KERNEL | __GFP_ZERO);
>>> +	if (!vmcoreinfo_note) {
>>> +		free_page((unsigned long)vmcoreinfo_data);
>>> +		vmcoreinfo_data = NULL;
>>> +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>>  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
>>>  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
>>>  
>>> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
>>> index ee1bc1b..9de6fcc 100644
>>> --- a/kernel/ksysfs.c
>>> +++ b/kernel/ksysfs.c
>>> @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
>>>  {
>>>  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
>>>  	return sprintf(buf, "%pa %x\n", &vmcore_base,
>>> -		       (unsigned int)sizeof(vmcoreinfo_note));
>>> +			(unsigned int)VMCOREINFO_NOTE_SIZE);
>>>  }
>>>  KERNEL_ATTR_RO(vmcoreinfo);
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-22  8:55     ` Xunlei Pang
@ 2017-03-22  9:17       ` Xunlei Pang
  -1 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-22  9:17 UTC (permalink / raw)
  To: Eric W. Biederman, Xunlei Pang
  Cc: Baoquan He, Petr Tesarik, kexec, linux-kernel, akpm, Dave Young

On 03/22/2017 at 04:55 PM, Xunlei Pang wrote:
> On 03/21/2017 at 11:33 AM, Eric W. Biederman wrote:
>> Xunlei Pang <xlpang@redhat.com> writes:
>>
>>> As Eric said,
>>> "what we need to do is move the variable vmcoreinfo_note out
>>> of the kernel's .bss section.  And modify the code to regenerate
>>> and keep this information in something like the control page.
>>>
>>> Definitely something like this needs a page all to itself, and ideally
>>> far away from any other kernel data structures.  I clearly was not
>>> watching closely the data someone decided to keep this silly thing
>>> in the kernel's .bss section."
>>>
>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>> vmcoreinfo now is kept far away from other kernel data structures.
>> Can you preceed this patch with a patch that removes CRASHTIME from
>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>> a 64bit crashtime in the per cpu notes.  
> Hi Eric,
>
> Thanks for your review, I took some time and did some investigation.
>
> Removing "CRASHTIME=X" from vmcoreinfo_note will break user-space tools.
> For example, makedumpfile gets vmcoreinfo note information by reading
> "/sys/kernel/vmcoreinfo"  its PA, then get its "VA = PA | PAGE_OFFSET",
> and then get the timestamp. This operates in the first kernel even before
> kdump is loaded.

Think more, this is not a problem for "makedumpfile --mem-usage",
as the system doesn't have "CRASHTIME" before crash. But still we
may have the following concerns.

>
> Actually, even moving vmcoreinfo_note[] into the crash memory, it
> may have problems, for example, on s390 system the crash memory
> range will be unmapped, so I guess it may cause some risks.
>
> Additionally, there is no available way for us to allocate a page from the
> crash memory during kernel initialization, we only can achieve this during
> the kexec syscalls. There is not a neat way to implement a function to
> allocate pages from the crash memory during kernel initialization without
> some hack code added, because user-space tools(like kexec-tools) can
> allocate the crash segment by their own ways from the crash memory.
>
> That's why I only copy vmcoreinfo_data[] into the crash memory, and
> not touch vmcoreinfo_note, so vmcoreinfo_data is well protected in
> the crash memory copy, then in crash_save_vmcoreinfo(), we copy
> this guaranteed copy into vmcoreinfo_note[], so the correctness of
> vmcoreinfo_note[] is guaranteed. This is what [PATCH v3 3/3] does.
>
> The current crash_save_vmcoreinfo() only involves memory(memcpy)
> operations even for get_seconds(no locks), the only risk I can think
> of now is that vmcoreinfo_note pointer may be corrupted. If it is a concern,
> I guess we can put it into struct kimage" just like vmcoreinfo_XXX_copy
> in this patch. After all if kimage structure was corrupted when crash happens,
> we can do nothing but have to accept the fate.
>
> So does it really deserve to eliminate crash_save_vmcoreinfo()?
>
> Regards,
> Xunlei
>
>> As we are looking at reliability concerns removing CRASHTIME should make
>> everything in vmcoreinfo a boot time constant.  Which should simplify
>> everything considerably.
>>
>> Which means we only need to worry abou the per-cpu notes being written
>> at the time of a crash.
>>
>>> Suggested-by: Eric Biederman <ebiederm@xmission.com>
>>> Signed-off-by: Xunlei Pang <xlpang@redhat.com>
>>> ---
>>>  arch/ia64/kernel/machine_kexec.c |  5 -----
>>>  arch/x86/kernel/crash.c          |  2 +-
>>>  include/linux/kexec.h            |  2 +-
>>>  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
>>>  kernel/ksysfs.c                  |  2 +-
>>>  5 files changed, 27 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
>>> index 599507b..c14815d 100644
>>> --- a/arch/ia64/kernel/machine_kexec.c
>>> +++ b/arch/ia64/kernel/machine_kexec.c
>>> @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
>>>  #endif
>>>  }
>>>  
>>> -phys_addr_t paddr_vmcoreinfo_note(void)
>>> -{
>>> -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
>>> -}
>>> -
>>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>>> index 3741461..4d35fbb 100644
>>> --- a/arch/x86/kernel/crash.c
>>> +++ b/arch/x86/kernel/crash.c
>>> @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
>>>  	bufp += sizeof(Elf64_Phdr);
>>>  	phdr->p_type = PT_NOTE;
>>>  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
>>> -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
>>> +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
>>>  	(ehdr->e_phnum)++;
>>>  
>>>  #ifdef CONFIG_X86_64
>>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>>> index e98e546..f1c601b 100644
>>> --- a/include/linux/kexec.h
>>> +++ b/include/linux/kexec.h
>>> @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
>>>  extern struct resource crashk_low_res;
>>>  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
>>>  extern note_buf_t __percpu *crash_notes;
>>> -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>>> +extern u32 *vmcoreinfo_note;
>>>  extern size_t vmcoreinfo_size;
>>>  extern size_t vmcoreinfo_max_size;
>>>  
>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>> index bfe62d5..e3a4bda 100644
>>> --- a/kernel/kexec_core.c
>>> +++ b/kernel/kexec_core.c
>>> @@ -52,10 +52,10 @@
>>>  note_buf_t __percpu *crash_notes;
>>>  
>>>  /* vmcoreinfo stuff */
>>> -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
>>> -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>>> +static unsigned char *vmcoreinfo_data;
>>>  size_t vmcoreinfo_size;
>>> -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
>>> +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
>>> +u32 *vmcoreinfo_note;
>>>  
>>>  /* Flag to indicate we are going to kexec a new kernel */
>>>  bool kexec_in_progress = false;
>>> @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
>>>  
>>>  void crash_save_vmcoreinfo(void)
>>>  {
>>> +	if (!vmcoreinfo_note)
>>> +		return;
>>> +
>>>  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
>>>  	update_vmcoreinfo_note();
>>>  }
>>> @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
>>>  void __weak arch_crash_save_vmcoreinfo(void)
>>>  {}
>>>  
>>> -phys_addr_t __weak paddr_vmcoreinfo_note(void)
>>> +phys_addr_t paddr_vmcoreinfo_note(void)
>>>  {
>>> -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
>>> +	return __pa(vmcoreinfo_note);
>>>  }
>>>  
>>>  static int __init crash_save_vmcoreinfo_init(void)
>>>  {
>>> +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
>>> +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
>>> +	if (!vmcoreinfo_data) {
>>> +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>> +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
>>> +						GFP_KERNEL | __GFP_ZERO);
>>> +	if (!vmcoreinfo_note) {
>>> +		free_page((unsigned long)vmcoreinfo_data);
>>> +		vmcoreinfo_data = NULL;
>>> +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>>  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
>>>  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
>>>  
>>> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
>>> index ee1bc1b..9de6fcc 100644
>>> --- a/kernel/ksysfs.c
>>> +++ b/kernel/ksysfs.c
>>> @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
>>>  {
>>>  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
>>>  	return sprintf(buf, "%pa %x\n", &vmcore_base,
>>> -		       (unsigned int)sizeof(vmcoreinfo_note));
>>> +			(unsigned int)VMCOREINFO_NOTE_SIZE);
>>>  }
>>>  KERNEL_ATTR_RO(vmcoreinfo);
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22  9:17       ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-22  9:17 UTC (permalink / raw)
  To: Eric W. Biederman, Xunlei Pang
  Cc: Baoquan He, Petr Tesarik, kexec, linux-kernel, akpm, Dave Young

On 03/22/2017 at 04:55 PM, Xunlei Pang wrote:
> On 03/21/2017 at 11:33 AM, Eric W. Biederman wrote:
>> Xunlei Pang <xlpang@redhat.com> writes:
>>
>>> As Eric said,
>>> "what we need to do is move the variable vmcoreinfo_note out
>>> of the kernel's .bss section.  And modify the code to regenerate
>>> and keep this information in something like the control page.
>>>
>>> Definitely something like this needs a page all to itself, and ideally
>>> far away from any other kernel data structures.  I clearly was not
>>> watching closely the data someone decided to keep this silly thing
>>> in the kernel's .bss section."
>>>
>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>> vmcoreinfo now is kept far away from other kernel data structures.
>> Can you preceed this patch with a patch that removes CRASHTIME from
>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>> a 64bit crashtime in the per cpu notes.  
> Hi Eric,
>
> Thanks for your review, I took some time and did some investigation.
>
> Removing "CRASHTIME=X" from vmcoreinfo_note will break user-space tools.
> For example, makedumpfile gets vmcoreinfo note information by reading
> "/sys/kernel/vmcoreinfo"  its PA, then get its "VA = PA | PAGE_OFFSET",
> and then get the timestamp. This operates in the first kernel even before
> kdump is loaded.

Think more, this is not a problem for "makedumpfile --mem-usage",
as the system doesn't have "CRASHTIME" before crash. But still we
may have the following concerns.

>
> Actually, even moving vmcoreinfo_note[] into the crash memory, it
> may have problems, for example, on s390 system the crash memory
> range will be unmapped, so I guess it may cause some risks.
>
> Additionally, there is no available way for us to allocate a page from the
> crash memory during kernel initialization, we only can achieve this during
> the kexec syscalls. There is not a neat way to implement a function to
> allocate pages from the crash memory during kernel initialization without
> some hack code added, because user-space tools(like kexec-tools) can
> allocate the crash segment by their own ways from the crash memory.
>
> That's why I only copy vmcoreinfo_data[] into the crash memory, and
> not touch vmcoreinfo_note, so vmcoreinfo_data is well protected in
> the crash memory copy, then in crash_save_vmcoreinfo(), we copy
> this guaranteed copy into vmcoreinfo_note[], so the correctness of
> vmcoreinfo_note[] is guaranteed. This is what [PATCH v3 3/3] does.
>
> The current crash_save_vmcoreinfo() only involves memory(memcpy)
> operations even for get_seconds(no locks), the only risk I can think
> of now is that vmcoreinfo_note pointer may be corrupted. If it is a concern,
> I guess we can put it into struct kimage" just like vmcoreinfo_XXX_copy
> in this patch. After all if kimage structure was corrupted when crash happens,
> we can do nothing but have to accept the fate.
>
> So does it really deserve to eliminate crash_save_vmcoreinfo()?
>
> Regards,
> Xunlei
>
>> As we are looking at reliability concerns removing CRASHTIME should make
>> everything in vmcoreinfo a boot time constant.  Which should simplify
>> everything considerably.
>>
>> Which means we only need to worry abou the per-cpu notes being written
>> at the time of a crash.
>>
>>> Suggested-by: Eric Biederman <ebiederm@xmission.com>
>>> Signed-off-by: Xunlei Pang <xlpang@redhat.com>
>>> ---
>>>  arch/ia64/kernel/machine_kexec.c |  5 -----
>>>  arch/x86/kernel/crash.c          |  2 +-
>>>  include/linux/kexec.h            |  2 +-
>>>  kernel/kexec_core.c              | 29 ++++++++++++++++++++++++-----
>>>  kernel/ksysfs.c                  |  2 +-
>>>  5 files changed, 27 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
>>> index 599507b..c14815d 100644
>>> --- a/arch/ia64/kernel/machine_kexec.c
>>> +++ b/arch/ia64/kernel/machine_kexec.c
>>> @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
>>>  #endif
>>>  }
>>>  
>>> -phys_addr_t paddr_vmcoreinfo_note(void)
>>> -{
>>> -	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
>>> -}
>>> -
>>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>>> index 3741461..4d35fbb 100644
>>> --- a/arch/x86/kernel/crash.c
>>> +++ b/arch/x86/kernel/crash.c
>>> @@ -456,7 +456,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
>>>  	bufp += sizeof(Elf64_Phdr);
>>>  	phdr->p_type = PT_NOTE;
>>>  	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
>>> -	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
>>> +	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
>>>  	(ehdr->e_phnum)++;
>>>  
>>>  #ifdef CONFIG_X86_64
>>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>>> index e98e546..f1c601b 100644
>>> --- a/include/linux/kexec.h
>>> +++ b/include/linux/kexec.h
>>> @@ -317,7 +317,7 @@ extern void *kexec_purgatory_get_symbol_addr(struct kimage *image,
>>>  extern struct resource crashk_low_res;
>>>  typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
>>>  extern note_buf_t __percpu *crash_notes;
>>> -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>>> +extern u32 *vmcoreinfo_note;
>>>  extern size_t vmcoreinfo_size;
>>>  extern size_t vmcoreinfo_max_size;
>>>  
>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>> index bfe62d5..e3a4bda 100644
>>> --- a/kernel/kexec_core.c
>>> +++ b/kernel/kexec_core.c
>>> @@ -52,10 +52,10 @@
>>>  note_buf_t __percpu *crash_notes;
>>>  
>>>  /* vmcoreinfo stuff */
>>> -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
>>> -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
>>> +static unsigned char *vmcoreinfo_data;
>>>  size_t vmcoreinfo_size;
>>> -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
>>> +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
>>> +u32 *vmcoreinfo_note;
>>>  
>>>  /* Flag to indicate we are going to kexec a new kernel */
>>>  bool kexec_in_progress = false;
>>> @@ -1369,6 +1369,9 @@ static void update_vmcoreinfo_note(void)
>>>  
>>>  void crash_save_vmcoreinfo(void)
>>>  {
>>> +	if (!vmcoreinfo_note)
>>> +		return;
>>> +
>>>  	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
>>>  	update_vmcoreinfo_note();
>>>  }
>>> @@ -1397,13 +1400,29 @@ void vmcoreinfo_append_str(const char *fmt, ...)
>>>  void __weak arch_crash_save_vmcoreinfo(void)
>>>  {}
>>>  
>>> -phys_addr_t __weak paddr_vmcoreinfo_note(void)
>>> +phys_addr_t paddr_vmcoreinfo_note(void)
>>>  {
>>> -	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
>>> +	return __pa(vmcoreinfo_note);
>>>  }
>>>  
>>>  static int __init crash_save_vmcoreinfo_init(void)
>>>  {
>>> +	/* One page should be enough for VMCOREINFO_BYTES under all archs */
>>> +	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
>>> +	if (!vmcoreinfo_data) {
>>> +		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>> +	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
>>> +						GFP_KERNEL | __GFP_ZERO);
>>> +	if (!vmcoreinfo_note) {
>>> +		free_page((unsigned long)vmcoreinfo_data);
>>> +		vmcoreinfo_data = NULL;
>>> +		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>>  	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
>>>  	VMCOREINFO_PAGESIZE(PAGE_SIZE);
>>>  
>>> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
>>> index ee1bc1b..9de6fcc 100644
>>> --- a/kernel/ksysfs.c
>>> +++ b/kernel/ksysfs.c
>>> @@ -130,7 +130,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
>>>  {
>>>  	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
>>>  	return sprintf(buf, "%pa %x\n", &vmcore_base,
>>> -		       (unsigned int)sizeof(vmcoreinfo_note));
>>> +			(unsigned int)VMCOREINFO_NOTE_SIZE);
>>>  }
>>>  KERNEL_ATTR_RO(vmcoreinfo);
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-22  4:30         ` Dave Young
@ 2017-03-22  9:34           ` Xunlei Pang
  -1 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-22  9:34 UTC (permalink / raw)
  To: Dave Young, Eric W. Biederman
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, hbathini, akpm, holzheu, kexec

On 03/22/2017 at 12:30 PM, Dave Young wrote:
> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>> Dave Young <dyoung@redhat.com> writes:
>>
>>> On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
>>>> Xunlei Pang <xlpang@redhat.com> writes:
>>>>
>>>>> As Eric said,
>>>>> "what we need to do is move the variable vmcoreinfo_note out
>>>>> of the kernel's .bss section.  And modify the code to regenerate
>>>>> and keep this information in something like the control page.
>>>>>
>>>>> Definitely something like this needs a page all to itself, and ideally
>>>>> far away from any other kernel data structures.  I clearly was not
>>>>> watching closely the data someone decided to keep this silly thing
>>>>> in the kernel's .bss section."
>>>>>
>>>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>>>> vmcoreinfo now is kept far away from other kernel data structures.
>>>> Can you preceed this patch with a patch that removes CRASHTIME from
>>>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>>>> a 64bit crashtime in the per cpu notes.  
>>> I think makedumpfile is using it, but I also vote to remove the
>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>> userspace patch is needed to drop the use of it.
>>>

By moving the CRASHTIME info to the cpu note of crashed cpu may be a good
way. In kdump kernel, notes of vmcore elfhdr will be merged into one big note
section, I don't know how makedumpfile or crash handle the big note section?
If they process the note in some order, breakage will definitely happen...

There is also a fadump may be affected.

Regards,
Xunlei

>>>> As we are looking at reliability concerns removing CRASHTIME should make
>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
>>>> everything considerably.
>>> It is a nice improvement..
>> We also need to take a close look at what s390 is doing with vmcoreinfo.
>> As apparently it is reading it in a different kind of crashdump process.
> Yes, need careful review from s390 and maybe ppc64 especially about
> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
> fadump. Added more cc.
>
> Thanks
> Dave
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22  9:34           ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-22  9:34 UTC (permalink / raw)
  To: Dave Young, Eric W. Biederman
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, hbathini, akpm, holzheu, kexec

On 03/22/2017 at 12:30 PM, Dave Young wrote:
> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>> Dave Young <dyoung@redhat.com> writes:
>>
>>> On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
>>>> Xunlei Pang <xlpang@redhat.com> writes:
>>>>
>>>>> As Eric said,
>>>>> "what we need to do is move the variable vmcoreinfo_note out
>>>>> of the kernel's .bss section.  And modify the code to regenerate
>>>>> and keep this information in something like the control page.
>>>>>
>>>>> Definitely something like this needs a page all to itself, and ideally
>>>>> far away from any other kernel data structures.  I clearly was not
>>>>> watching closely the data someone decided to keep this silly thing
>>>>> in the kernel's .bss section."
>>>>>
>>>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>>>> vmcoreinfo now is kept far away from other kernel data structures.
>>>> Can you preceed this patch with a patch that removes CRASHTIME from
>>>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>>>> a 64bit crashtime in the per cpu notes.  
>>> I think makedumpfile is using it, but I also vote to remove the
>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>> userspace patch is needed to drop the use of it.
>>>

By moving the CRASHTIME info to the cpu note of crashed cpu may be a good
way. In kdump kernel, notes of vmcore elfhdr will be merged into one big note
section, I don't know how makedumpfile or crash handle the big note section?
If they process the note in some order, breakage will definitely happen...

There is also a fadump may be affected.

Regards,
Xunlei

>>>> As we are looking at reliability concerns removing CRASHTIME should make
>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
>>>> everything considerably.
>>> It is a nice improvement..
>> We also need to take a close look at what s390 is doing with vmcoreinfo.
>> As apparently it is reading it in a different kind of crashdump process.
> Yes, need careful review from s390 and maybe ppc64 especially about
> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
> fadump. Added more cc.
>
> Thanks
> Dave
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-22  4:30         ` Dave Young
@ 2017-03-22 11:46           ` Hari Bathini
  -1 siblings, 0 replies; 36+ messages in thread
From: Hari Bathini @ 2017-03-22 11:46 UTC (permalink / raw)
  To: Dave Young, Eric W. Biederman
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, akpm, holzheu, kexec

Hi Dave,


On Wednesday 22 March 2017 10:00 AM, Dave Young wrote:
> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>> Dave Young <dyoung@redhat.com> writes:
>>
>>> On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
>>>> Xunlei Pang <xlpang@redhat.com> writes:
>>>>
>>>>> As Eric said,
>>>>> "what we need to do is move the variable vmcoreinfo_note out
>>>>> of the kernel's .bss section.  And modify the code to regenerate
>>>>> and keep this information in something like the control page.
>>>>>
>>>>> Definitely something like this needs a page all to itself, and ideally
>>>>> far away from any other kernel data structures.  I clearly was not
>>>>> watching closely the data someone decided to keep this silly thing
>>>>> in the kernel's .bss section."
>>>>>
>>>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>>>> vmcoreinfo now is kept far away from other kernel data structures.
>>>> Can you preceed this patch with a patch that removes CRASHTIME from
>>>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>>>> a 64bit crashtime in the per cpu notes.
>>> I think makedumpfile is using it, but I also vote to remove the
>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>> userspace patch is needed to drop the use of it.
>>>
>>>> As we are looking at reliability concerns removing CRASHTIME should make
>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
>>>> everything considerably.
>>> It is a nice improvement..
>> We also need to take a close look at what s390 is doing with vmcoreinfo.
>> As apparently it is reading it in a different kind of crashdump process.
> Yes, need careful review from s390 and maybe ppc64 especially about
> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
> fadump. Added more cc.

w.r.t powerpc/fadump, this patch-set works fine..

Thanks
Hari

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22 11:46           ` Hari Bathini
  0 siblings, 0 replies; 36+ messages in thread
From: Hari Bathini @ 2017-03-22 11:46 UTC (permalink / raw)
  To: Dave Young, Eric W. Biederman
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, akpm, holzheu, kexec

Hi Dave,


On Wednesday 22 March 2017 10:00 AM, Dave Young wrote:
> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>> Dave Young <dyoung@redhat.com> writes:
>>
>>> On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
>>>> Xunlei Pang <xlpang@redhat.com> writes:
>>>>
>>>>> As Eric said,
>>>>> "what we need to do is move the variable vmcoreinfo_note out
>>>>> of the kernel's .bss section.  And modify the code to regenerate
>>>>> and keep this information in something like the control page.
>>>>>
>>>>> Definitely something like this needs a page all to itself, and ideally
>>>>> far away from any other kernel data structures.  I clearly was not
>>>>> watching closely the data someone decided to keep this silly thing
>>>>> in the kernel's .bss section."
>>>>>
>>>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>>>> vmcoreinfo now is kept far away from other kernel data structures.
>>>> Can you preceed this patch with a patch that removes CRASHTIME from
>>>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>>>> a 64bit crashtime in the per cpu notes.
>>> I think makedumpfile is using it, but I also vote to remove the
>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>> userspace patch is needed to drop the use of it.
>>>
>>>> As we are looking at reliability concerns removing CRASHTIME should make
>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
>>>> everything considerably.
>>> It is a nice improvement..
>> We also need to take a close look at what s390 is doing with vmcoreinfo.
>> As apparently it is reading it in a different kind of crashdump process.
> Yes, need careful review from s390 and maybe ppc64 especially about
> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
> fadump. Added more cc.

w.r.t powerpc/fadump, this patch-set works fine..

Thanks
Hari


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-22  9:34           ` Xunlei Pang
@ 2017-03-22 12:15             ` Hari Bathini
  -1 siblings, 0 replies; 36+ messages in thread
From: Hari Bathini @ 2017-03-22 12:15 UTC (permalink / raw)
  To: xlpang, Dave Young, Eric W. Biederman
  Cc: Baoquan He, Atsushi Kumagai, Petr Tesarik, linux-kernel, akpm,
	holzheu, kexec

Hi Xunlei,


On Wednesday 22 March 2017 03:04 PM, Xunlei Pang wrote:
> On 03/22/2017 at 12:30 PM, Dave Young wrote:
>> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>>> Dave Young <dyoung@redhat.com> writes:
>>>
>>>> On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
>>>>> Xunlei Pang <xlpang@redhat.com> writes:
>>>>>
>>>>>> As Eric said,
>>>>>> "what we need to do is move the variable vmcoreinfo_note out
>>>>>> of the kernel's .bss section.  And modify the code to regenerate
>>>>>> and keep this information in something like the control page.
>>>>>>
>>>>>> Definitely something like this needs a page all to itself, and ideally
>>>>>> far away from any other kernel data structures.  I clearly was not
>>>>>> watching closely the data someone decided to keep this silly thing
>>>>>> in the kernel's .bss section."
>>>>>>
>>>>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>>>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>>>>> vmcoreinfo now is kept far away from other kernel data structures.
>>>>> Can you preceed this patch with a patch that removes CRASHTIME from
>>>>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>>>>> a 64bit crashtime in the per cpu notes.
>>>> I think makedumpfile is using it, but I also vote to remove the
>>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>>> userspace patch is needed to drop the use of it.
>>>>
> By moving the CRASHTIME info to the cpu note of crashed cpu may be a good
> way. In kdump kernel, notes of vmcore elfhdr will be merged into one big note
> section, I don't know how makedumpfile or crash handle the big note section?
> If they process the note in some order, breakage will definitely happen...
>
> There is also a fadump may be affected.
>

Would like to keep a tab of such change as fadump builds cpu notes 
differently
and such change may have an impact on it considering it depends on the same
tools - crash, makedumpfile..

Thanks
Hari

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22 12:15             ` Hari Bathini
  0 siblings, 0 replies; 36+ messages in thread
From: Hari Bathini @ 2017-03-22 12:15 UTC (permalink / raw)
  To: xlpang, Dave Young, Eric W. Biederman
  Cc: Baoquan He, kexec, Atsushi Kumagai, Petr Tesarik, linux-kernel,
	akpm, holzheu

Hi Xunlei,


On Wednesday 22 March 2017 03:04 PM, Xunlei Pang wrote:
> On 03/22/2017 at 12:30 PM, Dave Young wrote:
>> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>>> Dave Young <dyoung@redhat.com> writes:
>>>
>>>> On 03/20/17 at 10:33pm, Eric W. Biederman wrote:
>>>>> Xunlei Pang <xlpang@redhat.com> writes:
>>>>>
>>>>>> As Eric said,
>>>>>> "what we need to do is move the variable vmcoreinfo_note out
>>>>>> of the kernel's .bss section.  And modify the code to regenerate
>>>>>> and keep this information in something like the control page.
>>>>>>
>>>>>> Definitely something like this needs a page all to itself, and ideally
>>>>>> far away from any other kernel data structures.  I clearly was not
>>>>>> watching closely the data someone decided to keep this silly thing
>>>>>> in the kernel's .bss section."
>>>>>>
>>>>>> This patch allocates extra pages for these vmcoreinfo_XXX variables,
>>>>>> one advantage is that it enhances some safety of vmcoreinfo, because
>>>>>> vmcoreinfo now is kept far away from other kernel data structures.
>>>>> Can you preceed this patch with a patch that removes CRASHTIME from
>>>>> vmcoreinfo?  If someone actually cares we can add a separate note that holds
>>>>> a 64bit crashtime in the per cpu notes.
>>>> I think makedumpfile is using it, but I also vote to remove the
>>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>>> userspace patch is needed to drop the use of it.
>>>>
> By moving the CRASHTIME info to the cpu note of crashed cpu may be a good
> way. In kdump kernel, notes of vmcore elfhdr will be merged into one big note
> section, I don't know how makedumpfile or crash handle the big note section?
> If they process the note in some order, breakage will definitely happen...
>
> There is also a fadump may be affected.
>

Would like to keep a tab of such change as fadump builds cpu notes 
differently
and such change may have an impact on it considering it depends on the same
tools - crash, makedumpfile..

Thanks
Hari


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-22  4:30         ` Dave Young
@ 2017-03-22 20:48           ` Michael Holzheu
  -1 siblings, 0 replies; 36+ messages in thread
From: Michael Holzheu @ 2017-03-22 20:48 UTC (permalink / raw)
  To: Dave Young
  Cc: Eric W. Biederman, Baoquan He, Xunlei Pang, Atsushi Kumagai,
	Petr Tesarik, linux-kernel, akpm, kexec, mahesh, hbathini

Am Wed, 22 Mar 2017 12:30:04 +0800
schrieb Dave Young <dyoung@redhat.com>:

> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
> > Dave Young <dyoung@redhat.com> writes:
> > 

[snip]

> > > I think makedumpfile is using it, but I also vote to remove the
> > > CRASHTIME. It is better not to do this while crashing and a makedumpfile
> > > userspace patch is needed to drop the use of it.
> > >
> > >> 
> > >> As we are looking at reliability concerns removing CRASHTIME should make
> > >> everything in vmcoreinfo a boot time constant.  Which should simplify
> > >> everything considerably.
> > >
> > > It is a nice improvement..
> > 
> > We also need to take a close look at what s390 is doing with vmcoreinfo.
> > As apparently it is reading it in a different kind of crashdump process.
> 
> Yes, need careful review from s390 and maybe ppc64 especially about
> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
> fadump. Added more cc.

On s390 we have at least an issue with patch 1/3. For stand-alone dump
and also because we create the ELF header for kdump in the new
kernel we save the pointer to the vmcoreinfo note in the old kernel on a
defined memory address in our absolute zero lowcore.

This is done in arch/s390/kernel/setup.c:

static void __init setup_vmcoreinfo(void)
{
        mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
}

Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in
time we have a problem here.

To solve this - I think - we could move the initialization to
arch/s390/kernel/machine_kexec.c:

void arch_crash_save_vmcoreinfo(void)
{
        VMCOREINFO_SYMBOL(lowcore_ptr);
        VMCOREINFO_SYMBOL(high_memory);
        VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
        mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
}

Probably related to this is my observation that patch 3/3 leads to
an empty VMCOREINFO note for kdump on s390. The note is there ...

# readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE
  VMCOREINFO           0x0000068e       Unknown note type: (0x00000000)

But it contains only zeros.

Unfortunately I have not yet understood the reason for this.

Michael

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-22 20:48           ` Michael Holzheu
  0 siblings, 0 replies; 36+ messages in thread
From: Michael Holzheu @ 2017-03-22 20:48 UTC (permalink / raw)
  To: Dave Young
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, Eric W. Biederman, hbathini, akpm, kexec

Am Wed, 22 Mar 2017 12:30:04 +0800
schrieb Dave Young <dyoung@redhat.com>:

> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
> > Dave Young <dyoung@redhat.com> writes:
> > 

[snip]

> > > I think makedumpfile is using it, but I also vote to remove the
> > > CRASHTIME. It is better not to do this while crashing and a makedumpfile
> > > userspace patch is needed to drop the use of it.
> > >
> > >> 
> > >> As we are looking at reliability concerns removing CRASHTIME should make
> > >> everything in vmcoreinfo a boot time constant.  Which should simplify
> > >> everything considerably.
> > >
> > > It is a nice improvement..
> > 
> > We also need to take a close look at what s390 is doing with vmcoreinfo.
> > As apparently it is reading it in a different kind of crashdump process.
> 
> Yes, need careful review from s390 and maybe ppc64 especially about
> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
> fadump. Added more cc.

On s390 we have at least an issue with patch 1/3. For stand-alone dump
and also because we create the ELF header for kdump in the new
kernel we save the pointer to the vmcoreinfo note in the old kernel on a
defined memory address in our absolute zero lowcore.

This is done in arch/s390/kernel/setup.c:

static void __init setup_vmcoreinfo(void)
{
        mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
}

Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in
time we have a problem here.

To solve this - I think - we could move the initialization to
arch/s390/kernel/machine_kexec.c:

void arch_crash_save_vmcoreinfo(void)
{
        VMCOREINFO_SYMBOL(lowcore_ptr);
        VMCOREINFO_SYMBOL(high_memory);
        VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
        mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
}

Probably related to this is my observation that patch 3/3 leads to
an empty VMCOREINFO note for kdump on s390. The note is there ...

# readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE
  VMCOREINFO           0x0000068e       Unknown note type: (0x00000000)

But it contains only zeros.

Unfortunately I have not yet understood the reason for this.

Michael


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-22 20:48           ` Michael Holzheu
@ 2017-03-23  9:23             ` Xunlei Pang
  -1 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-23  9:23 UTC (permalink / raw)
  To: Michael Holzheu, Dave Young
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, Eric W. Biederman, hbathini, akpm, kexec

On 03/23/2017 at 04:48 AM, Michael Holzheu wrote:
> Am Wed, 22 Mar 2017 12:30:04 +0800
> schrieb Dave Young <dyoung@redhat.com>:
>
>> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>>> Dave Young <dyoung@redhat.com> writes:
>>>
> [snip]
>
>>>> I think makedumpfile is using it, but I also vote to remove the
>>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>>> userspace patch is needed to drop the use of it.
>>>>
>>>>> As we are looking at reliability concerns removing CRASHTIME should make
>>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
>>>>> everything considerably.
>>>> It is a nice improvement..
>>> We also need to take a close look at what s390 is doing with vmcoreinfo.
>>> As apparently it is reading it in a different kind of crashdump process.
>> Yes, need careful review from s390 and maybe ppc64 especially about
>> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
>> fadump. Added more cc.
> On s390 we have at least an issue with patch 1/3. For stand-alone dump
> and also because we create the ELF header for kdump in the new
> kernel we save the pointer to the vmcoreinfo note in the old kernel on a
> defined memory address in our absolute zero lowcore.
>
> This is done in arch/s390/kernel/setup.c:
>
> static void __init setup_vmcoreinfo(void)
> {
>         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
> }
>
> Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in
> time we have a problem here.
>
> To solve this - I think - we could move the initialization to
> arch/s390/kernel/machine_kexec.c:
>
> void arch_crash_save_vmcoreinfo(void)
> {
>         VMCOREINFO_SYMBOL(lowcore_ptr);
>         VMCOREINFO_SYMBOL(high_memory);
>         VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
>         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
> }
>
> Probably related to this is my observation that patch 3/3 leads to
> an empty VMCOREINFO note for kdump on s390. The note is there ...
>
> # readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE
>   VMCOREINFO           0x0000068e       Unknown note type: (0x00000000)
>
> But it contains only zeros.

Yes, this is a good catch, I will do more tests.

Thanks,
Xunlei

>
> Unfortunately I have not yet understood the reason for this.
>
> Michael
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-23  9:23             ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-23  9:23 UTC (permalink / raw)
  To: Michael Holzheu, Dave Young
  Cc: Baoquan He, Xunlei Pang, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, Eric W. Biederman, hbathini, akpm, kexec

On 03/23/2017 at 04:48 AM, Michael Holzheu wrote:
> Am Wed, 22 Mar 2017 12:30:04 +0800
> schrieb Dave Young <dyoung@redhat.com>:
>
>> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>>> Dave Young <dyoung@redhat.com> writes:
>>>
> [snip]
>
>>>> I think makedumpfile is using it, but I also vote to remove the
>>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>>> userspace patch is needed to drop the use of it.
>>>>
>>>>> As we are looking at reliability concerns removing CRASHTIME should make
>>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
>>>>> everything considerably.
>>>> It is a nice improvement..
>>> We also need to take a close look at what s390 is doing with vmcoreinfo.
>>> As apparently it is reading it in a different kind of crashdump process.
>> Yes, need careful review from s390 and maybe ppc64 especially about
>> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
>> fadump. Added more cc.
> On s390 we have at least an issue with patch 1/3. For stand-alone dump
> and also because we create the ELF header for kdump in the new
> kernel we save the pointer to the vmcoreinfo note in the old kernel on a
> defined memory address in our absolute zero lowcore.
>
> This is done in arch/s390/kernel/setup.c:
>
> static void __init setup_vmcoreinfo(void)
> {
>         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
> }
>
> Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in
> time we have a problem here.
>
> To solve this - I think - we could move the initialization to
> arch/s390/kernel/machine_kexec.c:
>
> void arch_crash_save_vmcoreinfo(void)
> {
>         VMCOREINFO_SYMBOL(lowcore_ptr);
>         VMCOREINFO_SYMBOL(high_memory);
>         VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
>         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
> }
>
> Probably related to this is my observation that patch 3/3 leads to
> an empty VMCOREINFO note for kdump on s390. The note is there ...
>
> # readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE
>   VMCOREINFO           0x0000068e       Unknown note type: (0x00000000)
>
> But it contains only zeros.

Yes, this is a good catch, I will do more tests.

Thanks,
Xunlei

>
> Unfortunately I have not yet understood the reason for this.
>
> Michael
>
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-23  9:23             ` Xunlei Pang
@ 2017-03-23 17:46               ` Michael Holzheu
  -1 siblings, 0 replies; 36+ messages in thread
From: Michael Holzheu @ 2017-03-23 17:46 UTC (permalink / raw)
  To: xlpang
  Cc: xpang, Dave Young, Baoquan He, Atsushi Kumagai, Petr Tesarik,
	linux-kernel, Eric W. Biederman, hbathini, akpm, kexec

Am Thu, 23 Mar 2017 17:23:53 +0800
schrieb Xunlei Pang <xpang@redhat.com>:

> On 03/23/2017 at 04:48 AM, Michael Holzheu wrote:
> > Am Wed, 22 Mar 2017 12:30:04 +0800
> > schrieb Dave Young <dyoung@redhat.com>:
> >
> >> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
> >>> Dave Young <dyoung@redhat.com> writes:
> >>>
> > [snip]
> >
> >>>> I think makedumpfile is using it, but I also vote to remove the
> >>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
> >>>> userspace patch is needed to drop the use of it.
> >>>>
> >>>>> As we are looking at reliability concerns removing CRASHTIME should make
> >>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
> >>>>> everything considerably.
> >>>> It is a nice improvement..
> >>> We also need to take a close look at what s390 is doing with vmcoreinfo.
> >>> As apparently it is reading it in a different kind of crashdump process.
> >> Yes, need careful review from s390 and maybe ppc64 especially about
> >> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
> >> fadump. Added more cc.
> > On s390 we have at least an issue with patch 1/3. For stand-alone dump
> > and also because we create the ELF header for kdump in the new
> > kernel we save the pointer to the vmcoreinfo note in the old kernel on a
> > defined memory address in our absolute zero lowcore.
> >
> > This is done in arch/s390/kernel/setup.c:
> >
> > static void __init setup_vmcoreinfo(void)
> > {
> >         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
> > }
> >
> > Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in
> > time we have a problem here.
> >
> > To solve this - I think - we could move the initialization to
> > arch/s390/kernel/machine_kexec.c:
> >
> > void arch_crash_save_vmcoreinfo(void)
> > {
> >         VMCOREINFO_SYMBOL(lowcore_ptr);
> >         VMCOREINFO_SYMBOL(high_memory);
> >         VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
> >         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
> > }
> >
> > Probably related to this is my observation that patch 3/3 leads to
> > an empty VMCOREINFO note for kdump on s390. The note is there ...
> >
> > # readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE
> >   VMCOREINFO           0x0000068e       Unknown note type: (0x00000000)
> >
> > But it contains only zeros.
> 
> Yes, this is a good catch, I will do more tests.

Hello Xunlei,

After spending some time on this, I now understood the problem:

In patch 3/3 you copy vmcoreinfo into the control page before
machine_kexec_prepare() is called. For s390 we give back all the
crashkernel memory to the hypervisor before the new crashkernel
is loaded:

/*
 * Give back memory to hypervisor before new kdump is loaded
 */
static int machine_kexec_prepare_kdump(void)
{
#ifdef CONFIG_CRASH_DUMP
        if (MACHINE_IS_VM)
                diag10_range(PFN_DOWN(crashk_res.start),
                             PFN_DOWN(crashk_res.end - crashk_res.start + 1));
        return 0;
#else
        return -EINVAL;
#endif
}

So after machine_kexec_prepare_kdump() the contents of your control page
is gone and therefore the vmcorinfo ELF note contains only zeros.

If you call kimage_crash_copy_vmcoreinfo() after
machine_kexec_prepare_kdump() the problem should be solved for s390.

Regards
Michael

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-23 17:46               ` Michael Holzheu
  0 siblings, 0 replies; 36+ messages in thread
From: Michael Holzheu @ 2017-03-23 17:46 UTC (permalink / raw)
  To: xlpang
  Cc: Baoquan He, xpang, Atsushi Kumagai, Petr Tesarik, linux-kernel,
	Eric W. Biederman, hbathini, akpm, Dave Young, kexec

Am Thu, 23 Mar 2017 17:23:53 +0800
schrieb Xunlei Pang <xpang@redhat.com>:

> On 03/23/2017 at 04:48 AM, Michael Holzheu wrote:
> > Am Wed, 22 Mar 2017 12:30:04 +0800
> > schrieb Dave Young <dyoung@redhat.com>:
> >
> >> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
> >>> Dave Young <dyoung@redhat.com> writes:
> >>>
> > [snip]
> >
> >>>> I think makedumpfile is using it, but I also vote to remove the
> >>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
> >>>> userspace patch is needed to drop the use of it.
> >>>>
> >>>>> As we are looking at reliability concerns removing CRASHTIME should make
> >>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
> >>>>> everything considerably.
> >>>> It is a nice improvement..
> >>> We also need to take a close look at what s390 is doing with vmcoreinfo.
> >>> As apparently it is reading it in a different kind of crashdump process.
> >> Yes, need careful review from s390 and maybe ppc64 especially about
> >> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
> >> fadump. Added more cc.
> > On s390 we have at least an issue with patch 1/3. For stand-alone dump
> > and also because we create the ELF header for kdump in the new
> > kernel we save the pointer to the vmcoreinfo note in the old kernel on a
> > defined memory address in our absolute zero lowcore.
> >
> > This is done in arch/s390/kernel/setup.c:
> >
> > static void __init setup_vmcoreinfo(void)
> > {
> >         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
> > }
> >
> > Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in
> > time we have a problem here.
> >
> > To solve this - I think - we could move the initialization to
> > arch/s390/kernel/machine_kexec.c:
> >
> > void arch_crash_save_vmcoreinfo(void)
> > {
> >         VMCOREINFO_SYMBOL(lowcore_ptr);
> >         VMCOREINFO_SYMBOL(high_memory);
> >         VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
> >         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
> > }
> >
> > Probably related to this is my observation that patch 3/3 leads to
> > an empty VMCOREINFO note for kdump on s390. The note is there ...
> >
> > # readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE
> >   VMCOREINFO           0x0000068e       Unknown note type: (0x00000000)
> >
> > But it contains only zeros.
> 
> Yes, this is a good catch, I will do more tests.

Hello Xunlei,

After spending some time on this, I now understood the problem:

In patch 3/3 you copy vmcoreinfo into the control page before
machine_kexec_prepare() is called. For s390 we give back all the
crashkernel memory to the hypervisor before the new crashkernel
is loaded:

/*
 * Give back memory to hypervisor before new kdump is loaded
 */
static int machine_kexec_prepare_kdump(void)
{
#ifdef CONFIG_CRASH_DUMP
        if (MACHINE_IS_VM)
                diag10_range(PFN_DOWN(crashk_res.start),
                             PFN_DOWN(crashk_res.end - crashk_res.start + 1));
        return 0;
#else
        return -EINVAL;
#endif
}

So after machine_kexec_prepare_kdump() the contents of your control page
is gone and therefore the vmcorinfo ELF note contains only zeros.

If you call kimage_crash_copy_vmcoreinfo() after
machine_kexec_prepare_kdump() the problem should be solved for s390.

Regards
Michael


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
  2017-03-23 17:46               ` Michael Holzheu
@ 2017-03-24 11:03                 ` Xunlei Pang
  -1 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-24 11:03 UTC (permalink / raw)
  To: Michael Holzheu, xlpang
  Cc: Baoquan He, Atsushi Kumagai, Petr Tesarik, linux-kernel,
	Eric W. Biederman, hbathini, akpm, Dave Young, kexec

On 03/24/2017 at 01:46 AM, Michael Holzheu wrote:
> Am Thu, 23 Mar 2017 17:23:53 +0800
> schrieb Xunlei Pang <xpang@redhat.com>:
>
>> On 03/23/2017 at 04:48 AM, Michael Holzheu wrote:
>>> Am Wed, 22 Mar 2017 12:30:04 +0800
>>> schrieb Dave Young <dyoung@redhat.com>:
>>>
>>>> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>>>>> Dave Young <dyoung@redhat.com> writes:
>>>>>
>>> [snip]
>>>
>>>>>> I think makedumpfile is using it, but I also vote to remove the
>>>>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>>>>> userspace patch is needed to drop the use of it.
>>>>>>
>>>>>>> As we are looking at reliability concerns removing CRASHTIME should make
>>>>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
>>>>>>> everything considerably.
>>>>>> It is a nice improvement..
>>>>> We also need to take a close look at what s390 is doing with vmcoreinfo.
>>>>> As apparently it is reading it in a different kind of crashdump process.
>>>> Yes, need careful review from s390 and maybe ppc64 especially about
>>>> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
>>>> fadump. Added more cc.
>>> On s390 we have at least an issue with patch 1/3. For stand-alone dump
>>> and also because we create the ELF header for kdump in the new
>>> kernel we save the pointer to the vmcoreinfo note in the old kernel on a
>>> defined memory address in our absolute zero lowcore.
>>>
>>> This is done in arch/s390/kernel/setup.c:
>>>
>>> static void __init setup_vmcoreinfo(void)
>>> {
>>>         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
>>> }
>>>
>>> Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in
>>> time we have a problem here.
>>>
>>> To solve this - I think - we could move the initialization to
>>> arch/s390/kernel/machine_kexec.c:
>>>
>>> void arch_crash_save_vmcoreinfo(void)
>>> {
>>>         VMCOREINFO_SYMBOL(lowcore_ptr);
>>>         VMCOREINFO_SYMBOL(high_memory);
>>>         VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
>>>         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
>>> }
>>>
>>> Probably related to this is my observation that patch 3/3 leads to
>>> an empty VMCOREINFO note for kdump on s390. The note is there ...
>>>
>>> # readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE
>>>   VMCOREINFO           0x0000068e       Unknown note type: (0x00000000)
>>>
>>> But it contains only zeros.
>> Yes, this is a good catch, I will do more tests.
> Hello Xunlei,
>
> After spending some time on this, I now understood the problem:
>
> In patch 3/3 you copy vmcoreinfo into the control page before
> machine_kexec_prepare() is called. For s390 we give back all the
> crashkernel memory to the hypervisor before the new crashkernel
> is loaded:
>
> /*
>  * Give back memory to hypervisor before new kdump is loaded
>  */
> static int machine_kexec_prepare_kdump(void)
> {
> #ifdef CONFIG_CRASH_DUMP
>         if (MACHINE_IS_VM)
>                 diag10_range(PFN_DOWN(crashk_res.start),
>                              PFN_DOWN(crashk_res.end - crashk_res.start + 1));
>         return 0;
> #else
>         return -EINVAL;
> #endif
> }
>
> So after machine_kexec_prepare_kdump() the contents of your control page
> is gone and therefore the vmcorinfo ELF note contains only zeros.
>
> If you call kimage_crash_copy_vmcoreinfo() after
> machine_kexec_prepare_kdump() the problem should be solved for s390.

Will update, thanks for finding the root cause.

Regards,
Xunlei

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section
@ 2017-03-24 11:03                 ` Xunlei Pang
  0 siblings, 0 replies; 36+ messages in thread
From: Xunlei Pang @ 2017-03-24 11:03 UTC (permalink / raw)
  To: Michael Holzheu, xlpang
  Cc: Baoquan He, kexec, Atsushi Kumagai, Petr Tesarik, linux-kernel,
	Eric W. Biederman, hbathini, akpm, Dave Young

On 03/24/2017 at 01:46 AM, Michael Holzheu wrote:
> Am Thu, 23 Mar 2017 17:23:53 +0800
> schrieb Xunlei Pang <xpang@redhat.com>:
>
>> On 03/23/2017 at 04:48 AM, Michael Holzheu wrote:
>>> Am Wed, 22 Mar 2017 12:30:04 +0800
>>> schrieb Dave Young <dyoung@redhat.com>:
>>>
>>>> On 03/21/17 at 10:18pm, Eric W. Biederman wrote:
>>>>> Dave Young <dyoung@redhat.com> writes:
>>>>>
>>> [snip]
>>>
>>>>>> I think makedumpfile is using it, but I also vote to remove the
>>>>>> CRASHTIME. It is better not to do this while crashing and a makedumpfile
>>>>>> userspace patch is needed to drop the use of it.
>>>>>>
>>>>>>> As we are looking at reliability concerns removing CRASHTIME should make
>>>>>>> everything in vmcoreinfo a boot time constant.  Which should simplify
>>>>>>> everything considerably.
>>>>>> It is a nice improvement..
>>>>> We also need to take a close look at what s390 is doing with vmcoreinfo.
>>>>> As apparently it is reading it in a different kind of crashdump process.
>>>> Yes, need careful review from s390 and maybe ppc64 especially about
>>>> patch 2/3, better to have comments from IBM about s390 dump tool and ppc
>>>> fadump. Added more cc.
>>> On s390 we have at least an issue with patch 1/3. For stand-alone dump
>>> and also because we create the ELF header for kdump in the new
>>> kernel we save the pointer to the vmcoreinfo note in the old kernel on a
>>> defined memory address in our absolute zero lowcore.
>>>
>>> This is done in arch/s390/kernel/setup.c:
>>>
>>> static void __init setup_vmcoreinfo(void)
>>> {
>>>         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
>>> }
>>>
>>> Since with patch 1/3 paddr_vmcoreinfo_note() returns NULL at this point in
>>> time we have a problem here.
>>>
>>> To solve this - I think - we could move the initialization to
>>> arch/s390/kernel/machine_kexec.c:
>>>
>>> void arch_crash_save_vmcoreinfo(void)
>>> {
>>>         VMCOREINFO_SYMBOL(lowcore_ptr);
>>>         VMCOREINFO_SYMBOL(high_memory);
>>>         VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
>>>         mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
>>> }
>>>
>>> Probably related to this is my observation that patch 3/3 leads to
>>> an empty VMCOREINFO note for kdump on s390. The note is there ...
>>>
>>> # readelf -n /var/crash/127.0.0.1-2017-03-22-21:14:39/vmcore | grep VMCORE
>>>   VMCOREINFO           0x0000068e       Unknown note type: (0x00000000)
>>>
>>> But it contains only zeros.
>> Yes, this is a good catch, I will do more tests.
> Hello Xunlei,
>
> After spending some time on this, I now understood the problem:
>
> In patch 3/3 you copy vmcoreinfo into the control page before
> machine_kexec_prepare() is called. For s390 we give back all the
> crashkernel memory to the hypervisor before the new crashkernel
> is loaded:
>
> /*
>  * Give back memory to hypervisor before new kdump is loaded
>  */
> static int machine_kexec_prepare_kdump(void)
> {
> #ifdef CONFIG_CRASH_DUMP
>         if (MACHINE_IS_VM)
>                 diag10_range(PFN_DOWN(crashk_res.start),
>                              PFN_DOWN(crashk_res.end - crashk_res.start + 1));
>         return 0;
> #else
>         return -EINVAL;
> #endif
> }
>
> So after machine_kexec_prepare_kdump() the contents of your control page
> is gone and therefore the vmcorinfo ELF note contains only zeros.
>
> If you call kimage_crash_copy_vmcoreinfo() after
> machine_kexec_prepare_kdump() the problem should be solved for s390.

Will update, thanks for finding the root cause.

Regards,
Xunlei

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2017-03-24 11:00 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-20  5:50 [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section Xunlei Pang
2017-03-20  5:50 ` Xunlei Pang
2017-03-20  5:50 ` [PATCH v3 2/3] powerpc/fadump: Use the correct VMCOREINFO_NOTE_SIZE for phdr Xunlei Pang
2017-03-20  5:50   ` Xunlei Pang
2017-03-20  5:50 ` [PATCH v3 3/3] kdump: Relocate vmcoreinfo to the crash memory range Xunlei Pang
2017-03-20  5:50   ` Xunlei Pang
2017-03-21  3:33 ` [PATCH v3 1/3] kexec: Move vmcoreinfo out of the kernel's .bss section Eric W. Biederman
2017-03-21  3:33   ` Eric W. Biederman
2017-03-22  2:55   ` Dave Young
2017-03-22  2:55     ` Dave Young
2017-03-22  3:18     ` Eric W. Biederman
2017-03-22  3:18       ` Eric W. Biederman
2017-03-22  4:30       ` Dave Young
2017-03-22  4:30         ` Dave Young
2017-03-22  9:34         ` Xunlei Pang
2017-03-22  9:34           ` Xunlei Pang
2017-03-22 12:15           ` Hari Bathini
2017-03-22 12:15             ` Hari Bathini
2017-03-22 11:46         ` Hari Bathini
2017-03-22 11:46           ` Hari Bathini
2017-03-22 20:48         ` Michael Holzheu
2017-03-22 20:48           ` Michael Holzheu
2017-03-23  9:23           ` Xunlei Pang
2017-03-23  9:23             ` Xunlei Pang
2017-03-23 17:46             ` Michael Holzheu
2017-03-23 17:46               ` Michael Holzheu
2017-03-24 11:03               ` Xunlei Pang
2017-03-24 11:03                 ` Xunlei Pang
2017-03-22  8:55   ` Xunlei Pang
2017-03-22  8:55     ` Xunlei Pang
2017-03-22  9:16     ` Xunlei Pang
2017-03-22  9:16       ` Xunlei Pang
2017-03-22  9:17     ` Xunlei Pang
2017-03-22  9:17       ` Xunlei Pang
2017-03-21  9:27 ` Petr Tesarik
2017-03-21  9:27   ` Petr Tesarik

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.