All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-14  5:52 ` Sang Yan
  0 siblings, 0 replies; 25+ messages in thread
From: Sang Yan @ 2020-08-14  5:52 UTC (permalink / raw)
  To: kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun
  Cc: zhuling8, luanjianhai, luchunhua

In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
copy all segments from vmalloced memory to kernel boot memory,
because of disabled mmu.

We introduce quick kexec to save time of copying memory as above,
just like kdump(kexec on crash), by using reserved memory
"Quick Kexec".

Constructing quick kimage as the same as crash kernel,
then simply copy all segments of kimage to reserved memroy.

We also add this support in syscall kexec_load using flags
of KEXEC_QUICK.

Signed-off-by: Sang Yan <sangyan@huawei.com>
---
 arch/Kconfig               | 10 ++++++++++
 include/linux/ioport.h     |  3 +++
 include/linux/kexec.h      | 13 +++++++++++-
 include/uapi/linux/kexec.h |  3 +++
 kernel/kexec.c             | 10 ++++++++++
 kernel/kexec_core.c        | 41 +++++++++++++++++++++++++++++---------
 6 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 3329fa143637..eca782cb8e29 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -21,6 +21,16 @@ config KEXEC_CORE
 config KEXEC_ELF
 	bool
 
+config QUICK_KEXEC
+	bool "Support for quick kexec"
+	depends on KEXEC_CORE
+	help
+	  Say y here to enable this feature.
+	  It use reserved memory to accelerate kexec, just like crash
+	  kexec, load new kernel and initrd to reserved memory, and
+	  boot new kernel on that memory. It will save the time of
+	  relocating kernel.
+
 config HAVE_IMA_KEXEC
 	bool
 
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6c2b06fe8beb..f37c632accbe 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -136,6 +136,9 @@ enum {
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 	IORES_DESC_RESERVED			= 7,
 	IORES_DESC_SOFT_RESERVED		= 8,
+#ifdef CONFIG_QUICK_KEXEC
+	IORES_DESC_QUICK_KEXEC			= 9,
+#endif
 };
 
 /*
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9e93bef52968..976bf9631070 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -269,9 +269,12 @@ struct kimage {
 	unsigned long control_page;
 
 	/* Flags to indicate special processing */
-	unsigned int type : 1;
+	unsigned int type : 2;
 #define KEXEC_TYPE_DEFAULT 0
 #define KEXEC_TYPE_CRASH   1
+#ifdef CONFIG_QUICK_KEXEC
+#define KEXEC_TYPE_QUICK   2
+#endif
 	unsigned int preserve_context : 1;
 	/* If set, we are using file mode kexec syscall */
 	unsigned int file_mode:1;
@@ -331,6 +334,11 @@ extern int kexec_load_disabled;
 #define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
 #endif
 
+#ifdef CONFIG_QUICK_KEXEC
+#undef KEXEC_FLAGS
+#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_QUICK)
+#endif
+
 /* List of defined/legal kexec file flags */
 #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
 				 KEXEC_FILE_NO_INITRAMFS)
@@ -340,6 +348,9 @@ extern int kexec_load_disabled;
 extern struct resource crashk_res;
 extern struct resource crashk_low_res;
 extern note_buf_t __percpu *crash_notes;
+#ifdef CONFIG_QUICK_KEXEC
+extern struct resource quick_kexec_res;
+#endif
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 05669c87a0af..e3213614b713 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -12,6 +12,9 @@
 /* kexec flags for different usage scenarios */
 #define KEXEC_ON_CRASH		0x00000001
 #define KEXEC_PRESERVE_CONTEXT	0x00000002
+#ifdef CONFIG_QUICK_KEXEC
+#define KEXEC_QUICK		0x00000004
+#endif
 #define KEXEC_ARCH_MASK		0xffff0000
 
 /*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f977786fe498..428af4cd3e1a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -44,6 +44,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 	int ret;
 	struct kimage *image;
 	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+#ifdef CONFIG_QUICK_KEXEC
+	bool kexec_on_quick = flags & KEXEC_QUICK;
+#endif
 
 	if (kexec_on_panic) {
 		/* Verify we have a valid entry point */
@@ -69,6 +72,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 		image->type = KEXEC_TYPE_CRASH;
 	}
 
+#ifdef CONFIG_QUICK_KEXEC
+	if (kexec_on_quick) {
+		image->control_page = quick_kexec_res.start;
+		image->type = KEXEC_TYPE_QUICK;
+	}
+#endif
+
 	ret = sanity_check_segment_list(image);
 	if (ret)
 		goto out_free_image;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c19c0dad1ebe..b73dd749368b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -70,6 +70,16 @@ struct resource crashk_low_res = {
 	.desc  = IORES_DESC_CRASH_KERNEL
 };
 
+#ifdef CONFIG_QUICK_KEXEC
+struct resource quick_kexec_res = {
+	.name  = "Quick kexec",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_QUICK_KEXEC
+};
+#endif
+
 int kexec_should_crash(struct task_struct *p)
 {
 	/*
@@ -413,8 +423,10 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	return pages;
 }
 
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
-						      unsigned int order)
+
+static struct page *kimage_alloc_special_control_pages(struct kimage *image,
+						       unsigned int order,
+						       unsigned long end)
 {
 	/* Control pages are special, they are the intermediaries
 	 * that are needed while we copy the rest of the pages
@@ -444,7 +456,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 	size = (1 << order) << PAGE_SHIFT;
 	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 	hole_end   = hole_start + size - 1;
-	while (hole_end <= crashk_res.end) {
+	while (hole_end <= end) {
 		unsigned long i;
 
 		cond_resched();
@@ -479,7 +491,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 	return pages;
 }
 
-
 struct page *kimage_alloc_control_pages(struct kimage *image,
 					 unsigned int order)
 {
@@ -490,8 +501,15 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
 		pages = kimage_alloc_normal_control_pages(image, order);
 		break;
 	case KEXEC_TYPE_CRASH:
-		pages = kimage_alloc_crash_control_pages(image, order);
+		pages = kimage_alloc_special_control_pages(image, order,
+							   crashk_res.end);
+		break;
+#ifdef CONFIG_QUICK_KEXEC
+	case KEXEC_TYPE_QUICK:
+		pages = kimage_alloc_special_control_pages(image, order,
+							   quick_kexec_res.end);
 		break;
+#endif
 	}
 
 	return pages;
@@ -847,11 +865,11 @@ static int kimage_load_normal_segment(struct kimage *image,
 	return result;
 }
 
-static int kimage_load_crash_segment(struct kimage *image,
+static int kimage_load_special_segment(struct kimage *image,
 					struct kexec_segment *segment)
 {
-	/* For crash dumps kernels we simply copy the data from
-	 * user space to it's destination.
+	/* For crash dumps kernels and quick kexec kernels
+	 * we simply copy the data from user space to it's destination.
 	 * We do things a page at a time for the sake of kmap.
 	 */
 	unsigned long maddr;
@@ -925,8 +943,13 @@ int kimage_load_segment(struct kimage *image,
 		result = kimage_load_normal_segment(image, segment);
 		break;
 	case KEXEC_TYPE_CRASH:
-		result = kimage_load_crash_segment(image, segment);
+		result = kimage_load_special_segment(image, segment);
+		break;
+#ifdef CONFIG_QUICK_KEXEC
+	case KEXEC_TYPE_QUICK:
+		result = kimage_load_special_segment(image, segment);
 		break;
+#endif
 	}
 
 	return result;
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-14  5:52 ` Sang Yan
  0 siblings, 0 replies; 25+ messages in thread
From: Sang Yan @ 2020-08-14  5:52 UTC (permalink / raw)
  To: kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun
  Cc: luanjianhai, zhuling8, luchunhua

In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
copy all segments from vmalloced memory to kernel boot memory,
because of disabled mmu.

We introduce quick kexec to save time of copying memory as above,
just like kdump(kexec on crash), by using reserved memory
"Quick Kexec".

Constructing quick kimage as the same as crash kernel,
then simply copy all segments of kimage to reserved memroy.

We also add this support in syscall kexec_load using flags
of KEXEC_QUICK.

Signed-off-by: Sang Yan <sangyan@huawei.com>
---
 arch/Kconfig               | 10 ++++++++++
 include/linux/ioport.h     |  3 +++
 include/linux/kexec.h      | 13 +++++++++++-
 include/uapi/linux/kexec.h |  3 +++
 kernel/kexec.c             | 10 ++++++++++
 kernel/kexec_core.c        | 41 +++++++++++++++++++++++++++++---------
 6 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 3329fa143637..eca782cb8e29 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -21,6 +21,16 @@ config KEXEC_CORE
 config KEXEC_ELF
 	bool
 
+config QUICK_KEXEC
+	bool "Support for quick kexec"
+	depends on KEXEC_CORE
+	help
+	  Say y here to enable this feature.
+	  It use reserved memory to accelerate kexec, just like crash
+	  kexec, load new kernel and initrd to reserved memory, and
+	  boot new kernel on that memory. It will save the time of
+	  relocating kernel.
+
 config HAVE_IMA_KEXEC
 	bool
 
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6c2b06fe8beb..f37c632accbe 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -136,6 +136,9 @@ enum {
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 	IORES_DESC_RESERVED			= 7,
 	IORES_DESC_SOFT_RESERVED		= 8,
+#ifdef CONFIG_QUICK_KEXEC
+	IORES_DESC_QUICK_KEXEC			= 9,
+#endif
 };
 
 /*
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9e93bef52968..976bf9631070 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -269,9 +269,12 @@ struct kimage {
 	unsigned long control_page;
 
 	/* Flags to indicate special processing */
-	unsigned int type : 1;
+	unsigned int type : 2;
 #define KEXEC_TYPE_DEFAULT 0
 #define KEXEC_TYPE_CRASH   1
+#ifdef CONFIG_QUICK_KEXEC
+#define KEXEC_TYPE_QUICK   2
+#endif
 	unsigned int preserve_context : 1;
 	/* If set, we are using file mode kexec syscall */
 	unsigned int file_mode:1;
@@ -331,6 +334,11 @@ extern int kexec_load_disabled;
 #define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
 #endif
 
+#ifdef CONFIG_QUICK_KEXEC
+#undef KEXEC_FLAGS
+#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_QUICK)
+#endif
+
 /* List of defined/legal kexec file flags */
 #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
 				 KEXEC_FILE_NO_INITRAMFS)
@@ -340,6 +348,9 @@ extern int kexec_load_disabled;
 extern struct resource crashk_res;
 extern struct resource crashk_low_res;
 extern note_buf_t __percpu *crash_notes;
+#ifdef CONFIG_QUICK_KEXEC
+extern struct resource quick_kexec_res;
+#endif
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 05669c87a0af..e3213614b713 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -12,6 +12,9 @@
 /* kexec flags for different usage scenarios */
 #define KEXEC_ON_CRASH		0x00000001
 #define KEXEC_PRESERVE_CONTEXT	0x00000002
+#ifdef CONFIG_QUICK_KEXEC
+#define KEXEC_QUICK		0x00000004
+#endif
 #define KEXEC_ARCH_MASK		0xffff0000
 
 /*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f977786fe498..428af4cd3e1a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -44,6 +44,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 	int ret;
 	struct kimage *image;
 	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+#ifdef CONFIG_QUICK_KEXEC
+	bool kexec_on_quick = flags & KEXEC_QUICK;
+#endif
 
 	if (kexec_on_panic) {
 		/* Verify we have a valid entry point */
@@ -69,6 +72,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
 		image->type = KEXEC_TYPE_CRASH;
 	}
 
+#ifdef CONFIG_QUICK_KEXEC
+	if (kexec_on_quick) {
+		image->control_page = quick_kexec_res.start;
+		image->type = KEXEC_TYPE_QUICK;
+	}
+#endif
+
 	ret = sanity_check_segment_list(image);
 	if (ret)
 		goto out_free_image;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c19c0dad1ebe..b73dd749368b 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -70,6 +70,16 @@ struct resource crashk_low_res = {
 	.desc  = IORES_DESC_CRASH_KERNEL
 };
 
+#ifdef CONFIG_QUICK_KEXEC
+struct resource quick_kexec_res = {
+	.name  = "Quick kexec",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_QUICK_KEXEC
+};
+#endif
+
 int kexec_should_crash(struct task_struct *p)
 {
 	/*
@@ -413,8 +423,10 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	return pages;
 }
 
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
-						      unsigned int order)
+
+static struct page *kimage_alloc_special_control_pages(struct kimage *image,
+						       unsigned int order,
+						       unsigned long end)
 {
 	/* Control pages are special, they are the intermediaries
 	 * that are needed while we copy the rest of the pages
@@ -444,7 +456,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 	size = (1 << order) << PAGE_SHIFT;
 	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 	hole_end   = hole_start + size - 1;
-	while (hole_end <= crashk_res.end) {
+	while (hole_end <= end) {
 		unsigned long i;
 
 		cond_resched();
@@ -479,7 +491,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 	return pages;
 }
 
-
 struct page *kimage_alloc_control_pages(struct kimage *image,
 					 unsigned int order)
 {
@@ -490,8 +501,15 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
 		pages = kimage_alloc_normal_control_pages(image, order);
 		break;
 	case KEXEC_TYPE_CRASH:
-		pages = kimage_alloc_crash_control_pages(image, order);
+		pages = kimage_alloc_special_control_pages(image, order,
+							   crashk_res.end);
+		break;
+#ifdef CONFIG_QUICK_KEXEC
+	case KEXEC_TYPE_QUICK:
+		pages = kimage_alloc_special_control_pages(image, order,
+							   quick_kexec_res.end);
 		break;
+#endif
 	}
 
 	return pages;
@@ -847,11 +865,11 @@ static int kimage_load_normal_segment(struct kimage *image,
 	return result;
 }
 
-static int kimage_load_crash_segment(struct kimage *image,
+static int kimage_load_special_segment(struct kimage *image,
 					struct kexec_segment *segment)
 {
-	/* For crash dumps kernels we simply copy the data from
-	 * user space to it's destination.
+	/* For crash dumps kernels and quick kexec kernels
+	 * we simply copy the data from user space to it's destination.
 	 * We do things a page at a time for the sake of kmap.
 	 */
 	unsigned long maddr;
@@ -925,8 +943,13 @@ int kimage_load_segment(struct kimage *image,
 		result = kimage_load_normal_segment(image, segment);
 		break;
 	case KEXEC_TYPE_CRASH:
-		result = kimage_load_crash_segment(image, segment);
+		result = kimage_load_special_segment(image, segment);
+		break;
+#ifdef CONFIG_QUICK_KEXEC
+	case KEXEC_TYPE_QUICK:
+		result = kimage_load_special_segment(image, segment);
 		break;
+#endif
 	}
 
 	return result;
-- 
2.19.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/2] arm64: Reserve memory for quick kexec
  2020-08-14  5:52 ` Sang Yan
@ 2020-08-14  5:52   ` Sang Yan
  -1 siblings, 0 replies; 25+ messages in thread
From: Sang Yan @ 2020-08-14  5:52 UTC (permalink / raw)
  To: kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun
  Cc: zhuling8, luanjianhai, luchunhua

Reserve memory for quick kexec on arm64
with cmdline "quickkexec=".

Signed-off-by: Sang Yan <sangyan@huawei.com>
---
 arch/arm64/kernel/setup.c |  6 ++++++
 arch/arm64/mm/init.c      | 43 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 77c4c9bad1b8..2a5dc032d95e 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -369,6 +369,12 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	 */
 	init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page);
 #endif
+#ifdef CONFIG_QUICK_KEXEC
+		if (quick_kexec_res.end &&
+		    quick_kexec_res.start >= res->start &&
+		    quick_kexec_res.end <= res->end)
+			request_resource(res, &quick_kexec_res);
+#endif
 
 	if (boot_args[1] || boot_args[2] || boot_args[3]) {
 		pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 481d22c32a2e..579acb93728f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -130,6 +130,45 @@ static void __init reserve_crashkernel(void)
 }
 #endif /* CONFIG_KEXEC_CORE */
 
+#ifdef CONFIG_QUICK_KEXEC
+static int __init parse_quick_kexec(char *p)
+{
+	if (!p)
+		return 0;
+
+	quick_kexec_res.end = PAGE_ALIGN(memparse(p, NULL));
+
+	return 0;
+}
+early_param("quickkexec", parse_quick_kexec);
+
+static void __init reserve_quick_kexec(void)
+{
+	unsigned long long mem_start, mem_len;
+
+	mem_len = quick_kexec_res.end;
+	if (mem_len == 0)
+		return;
+
+	/* Current arm64 boot protocol requires 2MB alignment */
+	mem_start = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT,
+			mem_len, CRASH_ALIGN);
+	if (mem_start == 0) {
+		pr_warn("cannot allocate quick kexec mem (size:0x%llx)\n",
+			mem_len);
+		quick_kexec_res.end = 0;
+		return;
+	}
+
+	memblock_reserve(mem_start, mem_len);
+	pr_info("quick kexec mem reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+		mem_start, mem_start + mem_len,	mem_len >> 20);
+
+	quick_kexec_res.start = mem_start;
+	quick_kexec_res.end = mem_start + mem_len - 1;
+}
+#endif
+
 #ifdef CONFIG_CRASH_DUMP
 static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
 		const char *uname, int depth, void *data)
@@ -399,6 +438,10 @@ void __init arm64_memblock_init(void)
 
 	reserve_crashkernel();
 
+#ifdef CONFIG_QUICK_KEXEC
+	reserve_quick_kexec();
+#endif
+
 	reserve_elfcorehdr();
 
 	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/2] arm64: Reserve memory for quick kexec
@ 2020-08-14  5:52   ` Sang Yan
  0 siblings, 0 replies; 25+ messages in thread
From: Sang Yan @ 2020-08-14  5:52 UTC (permalink / raw)
  To: kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun
  Cc: luanjianhai, zhuling8, luchunhua

Reserve memory for quick kexec on arm64
with cmdline "quickkexec=".

Signed-off-by: Sang Yan <sangyan@huawei.com>
---
 arch/arm64/kernel/setup.c |  6 ++++++
 arch/arm64/mm/init.c      | 43 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 77c4c9bad1b8..2a5dc032d95e 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -369,6 +369,12 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	 */
 	init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page);
 #endif
+#ifdef CONFIG_QUICK_KEXEC
+		if (quick_kexec_res.end &&
+		    quick_kexec_res.start >= res->start &&
+		    quick_kexec_res.end <= res->end)
+			request_resource(res, &quick_kexec_res);
+#endif
 
 	if (boot_args[1] || boot_args[2] || boot_args[3]) {
 		pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 481d22c32a2e..579acb93728f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -130,6 +130,45 @@ static void __init reserve_crashkernel(void)
 }
 #endif /* CONFIG_KEXEC_CORE */
 
+#ifdef CONFIG_QUICK_KEXEC
+static int __init parse_quick_kexec(char *p)
+{
+	if (!p)
+		return 0;
+
+	quick_kexec_res.end = PAGE_ALIGN(memparse(p, NULL));
+
+	return 0;
+}
+early_param("quickkexec", parse_quick_kexec);
+
+static void __init reserve_quick_kexec(void)
+{
+	unsigned long long mem_start, mem_len;
+
+	mem_len = quick_kexec_res.end;
+	if (mem_len == 0)
+		return;
+
+	/* Current arm64 boot protocol requires 2MB alignment */
+	mem_start = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT,
+			mem_len, CRASH_ALIGN);
+	if (mem_start == 0) {
+		pr_warn("cannot allocate quick kexec mem (size:0x%llx)\n",
+			mem_len);
+		quick_kexec_res.end = 0;
+		return;
+	}
+
+	memblock_reserve(mem_start, mem_len);
+	pr_info("quick kexec mem reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+		mem_start, mem_start + mem_len,	mem_len >> 20);
+
+	quick_kexec_res.start = mem_start;
+	quick_kexec_res.end = mem_start + mem_len - 1;
+}
+#endif
+
 #ifdef CONFIG_CRASH_DUMP
 static int __init early_init_dt_scan_elfcorehdr(unsigned long node,
 		const char *uname, int depth, void *data)
@@ -399,6 +438,10 @@ void __init arm64_memblock_init(void)
 
 	reserve_crashkernel();
 
+#ifdef CONFIG_QUICK_KEXEC
+	reserve_quick_kexec();
+#endif
+
 	reserve_elfcorehdr();
 
 	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
-- 
2.19.1


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
  2020-08-14  5:52 ` Sang Yan
@ 2020-08-14  6:58   ` Dave Young
  -1 siblings, 0 replies; 25+ messages in thread
From: Dave Young @ 2020-08-14  6:58 UTC (permalink / raw)
  To: Sang Yan
  Cc: kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun, luanjianhai,
	zhuling8, luchunhua, pasha.tatashin

On 08/14/20 at 01:52am, Sang Yan wrote:
> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
> copy all segments from vmalloced memory to kernel boot memory,
> because of disabled mmu.

It is not the case on all archs, I assume your case is arm64, please
describe it in patch log :)

About the arm64 problem, I know Pavel Tatashin is working on a patchset
to improve the performance with enabling mmu.

I added Pavel in cc, can you try his patches?

> 
> We introduce quick kexec to save time of copying memory as above,
> just like kdump(kexec on crash), by using reserved memory
> "Quick Kexec".

This approach may have gain, but it also introduce extra requirements to
pre-reserve a memory region.  I wonder how Eric thinks about the idea.

Anyway the "quick" name sounds not very good, I would suggest do not
introduce a new param, and the code can check if pre-reserved region
exist then use it, if not then fallback to old way.

> 
> Constructing quick kimage as the same as crash kernel,
> then simply copy all segments of kimage to reserved memroy.
> 
> We also add this support in syscall kexec_load using flags
> of KEXEC_QUICK.
> 
> Signed-off-by: Sang Yan <sangyan@huawei.com>
> ---
>  arch/Kconfig               | 10 ++++++++++
>  include/linux/ioport.h     |  3 +++
>  include/linux/kexec.h      | 13 +++++++++++-
>  include/uapi/linux/kexec.h |  3 +++
>  kernel/kexec.c             | 10 ++++++++++
>  kernel/kexec_core.c        | 41 +++++++++++++++++++++++++++++---------
>  6 files changed, 70 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 3329fa143637..eca782cb8e29 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -21,6 +21,16 @@ config KEXEC_CORE
>  config KEXEC_ELF
>  	bool
>  
> +config QUICK_KEXEC
> +	bool "Support for quick kexec"
> +	depends on KEXEC_CORE
> +	help
> +	  Say y here to enable this feature.
> +	  It use reserved memory to accelerate kexec, just like crash
> +	  kexec, load new kernel and initrd to reserved memory, and
> +	  boot new kernel on that memory. It will save the time of
> +	  relocating kernel.
> +
>  config HAVE_IMA_KEXEC
>  	bool
>  
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index 6c2b06fe8beb..f37c632accbe 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -136,6 +136,9 @@ enum {
>  	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
>  	IORES_DESC_RESERVED			= 7,
>  	IORES_DESC_SOFT_RESERVED		= 8,
> +#ifdef CONFIG_QUICK_KEXEC
> +	IORES_DESC_QUICK_KEXEC			= 9,
> +#endif
>  };
>  
>  /*
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index 9e93bef52968..976bf9631070 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -269,9 +269,12 @@ struct kimage {
>  	unsigned long control_page;
>  
>  	/* Flags to indicate special processing */
> -	unsigned int type : 1;
> +	unsigned int type : 2;
>  #define KEXEC_TYPE_DEFAULT 0
>  #define KEXEC_TYPE_CRASH   1
> +#ifdef CONFIG_QUICK_KEXEC
> +#define KEXEC_TYPE_QUICK   2
> +#endif
>  	unsigned int preserve_context : 1;
>  	/* If set, we are using file mode kexec syscall */
>  	unsigned int file_mode:1;
> @@ -331,6 +334,11 @@ extern int kexec_load_disabled;
>  #define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
>  #endif
>  
> +#ifdef CONFIG_QUICK_KEXEC
> +#undef KEXEC_FLAGS
> +#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_QUICK)
> +#endif
> +
>  /* List of defined/legal kexec file flags */
>  #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
>  				 KEXEC_FILE_NO_INITRAMFS)
> @@ -340,6 +348,9 @@ extern int kexec_load_disabled;
>  extern struct resource crashk_res;
>  extern struct resource crashk_low_res;
>  extern note_buf_t __percpu *crash_notes;
> +#ifdef CONFIG_QUICK_KEXEC
> +extern struct resource quick_kexec_res;
> +#endif
>  
>  /* flag to track if kexec reboot is in progress */
>  extern bool kexec_in_progress;
> diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
> index 05669c87a0af..e3213614b713 100644
> --- a/include/uapi/linux/kexec.h
> +++ b/include/uapi/linux/kexec.h
> @@ -12,6 +12,9 @@
>  /* kexec flags for different usage scenarios */
>  #define KEXEC_ON_CRASH		0x00000001
>  #define KEXEC_PRESERVE_CONTEXT	0x00000002
> +#ifdef CONFIG_QUICK_KEXEC
> +#define KEXEC_QUICK		0x00000004
> +#endif
>  #define KEXEC_ARCH_MASK		0xffff0000
>  
>  /*
> diff --git a/kernel/kexec.c b/kernel/kexec.c
> index f977786fe498..428af4cd3e1a 100644
> --- a/kernel/kexec.c
> +++ b/kernel/kexec.c
> @@ -44,6 +44,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
>  	int ret;
>  	struct kimage *image;
>  	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
> +#ifdef CONFIG_QUICK_KEXEC
> +	bool kexec_on_quick = flags & KEXEC_QUICK;
> +#endif
>  
>  	if (kexec_on_panic) {
>  		/* Verify we have a valid entry point */
> @@ -69,6 +72,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
>  		image->type = KEXEC_TYPE_CRASH;
>  	}
>  
> +#ifdef CONFIG_QUICK_KEXEC
> +	if (kexec_on_quick) {
> +		image->control_page = quick_kexec_res.start;
> +		image->type = KEXEC_TYPE_QUICK;
> +	}
> +#endif
> +
>  	ret = sanity_check_segment_list(image);
>  	if (ret)
>  		goto out_free_image;
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index c19c0dad1ebe..b73dd749368b 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -70,6 +70,16 @@ struct resource crashk_low_res = {
>  	.desc  = IORES_DESC_CRASH_KERNEL
>  };
>  
> +#ifdef CONFIG_QUICK_KEXEC
> +struct resource quick_kexec_res = {
> +	.name  = "Quick kexec",
> +	.start = 0,
> +	.end   = 0,
> +	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
> +	.desc  = IORES_DESC_QUICK_KEXEC
> +};
> +#endif
> +
>  int kexec_should_crash(struct task_struct *p)
>  {
>  	/*
> @@ -413,8 +423,10 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
>  	return pages;
>  }
>  
> -static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
> -						      unsigned int order)
> +
> +static struct page *kimage_alloc_special_control_pages(struct kimage *image,
> +						       unsigned int order,
> +						       unsigned long end)
>  {
>  	/* Control pages are special, they are the intermediaries
>  	 * that are needed while we copy the rest of the pages
> @@ -444,7 +456,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>  	size = (1 << order) << PAGE_SHIFT;
>  	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
>  	hole_end   = hole_start + size - 1;
> -	while (hole_end <= crashk_res.end) {
> +	while (hole_end <= end) {
>  		unsigned long i;
>  
>  		cond_resched();
> @@ -479,7 +491,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>  	return pages;
>  }
>  
> -
>  struct page *kimage_alloc_control_pages(struct kimage *image,
>  					 unsigned int order)
>  {
> @@ -490,8 +501,15 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
>  		pages = kimage_alloc_normal_control_pages(image, order);
>  		break;
>  	case KEXEC_TYPE_CRASH:
> -		pages = kimage_alloc_crash_control_pages(image, order);
> +		pages = kimage_alloc_special_control_pages(image, order,
> +							   crashk_res.end);
> +		break;
> +#ifdef CONFIG_QUICK_KEXEC
> +	case KEXEC_TYPE_QUICK:
> +		pages = kimage_alloc_special_control_pages(image, order,
> +							   quick_kexec_res.end);
>  		break;
> +#endif
>  	}
>  
>  	return pages;
> @@ -847,11 +865,11 @@ static int kimage_load_normal_segment(struct kimage *image,
>  	return result;
>  }
>  
> -static int kimage_load_crash_segment(struct kimage *image,
> +static int kimage_load_special_segment(struct kimage *image,
>  					struct kexec_segment *segment)
>  {
> -	/* For crash dumps kernels we simply copy the data from
> -	 * user space to it's destination.
> +	/* For crash dumps kernels and quick kexec kernels
> +	 * we simply copy the data from user space to it's destination.
>  	 * We do things a page at a time for the sake of kmap.
>  	 */
>  	unsigned long maddr;
> @@ -925,8 +943,13 @@ int kimage_load_segment(struct kimage *image,
>  		result = kimage_load_normal_segment(image, segment);
>  		break;
>  	case KEXEC_TYPE_CRASH:
> -		result = kimage_load_crash_segment(image, segment);
> +		result = kimage_load_special_segment(image, segment);
> +		break;
> +#ifdef CONFIG_QUICK_KEXEC
> +	case KEXEC_TYPE_QUICK:
> +		result = kimage_load_special_segment(image, segment);
>  		break;
> +#endif
>  	}
>  
>  	return result;
> -- 
> 2.19.1
> 
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
> 

Thanks
Dave


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-14  6:58   ` Dave Young
  0 siblings, 0 replies; 25+ messages in thread
From: Dave Young @ 2020-08-14  6:58 UTC (permalink / raw)
  To: Sang Yan
  Cc: luanjianhai, pasha.tatashin, xiexiuqi, kexec, linux-kernel,
	luchunhua, ebiederm, guohanjun, zhuling8

On 08/14/20 at 01:52am, Sang Yan wrote:
> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
> copy all segments from vmalloced memory to kernel boot memory,
> because of disabled mmu.

It is not the case on all archs, I assume your case is arm64, please
describe it in patch log :)

About the arm64 problem, I know Pavel Tatashin is working on a patchset
to improve the performance with enabling mmu.

I added Pavel in cc, can you try his patches?

> 
> We introduce quick kexec to save time of copying memory as above,
> just like kdump(kexec on crash), by using reserved memory
> "Quick Kexec".

This approach may have gain, but it also introduce extra requirements to
pre-reserve a memory region.  I wonder how Eric thinks about the idea.

Anyway the "quick" name sounds not very good, I would suggest do not
introduce a new param, and the code can check if pre-reserved region
exist then use it, if not then fallback to old way.

> 
> Constructing quick kimage as the same as crash kernel,
> then simply copy all segments of kimage to reserved memroy.
> 
> We also add this support in syscall kexec_load using flags
> of KEXEC_QUICK.
> 
> Signed-off-by: Sang Yan <sangyan@huawei.com>
> ---
>  arch/Kconfig               | 10 ++++++++++
>  include/linux/ioport.h     |  3 +++
>  include/linux/kexec.h      | 13 +++++++++++-
>  include/uapi/linux/kexec.h |  3 +++
>  kernel/kexec.c             | 10 ++++++++++
>  kernel/kexec_core.c        | 41 +++++++++++++++++++++++++++++---------
>  6 files changed, 70 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 3329fa143637..eca782cb8e29 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -21,6 +21,16 @@ config KEXEC_CORE
>  config KEXEC_ELF
>  	bool
>  
> +config QUICK_KEXEC
> +	bool "Support for quick kexec"
> +	depends on KEXEC_CORE
> +	help
> +	  Say y here to enable this feature.
> +	  It use reserved memory to accelerate kexec, just like crash
> +	  kexec, load new kernel and initrd to reserved memory, and
> +	  boot new kernel on that memory. It will save the time of
> +	  relocating kernel.
> +
>  config HAVE_IMA_KEXEC
>  	bool
>  
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index 6c2b06fe8beb..f37c632accbe 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -136,6 +136,9 @@ enum {
>  	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
>  	IORES_DESC_RESERVED			= 7,
>  	IORES_DESC_SOFT_RESERVED		= 8,
> +#ifdef CONFIG_QUICK_KEXEC
> +	IORES_DESC_QUICK_KEXEC			= 9,
> +#endif
>  };
>  
>  /*
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index 9e93bef52968..976bf9631070 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -269,9 +269,12 @@ struct kimage {
>  	unsigned long control_page;
>  
>  	/* Flags to indicate special processing */
> -	unsigned int type : 1;
> +	unsigned int type : 2;
>  #define KEXEC_TYPE_DEFAULT 0
>  #define KEXEC_TYPE_CRASH   1
> +#ifdef CONFIG_QUICK_KEXEC
> +#define KEXEC_TYPE_QUICK   2
> +#endif
>  	unsigned int preserve_context : 1;
>  	/* If set, we are using file mode kexec syscall */
>  	unsigned int file_mode:1;
> @@ -331,6 +334,11 @@ extern int kexec_load_disabled;
>  #define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
>  #endif
>  
> +#ifdef CONFIG_QUICK_KEXEC
> +#undef KEXEC_FLAGS
> +#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_QUICK)
> +#endif
> +
>  /* List of defined/legal kexec file flags */
>  #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
>  				 KEXEC_FILE_NO_INITRAMFS)
> @@ -340,6 +348,9 @@ extern int kexec_load_disabled;
>  extern struct resource crashk_res;
>  extern struct resource crashk_low_res;
>  extern note_buf_t __percpu *crash_notes;
> +#ifdef CONFIG_QUICK_KEXEC
> +extern struct resource quick_kexec_res;
> +#endif
>  
>  /* flag to track if kexec reboot is in progress */
>  extern bool kexec_in_progress;
> diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
> index 05669c87a0af..e3213614b713 100644
> --- a/include/uapi/linux/kexec.h
> +++ b/include/uapi/linux/kexec.h
> @@ -12,6 +12,9 @@
>  /* kexec flags for different usage scenarios */
>  #define KEXEC_ON_CRASH		0x00000001
>  #define KEXEC_PRESERVE_CONTEXT	0x00000002
> +#ifdef CONFIG_QUICK_KEXEC
> +#define KEXEC_QUICK		0x00000004
> +#endif
>  #define KEXEC_ARCH_MASK		0xffff0000
>  
>  /*
> diff --git a/kernel/kexec.c b/kernel/kexec.c
> index f977786fe498..428af4cd3e1a 100644
> --- a/kernel/kexec.c
> +++ b/kernel/kexec.c
> @@ -44,6 +44,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
>  	int ret;
>  	struct kimage *image;
>  	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
> +#ifdef CONFIG_QUICK_KEXEC
> +	bool kexec_on_quick = flags & KEXEC_QUICK;
> +#endif
>  
>  	if (kexec_on_panic) {
>  		/* Verify we have a valid entry point */
> @@ -69,6 +72,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
>  		image->type = KEXEC_TYPE_CRASH;
>  	}
>  
> +#ifdef CONFIG_QUICK_KEXEC
> +	if (kexec_on_quick) {
> +		image->control_page = quick_kexec_res.start;
> +		image->type = KEXEC_TYPE_QUICK;
> +	}
> +#endif
> +
>  	ret = sanity_check_segment_list(image);
>  	if (ret)
>  		goto out_free_image;
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index c19c0dad1ebe..b73dd749368b 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -70,6 +70,16 @@ struct resource crashk_low_res = {
>  	.desc  = IORES_DESC_CRASH_KERNEL
>  };
>  
> +#ifdef CONFIG_QUICK_KEXEC
> +struct resource quick_kexec_res = {
> +	.name  = "Quick kexec",
> +	.start = 0,
> +	.end   = 0,
> +	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
> +	.desc  = IORES_DESC_QUICK_KEXEC
> +};
> +#endif
> +
>  int kexec_should_crash(struct task_struct *p)
>  {
>  	/*
> @@ -413,8 +423,10 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
>  	return pages;
>  }
>  
> -static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
> -						      unsigned int order)
> +
> +static struct page *kimage_alloc_special_control_pages(struct kimage *image,
> +						       unsigned int order,
> +						       unsigned long end)
>  {
>  	/* Control pages are special, they are the intermediaries
>  	 * that are needed while we copy the rest of the pages
> @@ -444,7 +456,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>  	size = (1 << order) << PAGE_SHIFT;
>  	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
>  	hole_end   = hole_start + size - 1;
> -	while (hole_end <= crashk_res.end) {
> +	while (hole_end <= end) {
>  		unsigned long i;
>  
>  		cond_resched();
> @@ -479,7 +491,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>  	return pages;
>  }
>  
> -
>  struct page *kimage_alloc_control_pages(struct kimage *image,
>  					 unsigned int order)
>  {
> @@ -490,8 +501,15 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
>  		pages = kimage_alloc_normal_control_pages(image, order);
>  		break;
>  	case KEXEC_TYPE_CRASH:
> -		pages = kimage_alloc_crash_control_pages(image, order);
> +		pages = kimage_alloc_special_control_pages(image, order,
> +							   crashk_res.end);
> +		break;
> +#ifdef CONFIG_QUICK_KEXEC
> +	case KEXEC_TYPE_QUICK:
> +		pages = kimage_alloc_special_control_pages(image, order,
> +							   quick_kexec_res.end);
>  		break;
> +#endif
>  	}
>  
>  	return pages;
> @@ -847,11 +865,11 @@ static int kimage_load_normal_segment(struct kimage *image,
>  	return result;
>  }
>  
> -static int kimage_load_crash_segment(struct kimage *image,
> +static int kimage_load_special_segment(struct kimage *image,
>  					struct kexec_segment *segment)
>  {
> -	/* For crash dumps kernels we simply copy the data from
> -	 * user space to it's destination.
> +	/* For crash dumps kernels and quick kexec kernels
> +	 * we simply copy the data from user space to it's destination.
>  	 * We do things a page at a time for the sake of kmap.
>  	 */
>  	unsigned long maddr;
> @@ -925,8 +943,13 @@ int kimage_load_segment(struct kimage *image,
>  		result = kimage_load_normal_segment(image, segment);
>  		break;
>  	case KEXEC_TYPE_CRASH:
> -		result = kimage_load_crash_segment(image, segment);
> +		result = kimage_load_special_segment(image, segment);
> +		break;
> +#ifdef CONFIG_QUICK_KEXEC
> +	case KEXEC_TYPE_QUICK:
> +		result = kimage_load_special_segment(image, segment);
>  		break;
> +#endif
>  	}
>  
>  	return result;
> -- 
> 2.19.1
> 
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
> 

Thanks
Dave


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
  2020-08-14  6:58   ` Dave Young
@ 2020-08-14  8:21     ` Sang Yan
  -1 siblings, 0 replies; 25+ messages in thread
From: Sang Yan @ 2020-08-14  8:21 UTC (permalink / raw)
  To: Dave Young
  Cc: kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun, luanjianhai,
	zhuling8, luchunhua, pasha.tatashin



On 08/14/20 14:58, Dave Young wrote:
> On 08/14/20 at 01:52am, Sang Yan wrote:
>> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
>> copy all segments from vmalloced memory to kernel boot memory,
>> because of disabled mmu.
> 
> It is not the case on all archs, I assume your case is arm64, please
> describe it in patch log :)
> 
Yes, it's particularly obvious on arm64. I will add it to the patch log,
and test how long it takes on x86 and other arch.

> About the arm64 problem, I know Pavel Tatashin is working on a patchset
> to improve the performance with enabling mmu.
> 
> I added Pavel in cc, can you try his patches?
> 
Thanks for your tips, I will try these patches. @Pavel.
Disable mmu after finishing copying pages?
>>
>> We introduce quick kexec to save time of copying memory as above,
>> just like kdump(kexec on crash), by using reserved memory
>> "Quick Kexec".
> 
> This approach may have gain, but it also introduce extra requirements to
> pre-reserve a memory region.  I wonder how Eric thinks about the idea.
> 
> Anyway the "quick" name sounds not very good, I would suggest do not
> introduce a new param, and the code can check if pre-reserved region
> exist then use it, if not then fallback to old way.
> 
aha. I agree with it, but I thought it may change the old behaviors of
kexec_load.

I will update a new patch without introducing new flags and new params.

Thanks a lot.

>>
>> Constructing quick kimage as the same as crash kernel,
>> then simply copy all segments of kimage to reserved memroy.
>>
>> We also add this support in syscall kexec_load using flags
>> of KEXEC_QUICK.
>>
>> Signed-off-by: Sang Yan <sangyan@huawei.com>
>> ---
>>  arch/Kconfig               | 10 ++++++++++
>>  include/linux/ioport.h     |  3 +++
>>  include/linux/kexec.h      | 13 +++++++++++-
>>  include/uapi/linux/kexec.h |  3 +++
>>  kernel/kexec.c             | 10 ++++++++++
>>  kernel/kexec_core.c        | 41 +++++++++++++++++++++++++++++---------
>>  6 files changed, 70 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 3329fa143637..eca782cb8e29 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -21,6 +21,16 @@ config KEXEC_CORE
>>  config KEXEC_ELF
>>  	bool
>>  
>> +config QUICK_KEXEC
>> +	bool "Support for quick kexec"
>> +	depends on KEXEC_CORE
>> +	help
>> +	  Say y here to enable this feature.
>> +	  It use reserved memory to accelerate kexec, just like crash
>> +	  kexec, load new kernel and initrd to reserved memory, and
>> +	  boot new kernel on that memory. It will save the time of
>> +	  relocating kernel.
>> +
>>  config HAVE_IMA_KEXEC
>>  	bool
>>  
>> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>> index 6c2b06fe8beb..f37c632accbe 100644
>> --- a/include/linux/ioport.h
>> +++ b/include/linux/ioport.h
>> @@ -136,6 +136,9 @@ enum {
>>  	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
>>  	IORES_DESC_RESERVED			= 7,
>>  	IORES_DESC_SOFT_RESERVED		= 8,
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	IORES_DESC_QUICK_KEXEC			= 9,
>> +#endif
>>  };
>>  
>>  /*
>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>> index 9e93bef52968..976bf9631070 100644
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>> @@ -269,9 +269,12 @@ struct kimage {
>>  	unsigned long control_page;
>>  
>>  	/* Flags to indicate special processing */
>> -	unsigned int type : 1;
>> +	unsigned int type : 2;
>>  #define KEXEC_TYPE_DEFAULT 0
>>  #define KEXEC_TYPE_CRASH   1
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#define KEXEC_TYPE_QUICK   2
>> +#endif
>>  	unsigned int preserve_context : 1;
>>  	/* If set, we are using file mode kexec syscall */
>>  	unsigned int file_mode:1;
>> @@ -331,6 +334,11 @@ extern int kexec_load_disabled;
>>  #define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
>>  #endif
>>  
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#undef KEXEC_FLAGS
>> +#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_QUICK)
>> +#endif
>> +
>>  /* List of defined/legal kexec file flags */
>>  #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
>>  				 KEXEC_FILE_NO_INITRAMFS)
>> @@ -340,6 +348,9 @@ extern int kexec_load_disabled;
>>  extern struct resource crashk_res;
>>  extern struct resource crashk_low_res;
>>  extern note_buf_t __percpu *crash_notes;
>> +#ifdef CONFIG_QUICK_KEXEC
>> +extern struct resource quick_kexec_res;
>> +#endif
>>  
>>  /* flag to track if kexec reboot is in progress */
>>  extern bool kexec_in_progress;
>> diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
>> index 05669c87a0af..e3213614b713 100644
>> --- a/include/uapi/linux/kexec.h
>> +++ b/include/uapi/linux/kexec.h
>> @@ -12,6 +12,9 @@
>>  /* kexec flags for different usage scenarios */
>>  #define KEXEC_ON_CRASH		0x00000001
>>  #define KEXEC_PRESERVE_CONTEXT	0x00000002
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#define KEXEC_QUICK		0x00000004
>> +#endif
>>  #define KEXEC_ARCH_MASK		0xffff0000
>>  
>>  /*
>> diff --git a/kernel/kexec.c b/kernel/kexec.c
>> index f977786fe498..428af4cd3e1a 100644
>> --- a/kernel/kexec.c
>> +++ b/kernel/kexec.c
>> @@ -44,6 +44,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
>>  	int ret;
>>  	struct kimage *image;
>>  	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	bool kexec_on_quick = flags & KEXEC_QUICK;
>> +#endif
>>  
>>  	if (kexec_on_panic) {
>>  		/* Verify we have a valid entry point */
>> @@ -69,6 +72,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
>>  		image->type = KEXEC_TYPE_CRASH;
>>  	}
>>  
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	if (kexec_on_quick) {
>> +		image->control_page = quick_kexec_res.start;
>> +		image->type = KEXEC_TYPE_QUICK;
>> +	}
>> +#endif
>> +
>>  	ret = sanity_check_segment_list(image);
>>  	if (ret)
>>  		goto out_free_image;
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index c19c0dad1ebe..b73dd749368b 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -70,6 +70,16 @@ struct resource crashk_low_res = {
>>  	.desc  = IORES_DESC_CRASH_KERNEL
>>  };
>>  
>> +#ifdef CONFIG_QUICK_KEXEC
>> +struct resource quick_kexec_res = {
>> +	.name  = "Quick kexec",
>> +	.start = 0,
>> +	.end   = 0,
>> +	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
>> +	.desc  = IORES_DESC_QUICK_KEXEC
>> +};
>> +#endif
>> +
>>  int kexec_should_crash(struct task_struct *p)
>>  {
>>  	/*
>> @@ -413,8 +423,10 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
>>  	return pages;
>>  }
>>  
>> -static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>> -						      unsigned int order)
>> +
>> +static struct page *kimage_alloc_special_control_pages(struct kimage *image,
>> +						       unsigned int order,
>> +						       unsigned long end)
>>  {
>>  	/* Control pages are special, they are the intermediaries
>>  	 * that are needed while we copy the rest of the pages
>> @@ -444,7 +456,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>>  	size = (1 << order) << PAGE_SHIFT;
>>  	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
>>  	hole_end   = hole_start + size - 1;
>> -	while (hole_end <= crashk_res.end) {
>> +	while (hole_end <= end) {
>>  		unsigned long i;
>>  
>>  		cond_resched();
>> @@ -479,7 +491,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>>  	return pages;
>>  }
>>  
>> -
>>  struct page *kimage_alloc_control_pages(struct kimage *image,
>>  					 unsigned int order)
>>  {
>> @@ -490,8 +501,15 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
>>  		pages = kimage_alloc_normal_control_pages(image, order);
>>  		break;
>>  	case KEXEC_TYPE_CRASH:
>> -		pages = kimage_alloc_crash_control_pages(image, order);
>> +		pages = kimage_alloc_special_control_pages(image, order,
>> +							   crashk_res.end);
>> +		break;
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	case KEXEC_TYPE_QUICK:
>> +		pages = kimage_alloc_special_control_pages(image, order,
>> +							   quick_kexec_res.end);
>>  		break;
>> +#endif
>>  	}
>>  
>>  	return pages;
>> @@ -847,11 +865,11 @@ static int kimage_load_normal_segment(struct kimage *image,
>>  	return result;
>>  }
>>  
>> -static int kimage_load_crash_segment(struct kimage *image,
>> +static int kimage_load_special_segment(struct kimage *image,
>>  					struct kexec_segment *segment)
>>  {
>> -	/* For crash dumps kernels we simply copy the data from
>> -	 * user space to it's destination.
>> +	/* For crash dumps kernels and quick kexec kernels
>> +	 * we simply copy the data from user space to it's destination.
>>  	 * We do things a page at a time for the sake of kmap.
>>  	 */
>>  	unsigned long maddr;
>> @@ -925,8 +943,13 @@ int kimage_load_segment(struct kimage *image,
>>  		result = kimage_load_normal_segment(image, segment);
>>  		break;
>>  	case KEXEC_TYPE_CRASH:
>> -		result = kimage_load_crash_segment(image, segment);
>> +		result = kimage_load_special_segment(image, segment);
>> +		break;
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	case KEXEC_TYPE_QUICK:
>> +		result = kimage_load_special_segment(image, segment);
>>  		break;
>> +#endif
>>  	}
>>  
>>  	return result;
>> -- 
>> 2.19.1
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>
> 
> Thanks
> Dave
> 
> 
> .
> 
Thanks
Sang Yan


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-14  8:21     ` Sang Yan
  0 siblings, 0 replies; 25+ messages in thread
From: Sang Yan @ 2020-08-14  8:21 UTC (permalink / raw)
  To: Dave Young
  Cc: luanjianhai, pasha.tatashin, xiexiuqi, kexec, linux-kernel,
	luchunhua, ebiederm, guohanjun, zhuling8



On 08/14/20 14:58, Dave Young wrote:
> On 08/14/20 at 01:52am, Sang Yan wrote:
>> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
>> copy all segments from vmalloced memory to kernel boot memory,
>> because of disabled mmu.
> 
> It is not the case on all archs, I assume your case is arm64, please
> describe it in patch log :)
> 
Yes, it's particularly obvious on arm64. I will add it to the patch log,
and test how long it takes on x86 and other arch.

> About the arm64 problem, I know Pavel Tatashin is working on a patchset
> to improve the performance with enabling mmu.
> 
> I added Pavel in cc, can you try his patches?
> 
Thanks for your tips, I will try these patches. @Pavel.
Disable mmu after finishing copying pages?
>>
>> We introduce quick kexec to save time of copying memory as above,
>> just like kdump(kexec on crash), by using reserved memory
>> "Quick Kexec".
> 
> This approach may have gain, but it also introduce extra requirements to
> pre-reserve a memory region.  I wonder how Eric thinks about the idea.
> 
> Anyway the "quick" name sounds not very good, I would suggest do not
> introduce a new param, and the code can check if pre-reserved region
> exist then use it, if not then fallback to old way.
> 
aha. I agree with it, but I thought it may change the old behaviors of
kexec_load.

I will update a new patch without introducing new flags and new params.

Thanks a lot.

>>
>> Constructing quick kimage as the same as crash kernel,
>> then simply copy all segments of kimage to reserved memroy.
>>
>> We also add this support in syscall kexec_load using flags
>> of KEXEC_QUICK.
>>
>> Signed-off-by: Sang Yan <sangyan@huawei.com>
>> ---
>>  arch/Kconfig               | 10 ++++++++++
>>  include/linux/ioport.h     |  3 +++
>>  include/linux/kexec.h      | 13 +++++++++++-
>>  include/uapi/linux/kexec.h |  3 +++
>>  kernel/kexec.c             | 10 ++++++++++
>>  kernel/kexec_core.c        | 41 +++++++++++++++++++++++++++++---------
>>  6 files changed, 70 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 3329fa143637..eca782cb8e29 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -21,6 +21,16 @@ config KEXEC_CORE
>>  config KEXEC_ELF
>>  	bool
>>  
>> +config QUICK_KEXEC
>> +	bool "Support for quick kexec"
>> +	depends on KEXEC_CORE
>> +	help
>> +	  Say y here to enable this feature.
>> +	  It use reserved memory to accelerate kexec, just like crash
>> +	  kexec, load new kernel and initrd to reserved memory, and
>> +	  boot new kernel on that memory. It will save the time of
>> +	  relocating kernel.
>> +
>>  config HAVE_IMA_KEXEC
>>  	bool
>>  
>> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>> index 6c2b06fe8beb..f37c632accbe 100644
>> --- a/include/linux/ioport.h
>> +++ b/include/linux/ioport.h
>> @@ -136,6 +136,9 @@ enum {
>>  	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
>>  	IORES_DESC_RESERVED			= 7,
>>  	IORES_DESC_SOFT_RESERVED		= 8,
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	IORES_DESC_QUICK_KEXEC			= 9,
>> +#endif
>>  };
>>  
>>  /*
>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>> index 9e93bef52968..976bf9631070 100644
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>> @@ -269,9 +269,12 @@ struct kimage {
>>  	unsigned long control_page;
>>  
>>  	/* Flags to indicate special processing */
>> -	unsigned int type : 1;
>> +	unsigned int type : 2;
>>  #define KEXEC_TYPE_DEFAULT 0
>>  #define KEXEC_TYPE_CRASH   1
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#define KEXEC_TYPE_QUICK   2
>> +#endif
>>  	unsigned int preserve_context : 1;
>>  	/* If set, we are using file mode kexec syscall */
>>  	unsigned int file_mode:1;
>> @@ -331,6 +334,11 @@ extern int kexec_load_disabled;
>>  #define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
>>  #endif
>>  
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#undef KEXEC_FLAGS
>> +#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_QUICK)
>> +#endif
>> +
>>  /* List of defined/legal kexec file flags */
>>  #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
>>  				 KEXEC_FILE_NO_INITRAMFS)
>> @@ -340,6 +348,9 @@ extern int kexec_load_disabled;
>>  extern struct resource crashk_res;
>>  extern struct resource crashk_low_res;
>>  extern note_buf_t __percpu *crash_notes;
>> +#ifdef CONFIG_QUICK_KEXEC
>> +extern struct resource quick_kexec_res;
>> +#endif
>>  
>>  /* flag to track if kexec reboot is in progress */
>>  extern bool kexec_in_progress;
>> diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
>> index 05669c87a0af..e3213614b713 100644
>> --- a/include/uapi/linux/kexec.h
>> +++ b/include/uapi/linux/kexec.h
>> @@ -12,6 +12,9 @@
>>  /* kexec flags for different usage scenarios */
>>  #define KEXEC_ON_CRASH		0x00000001
>>  #define KEXEC_PRESERVE_CONTEXT	0x00000002
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#define KEXEC_QUICK		0x00000004
>> +#endif
>>  #define KEXEC_ARCH_MASK		0xffff0000
>>  
>>  /*
>> diff --git a/kernel/kexec.c b/kernel/kexec.c
>> index f977786fe498..428af4cd3e1a 100644
>> --- a/kernel/kexec.c
>> +++ b/kernel/kexec.c
>> @@ -44,6 +44,9 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
>>  	int ret;
>>  	struct kimage *image;
>>  	bool kexec_on_panic = flags & KEXEC_ON_CRASH;
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	bool kexec_on_quick = flags & KEXEC_QUICK;
>> +#endif
>>  
>>  	if (kexec_on_panic) {
>>  		/* Verify we have a valid entry point */
>> @@ -69,6 +72,13 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
>>  		image->type = KEXEC_TYPE_CRASH;
>>  	}
>>  
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	if (kexec_on_quick) {
>> +		image->control_page = quick_kexec_res.start;
>> +		image->type = KEXEC_TYPE_QUICK;
>> +	}
>> +#endif
>> +
>>  	ret = sanity_check_segment_list(image);
>>  	if (ret)
>>  		goto out_free_image;
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index c19c0dad1ebe..b73dd749368b 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -70,6 +70,16 @@ struct resource crashk_low_res = {
>>  	.desc  = IORES_DESC_CRASH_KERNEL
>>  };
>>  
>> +#ifdef CONFIG_QUICK_KEXEC
>> +struct resource quick_kexec_res = {
>> +	.name  = "Quick kexec",
>> +	.start = 0,
>> +	.end   = 0,
>> +	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
>> +	.desc  = IORES_DESC_QUICK_KEXEC
>> +};
>> +#endif
>> +
>>  int kexec_should_crash(struct task_struct *p)
>>  {
>>  	/*
>> @@ -413,8 +423,10 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
>>  	return pages;
>>  }
>>  
>> -static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>> -						      unsigned int order)
>> +
>> +static struct page *kimage_alloc_special_control_pages(struct kimage *image,
>> +						       unsigned int order,
>> +						       unsigned long end)
>>  {
>>  	/* Control pages are special, they are the intermediaries
>>  	 * that are needed while we copy the rest of the pages
>> @@ -444,7 +456,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>>  	size = (1 << order) << PAGE_SHIFT;
>>  	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
>>  	hole_end   = hole_start + size - 1;
>> -	while (hole_end <= crashk_res.end) {
>> +	while (hole_end <= end) {
>>  		unsigned long i;
>>  
>>  		cond_resched();
>> @@ -479,7 +491,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
>>  	return pages;
>>  }
>>  
>> -
>>  struct page *kimage_alloc_control_pages(struct kimage *image,
>>  					 unsigned int order)
>>  {
>> @@ -490,8 +501,15 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
>>  		pages = kimage_alloc_normal_control_pages(image, order);
>>  		break;
>>  	case KEXEC_TYPE_CRASH:
>> -		pages = kimage_alloc_crash_control_pages(image, order);
>> +		pages = kimage_alloc_special_control_pages(image, order,
>> +							   crashk_res.end);
>> +		break;
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	case KEXEC_TYPE_QUICK:
>> +		pages = kimage_alloc_special_control_pages(image, order,
>> +							   quick_kexec_res.end);
>>  		break;
>> +#endif
>>  	}
>>  
>>  	return pages;
>> @@ -847,11 +865,11 @@ static int kimage_load_normal_segment(struct kimage *image,
>>  	return result;
>>  }
>>  
>> -static int kimage_load_crash_segment(struct kimage *image,
>> +static int kimage_load_special_segment(struct kimage *image,
>>  					struct kexec_segment *segment)
>>  {
>> -	/* For crash dumps kernels we simply copy the data from
>> -	 * user space to it's destination.
>> +	/* For crash dumps kernels and quick kexec kernels
>> +	 * we simply copy the data from user space to it's destination.
>>  	 * We do things a page at a time for the sake of kmap.
>>  	 */
>>  	unsigned long maddr;
>> @@ -925,8 +943,13 @@ int kimage_load_segment(struct kimage *image,
>>  		result = kimage_load_normal_segment(image, segment);
>>  		break;
>>  	case KEXEC_TYPE_CRASH:
>> -		result = kimage_load_crash_segment(image, segment);
>> +		result = kimage_load_special_segment(image, segment);
>> +		break;
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	case KEXEC_TYPE_QUICK:
>> +		result = kimage_load_special_segment(image, segment);
>>  		break;
>> +#endif
>>  	}
>>  
>>  	return result;
>> -- 
>> 2.19.1
>>
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
>>
> 
> Thanks
> Dave
> 
> 
> .
> 
Thanks
Sang Yan


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
  2020-08-14  8:21     ` Sang Yan
@ 2020-08-14 11:24       ` Dave Young
  -1 siblings, 0 replies; 25+ messages in thread
From: Dave Young @ 2020-08-14 11:24 UTC (permalink / raw)
  To: Sang Yan
  Cc: kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun, luanjianhai,
	zhuling8, luchunhua, pasha.tatashin

Hi,

On 08/14/20 at 04:21pm, Sang Yan wrote:
> 
> 
> On 08/14/20 14:58, Dave Young wrote:
> > On 08/14/20 at 01:52am, Sang Yan wrote:
> >> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
> >> copy all segments from vmalloced memory to kernel boot memory,
> >> because of disabled mmu.
> > 
> > It is not the case on all archs, I assume your case is arm64, please
> > describe it in patch log :)
> > 
> Yes, it's particularly obvious on arm64. I will add it to the patch log,
> and test how long it takes on x86 and other arch.
> 
> > About the arm64 problem, I know Pavel Tatashin is working on a patchset
> > to improve the performance with enabling mmu.
> > 
> > I added Pavel in cc, can you try his patches?
> > 
> Thanks for your tips, I will try these patches. @Pavel.
> Disable mmu after finishing copying pages?
> >>
> >> We introduce quick kexec to save time of copying memory as above,
> >> just like kdump(kexec on crash), by using reserved memory
> >> "Quick Kexec".
> > 
> > This approach may have gain, but it also introduce extra requirements to
> > pre-reserve a memory region.  I wonder how Eric thinks about the idea.
> > 
> > Anyway the "quick" name sounds not very good, I would suggest do not
> > introduce a new param, and the code can check if pre-reserved region
> > exist then use it, if not then fallback to old way.
> > 
> aha. I agree with it, but I thought it may change the old behaviors of
> kexec_load.
> 
> I will update a new patch without introducing new flags and new params.

Frankly I'm still not sure it is worth to introduce a new interface if the
improvement can be done in arch code like Pavel is doing.  Can you try
that first?

Thanks
Dave


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-14 11:24       ` Dave Young
  0 siblings, 0 replies; 25+ messages in thread
From: Dave Young @ 2020-08-14 11:24 UTC (permalink / raw)
  To: Sang Yan
  Cc: luanjianhai, pasha.tatashin, xiexiuqi, kexec, linux-kernel,
	luchunhua, ebiederm, guohanjun, zhuling8

Hi,

On 08/14/20 at 04:21pm, Sang Yan wrote:
> 
> 
> On 08/14/20 14:58, Dave Young wrote:
> > On 08/14/20 at 01:52am, Sang Yan wrote:
> >> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
> >> copy all segments from vmalloced memory to kernel boot memory,
> >> because of disabled mmu.
> > 
> > It is not the case on all archs, I assume your case is arm64, please
> > describe it in patch log :)
> > 
> Yes, it's particularly obvious on arm64. I will add it to the patch log,
> and test how long it takes on x86 and other arch.
> 
> > About the arm64 problem, I know Pavel Tatashin is working on a patchset
> > to improve the performance with enabling mmu.
> > 
> > I added Pavel in cc, can you try his patches?
> > 
> Thanks for your tips, I will try these patches. @Pavel.
> Disable mmu after finishing copying pages?
> >>
> >> We introduce quick kexec to save time of copying memory as above,
> >> just like kdump(kexec on crash), by using reserved memory
> >> "Quick Kexec".
> > 
> > This approach may have gain, but it also introduce extra requirements to
> > pre-reserve a memory region.  I wonder how Eric thinks about the idea.
> > 
> > Anyway the "quick" name sounds not very good, I would suggest do not
> > introduce a new param, and the code can check if pre-reserved region
> > exist then use it, if not then fallback to old way.
> > 
> aha. I agree with it, but I thought it may change the old behaviors of
> kexec_load.
> 
> I will update a new patch without introducing new flags and new params.

Frankly I'm still not sure it is worth to introduce a new interface if the
improvement can be done in arch code like Pavel is doing.  Can you try
that first?

Thanks
Dave


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
  2020-08-14  5:52 ` Sang Yan
@ 2020-08-14 15:17   ` Eric W. Biederman
  -1 siblings, 0 replies; 25+ messages in thread
From: Eric W. Biederman @ 2020-08-14 15:17 UTC (permalink / raw)
  To: Sang Yan
  Cc: kexec, linux-kernel, xiexiuqi, guohanjun, zhuling8, luanjianhai,
	luchunhua

Sang Yan <sangyan@huawei.com> writes:

> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
> copy all segments from vmalloced memory to kernel boot memory,
> because of disabled mmu.

I haven't seen kexec that slow since I tested on my 16Mhz 386.

That machine has an excuse it really is slow.  Anything else
that takes seconds is almost certainly slow because someone
has misconfigured things to not cache the data copied by kexec.

I humbly suggest that you fix the arm64 code so that the data gets
cached.

Eric

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-14 15:17   ` Eric W. Biederman
  0 siblings, 0 replies; 25+ messages in thread
From: Eric W. Biederman @ 2020-08-14 15:17 UTC (permalink / raw)
  To: Sang Yan
  Cc: luanjianhai, xiexiuqi, kexec, linux-kernel, luchunhua, guohanjun,
	zhuling8

Sang Yan <sangyan@huawei.com> writes:

> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
> copy all segments from vmalloced memory to kernel boot memory,
> because of disabled mmu.

I haven't seen kexec that slow since I tested on my 16Mhz 386.

That machine has an excuse it really is slow.  Anything else
that takes seconds is almost certainly slow because someone
has misconfigured things to not cache the data copied by kexec.

I humbly suggest that you fix the arm64 code so that the data gets
cached.

Eric

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
  2020-08-14 11:24       ` Dave Young
@ 2020-08-14 19:22         ` Pavel Tatashin
  -1 siblings, 0 replies; 25+ messages in thread
From: Pavel Tatashin @ 2020-08-14 19:22 UTC (permalink / raw)
  To: Dave Young
  Cc: Sang Yan, kexec mailing list, Eric W. Biederman, LKML, xiexiuqi,
	guohanjun, luanjianhai, zhuling8, luchunhua, James Morse

On Fri, Aug 14, 2020 at 7:24 AM Dave Young <dyoung@redhat.com> wrote:
>
> Hi,
>
> On 08/14/20 at 04:21pm, Sang Yan wrote:
> >
> >
> > On 08/14/20 14:58, Dave Young wrote:
> > > On 08/14/20 at 01:52am, Sang Yan wrote:
> > >> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
> > >> copy all segments from vmalloced memory to kernel boot memory,
> > >> because of disabled mmu.
> > >
> > > It is not the case on all archs, I assume your case is arm64, please
> > > describe it in patch log :)
> > >
> > Yes, it's particularly obvious on arm64. I will add it to the patch log,
> > and test how long it takes on x86 and other arch.
> >
> > > About the arm64 problem, I know Pavel Tatashin is working on a patchset
> > > to improve the performance with enabling mmu.
> > >
> > > I added Pavel in cc, can you try his patches?
> > >
> > Thanks for your tips, I will try these patches. @Pavel.
> > Disable mmu after finishing copying pages?
> > >>
> > >> We introduce quick kexec to save time of copying memory as above,
> > >> just like kdump(kexec on crash), by using reserved memory
> > >> "Quick Kexec".
> > >
> > > This approach may have gain, but it also introduce extra requirements to
> > > pre-reserve a memory region.  I wonder how Eric thinks about the idea.
> > >
> > > Anyway the "quick" name sounds not very good, I would suggest do not
> > > introduce a new param, and the code can check if pre-reserved region
> > > exist then use it, if not then fallback to old way.
> > >
> > aha. I agree with it, but I thought it may change the old behaviors of
> > kexec_load.
> >
> > I will update a new patch without introducing new flags and new params.
>
> Frankly I'm still not sure it is worth to introduce a new interface if the
> improvement can be done in arch code like Pavel is doing.  Can you try
> that first?

Hi Dave,

Thank you for including me into this discussion.

My patches will fix this issue. This is an ARM64 specific problem and
I did not see this to be performance problem on x86 during kexec
relocation. This happens because on ARM64 relocation is performed with
MMU disabled, and when MMU is disabled the caching is disabled as
well.

I have a patch series that fixes this entirely, but James Morse
(+CCed) and I still have not agreed on the final approach. We had an
off-list conversation about it, and we need to continue it in public
ML.

Here is some history:

This is the original series that I sent a year ago. It basically
proposes the same thing as this series from Sang Yan:
https://lore.kernel.org/lkml/20190709182014.16052-1-pasha.tatashin@soleen.com/

Once, I realized that with enabling MMU the relocation is issue is
gone completely, I sent a new series, and this is the latest version
of that series:
https://lore.kernel.org/lkml/20200326032420.27220-1-pasha.tatashin@soleen.com/

It has been tested in production, and several people from different
companies commented to me that they are using it as well.

After my patch series was sent out, James created a new branch in his
tree with his approach of enabling MMU without having a new VA space,
but instead re-use what the kernel  has now. I have not tested that
branch yet.

Here are some comments from James Morse and the off-list discussion we had:
-------
It sounds like you are depending on write streaming mode to meet your
target performance.
This isn't even CPU specific, its cache and firmware configuration specific!
I don't think we should optimise a general purpose operating system
based on things like this.
..
I think the best approach is going to be to eliminate the relocations entirely.
...
I'm afraid I view this kexec-map thing as high-risk duct-tape over the
kexec core code
deliberately scattering the kexec payload.
I'd prefer any approach that causes the payload to be stored in-place
from the beginning
as that benefits other architectures too.
-------

It appears James is leaning to the approach of not performing
relocation at all and use what is proposed by Sang Yan and me during
my first approach for this problem. However, I have several issues
with this take, which if addressed would be OK for me.
1. The newer, more secure kexec syscall kexec_file_load(), which
allows to check the IMA integrity of the loaded file does not have a
way to specify the area in memory where to place the kernel. We are
using this syscall in production, and cannot go back to kexec_load()
for security reasons.
2. Reserving memory means wasting memory during run-time. Our machine
has only 8G of RAM, and reserving even 128M for the next kernel is an
expensive proposition. Now we start loading the next kernel after some
non essential processes are stopped, but before essential processes
are stopped for the lowest downtime possible.
3. Disabling relocation means changes in the common code, which I am
not sure actually helps any other platform beside ARM64, so I am
worried it won't be accepted into upstream.

Thank you,
Pasha

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-14 19:22         ` Pavel Tatashin
  0 siblings, 0 replies; 25+ messages in thread
From: Pavel Tatashin @ 2020-08-14 19:22 UTC (permalink / raw)
  To: Dave Young
  Cc: luanjianhai, Sang Yan, xiexiuqi, kexec mailing list, LKML,
	luchunhua, James Morse, Eric W. Biederman, guohanjun, zhuling8

On Fri, Aug 14, 2020 at 7:24 AM Dave Young <dyoung@redhat.com> wrote:
>
> Hi,
>
> On 08/14/20 at 04:21pm, Sang Yan wrote:
> >
> >
> > On 08/14/20 14:58, Dave Young wrote:
> > > On 08/14/20 at 01:52am, Sang Yan wrote:
> > >> In normal kexec, relocating kernel may cost 5 ~ 10 seconds, to
> > >> copy all segments from vmalloced memory to kernel boot memory,
> > >> because of disabled mmu.
> > >
> > > It is not the case on all archs, I assume your case is arm64, please
> > > describe it in patch log :)
> > >
> > Yes, it's particularly obvious on arm64. I will add it to the patch log,
> > and test how long it takes on x86 and other arch.
> >
> > > About the arm64 problem, I know Pavel Tatashin is working on a patchset
> > > to improve the performance with enabling mmu.
> > >
> > > I added Pavel in cc, can you try his patches?
> > >
> > Thanks for your tips, I will try these patches. @Pavel.
> > Disable mmu after finishing copying pages?
> > >>
> > >> We introduce quick kexec to save time of copying memory as above,
> > >> just like kdump(kexec on crash), by using reserved memory
> > >> "Quick Kexec".
> > >
> > > This approach may have gain, but it also introduce extra requirements to
> > > pre-reserve a memory region.  I wonder how Eric thinks about the idea.
> > >
> > > Anyway the "quick" name sounds not very good, I would suggest do not
> > > introduce a new param, and the code can check if pre-reserved region
> > > exist then use it, if not then fallback to old way.
> > >
> > aha. I agree with it, but I thought it may change the old behaviors of
> > kexec_load.
> >
> > I will update a new patch without introducing new flags and new params.
>
> Frankly I'm still not sure it is worth to introduce a new interface if the
> improvement can be done in arch code like Pavel is doing.  Can you try
> that first?

Hi Dave,

Thank you for including me into this discussion.

My patches will fix this issue. This is an ARM64 specific problem and
I did not see this to be performance problem on x86 during kexec
relocation. This happens because on ARM64 relocation is performed with
MMU disabled, and when MMU is disabled the caching is disabled as
well.

I have a patch series that fixes this entirely, but James Morse
(+CCed) and I still have not agreed on the final approach. We had an
off-list conversation about it, and we need to continue it in public
ML.

Here is some history:

This is the original series that I sent a year ago. It basically
proposes the same thing as this series from Sang Yan:
https://lore.kernel.org/lkml/20190709182014.16052-1-pasha.tatashin@soleen.com/

Once, I realized that with enabling MMU the relocation is issue is
gone completely, I sent a new series, and this is the latest version
of that series:
https://lore.kernel.org/lkml/20200326032420.27220-1-pasha.tatashin@soleen.com/

It has been tested in production, and several people from different
companies commented to me that they are using it as well.

After my patch series was sent out, James created a new branch in his
tree with his approach of enabling MMU without having a new VA space,
but instead re-use what the kernel  has now. I have not tested that
branch yet.

Here are some comments from James Morse and the off-list discussion we had:
-------
It sounds like you are depending on write streaming mode to meet your
target performance.
This isn't even CPU specific, its cache and firmware configuration specific!
I don't think we should optimise a general purpose operating system
based on things like this.
..
I think the best approach is going to be to eliminate the relocations entirely.
...
I'm afraid I view this kexec-map thing as high-risk duct-tape over the
kexec core code
deliberately scattering the kexec payload.
I'd prefer any approach that causes the payload to be stored in-place
from the beginning
as that benefits other architectures too.
-------

It appears James is leaning to the approach of not performing
relocation at all and use what is proposed by Sang Yan and me during
my first approach for this problem. However, I have several issues
with this take, which if addressed would be OK for me.
1. The newer, more secure kexec syscall kexec_file_load(), which
allows to check the IMA integrity of the loaded file does not have a
way to specify the area in memory where to place the kernel. We are
using this syscall in production, and cannot go back to kexec_load()
for security reasons.
2. Reserving memory means wasting memory during run-time. Our machine
has only 8G of RAM, and reserving even 128M for the next kernel is an
expensive proposition. Now we start loading the next kernel after some
non essential processes are stopped, but before essential processes
are stopped for the lowest downtime possible.
3. Disabling relocation means changes in the common code, which I am
not sure actually helps any other platform beside ARM64, so I am
worried it won't be accepted into upstream.

Thank you,
Pasha

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] arm64: Reserve memory for quick kexec
  2020-08-14  5:52   ` Sang Yan
  (?)
@ 2020-08-16  4:11     ` kernel test robot
  -1 siblings, 0 replies; 25+ messages in thread
From: kernel test robot @ 2020-08-16  4:11 UTC (permalink / raw)
  To: Sang Yan, kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun
  Cc: kbuild-all, zhuling8, luanjianhai, luchunhua

[-- Attachment #1: Type: text/plain, Size: 5296 bytes --]

Hi Sang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on linux/master linus/master v5.8 next-20200814]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Sang-Yan/kexec-Add-quick-kexec-support-for-kernel/20200814-142840
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: arm64-randconfig-r011-20200816 (attached as .config)
compiler: aarch64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=arm64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/arm64/kernel/setup.c: In function 'setup_arch':
>> arch/arm64/kernel/setup.c:374:32: error: 'res' undeclared (first use in this function)
     374 |       quick_kexec_res.start >= res->start &&
         |                                ^~~
   arch/arm64/kernel/setup.c:374:32: note: each undeclared identifier is reported only once for each function it appears in
--
   arch/arm64/mm/init.c: In function 'reserve_quick_kexec':
>> arch/arm64/mm/init.c:155:13: error: 'CRASH_ALIGN' undeclared (first use in this function); did you mean 'CMSG_ALIGN'?
     155 |    mem_len, CRASH_ALIGN);
         |             ^~~~~~~~~~~
         |             CMSG_ALIGN
   arch/arm64/mm/init.c:155:13: note: each undeclared identifier is reported only once for each function it appears in

vim +/res +374 arch/arm64/kernel/setup.c

   284	
   285	void __init setup_arch(char **cmdline_p)
   286	{
   287		init_mm.start_code = (unsigned long) _text;
   288		init_mm.end_code   = (unsigned long) _etext;
   289		init_mm.end_data   = (unsigned long) _edata;
   290		init_mm.brk	   = (unsigned long) _end;
   291	
   292		*cmdline_p = boot_command_line;
   293	
   294		/*
   295		 * If know now we are going to need KPTI then use non-global
   296		 * mappings from the start, avoiding the cost of rewriting
   297		 * everything later.
   298		 */
   299		arm64_use_ng_mappings = kaslr_requires_kpti();
   300	
   301		early_fixmap_init();
   302		early_ioremap_init();
   303	
   304		setup_machine_fdt(__fdt_pointer);
   305	
   306		/*
   307		 * Initialise the static keys early as they may be enabled by the
   308		 * cpufeature code and early parameters.
   309		 */
   310		jump_label_init();
   311		parse_early_param();
   312	
   313		/*
   314		 * Unmask asynchronous aborts and fiq after bringing up possible
   315		 * earlycon. (Report possible System Errors once we can report this
   316		 * occurred).
   317		 */
   318		local_daif_restore(DAIF_PROCCTX_NOIRQ);
   319	
   320		/*
   321		 * TTBR0 is only used for the identity mapping at this stage. Make it
   322		 * point to zero page to avoid speculatively fetching new entries.
   323		 */
   324		cpu_uninstall_idmap();
   325	
   326		xen_early_init();
   327		efi_init();
   328	
   329		if (!efi_enabled(EFI_BOOT) && ((u64)_text % MIN_KIMG_ALIGN) != 0)
   330		     pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!");
   331	
   332		arm64_memblock_init();
   333	
   334		paging_init();
   335	
   336		acpi_table_upgrade();
   337	
   338		/* Parse the ACPI tables for possible boot-time configuration */
   339		acpi_boot_table_init();
   340	
   341		if (acpi_disabled)
   342			unflatten_device_tree();
   343	
   344		bootmem_init();
   345	
   346		kasan_init();
   347	
   348		request_standard_resources();
   349	
   350		early_ioremap_reset();
   351	
   352		if (acpi_disabled)
   353			psci_dt_init();
   354		else
   355			psci_acpi_init();
   356	
   357		init_bootcpu_ops();
   358		smp_init_cpus();
   359		smp_build_mpidr_hash();
   360	
   361		/* Init percpu seeds for random tags after cpus are set up. */
   362		kasan_init_tags();
   363	
   364	#ifdef CONFIG_ARM64_SW_TTBR0_PAN
   365		/*
   366		 * Make sure init_thread_info.ttbr0 always generates translation
   367		 * faults in case uaccess_enable() is inadvertently called by the init
   368		 * thread.
   369		 */
   370		init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page);
   371	#endif
   372	#ifdef CONFIG_QUICK_KEXEC
   373			if (quick_kexec_res.end &&
 > 374			    quick_kexec_res.start >= res->start &&
   375			    quick_kexec_res.end <= res->end)
   376				request_resource(res, &quick_kexec_res);
   377	#endif
   378	
   379		if (boot_args[1] || boot_args[2] || boot_args[3]) {
   380			pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
   381				"\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n"
   382				"This indicates a broken bootloader or old kernel\n",
   383				boot_args[1], boot_args[2], boot_args[3]);
   384		}
   385	}
   386	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 33210 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] arm64: Reserve memory for quick kexec
@ 2020-08-16  4:11     ` kernel test robot
  0 siblings, 0 replies; 25+ messages in thread
From: kernel test robot @ 2020-08-16  4:11 UTC (permalink / raw)
  To: Sang Yan, kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun
  Cc: luanjianhai, zhuling8, kbuild-all, luchunhua

[-- Attachment #1: Type: text/plain, Size: 5296 bytes --]

Hi Sang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on linux/master linus/master v5.8 next-20200814]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Sang-Yan/kexec-Add-quick-kexec-support-for-kernel/20200814-142840
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: arm64-randconfig-r011-20200816 (attached as .config)
compiler: aarch64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=arm64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/arm64/kernel/setup.c: In function 'setup_arch':
>> arch/arm64/kernel/setup.c:374:32: error: 'res' undeclared (first use in this function)
     374 |       quick_kexec_res.start >= res->start &&
         |                                ^~~
   arch/arm64/kernel/setup.c:374:32: note: each undeclared identifier is reported only once for each function it appears in
--
   arch/arm64/mm/init.c: In function 'reserve_quick_kexec':
>> arch/arm64/mm/init.c:155:13: error: 'CRASH_ALIGN' undeclared (first use in this function); did you mean 'CMSG_ALIGN'?
     155 |    mem_len, CRASH_ALIGN);
         |             ^~~~~~~~~~~
         |             CMSG_ALIGN
   arch/arm64/mm/init.c:155:13: note: each undeclared identifier is reported only once for each function it appears in

vim +/res +374 arch/arm64/kernel/setup.c

   284	
   285	void __init setup_arch(char **cmdline_p)
   286	{
   287		init_mm.start_code = (unsigned long) _text;
   288		init_mm.end_code   = (unsigned long) _etext;
   289		init_mm.end_data   = (unsigned long) _edata;
   290		init_mm.brk	   = (unsigned long) _end;
   291	
   292		*cmdline_p = boot_command_line;
   293	
   294		/*
   295		 * If know now we are going to need KPTI then use non-global
   296		 * mappings from the start, avoiding the cost of rewriting
   297		 * everything later.
   298		 */
   299		arm64_use_ng_mappings = kaslr_requires_kpti();
   300	
   301		early_fixmap_init();
   302		early_ioremap_init();
   303	
   304		setup_machine_fdt(__fdt_pointer);
   305	
   306		/*
   307		 * Initialise the static keys early as they may be enabled by the
   308		 * cpufeature code and early parameters.
   309		 */
   310		jump_label_init();
   311		parse_early_param();
   312	
   313		/*
   314		 * Unmask asynchronous aborts and fiq after bringing up possible
   315		 * earlycon. (Report possible System Errors once we can report this
   316		 * occurred).
   317		 */
   318		local_daif_restore(DAIF_PROCCTX_NOIRQ);
   319	
   320		/*
   321		 * TTBR0 is only used for the identity mapping at this stage. Make it
   322		 * point to zero page to avoid speculatively fetching new entries.
   323		 */
   324		cpu_uninstall_idmap();
   325	
   326		xen_early_init();
   327		efi_init();
   328	
   329		if (!efi_enabled(EFI_BOOT) && ((u64)_text % MIN_KIMG_ALIGN) != 0)
   330		     pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!");
   331	
   332		arm64_memblock_init();
   333	
   334		paging_init();
   335	
   336		acpi_table_upgrade();
   337	
   338		/* Parse the ACPI tables for possible boot-time configuration */
   339		acpi_boot_table_init();
   340	
   341		if (acpi_disabled)
   342			unflatten_device_tree();
   343	
   344		bootmem_init();
   345	
   346		kasan_init();
   347	
   348		request_standard_resources();
   349	
   350		early_ioremap_reset();
   351	
   352		if (acpi_disabled)
   353			psci_dt_init();
   354		else
   355			psci_acpi_init();
   356	
   357		init_bootcpu_ops();
   358		smp_init_cpus();
   359		smp_build_mpidr_hash();
   360	
   361		/* Init percpu seeds for random tags after cpus are set up. */
   362		kasan_init_tags();
   363	
   364	#ifdef CONFIG_ARM64_SW_TTBR0_PAN
   365		/*
   366		 * Make sure init_thread_info.ttbr0 always generates translation
   367		 * faults in case uaccess_enable() is inadvertently called by the init
   368		 * thread.
   369		 */
   370		init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page);
   371	#endif
   372	#ifdef CONFIG_QUICK_KEXEC
   373			if (quick_kexec_res.end &&
 > 374			    quick_kexec_res.start >= res->start &&
   375			    quick_kexec_res.end <= res->end)
   376				request_resource(res, &quick_kexec_res);
   377	#endif
   378	
   379		if (boot_args[1] || boot_args[2] || boot_args[3]) {
   380			pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
   381				"\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n"
   382				"This indicates a broken bootloader or old kernel\n",
   383				boot_args[1], boot_args[2], boot_args[3]);
   384		}
   385	}
   386	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 33210 bytes --]

[-- Attachment #3: Type: text/plain, Size: 143 bytes --]

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] arm64: Reserve memory for quick kexec
@ 2020-08-16  4:11     ` kernel test robot
  0 siblings, 0 replies; 25+ messages in thread
From: kernel test robot @ 2020-08-16  4:11 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 5445 bytes --]

Hi Sang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on linux/master linus/master v5.8 next-20200814]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Sang-Yan/kexec-Add-quick-kexec-support-for-kernel/20200814-142840
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: arm64-randconfig-r011-20200816 (attached as .config)
compiler: aarch64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=arm64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/arm64/kernel/setup.c: In function 'setup_arch':
>> arch/arm64/kernel/setup.c:374:32: error: 'res' undeclared (first use in this function)
     374 |       quick_kexec_res.start >= res->start &&
         |                                ^~~
   arch/arm64/kernel/setup.c:374:32: note: each undeclared identifier is reported only once for each function it appears in
--
   arch/arm64/mm/init.c: In function 'reserve_quick_kexec':
>> arch/arm64/mm/init.c:155:13: error: 'CRASH_ALIGN' undeclared (first use in this function); did you mean 'CMSG_ALIGN'?
     155 |    mem_len, CRASH_ALIGN);
         |             ^~~~~~~~~~~
         |             CMSG_ALIGN
   arch/arm64/mm/init.c:155:13: note: each undeclared identifier is reported only once for each function it appears in

vim +/res +374 arch/arm64/kernel/setup.c

   284	
   285	void __init setup_arch(char **cmdline_p)
   286	{
   287		init_mm.start_code = (unsigned long) _text;
   288		init_mm.end_code   = (unsigned long) _etext;
   289		init_mm.end_data   = (unsigned long) _edata;
   290		init_mm.brk	   = (unsigned long) _end;
   291	
   292		*cmdline_p = boot_command_line;
   293	
   294		/*
   295		 * If know now we are going to need KPTI then use non-global
   296		 * mappings from the start, avoiding the cost of rewriting
   297		 * everything later.
   298		 */
   299		arm64_use_ng_mappings = kaslr_requires_kpti();
   300	
   301		early_fixmap_init();
   302		early_ioremap_init();
   303	
   304		setup_machine_fdt(__fdt_pointer);
   305	
   306		/*
   307		 * Initialise the static keys early as they may be enabled by the
   308		 * cpufeature code and early parameters.
   309		 */
   310		jump_label_init();
   311		parse_early_param();
   312	
   313		/*
   314		 * Unmask asynchronous aborts and fiq after bringing up possible
   315		 * earlycon. (Report possible System Errors once we can report this
   316		 * occurred).
   317		 */
   318		local_daif_restore(DAIF_PROCCTX_NOIRQ);
   319	
   320		/*
   321		 * TTBR0 is only used for the identity mapping at this stage. Make it
   322		 * point to zero page to avoid speculatively fetching new entries.
   323		 */
   324		cpu_uninstall_idmap();
   325	
   326		xen_early_init();
   327		efi_init();
   328	
   329		if (!efi_enabled(EFI_BOOT) && ((u64)_text % MIN_KIMG_ALIGN) != 0)
   330		     pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!");
   331	
   332		arm64_memblock_init();
   333	
   334		paging_init();
   335	
   336		acpi_table_upgrade();
   337	
   338		/* Parse the ACPI tables for possible boot-time configuration */
   339		acpi_boot_table_init();
   340	
   341		if (acpi_disabled)
   342			unflatten_device_tree();
   343	
   344		bootmem_init();
   345	
   346		kasan_init();
   347	
   348		request_standard_resources();
   349	
   350		early_ioremap_reset();
   351	
   352		if (acpi_disabled)
   353			psci_dt_init();
   354		else
   355			psci_acpi_init();
   356	
   357		init_bootcpu_ops();
   358		smp_init_cpus();
   359		smp_build_mpidr_hash();
   360	
   361		/* Init percpu seeds for random tags after cpus are set up. */
   362		kasan_init_tags();
   363	
   364	#ifdef CONFIG_ARM64_SW_TTBR0_PAN
   365		/*
   366		 * Make sure init_thread_info.ttbr0 always generates translation
   367		 * faults in case uaccess_enable() is inadvertently called by the init
   368		 * thread.
   369		 */
   370		init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page);
   371	#endif
   372	#ifdef CONFIG_QUICK_KEXEC
   373			if (quick_kexec_res.end &&
 > 374			    quick_kexec_res.start >= res->start &&
   375			    quick_kexec_res.end <= res->end)
   376				request_resource(res, &quick_kexec_res);
   377	#endif
   378	
   379		if (boot_args[1] || boot_args[2] || boot_args[3]) {
   380			pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
   381				"\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n"
   382				"This indicates a broken bootloader or old kernel\n",
   383				boot_args[1], boot_args[2], boot_args[3]);
   384		}
   385	}
   386	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 33210 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
  2020-08-14 19:22         ` Pavel Tatashin
@ 2020-08-17 12:14           ` James Morse
  -1 siblings, 0 replies; 25+ messages in thread
From: James Morse @ 2020-08-17 12:14 UTC (permalink / raw)
  To: Pavel Tatashin, Dave Young
  Cc: Sang Yan, kexec mailing list, Eric W. Biederman, LKML, xiexiuqi,
	guohanjun, luanjianhai, zhuling8, luchunhua

Hi guys,

On 14/08/2020 20:22, Pavel Tatashin wrote:
> On Fri, Aug 14, 2020 at 7:24 AM Dave Young <dyoung@redhat.com> wrote:
>> On 08/14/20 at 04:21pm, Sang Yan wrote:
>>> On 08/14/20 14:58, Dave Young wrote:
>>>> On 08/14/20 at 01:52am, Sang Yan wrote:
>>> Yes, it's particularly obvious on arm64. I will add it to the patch log,
>>> and test how long it takes on x86 and other arch.

Earlier versions of kexec-tools had the in-purgatory checksum enabled unconditionally.
More recent versions let you disable it, I think the parameter is called no-checks. This
saves some time, but the relocations still have to be done.


>>>> About the arm64 problem, I know Pavel Tatashin is working on a patchset
>>>> to improve the performance with enabling mmu.
>>>>
>>>> I added Pavel in cc, can you try his patches?
>>>>
>>> Thanks for your tips, I will try these patches. @Pavel.
>>> Disable mmu after finishing copying pages?

>>>>> We introduce quick kexec to save time of copying memory as above,
>>>>> just like kdump(kexec on crash), by using reserved memory
>>>>> "Quick Kexec".
>>>>
>>>> This approach may have gain, but it also introduce extra requirements to
>>>> pre-reserve a memory region.  I wonder how Eric thinks about the idea.
>>>>
>>>> Anyway the "quick" name sounds not very good, I would suggest do not
>>>> introduce a new param, and the code can check if pre-reserved region
>>>> exist then use it, if not then fallback to old way.
>>>>
>>> aha. I agree with it, but I thought it may change the old behaviors of
>>> kexec_load.
>>>
>>> I will update a new patch without introducing new flags and new params.
>>
>> Frankly I'm still not sure it is worth to introduce a new interface if the
>> improvement can be done in arch code like Pavel is doing.  Can you try
>> that first?

> My patches will fix this issue. This is an ARM64 specific problem and
> I did not see this to be performance problem on x86 during kexec
> relocation. This happens because on ARM64 relocation is performed with
> MMU disabled, and when MMU is disabled the caching is disabled as
> well.

> I have a patch series that fixes this entirely, but James Morse
> (+CCed) and I still have not agreed on the final approach. We had an
> off-list conversation about it, and we need to continue it in public
> ML.
> 
> Here is some history:
> 
> This is the original series that I sent a year ago. It basically
> proposes the same thing as this series from Sang Yan:
> https://lore.kernel.org/lkml/20190709182014.16052-1-pasha.tatashin@soleen.com/
> 
> Once, I realized that with enabling MMU the relocation is issue is
> gone completely, I sent a new series, and this is the latest version
> of that series:
> https://lore.kernel.org/lkml/20200326032420.27220-1-pasha.tatashin@soleen.com/
> 
> It has been tested in production, and several people from different
> companies commented to me that they are using it as well.
> 
> After my patch series was sent out, James created a new branch in his
> tree with his approach of enabling MMU without having a new VA space,
> but instead re-use what the kernel  has now. I have not tested that
> branch yet.

For context, that is here:
http://www.linux-arm.org/git?p=linux-jm.git;a=shortlog;h=refs/heads/kexec%2Bmmu/v0

I think we can maintain this approach, but it doesn't work for Pavel, as he has extra
requirements. I stopped looking at it because it became a solution no-one needed.


> Here are some comments from James Morse and the off-list discussion we had:
> -------
> It sounds like you are depending on write streaming mode to meet your
> target performance.
> This isn't even CPU specific, its cache and firmware configuration specific!
> I don't think we should optimise a general purpose operating system
> based on things like this.
> ..
> I think the best approach is going to be to eliminate the relocations entirely.> ...
> I'm afraid I view this kexec-map thing as high-risk duct-tape over the
> kexec core code
> deliberately scattering the kexec payload.
> I'd prefer any approach that causes the payload to be stored in-place
> from the beginning
> as that benefits other architectures too.
> -------

The 'eliminate relocations' comment goes with some of the context you removed.


> It appears James is leaning to the approach of not performing
> relocation at all and use what is proposed by Sang Yan and me during
> my first approach for this problem.

The background to that is Pavel's timing requirements: Enabling the MMU isn't enough, from
his description he also depends on re-arranging the memory so the CPU only sees increasing
virtual addresses. This is what my 'write streaming' comment refers to.
Doing this requires rewriting the relocation assembly code.

If we enable the MMU during kexec relocation, I expect someone on a memory constrained
system to come out of the woodwork screaming 'regression'. Systems with insufficient
memory to allocate the page tables will no longer be able to kexec.

If we keep the relocation assembly code as it is, its possible for it to handle MMU-on and
MMU-off relocations with a very small adjustment. It just won't work for Pavel, as
enabling the MMU is not enough.

I'm confident we won't get a second copy of the relocation code, that only runs on some
platforms, past the arch code maintainer.


> However, I have several issues
> with this take, which if addressed would be OK for me.
> 1. The newer, more secure kexec syscall kexec_file_load(), which
> allows to check the IMA integrity of the loaded file does not have a
> way to specify the area in memory where to place the kernel. We are
> using this syscall in production, and cannot go back to kexec_load()
> for security reasons.
> 2. Reserving memory means wasting memory during run-time. Our machine
> has only 8G of RAM, and reserving even 128M 

You're loading a 128M kernel!?


> for the next kernel is an
> expensive proposition. Now we start loading the next kernel after some
> non essential processes are stopped, but before essential processes
> are stopped for the lowest downtime possible.

> 3. Disabling relocation means changes in the common code, which I am
> not sure actually helps any other platform beside ARM64, so I am
> worried it won't be accepted into upstream.

I'm happy to post the MMU-enabled series, it needs to be maintainable and solve someone's
problem.

To chip away at the rest of Pavel's problem, my suggestions were:
 * Allocate things in place. If we can allocate any 2MB hugepage, we can place the DTB
  there and not need to relocate it.

 * use huge pages more generally in the core code. With the MMU enabled, this might keep
   the core in write-streaming-mode for longer. (might, because this is very platform
   specific).

 * Store the kexec payload in the crashkernel carveout, to eliminate the relocations
   completely. This is the bit Pavel quoted.

To expand on the carveout:
The crashkernel carveout is sized to load the payload, and memory to run the kdump kernel
entirely from within the carveout. Its obviously bigger than the payload it contains.

If you load your kexec kernel into the 'memory' part of the carveout, it won't overwrite
the kdump payload, and it wont require relocation, as its already stored in place. arm64's
arch code will spot these in-place buffers, and skip the relocation.

If you kdump even after doing this, the kdump kernel sees the kexec payload as
uninitialised memory, and will overwrite it.

If we did this, we wouldn't need to enable the MMU, and it should skip most of the
relocation code on all architectures without any changes to the arch code.


On arm64 the kernel and initramfs only need relocating because their physical addresses
are baked into the DTB. The DTB's physical address is then passed to the new kernel.
From memory: 'relocatable kernel' is detectable from the image header, and the support
predates arm64's kexec support.


James

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-17 12:14           ` James Morse
  0 siblings, 0 replies; 25+ messages in thread
From: James Morse @ 2020-08-17 12:14 UTC (permalink / raw)
  To: Pavel Tatashin, Dave Young
  Cc: luanjianhai, Sang Yan, xiexiuqi, kexec mailing list, LKML,
	luchunhua, Eric W. Biederman, guohanjun, zhuling8

Hi guys,

On 14/08/2020 20:22, Pavel Tatashin wrote:
> On Fri, Aug 14, 2020 at 7:24 AM Dave Young <dyoung@redhat.com> wrote:
>> On 08/14/20 at 04:21pm, Sang Yan wrote:
>>> On 08/14/20 14:58, Dave Young wrote:
>>>> On 08/14/20 at 01:52am, Sang Yan wrote:
>>> Yes, it's particularly obvious on arm64. I will add it to the patch log,
>>> and test how long it takes on x86 and other arch.

Earlier versions of kexec-tools had the in-purgatory checksum enabled unconditionally.
More recent versions let you disable it, I think the parameter is called no-checks. This
saves some time, but the relocations still have to be done.


>>>> About the arm64 problem, I know Pavel Tatashin is working on a patchset
>>>> to improve the performance with enabling mmu.
>>>>
>>>> I added Pavel in cc, can you try his patches?
>>>>
>>> Thanks for your tips, I will try these patches. @Pavel.
>>> Disable mmu after finishing copying pages?

>>>>> We introduce quick kexec to save time of copying memory as above,
>>>>> just like kdump(kexec on crash), by using reserved memory
>>>>> "Quick Kexec".
>>>>
>>>> This approach may have gain, but it also introduce extra requirements to
>>>> pre-reserve a memory region.  I wonder how Eric thinks about the idea.
>>>>
>>>> Anyway the "quick" name sounds not very good, I would suggest do not
>>>> introduce a new param, and the code can check if pre-reserved region
>>>> exist then use it, if not then fallback to old way.
>>>>
>>> aha. I agree with it, but I thought it may change the old behaviors of
>>> kexec_load.
>>>
>>> I will update a new patch without introducing new flags and new params.
>>
>> Frankly I'm still not sure it is worth to introduce a new interface if the
>> improvement can be done in arch code like Pavel is doing.  Can you try
>> that first?

> My patches will fix this issue. This is an ARM64 specific problem and
> I did not see this to be performance problem on x86 during kexec
> relocation. This happens because on ARM64 relocation is performed with
> MMU disabled, and when MMU is disabled the caching is disabled as
> well.

> I have a patch series that fixes this entirely, but James Morse
> (+CCed) and I still have not agreed on the final approach. We had an
> off-list conversation about it, and we need to continue it in public
> ML.
> 
> Here is some history:
> 
> This is the original series that I sent a year ago. It basically
> proposes the same thing as this series from Sang Yan:
> https://lore.kernel.org/lkml/20190709182014.16052-1-pasha.tatashin@soleen.com/
> 
> Once, I realized that with enabling MMU the relocation is issue is
> gone completely, I sent a new series, and this is the latest version
> of that series:
> https://lore.kernel.org/lkml/20200326032420.27220-1-pasha.tatashin@soleen.com/
> 
> It has been tested in production, and several people from different
> companies commented to me that they are using it as well.
> 
> After my patch series was sent out, James created a new branch in his
> tree with his approach of enabling MMU without having a new VA space,
> but instead re-use what the kernel  has now. I have not tested that
> branch yet.

For context, that is here:
http://www.linux-arm.org/git?p=linux-jm.git;a=shortlog;h=refs/heads/kexec%2Bmmu/v0

I think we can maintain this approach, but it doesn't work for Pavel, as he has extra
requirements. I stopped looking at it because it became a solution no-one needed.


> Here are some comments from James Morse and the off-list discussion we had:
> -------
> It sounds like you are depending on write streaming mode to meet your
> target performance.
> This isn't even CPU specific, its cache and firmware configuration specific!
> I don't think we should optimise a general purpose operating system
> based on things like this.
> ..
> I think the best approach is going to be to eliminate the relocations entirely.> ...
> I'm afraid I view this kexec-map thing as high-risk duct-tape over the
> kexec core code
> deliberately scattering the kexec payload.
> I'd prefer any approach that causes the payload to be stored in-place
> from the beginning
> as that benefits other architectures too.
> -------

The 'eliminate relocations' comment goes with some of the context you removed.


> It appears James is leaning to the approach of not performing
> relocation at all and use what is proposed by Sang Yan and me during
> my first approach for this problem.

The background to that is Pavel's timing requirements: Enabling the MMU isn't enough, from
his description he also depends on re-arranging the memory so the CPU only sees increasing
virtual addresses. This is what my 'write streaming' comment refers to.
Doing this requires rewriting the relocation assembly code.

If we enable the MMU during kexec relocation, I expect someone on a memory constrained
system to come out of the woodwork screaming 'regression'. Systems with insufficient
memory to allocate the page tables will no longer be able to kexec.

If we keep the relocation assembly code as it is, its possible for it to handle MMU-on and
MMU-off relocations with a very small adjustment. It just won't work for Pavel, as
enabling the MMU is not enough.

I'm confident we won't get a second copy of the relocation code, that only runs on some
platforms, past the arch code maintainer.


> However, I have several issues
> with this take, which if addressed would be OK for me.
> 1. The newer, more secure kexec syscall kexec_file_load(), which
> allows to check the IMA integrity of the loaded file does not have a
> way to specify the area in memory where to place the kernel. We are
> using this syscall in production, and cannot go back to kexec_load()
> for security reasons.
> 2. Reserving memory means wasting memory during run-time. Our machine
> has only 8G of RAM, and reserving even 128M 

You're loading a 128M kernel!?


> for the next kernel is an
> expensive proposition. Now we start loading the next kernel after some
> non essential processes are stopped, but before essential processes
> are stopped for the lowest downtime possible.

> 3. Disabling relocation means changes in the common code, which I am
> not sure actually helps any other platform beside ARM64, so I am
> worried it won't be accepted into upstream.

I'm happy to post the MMU-enabled series, it needs to be maintainable and solve someone's
problem.

To chip away at the rest of Pavel's problem, my suggestions were:
 * Allocate things in place. If we can allocate any 2MB hugepage, we can place the DTB
  there and not need to relocate it.

 * use huge pages more generally in the core code. With the MMU enabled, this might keep
   the core in write-streaming-mode for longer. (might, because this is very platform
   specific).

 * Store the kexec payload in the crashkernel carveout, to eliminate the relocations
   completely. This is the bit Pavel quoted.

To expand on the carveout:
The crashkernel carveout is sized to load the payload, and memory to run the kdump kernel
entirely from within the carveout. Its obviously bigger than the payload it contains.

If you load your kexec kernel into the 'memory' part of the carveout, it won't overwrite
the kdump payload, and it wont require relocation, as its already stored in place. arm64's
arch code will spot these in-place buffers, and skip the relocation.

If you kdump even after doing this, the kdump kernel sees the kexec payload as
uninitialised memory, and will overwrite it.

If we did this, we wouldn't need to enable the MMU, and it should skip most of the
relocation code on all architectures without any changes to the arch code.


On arm64 the kernel and initramfs only need relocating because their physical addresses
are baked into the DTB. The DTB's physical address is then passed to the new kernel.
From memory: 'relocatable kernel' is detectable from the image header, and the support
predates arm64's kexec support.


James

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
  2020-08-14  5:52 ` Sang Yan
@ 2020-08-17 13:42   ` Pavel Machek
  -1 siblings, 0 replies; 25+ messages in thread
From: Pavel Machek @ 2020-08-17 13:42 UTC (permalink / raw)
  To: Sang Yan
  Cc: kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun, zhuling8,
	luanjianhai, luchunhua

Hi!

> +config QUICK_KEXEC
> +	bool "Support for quick kexec"
> +	depends on KEXEC_CORE
> +	help
> +	  Say y here to enable this feature.

?

> +	  It use reserved memory to accelerate kexec, just like crash

uses

> +	  kexec, load new kernel and initrd to reserved memory, and
> +	  boot new kernel on that memory. It will save the time of
> +	  relocating kernel.

loads a new.... boots new... 

>  	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
>  	IORES_DESC_RESERVED			= 7,
>  	IORES_DESC_SOFT_RESERVED		= 8,
> +#ifdef CONFIG_QUICK_KEXEC
> +	IORES_DESC_QUICK_KEXEC			= 9,
> +#endif
>  };

Remove ifdef.

>  /*
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index 9e93bef52968..976bf9631070 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -269,9 +269,12 @@ struct kimage {
>  	unsigned long control_page;
>  
>  	/* Flags to indicate special processing */
> -	unsigned int type : 1;
> +	unsigned int type : 2;
>  #define KEXEC_TYPE_DEFAULT 0
>  #define KEXEC_TYPE_CRASH   1
> +#ifdef CONFIG_QUICK_KEXEC
> +#define KEXEC_TYPE_QUICK   2
> +#endif
>  	unsigned int preserve_context : 1;

Here, too.

> +++ b/include/uapi/linux/kexec.h
> @@ -12,6 +12,9 @@
>  /* kexec flags for different usage scenarios */
>  #define KEXEC_ON_CRASH		0x00000001
>  #define KEXEC_PRESERVE_CONTEXT	0x00000002
> +#ifdef CONFIG_QUICK_KEXEC
> +#define KEXEC_QUICK		0x00000004
> +#endif
>  #define KEXEC_ARCH_MASK		0xffff0000

And here.

									Pavel

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-17 13:42   ` Pavel Machek
  0 siblings, 0 replies; 25+ messages in thread
From: Pavel Machek @ 2020-08-17 13:42 UTC (permalink / raw)
  To: Sang Yan
  Cc: luanjianhai, xiexiuqi, kexec, linux-kernel, luchunhua, ebiederm,
	guohanjun, zhuling8

Hi!

> +config QUICK_KEXEC
> +	bool "Support for quick kexec"
> +	depends on KEXEC_CORE
> +	help
> +	  Say y here to enable this feature.

?

> +	  It use reserved memory to accelerate kexec, just like crash

uses

> +	  kexec, load new kernel and initrd to reserved memory, and
> +	  boot new kernel on that memory. It will save the time of
> +	  relocating kernel.

loads a new.... boots new... 

>  	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
>  	IORES_DESC_RESERVED			= 7,
>  	IORES_DESC_SOFT_RESERVED		= 8,
> +#ifdef CONFIG_QUICK_KEXEC
> +	IORES_DESC_QUICK_KEXEC			= 9,
> +#endif
>  };

Remove ifdef.

>  /*
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index 9e93bef52968..976bf9631070 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -269,9 +269,12 @@ struct kimage {
>  	unsigned long control_page;
>  
>  	/* Flags to indicate special processing */
> -	unsigned int type : 1;
> +	unsigned int type : 2;
>  #define KEXEC_TYPE_DEFAULT 0
>  #define KEXEC_TYPE_CRASH   1
> +#ifdef CONFIG_QUICK_KEXEC
> +#define KEXEC_TYPE_QUICK   2
> +#endif
>  	unsigned int preserve_context : 1;

Here, too.

> +++ b/include/uapi/linux/kexec.h
> @@ -12,6 +12,9 @@
>  /* kexec flags for different usage scenarios */
>  #define KEXEC_ON_CRASH		0x00000001
>  #define KEXEC_PRESERVE_CONTEXT	0x00000002
> +#ifdef CONFIG_QUICK_KEXEC
> +#define KEXEC_QUICK		0x00000004
> +#endif
>  #define KEXEC_ARCH_MASK		0xffff0000

And here.

									Pavel

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
  2020-08-17 13:42   ` Pavel Machek
@ 2020-08-18  6:49     ` Sang Yan
  -1 siblings, 0 replies; 25+ messages in thread
From: Sang Yan @ 2020-08-18  6:49 UTC (permalink / raw)
  To: Pavel Machek
  Cc: kexec, ebiederm, linux-kernel, xiexiuqi, guohanjun, zhuling8,
	luanjianhai, luchunhua


On 8/17/2020 9:42 PM, Pavel Machek wrote:
> Hi!
> 
>> +config QUICK_KEXEC
>> +	bool "Support for quick kexec"
>> +	depends on KEXEC_CORE
>> +	help
>> +	  Say y here to enable this feature.
> 
> ?
> 
>> +	  It use reserved memory to accelerate kexec, just like crash
> 
> uses
> 
>> +	  kexec, load new kernel and initrd to reserved memory, and
>> +	  boot new kernel on that memory. It will save the time of
>> +	  relocating kernel.
> 
> loads a new.... boots new... 
> 
>>  	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
>>  	IORES_DESC_RESERVED			= 7,
>>  	IORES_DESC_SOFT_RESERVED		= 8,
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	IORES_DESC_QUICK_KEXEC			= 9,
>> +#endif
>>  };
> 
> Remove ifdef.
> 
>>  /*
>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>> index 9e93bef52968..976bf9631070 100644
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>> @@ -269,9 +269,12 @@ struct kimage {
>>  	unsigned long control_page;
>>  
>>  	/* Flags to indicate special processing */
>> -	unsigned int type : 1;
>> +	unsigned int type : 2;
>>  #define KEXEC_TYPE_DEFAULT 0
>>  #define KEXEC_TYPE_CRASH   1
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#define KEXEC_TYPE_QUICK   2
>> +#endif
>>  	unsigned int preserve_context : 1;
> 
> Here, too.
> 
>> +++ b/include/uapi/linux/kexec.h
>> @@ -12,6 +12,9 @@
>>  /* kexec flags for different usage scenarios */
>>  #define KEXEC_ON_CRASH		0x00000001
>>  #define KEXEC_PRESERVE_CONTEXT	0x00000002
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#define KEXEC_QUICK		0x00000004
>> +#endif
>>  #define KEXEC_ARCH_MASK		0xffff0000
> 
> And here.
> 
> 									Pavel
> 
> .
> 

Thanks a lot for your review.

Sang Yan.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-18  6:49     ` Sang Yan
  0 siblings, 0 replies; 25+ messages in thread
From: Sang Yan @ 2020-08-18  6:49 UTC (permalink / raw)
  To: Pavel Machek
  Cc: luanjianhai, xiexiuqi, kexec, linux-kernel, luchunhua, ebiederm,
	guohanjun, zhuling8


On 8/17/2020 9:42 PM, Pavel Machek wrote:
> Hi!
> 
>> +config QUICK_KEXEC
>> +	bool "Support for quick kexec"
>> +	depends on KEXEC_CORE
>> +	help
>> +	  Say y here to enable this feature.
> 
> ?
> 
>> +	  It use reserved memory to accelerate kexec, just like crash
> 
> uses
> 
>> +	  kexec, load new kernel and initrd to reserved memory, and
>> +	  boot new kernel on that memory. It will save the time of
>> +	  relocating kernel.
> 
> loads a new.... boots new... 
> 
>>  	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
>>  	IORES_DESC_RESERVED			= 7,
>>  	IORES_DESC_SOFT_RESERVED		= 8,
>> +#ifdef CONFIG_QUICK_KEXEC
>> +	IORES_DESC_QUICK_KEXEC			= 9,
>> +#endif
>>  };
> 
> Remove ifdef.
> 
>>  /*
>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>> index 9e93bef52968..976bf9631070 100644
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>> @@ -269,9 +269,12 @@ struct kimage {
>>  	unsigned long control_page;
>>  
>>  	/* Flags to indicate special processing */
>> -	unsigned int type : 1;
>> +	unsigned int type : 2;
>>  #define KEXEC_TYPE_DEFAULT 0
>>  #define KEXEC_TYPE_CRASH   1
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#define KEXEC_TYPE_QUICK   2
>> +#endif
>>  	unsigned int preserve_context : 1;
> 
> Here, too.
> 
>> +++ b/include/uapi/linux/kexec.h
>> @@ -12,6 +12,9 @@
>>  /* kexec flags for different usage scenarios */
>>  #define KEXEC_ON_CRASH		0x00000001
>>  #define KEXEC_PRESERVE_CONTEXT	0x00000002
>> +#ifdef CONFIG_QUICK_KEXEC
>> +#define KEXEC_QUICK		0x00000004
>> +#endif
>>  #define KEXEC_ARCH_MASK		0xffff0000
> 
> And here.
> 
> 									Pavel
> 
> .
> 

Thanks a lot for your review.

Sang Yan.


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
  2020-08-17 12:14           ` James Morse
@ 2020-08-19 12:37             ` Dave Young
  -1 siblings, 0 replies; 25+ messages in thread
From: Dave Young @ 2020-08-19 12:37 UTC (permalink / raw)
  To: James Morse
  Cc: Pavel Tatashin, Sang Yan, kexec mailing list, Eric W. Biederman,
	LKML, xiexiuqi, guohanjun, luanjianhai, zhuling8, luchunhua

On 08/17/20 at 01:14pm, James Morse wrote:
> Hi guys,
> 
> On 14/08/2020 20:22, Pavel Tatashin wrote:
> > On Fri, Aug 14, 2020 at 7:24 AM Dave Young <dyoung@redhat.com> wrote:
> >> On 08/14/20 at 04:21pm, Sang Yan wrote:
> >>> On 08/14/20 14:58, Dave Young wrote:
> >>>> On 08/14/20 at 01:52am, Sang Yan wrote:
> >>> Yes, it's particularly obvious on arm64. I will add it to the patch log,
> >>> and test how long it takes on x86 and other arch.
> 
> Earlier versions of kexec-tools had the in-purgatory checksum enabled unconditionally.
> More recent versions let you disable it, I think the parameter is called no-checks. This
> saves some time, but the relocations still have to be done.
> 
> 
> >>>> About the arm64 problem, I know Pavel Tatashin is working on a patchset
> >>>> to improve the performance with enabling mmu.
> >>>>
> >>>> I added Pavel in cc, can you try his patches?
> >>>>
> >>> Thanks for your tips, I will try these patches. @Pavel.
> >>> Disable mmu after finishing copying pages?
> 
> >>>>> We introduce quick kexec to save time of copying memory as above,
> >>>>> just like kdump(kexec on crash), by using reserved memory
> >>>>> "Quick Kexec".
> >>>>
> >>>> This approach may have gain, but it also introduce extra requirements to
> >>>> pre-reserve a memory region.  I wonder how Eric thinks about the idea.
> >>>>
> >>>> Anyway the "quick" name sounds not very good, I would suggest do not
> >>>> introduce a new param, and the code can check if pre-reserved region
> >>>> exist then use it, if not then fallback to old way.
> >>>>
> >>> aha. I agree with it, but I thought it may change the old behaviors of
> >>> kexec_load.
> >>>
> >>> I will update a new patch without introducing new flags and new params.
> >>
> >> Frankly I'm still not sure it is worth to introduce a new interface if the
> >> improvement can be done in arch code like Pavel is doing.  Can you try
> >> that first?
> 
> > My patches will fix this issue. This is an ARM64 specific problem and
> > I did not see this to be performance problem on x86 during kexec
> > relocation. This happens because on ARM64 relocation is performed with
> > MMU disabled, and when MMU is disabled the caching is disabled as
> > well.
> 
> > I have a patch series that fixes this entirely, but James Morse
> > (+CCed) and I still have not agreed on the final approach. We had an
> > off-list conversation about it, and we need to continue it in public
> > ML.
> > 
> > Here is some history:
> > 
> > This is the original series that I sent a year ago. It basically
> > proposes the same thing as this series from Sang Yan:
> > https://lore.kernel.org/lkml/20190709182014.16052-1-pasha.tatashin@soleen.com/
> > 
> > Once, I realized that with enabling MMU the relocation is issue is
> > gone completely, I sent a new series, and this is the latest version
> > of that series:
> > https://lore.kernel.org/lkml/20200326032420.27220-1-pasha.tatashin@soleen.com/
> > 
> > It has been tested in production, and several people from different
> > companies commented to me that they are using it as well.
> > 
> > After my patch series was sent out, James created a new branch in his
> > tree with his approach of enabling MMU without having a new VA space,
> > but instead re-use what the kernel  has now. I have not tested that
> > branch yet.
> 
> For context, that is here:
> http://www.linux-arm.org/git?p=linux-jm.git;a=shortlog;h=refs/heads/kexec%2Bmmu/v0
> 
> I think we can maintain this approach, but it doesn't work for Pavel, as he has extra
> requirements. I stopped looking at it because it became a solution no-one needed.
> 
> 
> > Here are some comments from James Morse and the off-list discussion we had:
> > -------
> > It sounds like you are depending on write streaming mode to meet your
> > target performance.
> > This isn't even CPU specific, its cache and firmware configuration specific!
> > I don't think we should optimise a general purpose operating system
> > based on things like this.
> > ..
> > I think the best approach is going to be to eliminate the relocations entirely.> ...
> > I'm afraid I view this kexec-map thing as high-risk duct-tape over the
> > kexec core code
> > deliberately scattering the kexec payload.
> > I'd prefer any approach that causes the payload to be stored in-place
> > from the beginning
> > as that benefits other architectures too.
> > -------
> 
> The 'eliminate relocations' comment goes with some of the context you removed.
> 
> 
> > It appears James is leaning to the approach of not performing
> > relocation at all and use what is proposed by Sang Yan and me during
> > my first approach for this problem.
> 
> The background to that is Pavel's timing requirements: Enabling the MMU isn't enough, from
> his description he also depends on re-arranging the memory so the CPU only sees increasing
> virtual addresses. This is what my 'write streaming' comment refers to.
> Doing this requires rewriting the relocation assembly code.
> 
> If we enable the MMU during kexec relocation, I expect someone on a memory constrained
> system to come out of the woodwork screaming 'regression'. Systems with insufficient
> memory to allocate the page tables will no longer be able to kexec.
> 
> If we keep the relocation assembly code as it is, its possible for it to handle MMU-on and
> MMU-off relocations with a very small adjustment. It just won't work for Pavel, as
> enabling the MMU is not enough.
> 
> I'm confident we won't get a second copy of the relocation code, that only runs on some
> platforms, past the arch code maintainer.
> 
> 
> > However, I have several issues
> > with this take, which if addressed would be OK for me.
> > 1. The newer, more secure kexec syscall kexec_file_load(), which
> > allows to check the IMA integrity of the loaded file does not have a
> > way to specify the area in memory where to place the kernel. We are
> > using this syscall in production, and cannot go back to kexec_load()
> > for security reasons.
> > 2. Reserving memory means wasting memory during run-time. Our machine
> > has only 8G of RAM, and reserving even 128M 
> 
> You're loading a 128M kernel!?
> 
> 
> > for the next kernel is an
> > expensive proposition. Now we start loading the next kernel after some
> > non essential processes are stopped, but before essential processes
> > are stopped for the lowest downtime possible.
> 
> > 3. Disabling relocation means changes in the common code, which I am
> > not sure actually helps any other platform beside ARM64, so I am
> > worried it won't be accepted into upstream.
> 
> I'm happy to post the MMU-enabled series, it needs to be maintainable and solve someone's
> problem.
> 
> To chip away at the rest of Pavel's problem, my suggestions were:
>  * Allocate things in place. If we can allocate any 2MB hugepage, we can place the DTB
>   there and not need to relocate it.
> 
>  * use huge pages more generally in the core code. With the MMU enabled, this might keep
>    the core in write-streaming-mode for longer. (might, because this is very platform
>    specific).
> 
>  * Store the kexec payload in the crashkernel carveout, to eliminate the relocations
>    completely. This is the bit Pavel quoted.
> 
> To expand on the carveout:
> The crashkernel carveout is sized to load the payload, and memory to run the kdump kernel
> entirely from within the carveout. Its obviously bigger than the payload it contains.
> 
> If you load your kexec kernel into the 'memory' part of the carveout, it won't overwrite
> the kdump payload, and it wont require relocation, as its already stored in place. arm64's
> arch code will spot these in-place buffers, and skip the relocation.
> 
> If you kdump even after doing this, the kdump kernel sees the kexec payload as
> uninitialised memory, and will overwrite it.
> 
> If we did this, we wouldn't need to enable the MMU, and it should skip most of the
> relocation code on all architectures without any changes to the arch code.

I'm not very confident about this approach.  

Kdump usually goes with a minimum initramfs, but for kexec reboot the
normal initramfs generated by distribution could be larger, and there
could be some other cases which can cause troubles.

Another way, maybe one can copy the kernel, dtb, initrd etc with the
very early boot code. And the later kernel code just use the kexec_file_load
function to load them, since they are already in kernel buffers so the
relocations are not necessary.

Current kexec_file_load use 'fd' to load file content, we can have some
interface to load from memory buffers.

Anyway this is just a wild idea.

> 
> On arm64 the kernel and initramfs only need relocating because their physical addresses
> are baked into the DTB. The DTB's physical address is then passed to the new kernel.
> From memory: 'relocatable kernel' is detectable from the image header, and the support
> predates arm64's kexec support.
> 
> 
> James
> 

Thanks
Dave


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/2] kexec: Add quick kexec support for kernel
@ 2020-08-19 12:37             ` Dave Young
  0 siblings, 0 replies; 25+ messages in thread
From: Dave Young @ 2020-08-19 12:37 UTC (permalink / raw)
  To: James Morse
  Cc: luanjianhai, Pavel Tatashin, xiexiuqi, kexec mailing list, LKML,
	luchunhua, Eric W. Biederman, guohanjun, zhuling8, Sang Yan

On 08/17/20 at 01:14pm, James Morse wrote:
> Hi guys,
> 
> On 14/08/2020 20:22, Pavel Tatashin wrote:
> > On Fri, Aug 14, 2020 at 7:24 AM Dave Young <dyoung@redhat.com> wrote:
> >> On 08/14/20 at 04:21pm, Sang Yan wrote:
> >>> On 08/14/20 14:58, Dave Young wrote:
> >>>> On 08/14/20 at 01:52am, Sang Yan wrote:
> >>> Yes, it's particularly obvious on arm64. I will add it to the patch log,
> >>> and test how long it takes on x86 and other arch.
> 
> Earlier versions of kexec-tools had the in-purgatory checksum enabled unconditionally.
> More recent versions let you disable it, I think the parameter is called no-checks. This
> saves some time, but the relocations still have to be done.
> 
> 
> >>>> About the arm64 problem, I know Pavel Tatashin is working on a patchset
> >>>> to improve the performance with enabling mmu.
> >>>>
> >>>> I added Pavel in cc, can you try his patches?
> >>>>
> >>> Thanks for your tips, I will try these patches. @Pavel.
> >>> Disable mmu after finishing copying pages?
> 
> >>>>> We introduce quick kexec to save time of copying memory as above,
> >>>>> just like kdump(kexec on crash), by using reserved memory
> >>>>> "Quick Kexec".
> >>>>
> >>>> This approach may have gain, but it also introduce extra requirements to
> >>>> pre-reserve a memory region.  I wonder how Eric thinks about the idea.
> >>>>
> >>>> Anyway the "quick" name sounds not very good, I would suggest do not
> >>>> introduce a new param, and the code can check if pre-reserved region
> >>>> exist then use it, if not then fallback to old way.
> >>>>
> >>> aha. I agree with it, but I thought it may change the old behaviors of
> >>> kexec_load.
> >>>
> >>> I will update a new patch without introducing new flags and new params.
> >>
> >> Frankly I'm still not sure it is worth to introduce a new interface if the
> >> improvement can be done in arch code like Pavel is doing.  Can you try
> >> that first?
> 
> > My patches will fix this issue. This is an ARM64 specific problem and
> > I did not see this to be performance problem on x86 during kexec
> > relocation. This happens because on ARM64 relocation is performed with
> > MMU disabled, and when MMU is disabled the caching is disabled as
> > well.
> 
> > I have a patch series that fixes this entirely, but James Morse
> > (+CCed) and I still have not agreed on the final approach. We had an
> > off-list conversation about it, and we need to continue it in public
> > ML.
> > 
> > Here is some history:
> > 
> > This is the original series that I sent a year ago. It basically
> > proposes the same thing as this series from Sang Yan:
> > https://lore.kernel.org/lkml/20190709182014.16052-1-pasha.tatashin@soleen.com/
> > 
> > Once, I realized that with enabling MMU the relocation is issue is
> > gone completely, I sent a new series, and this is the latest version
> > of that series:
> > https://lore.kernel.org/lkml/20200326032420.27220-1-pasha.tatashin@soleen.com/
> > 
> > It has been tested in production, and several people from different
> > companies commented to me that they are using it as well.
> > 
> > After my patch series was sent out, James created a new branch in his
> > tree with his approach of enabling MMU without having a new VA space,
> > but instead re-use what the kernel  has now. I have not tested that
> > branch yet.
> 
> For context, that is here:
> http://www.linux-arm.org/git?p=linux-jm.git;a=shortlog;h=refs/heads/kexec%2Bmmu/v0
> 
> I think we can maintain this approach, but it doesn't work for Pavel, as he has extra
> requirements. I stopped looking at it because it became a solution no-one needed.
> 
> 
> > Here are some comments from James Morse and the off-list discussion we had:
> > -------
> > It sounds like you are depending on write streaming mode to meet your
> > target performance.
> > This isn't even CPU specific, its cache and firmware configuration specific!
> > I don't think we should optimise a general purpose operating system
> > based on things like this.
> > ..
> > I think the best approach is going to be to eliminate the relocations entirely.> ...
> > I'm afraid I view this kexec-map thing as high-risk duct-tape over the
> > kexec core code
> > deliberately scattering the kexec payload.
> > I'd prefer any approach that causes the payload to be stored in-place
> > from the beginning
> > as that benefits other architectures too.
> > -------
> 
> The 'eliminate relocations' comment goes with some of the context you removed.
> 
> 
> > It appears James is leaning to the approach of not performing
> > relocation at all and use what is proposed by Sang Yan and me during
> > my first approach for this problem.
> 
> The background to that is Pavel's timing requirements: Enabling the MMU isn't enough, from
> his description he also depends on re-arranging the memory so the CPU only sees increasing
> virtual addresses. This is what my 'write streaming' comment refers to.
> Doing this requires rewriting the relocation assembly code.
> 
> If we enable the MMU during kexec relocation, I expect someone on a memory constrained
> system to come out of the woodwork screaming 'regression'. Systems with insufficient
> memory to allocate the page tables will no longer be able to kexec.
> 
> If we keep the relocation assembly code as it is, its possible for it to handle MMU-on and
> MMU-off relocations with a very small adjustment. It just won't work for Pavel, as
> enabling the MMU is not enough.
> 
> I'm confident we won't get a second copy of the relocation code, that only runs on some
> platforms, past the arch code maintainer.
> 
> 
> > However, I have several issues
> > with this take, which if addressed would be OK for me.
> > 1. The newer, more secure kexec syscall kexec_file_load(), which
> > allows to check the IMA integrity of the loaded file does not have a
> > way to specify the area in memory where to place the kernel. We are
> > using this syscall in production, and cannot go back to kexec_load()
> > for security reasons.
> > 2. Reserving memory means wasting memory during run-time. Our machine
> > has only 8G of RAM, and reserving even 128M 
> 
> You're loading a 128M kernel!?
> 
> 
> > for the next kernel is an
> > expensive proposition. Now we start loading the next kernel after some
> > non essential processes are stopped, but before essential processes
> > are stopped for the lowest downtime possible.
> 
> > 3. Disabling relocation means changes in the common code, which I am
> > not sure actually helps any other platform beside ARM64, so I am
> > worried it won't be accepted into upstream.
> 
> I'm happy to post the MMU-enabled series, it needs to be maintainable and solve someone's
> problem.
> 
> To chip away at the rest of Pavel's problem, my suggestions were:
>  * Allocate things in place. If we can allocate any 2MB hugepage, we can place the DTB
>   there and not need to relocate it.
> 
>  * use huge pages more generally in the core code. With the MMU enabled, this might keep
>    the core in write-streaming-mode for longer. (might, because this is very platform
>    specific).
> 
>  * Store the kexec payload in the crashkernel carveout, to eliminate the relocations
>    completely. This is the bit Pavel quoted.
> 
> To expand on the carveout:
> The crashkernel carveout is sized to load the payload, and memory to run the kdump kernel
> entirely from within the carveout. Its obviously bigger than the payload it contains.
> 
> If you load your kexec kernel into the 'memory' part of the carveout, it won't overwrite
> the kdump payload, and it wont require relocation, as its already stored in place. arm64's
> arch code will spot these in-place buffers, and skip the relocation.
> 
> If you kdump even after doing this, the kdump kernel sees the kexec payload as
> uninitialised memory, and will overwrite it.
> 
> If we did this, we wouldn't need to enable the MMU, and it should skip most of the
> relocation code on all architectures without any changes to the arch code.

I'm not very confident about this approach.  

Kdump usually goes with a minimum initramfs, but for kexec reboot the
normal initramfs generated by distribution could be larger, and there
could be some other cases which can cause troubles.

Another way, maybe one can copy the kernel, dtb, initrd etc with the
very early boot code. And the later kernel code just use the kexec_file_load
function to load them, since they are already in kernel buffers so the
relocations are not necessary.

Current kexec_file_load use 'fd' to load file content, we can have some
interface to load from memory buffers.

Anyway this is just a wild idea.

> 
> On arm64 the kernel and initramfs only need relocating because their physical addresses
> are baked into the DTB. The DTB's physical address is then passed to the new kernel.
> From memory: 'relocatable kernel' is detectable from the image header, and the support
> predates arm64's kexec support.
> 
> 
> James
> 

Thanks
Dave


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2020-08-19 12:37 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-14  5:52 [PATCH 1/2] kexec: Add quick kexec support for kernel Sang Yan
2020-08-14  5:52 ` Sang Yan
2020-08-14  5:52 ` [PATCH 2/2] arm64: Reserve memory for quick kexec Sang Yan
2020-08-14  5:52   ` Sang Yan
2020-08-16  4:11   ` kernel test robot
2020-08-16  4:11     ` kernel test robot
2020-08-16  4:11     ` kernel test robot
2020-08-14  6:58 ` [PATCH 1/2] kexec: Add quick kexec support for kernel Dave Young
2020-08-14  6:58   ` Dave Young
2020-08-14  8:21   ` Sang Yan
2020-08-14  8:21     ` Sang Yan
2020-08-14 11:24     ` Dave Young
2020-08-14 11:24       ` Dave Young
2020-08-14 19:22       ` Pavel Tatashin
2020-08-14 19:22         ` Pavel Tatashin
2020-08-17 12:14         ` James Morse
2020-08-17 12:14           ` James Morse
2020-08-19 12:37           ` Dave Young
2020-08-19 12:37             ` Dave Young
2020-08-14 15:17 ` Eric W. Biederman
2020-08-14 15:17   ` Eric W. Biederman
2020-08-17 13:42 ` Pavel Machek
2020-08-17 13:42   ` Pavel Machek
2020-08-18  6:49   ` Sang Yan
2020-08-18  6:49     ` Sang Yan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.