linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86/purgatory: provide config to disable purgatory
@ 2021-11-23 15:05 Usama Arif
  2021-11-23 15:24 ` Borislav Petkov
  0 siblings, 1 reply; 9+ messages in thread
From: Usama Arif @ 2021-11-23 15:05 UTC (permalink / raw)
  To: x86, linux-kernel, vgoyal, tglx; +Cc: fam.zheng, Usama Arif

This can help in reducing boot time if purgatory is not needed
as the sha256 digest of kexec segments is no longer calculated
or verified if the config is disabled.

Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Reviewed-by: Fam Zheng <fam.zheng@bytedance.com>
---
 arch/powerpc/Kbuild               |  2 +-
 arch/powerpc/Kconfig              |  2 +-
 arch/s390/Kbuild                  |  2 +-
 arch/s390/Kconfig                 |  2 +-
 arch/s390/purgatory/Makefile      |  2 +-
 arch/x86/Kbuild                   |  2 +-
 arch/x86/Kconfig                  |  6 ++--
 arch/x86/kernel/kexec-bzimage64.c | 59 +++++++++++++++++++++------------------
 arch/x86/purgatory/Makefile       |  2 +-
 kernel/kexec_file.c               |  6 ++--
 10 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/arch/powerpc/Kbuild b/arch/powerpc/Kbuild
index 22cd0d55a892..072e62d7898e 100644
--- a/arch/powerpc/Kbuild
+++ b/arch/powerpc/Kbuild
@@ -15,7 +15,7 @@ obj-$(CONFIG_KVM)  += kvm/
 
 obj-$(CONFIG_PERF_EVENTS) += perf/
 obj-$(CONFIG_KEXEC_CORE)  += kexec/
-obj-$(CONFIG_KEXEC_FILE)  += purgatory/
+obj-$(CONFIG_KEXEC_PURGATORY)  += purgatory/
 
 # for cleaning
 subdir- += boot
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index dea74d7717c0..58bdfd1abb44 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -557,7 +557,7 @@ config KEXEC_FILE
 	  for kernel and initramfs as opposed to a list of segments as is the
 	  case for the older kexec call.
 
-config ARCH_HAS_KEXEC_PURGATORY
+config KEXEC_PURGATORY
 	def_bool KEXEC_FILE
 
 config RELOCATABLE
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild
index 76e362277179..2ed4ee5cdf59 100644
--- a/arch/s390/Kbuild
+++ b/arch/s390/Kbuild
@@ -7,7 +7,7 @@ obj-$(CONFIG_S390_HYPFS_FS)	+= hypfs/
 obj-$(CONFIG_APPLDATA_BASE)	+= appldata/
 obj-y				+= net/
 obj-$(CONFIG_PCI)		+= pci/
-obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/
+obj-$(CONFIG_KEXEC_PURGATORY) += purgatory/
 
 # for cleaning
 subdir- += boot tools
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2a5bb4f29cfe..d15bdaa0e198 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -538,7 +538,7 @@ config KEXEC_FILE
 	  kexec system call this system call takes file descriptors for the
 	  kernel and initramfs as arguments.
 
-config ARCH_HAS_KEXEC_PURGATORY
+config KEXEC_PURGATORY
 	def_bool y
 	depends on KEXEC_FILE
 
diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
index 360ada80d20c..03cac6d7310a 100644
--- a/arch/s390/purgatory/Makefile
+++ b/arch/s390/purgatory/Makefile
@@ -51,4 +51,4 @@ $(obj)/purgatory.ro: $(obj)/purgatory $(obj)/purgatory.chk FORCE
 $(obj)/kexec-purgatory.o: $(obj)/kexec-purgatory.S $(obj)/purgatory.ro FORCE
 	$(call if_changed_rule,as_o_S)
 
-obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o
+obj-$(CONFIG_KEXEC_PURGATORY) += kexec-purgatory.o
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index f384cb1a4f7a..9089438ed6d8 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -24,7 +24,7 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/
 obj-y += platform/
 obj-y += net/
 
-obj-$(CONFIG_KEXEC_FILE) += purgatory/
+obj-$(CONFIG_KEXEC_PURGATORY) += purgatory/
 
 # for cleaning
 subdir- += boot tools
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7399327d1eff..7efe6dbfdc67 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2000,8 +2000,10 @@ config KEXEC_FILE
 	  for kernel and initramfs as opposed to list of segments as
 	  accepted by previous system call.
 
-config ARCH_HAS_KEXEC_PURGATORY
-	def_bool KEXEC_FILE
+config KEXEC_PURGATORY
+	bool "A standalone relocatable object run between the 2 kernels during kexec"
+	depends on KEXEC_FILE
+	default y
 
 config KEXEC_SIG
 	bool "Verify kernel signature during kexec_file_load() syscall"
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 170d0fd68b1f..bf37a2c4ab8b 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -374,18 +374,19 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 			return ERR_PTR(ret);
 	}
 
-	/*
-	 * Load purgatory. For 64bit entry point, purgatory  code can be
-	 * anywhere.
-	 */
-	ret = kexec_load_purgatory(image, &pbuf);
-	if (ret) {
-		pr_err("Loading purgatory failed\n");
-		return ERR_PTR(ret);
-	}
-
-	pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+	if (IS_ENABLED(CONFIG_KEXEC_PURGATORY)) {
+		/*
+		 * Load purgatory. For 64bit entry point, purgatory  code can be
+		 * anywhere.
+		 */
+		ret = kexec_load_purgatory(image, &pbuf);
+		if (ret) {
+			pr_err("Loading purgatory failed\n");
+			return ERR_PTR(ret);
+		}
 
+		pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+	}
 
 	/*
 	 * Load Bootparams and cmdline and space for efi stuff.
@@ -466,28 +467,32 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	params->hdr.type_of_loader = 0x0D << 4;
 	params->hdr.loadflags = 0;
 
-	/* Setup purgatory regs for entry */
-	ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
-					     sizeof(regs64), 1);
-	if (ret)
-		goto out_free_params;
+	if (IS_ENABLED(CONFIG_KEXEC_PURGATORY)) {
+		/* Setup purgatory regs for entry */
+		ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+							sizeof(regs64), 1);
+		if (ret)
+			goto out_free_params;
+	}
 
 	regs64.rbx = 0; /* Bootstrap Processor */
 	regs64.rsi = bootparam_load_addr;
 	regs64.rip = kernel_load_addr + 0x200;
-	stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
-	if (IS_ERR(stack)) {
-		pr_err("Could not find address of symbol stack_end\n");
-		ret = -EINVAL;
-		goto out_free_params;
-	}
 
-	regs64.rsp = (unsigned long)stack;
-	ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
-					     sizeof(regs64), 0);
-	if (ret)
-		goto out_free_params;
+	if (IS_ENABLED(CONFIG_KEXEC_PURGATORY)) {
+		stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+		if (IS_ERR(stack)) {
+			pr_err("Could not find address of symbol stack_end\n");
+			ret = -EINVAL;
+			goto out_free_params;
+		}
 
+		regs64.rsp = (unsigned long)stack;
+		ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+							sizeof(regs64), 0);
+		if (ret)
+			goto out_free_params;
+	}
 	ret = setup_boot_parameters(image, params, bootparam_load_addr,
 				    efi_map_offset, efi_map_sz,
 				    efi_setup_data_offset);
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 95ea17a9d20c..688b3f21be8f 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -81,4 +81,4 @@ quiet_cmd_bin2c = BIN2C   $@
 $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro $(obj)/purgatory.chk FORCE
 	$(call if_changed,bin2c)
 
-obj-$(CONFIG_KEXEC_FILE)	+= kexec-purgatory.o
+obj-$(CONFIG_KEXEC_PURGATORY)	+= kexec-purgatory.o
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 8347fc158d2b..e5f4c2d27249 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -724,7 +724,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
 	struct kexec_sha_region *sha_regions;
 	struct purgatory_info *pi = &image->purgatory_info;
 
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+	if (!IS_ENABLED(CONFIG_KEXEC_PURGATORY))
 		return 0;
 
 	zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
@@ -829,7 +829,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
 	return ret;
 }
 
-#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+#ifdef CONFIG_KEXEC_PURGATORY
 /*
  * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
  * @pi:		Purgatory to be loaded.
@@ -1176,7 +1176,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 
 	return 0;
 }
-#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
+#endif /* CONFIG_KEXEC_PURGATORY */
 
 int crash_exclude_mem_range(struct crash_mem *mem,
 			    unsigned long long mstart, unsigned long long mend)
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86/purgatory: provide config to disable purgatory
  2021-11-23 15:05 [PATCH] x86/purgatory: provide config to disable purgatory Usama Arif
@ 2021-11-23 15:24 ` Borislav Petkov
  2021-11-29 14:04   ` Vivek Goyal
  0 siblings, 1 reply; 9+ messages in thread
From: Borislav Petkov @ 2021-11-23 15:24 UTC (permalink / raw)
  To: Usama Arif; +Cc: x86, linux-kernel, vgoyal, tglx, fam.zheng

On Tue, Nov 23, 2021 at 03:05:08PM +0000, Usama Arif wrote:
> This can help in reducing boot time if purgatory is not needed
> as the sha256 digest of kexec segments is no longer calculated
> or verified if the config is disabled.

I'd prefer a commit message to say:

"Disable purgatory because of real-life use case X. With it disabled,
booting a second kernel is sped up by Y."

"Just because" and adding yet another config option is not worth the
effort, otherwise.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86/purgatory: provide config to disable purgatory
  2021-11-23 15:24 ` Borislav Petkov
@ 2021-11-29 14:04   ` Vivek Goyal
       [not found]     ` <79517d3c-3674-cc21-fbdc-b26946809756@bytedance.com>
  0 siblings, 1 reply; 9+ messages in thread
From: Vivek Goyal @ 2021-11-29 14:04 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Usama Arif, x86, linux-kernel, tglx, fam.zheng,
	Eric W. Biederman, Baoquan He, Dave Young

On Tue, Nov 23, 2021 at 04:24:01PM +0100, Borislav Petkov wrote:
> On Tue, Nov 23, 2021 at 03:05:08PM +0000, Usama Arif wrote:
> > This can help in reducing boot time if purgatory is not needed
> > as the sha256 digest of kexec segments is no longer calculated
> > or verified if the config is disabled.
> 
> I'd prefer a commit message to say:
> 
> "Disable purgatory because of real-life use case X. With it disabled,
> booting a second kernel is sped up by Y."
> 
> "Just because" and adding yet another config option is not worth the
> effort, otherwise.

Agreed. What's the use case.

Also this cheksum is used to make sure purgatory is not corrupted. So
this is sort of saftey mechanism to make sure things are still the
same as we expected before we start executing this piece of code. Hence
this does not sound like an optional feature to me (even if it speeds
up things a bit).

BTW, how much speed up do you see.

Thanks
Vivek


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [External] Re: [PATCH] x86/purgatory: provide config to disable purgatory
       [not found]     ` <79517d3c-3674-cc21-fbdc-b26946809756@bytedance.com>
@ 2021-11-29 16:53       ` Eric W. Biederman
  2021-12-01 10:29         ` Usama Arif
  0 siblings, 1 reply; 9+ messages in thread
From: Eric W. Biederman @ 2021-11-29 16:53 UTC (permalink / raw)
  To: Usama Arif
  Cc: Vivek Goyal, Borislav Petkov, x86, linux-kernel, tglx, fam.zheng,
	Baoquan He, Dave Young

Usama Arif <usama.arif@bytedance.com> writes:

> Hi,
>
> Thanks for your replies. I have submitted a v2 of the patch with a
> much more detailed commit message including reason for the patch and timing values.
>
> The time taken from reboot to running init process was measured
> with both purgatory enabled and disabled over 20 runs and the
> averages are:
> Purgatory disabled:
> - TSC = 3908766161 cycles
> - ktime = 606.8 ms
> Purgatory enabled:
> - TSC = 5005811885 cycles (28.1% worse)
> - ktime = 843.1 ms (38.9% worse)
>
>
> Our reason for this patch is that it helps reduce the downtime of servers when
> the host kernel managing multiple VMs needs to be updated via kexec,
> but it makes reboot with kexec much faster so should be a general improvement in
> boot time if purgatory is not needed and could have other usecases as well.
> I believe only x86, powerpc and s390 have purgatory supported, other platforms
> like arm64 dont have it implemented yet, so with the reboot time improvement seen,
> it would be a good idea to have the option to disable purgatory completely but set default to y.
> We also have the CONFIG_KEXEC_BZIMAGE_VERIFY_SIG which can be enabled to verify the next
> kernel image to be booted and purgatory can be completely skipped if
> not required.

CONFIG_KEXEC_BZIMAGE_VERIFY_SIG is something totally and completely
different.  It's job is to verify that the kernel to be booted comes
from a trusted source.   The sha256 verification in purgatory's job
is to verify that memory the kernel cares about was not corrupted
during the kexec process.

I believe when you say purgatory you are really talking about that
sha256 checksum.  It really is not possible to disable all of
the code that runs between kernels, as the old and the new kernel may
run at the same addresses.  Anything that runs between the two kernels
is what is referred to as purgatory.  Even if it is just a small
assembly stub.

That sha256 verification is always needed for kexec on panic, there are
by the nature of a kernel panic too many unknowns to have any confidence
the new kernel will not be corrupted in the process of kexec before it
gets started.

For an ordinary kexec it might be possible to say that you have a
reliable kernel shutdown process and you know for a fact that something
won't come along and corrupt the kernel.  I find that a questionable
assertion.  I haven't seen anyone yet whose focus when getting an
ordinary kexec to work as anything other than making certain all of the
drivers are shutdown properly.

I have seen countless times when a network packet comes in a the wrong
time and the target kernel's memory is corrupted before it gets far
enough to initialize the network driver.

For a 0.2s speed up you are talking about disabling all of the safety
checks in a very dangerous situation.  How much can you can in
performance by optimizing the sha256 implementation instead of using
what is essentially a reference implementation in basic C that I copied
from somewhere long ago.

Optimize the sha256 implementation and the memory copy loop and then
show how the tiny bit of time that is left is on a mission critical path
and must be removed.  Then we can reasonably talk about a config option
for disabling the sha256 implementation in the kexec in not-panic case.

That sha256 implementation in part so that we can all sleep at night
because we don't have to deal with very very strange heizenbugs.

Eric


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [External] Re: [PATCH] x86/purgatory: provide config to disable purgatory
  2021-11-29 16:53       ` [External] " Eric W. Biederman
@ 2021-12-01 10:29         ` Usama Arif
  2021-12-06 16:51           ` Usama Arif
  0 siblings, 1 reply; 9+ messages in thread
From: Usama Arif @ 2021-12-01 10:29 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Vivek Goyal, Borislav Petkov, x86, linux-kernel, tglx, fam.zheng,
	Baoquan He, Dave Young

Hi,

(Resending the reply as my email client had updated and inserted html 
code and caused a bounceback from the mailing list, sorry about that.)

Thanks for your reply, I have responded with further comments/questions 
inline below, and also have provided some context for the patch at the 
start:

The patch is not introducing a new CONFIG option, as can be seen in the 
v2 patch diff. It is converting an existing CONFIG option that only 
enabled purgatory for specific architectures in which it has been 
implemented (x86, powerpc and s390) to an option that can that allow 
purgatory to be disabled (with default enabled) and only provides code 
to disable purgatory for x86 only. From what i see, purgatory is 
currently not yet implemented in other architectures like arm64 
kexec_file case and riscv, so this would on a high level,
provide only the option to make kexec on x86 similar to other the other 
architectures.

As a background to the discussion, the usecase we are aiming is to 
update the host kernel with kexec in servers which is managing multiple 
VMs, to cut down the downtime of servers as much as possible so that its 
not noticeable to VMs. The patch is aimed at x86 purgatory code 
specifically. We are targetting other optimizations as well in other 
areas in boot path to cut down the 600ms time much further, so that cut 
down of 200ms downtime is significant in the usecase described. I did 
have a few comments/questions about your reply, please see my responses 
below:


On 29/11/2021 16:53, Eric W. Biederman wrote:
> Usama Arif <usama.arif@bytedance.com> writes:
> 
>> Hi,
>>
>> Thanks for your replies. I have submitted a v2 of the patch with a
>> much more detailed commit message including reason for the patch and timing values.
>>
>> The time taken from reboot to running init process was measured
>> with both purgatory enabled and disabled over 20 runs and the
>> averages are:
>> Purgatory disabled:
>> - TSC = 3908766161 cycles
>> - ktime = 606.8 ms
>> Purgatory enabled:
>> - TSC = 5005811885 cycles (28.1% worse)
>> - ktime = 843.1 ms (38.9% worse)
>>
>>
>> Our reason for this patch is that it helps reduce the downtime of servers when
>> the host kernel managing multiple VMs needs to be updated via kexec,
>> but it makes reboot with kexec much faster so should be a general improvement in
>> boot time if purgatory is not needed and could have other usecases as well.
>> I believe only x86, powerpc and s390 have purgatory supported, other platforms
>> like arm64 dont have it implemented yet, so with the reboot time improvement seen,
>> it would be a good idea to have the option to disable purgatory completely but set default to y.
>> We also have the CONFIG_KEXEC_BZIMAGE_VERIFY_SIG which can be enabled to verify the next
>> kernel image to be booted and purgatory can be completely skipped if
>> not required.
> 
> CONFIG_KEXEC_BZIMAGE_VERIFY_SIG is something totally and completely
> different.  It's job is to verify that the kernel to be booted comes
> from a trusted source.   The sha256 verification in purgatory's job
> is to verify that memory the kernel cares about was not corrupted
> during the kexec process.
> 

Thanks, acknowledged.

> I believe when you say purgatory you are really talking about that
> sha256 checksum.  It really is not possible to disable all of
> the code that runs between kernels, as the old and the new kernel may
> run at the same addresses.  Anything that runs between the two kernels
> is what is referred to as purgatory.  Even if it is just a small
> assembly stub.
> 
 >

With this patch, i am trying to give an option (with default purgatory 
enabled)to disable purgatory completely on x86 only, i.e. no code 
running between 2 kernels on x86. From my understanding there is no 
purgatory in arm64 kexec_file case, in riscv and in some other archs as 
well, so I am not sure why its not possible to disable purgatory (all 
code running between 2 kernels) in x86? Unless i misunderstood something 
about the working of other platforms? In x86 case, from what i see 
relocate_kernel is still part of the older kernel and not purgatory, and 
with my patch, purgatory.ro is not built and kexec does execute 
successfully with purgatory disabled.

About your point for the old and the new kernel may run at the same 
addresses,i dont think that can happen as in bzImage64_load function its 
loaded using the kbuf struct. This doesnt happen in other architectures 
(for e.g. arm64) that dont implement purgatory and any of the tests I 
conducted with the patch applied due to the use of kbuf struct.

 From what i see in the code, purgatory in x86 specifically, has 2 main 
functions, sha256 verification and loading the %rsi register for 
bootparam_load_addr. In this patch purgatory was disabled, which 
resulted in sha256 verification disabled and bootparam_load_addr moved 
to relocate_kernel. Our analysis revealed that most of the time is spent 
in the sha256 verification, so I would be ok to reformat the patch
to make the sha verification optional and keep purgatory enabled if 
thats preferred? Although in my opinion the current patch of only 
providing an option to disable purgatory seems much better.


> That sha256 verification is always needed for kexec on panic, there are
> by the nature of a kernel panic too many unknowns to have any confidence
> the new kernel will not be corrupted in the process of kexec before it
> gets started.
> 
> For an ordinary kexec it might be possible to say that you have a
> reliable kernel shutdown process and you know for a fact that something
> won't come along and corrupt the kernel.  I find that a questionable
> assertion.  I haven't seen anyone yet whose focus when getting an
> ordinary kexec to work as anything other than making certain all of the
> drivers are shutdown properly.
> 
> I have seen countless times when a network packet comes in a the wrong
> time and the target kernel's memory is corrupted before it gets far
> enough to initialize the network driver. >

I agree that when doing testing and research, there are things that can
go wrong during kexec process, but i assume that the expectation is that
in production environment, when for e.g. updating a kernel with kexec in 
servers,the expectation is that kexec will execute successfully as long 
as there is nothing wrong in the old kernel and the new kernel.

Maybe I didn't understand this correctly, but if a network packet comes 
in and corrupts the kernel memory, wont it also cause a problem even 
when purgatory is enabled? I agree that issue like these would be caught 
earlier with purgatory, but then if something like this breaks the kexec 
process in a production environment where the old and new kernel are 
well tested, trusted and expected to work, wouldn't there be a much more 
fundamental issue with the reliability of the current kexec process.

> For a 0.2s speed up you are talking about disabling all of the safety
> checks in a very dangerous situation.  How much can you can in
> performance by optimizing the sha256 implementation instead of using
> what is essentially a reference implementation in basic C that I copied
> from somewhere long ago.
> 
> Optimize the sha256 implementation and the memory copy loop and then
> show how the tiny bit of time that is left is on a mission critical path
> and must be removed.  Then we can reasonably talk about a config option
> for disabling the sha256 implementation in the kexec in not-panic case.
> 
Thanks for this, I assume that over here you are suggesting for e.g. in 
x86 to replace the existing sha verification implementation in purgatory 
with the one in arch/x86/crypto/sha256_ssse3_glue.c? Also could you 
point to the memory copy loop to optimize? I can have a look at these 2 
options.

Even with these optimizations, i think the v2 patch should still be 
considered. The patch in the end is providing only an option to disable 
purgatory on x86 only, with the default value of keeping it enabled and 
not changing kexec behaviour. The CONFIG already existed, it is just 
renamed and now provides user the option to select to disable purgatory 
for x86, rather than it being architecture dependent.

Thanks again for the review and comments,
Usama


> That sha256 implementation in part so that we can all sleep at night
> because we don't have to deal with very very strange heizenbugs.
> 
> Eric
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [External] Re: [PATCH] x86/purgatory: provide config to disable purgatory
  2021-12-01 10:29         ` Usama Arif
@ 2021-12-06 16:51           ` Usama Arif
  2021-12-16 18:05             ` Usama Arif
  0 siblings, 1 reply; 9+ messages in thread
From: Usama Arif @ 2021-12-06 16:51 UTC (permalink / raw)
  To: Eric W. Biederman, Vivek Goyal
  Cc: Borislav Petkov, x86, linux-kernel, tglx, fam.zheng, Baoquan He,
	Dave Young

Hi,

I have sent a v3 of the patch with a much clearer commit message, please
let me know if there are any comments for the v3 patch or my responses 
below.

Thanks,
Usama

On 01/12/2021 10:29, Usama Arif wrote:
> Hi,
> 
> (Resending the reply as my email client had updated and inserted html 
> code and caused a bounceback from the mailing list, sorry about that.)
> 
> Thanks for your reply, I have responded with further comments/questions 
> inline below, and also have provided some context for the patch at the 
> start:
> 
> The patch is not introducing a new CONFIG option, as can be seen in the 
> v2 patch diff. It is converting an existing CONFIG option that only 
> enabled purgatory for specific architectures in which it has been 
> implemented (x86, powerpc and s390) to an option that can that allow 
> purgatory to be disabled (with default enabled) and only provides code 
> to disable purgatory for x86 only. From what i see, purgatory is 
> currently not yet implemented in other architectures like arm64 
> kexec_file case and riscv, so this would on a high level,
> provide only the option to make kexec on x86 similar to other the other 
> architectures.
> 
> As a background to the discussion, the usecase we are aiming is to 
> update the host kernel with kexec in servers which is managing multiple 
> VMs, to cut down the downtime of servers as much as possible so that its 
> not noticeable to VMs. The patch is aimed at x86 purgatory code 
> specifically. We are targetting other optimizations as well in other 
> areas in boot path to cut down the 600ms time much further, so that cut 
> down of 200ms downtime is significant in the usecase described. I did 
> have a few comments/questions about your reply, please see my responses 
> below:
> 
> 
> On 29/11/2021 16:53, Eric W. Biederman wrote:
>> Usama Arif <usama.arif@bytedance.com> writes:
>>
>>> Hi,
>>>
>>> Thanks for your replies. I have submitted a v2 of the patch with a
>>> much more detailed commit message including reason for the patch and 
>>> timing values.
>>>
>>> The time taken from reboot to running init process was measured
>>> with both purgatory enabled and disabled over 20 runs and the
>>> averages are:
>>> Purgatory disabled:
>>> - TSC = 3908766161 cycles
>>> - ktime = 606.8 ms
>>> Purgatory enabled:
>>> - TSC = 5005811885 cycles (28.1% worse)
>>> - ktime = 843.1 ms (38.9% worse)
>>>
>>>
>>> Our reason for this patch is that it helps reduce the downtime of 
>>> servers when
>>> the host kernel managing multiple VMs needs to be updated via kexec,
>>> but it makes reboot with kexec much faster so should be a general 
>>> improvement in
>>> boot time if purgatory is not needed and could have other usecases as 
>>> well.
>>> I believe only x86, powerpc and s390 have purgatory supported, other 
>>> platforms
>>> like arm64 dont have it implemented yet, so with the reboot time 
>>> improvement seen,
>>> it would be a good idea to have the option to disable purgatory 
>>> completely but set default to y.
>>> We also have the CONFIG_KEXEC_BZIMAGE_VERIFY_SIG which can be enabled 
>>> to verify the next
>>> kernel image to be booted and purgatory can be completely skipped if
>>> not required.
>>
>> CONFIG_KEXEC_BZIMAGE_VERIFY_SIG is something totally and completely
>> different.  It's job is to verify that the kernel to be booted comes
>> from a trusted source.   The sha256 verification in purgatory's job
>> is to verify that memory the kernel cares about was not corrupted
>> during the kexec process.
>>
> 
> Thanks, acknowledged.
> 
>> I believe when you say purgatory you are really talking about that
>> sha256 checksum.  It really is not possible to disable all of
>> the code that runs between kernels, as the old and the new kernel may
>> run at the same addresses.  Anything that runs between the two kernels
>> is what is referred to as purgatory.  Even if it is just a small
>> assembly stub.
>>
>  >
> 
> With this patch, i am trying to give an option (with default purgatory 
> enabled)to disable purgatory completely on x86 only, i.e. no code 
> running between 2 kernels on x86. From my understanding there is no 
> purgatory in arm64 kexec_file case, in riscv and in some other archs as 
> well, so I am not sure why its not possible to disable purgatory (all 
> code running between 2 kernels) in x86? Unless i misunderstood something 
> about the working of other platforms? In x86 case, from what i see 
> relocate_kernel is still part of the older kernel and not purgatory, and 
> with my patch, purgatory.ro is not built and kexec does execute 
> successfully with purgatory disabled.
> 
> About your point for the old and the new kernel may run at the same 
> addresses,i dont think that can happen as in bzImage64_load function its 
> loaded using the kbuf struct. This doesnt happen in other architectures 
> (for e.g. arm64) that dont implement purgatory and any of the tests I 
> conducted with the patch applied due to the use of kbuf struct.
> 
>  From what i see in the code, purgatory in x86 specifically, has 2 main 
> functions, sha256 verification and loading the %rsi register for 
> bootparam_load_addr. In this patch purgatory was disabled, which 
> resulted in sha256 verification disabled and bootparam_load_addr moved 
> to relocate_kernel. Our analysis revealed that most of the time is spent 
> in the sha256 verification, so I would be ok to reformat the patch
> to make the sha verification optional and keep purgatory enabled if 
> thats preferred? Although in my opinion the current patch of only 
> providing an option to disable purgatory seems much better.
> 
> 
>> That sha256 verification is always needed for kexec on panic, there are
>> by the nature of a kernel panic too many unknowns to have any confidence
>> the new kernel will not be corrupted in the process of kexec before it
>> gets started.
>>
>> For an ordinary kexec it might be possible to say that you have a
>> reliable kernel shutdown process and you know for a fact that something
>> won't come along and corrupt the kernel.  I find that a questionable
>> assertion.  I haven't seen anyone yet whose focus when getting an
>> ordinary kexec to work as anything other than making certain all of the
>> drivers are shutdown properly.
>>
>> I have seen countless times when a network packet comes in a the wrong
>> time and the target kernel's memory is corrupted before it gets far
>> enough to initialize the network driver. >
> 
> I agree that when doing testing and research, there are things that can
> go wrong during kexec process, but i assume that the expectation is that
> in production environment, when for e.g. updating a kernel with kexec in 
> servers,the expectation is that kexec will execute successfully as long 
> as there is nothing wrong in the old kernel and the new kernel.
> 
> Maybe I didn't understand this correctly, but if a network packet comes 
> in and corrupts the kernel memory, wont it also cause a problem even 
> when purgatory is enabled? I agree that issue like these would be caught 
> earlier with purgatory, but then if something like this breaks the kexec 
> process in a production environment where the old and new kernel are 
> well tested, trusted and expected to work, wouldn't there be a much more 
> fundamental issue with the reliability of the current kexec process.
> 
>> For a 0.2s speed up you are talking about disabling all of the safety
>> checks in a very dangerous situation.  How much can you can in
>> performance by optimizing the sha256 implementation instead of using
>> what is essentially a reference implementation in basic C that I copied
>> from somewhere long ago.
>>
>> Optimize the sha256 implementation and the memory copy loop and then
>> show how the tiny bit of time that is left is on a mission critical path
>> and must be removed.  Then we can reasonably talk about a config option
>> for disabling the sha256 implementation in the kexec in not-panic case.
>>
> Thanks for this, I assume that over here you are suggesting for e.g. in 
> x86 to replace the existing sha verification implementation in purgatory 
> with the one in arch/x86/crypto/sha256_ssse3_glue.c? Also could you 
> point to the memory copy loop to optimize? I can have a look at these 2 
> options.
> 
> Even with these optimizations, i think the v2 patch should still be 
> considered. The patch in the end is providing only an option to disable 
> purgatory on x86 only, with the default value of keeping it enabled and 
> not changing kexec behaviour. The CONFIG already existed, it is just 
> renamed and now provides user the option to select to disable purgatory 
> for x86, rather than it being architecture dependent.
> 
> Thanks again for the review and comments,
> Usama
> 
> 
>> That sha256 implementation in part so that we can all sleep at night
>> because we don't have to deal with very very strange heizenbugs.
>>
>> Eric
>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86/purgatory: provide config to disable purgatory
  2021-12-06 16:51           ` Usama Arif
@ 2021-12-16 18:05             ` Usama Arif
  2021-12-17  0:56               ` Baoquan He
  0 siblings, 1 reply; 9+ messages in thread
From: Usama Arif @ 2021-12-16 18:05 UTC (permalink / raw)
  To: Eric W. Biederman, Vivek Goyal, Dave Young, Baoquan He, tglx,
	Borislav Petkov
  Cc: x86, linux-kernel, fam.zheng

Hi,

Just wanted to check again if there were any comments on my responses 
below or the v3 of the patch at 
https://lore.kernel.org/lkml/20211206164724.2125489-1-usama.arif@bytedance.com/.

Thanks!
Usama

On 06/12/2021 16:51, Usama Arif wrote:
> Hi,
> 
> I have sent a v3 of the patch with a much clearer commit message, please
> let me know if there are any comments for the v3 patch or my responses 
> below.
> 
> Thanks,
> Usama
> 
> On 01/12/2021 10:29, Usama Arif wrote:
>> Hi,
>>
>> (Resending the reply as my email client had updated and inserted html 
>> code and caused a bounceback from the mailing list, sorry about that.)
>>
>> Thanks for your reply, I have responded with further 
>> comments/questions inline below, and also have provided some context 
>> for the patch at the start:
>>
>> The patch is not introducing a new CONFIG option, as can be seen in 
>> the v2 patch diff. It is converting an existing CONFIG option that 
>> only enabled purgatory for specific architectures in which it has been 
>> implemented (x86, powerpc and s390) to an option that can that allow 
>> purgatory to be disabled (with default enabled) and only provides code 
>> to disable purgatory for x86 only. From what i see, purgatory is 
>> currently not yet implemented in other architectures like arm64 
>> kexec_file case and riscv, so this would on a high level,
>> provide only the option to make kexec on x86 similar to other the 
>> other architectures.
>>
>> As a background to the discussion, the usecase we are aiming is to 
>> update the host kernel with kexec in servers which is managing 
>> multiple VMs, to cut down the downtime of servers as much as possible 
>> so that its not noticeable to VMs. The patch is aimed at x86 purgatory 
>> code specifically. We are targetting other optimizations as well in 
>> other areas in boot path to cut down the 600ms time much further, so 
>> that cut down of 200ms downtime is significant in the usecase 
>> described. I did have a few comments/questions about your reply, 
>> please see my responses below:
>>
>>
>> On 29/11/2021 16:53, Eric W. Biederman wrote:
>>> Usama Arif <usama.arif@bytedance.com> writes:
>>>
>>>> Hi,
>>>>
>>>> Thanks for your replies. I have submitted a v2 of the patch with a
>>>> much more detailed commit message including reason for the patch and 
>>>> timing values.
>>>>
>>>> The time taken from reboot to running init process was measured
>>>> with both purgatory enabled and disabled over 20 runs and the
>>>> averages are:
>>>> Purgatory disabled:
>>>> - TSC = 3908766161 cycles
>>>> - ktime = 606.8 ms
>>>> Purgatory enabled:
>>>> - TSC = 5005811885 cycles (28.1% worse)
>>>> - ktime = 843.1 ms (38.9% worse)
>>>>
>>>>
>>>> Our reason for this patch is that it helps reduce the downtime of 
>>>> servers when
>>>> the host kernel managing multiple VMs needs to be updated via kexec,
>>>> but it makes reboot with kexec much faster so should be a general 
>>>> improvement in
>>>> boot time if purgatory is not needed and could have other usecases 
>>>> as well.
>>>> I believe only x86, powerpc and s390 have purgatory supported, other 
>>>> platforms
>>>> like arm64 dont have it implemented yet, so with the reboot time 
>>>> improvement seen,
>>>> it would be a good idea to have the option to disable purgatory 
>>>> completely but set default to y.
>>>> We also have the CONFIG_KEXEC_BZIMAGE_VERIFY_SIG which can be 
>>>> enabled to verify the next
>>>> kernel image to be booted and purgatory can be completely skipped if
>>>> not required.
>>>
>>> CONFIG_KEXEC_BZIMAGE_VERIFY_SIG is something totally and completely
>>> different.  It's job is to verify that the kernel to be booted comes
>>> from a trusted source.   The sha256 verification in purgatory's job
>>> is to verify that memory the kernel cares about was not corrupted
>>> during the kexec process.
>>>
>>
>> Thanks, acknowledged.
>>
>>> I believe when you say purgatory you are really talking about that
>>> sha256 checksum.  It really is not possible to disable all of
>>> the code that runs between kernels, as the old and the new kernel may
>>> run at the same addresses.  Anything that runs between the two kernels
>>> is what is referred to as purgatory.  Even if it is just a small
>>> assembly stub.
>>>
>>  >
>>
>> With this patch, i am trying to give an option (with default purgatory 
>> enabled)to disable purgatory completely on x86 only, i.e. no code 
>> running between 2 kernels on x86. From my understanding there is no 
>> purgatory in arm64 kexec_file case, in riscv and in some other archs 
>> as well, so I am not sure why its not possible to disable purgatory 
>> (all code running between 2 kernels) in x86? Unless i misunderstood 
>> something about the working of other platforms? In x86 case, from what 
>> i see relocate_kernel is still part of the older kernel and not 
>> purgatory, and with my patch, purgatory.ro is not built and kexec does 
>> execute successfully with purgatory disabled.
>>
>> About your point for the old and the new kernel may run at the same 
>> addresses,i dont think that can happen as in bzImage64_load function 
>> its loaded using the kbuf struct. This doesnt happen in other 
>> architectures (for e.g. arm64) that dont implement purgatory and any 
>> of the tests I conducted with the patch applied due to the use of kbuf 
>> struct.
>>
>>  From what i see in the code, purgatory in x86 specifically, has 2 
>> main functions, sha256 verification and loading the %rsi register for 
>> bootparam_load_addr. In this patch purgatory was disabled, which 
>> resulted in sha256 verification disabled and bootparam_load_addr moved 
>> to relocate_kernel. Our analysis revealed that most of the time is 
>> spent in the sha256 verification, so I would be ok to reformat the patch
>> to make the sha verification optional and keep purgatory enabled if 
>> thats preferred? Although in my opinion the current patch of only 
>> providing an option to disable purgatory seems much better.
>>
>>
>>> That sha256 verification is always needed for kexec on panic, there are
>>> by the nature of a kernel panic too many unknowns to have any confidence
>>> the new kernel will not be corrupted in the process of kexec before it
>>> gets started.
>>>
>>> For an ordinary kexec it might be possible to say that you have a
>>> reliable kernel shutdown process and you know for a fact that something
>>> won't come along and corrupt the kernel.  I find that a questionable
>>> assertion.  I haven't seen anyone yet whose focus when getting an
>>> ordinary kexec to work as anything other than making certain all of the
>>> drivers are shutdown properly.
>>>
>>> I have seen countless times when a network packet comes in a the wrong
>>> time and the target kernel's memory is corrupted before it gets far
>>> enough to initialize the network driver. >
>>
>> I agree that when doing testing and research, there are things that can
>> go wrong during kexec process, but i assume that the expectation is that
>> in production environment, when for e.g. updating a kernel with kexec 
>> in servers,the expectation is that kexec will execute successfully as 
>> long as there is nothing wrong in the old kernel and the new kernel.
>>
>> Maybe I didn't understand this correctly, but if a network packet 
>> comes in and corrupts the kernel memory, wont it also cause a problem 
>> even when purgatory is enabled? I agree that issue like these would be 
>> caught earlier with purgatory, but then if something like this breaks 
>> the kexec process in a production environment where the old and new 
>> kernel are well tested, trusted and expected to work, wouldn't there 
>> be a much more fundamental issue with the reliability of the current 
>> kexec process.
>>
>>> For a 0.2s speed up you are talking about disabling all of the safety
>>> checks in a very dangerous situation.  How much can you can in
>>> performance by optimizing the sha256 implementation instead of using
>>> what is essentially a reference implementation in basic C that I copied
>>> from somewhere long ago.
>>>
>>> Optimize the sha256 implementation and the memory copy loop and then
>>> show how the tiny bit of time that is left is on a mission critical path
>>> and must be removed.  Then we can reasonably talk about a config option
>>> for disabling the sha256 implementation in the kexec in not-panic case.
>>>
>> Thanks for this, I assume that over here you are suggesting for e.g. 
>> in x86 to replace the existing sha verification implementation in 
>> purgatory with the one in arch/x86/crypto/sha256_ssse3_glue.c? Also 
>> could you point to the memory copy loop to optimize? I can have a look 
>> at these 2 options.
>>
>> Even with these optimizations, i think the v2 patch should still be 
>> considered. The patch in the end is providing only an option to 
>> disable purgatory on x86 only, with the default value of keeping it 
>> enabled and not changing kexec behaviour. The CONFIG already existed, 
>> it is just renamed and now provides user the option to select to 
>> disable purgatory for x86, rather than it being architecture dependent.
>>
>> Thanks again for the review and comments,
>> Usama
>>
>>
>>> That sha256 implementation in part so that we can all sleep at night
>>> because we don't have to deal with very very strange heizenbugs.
>>>
>>> Eric
>>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86/purgatory: provide config to disable purgatory
  2021-12-16 18:05             ` Usama Arif
@ 2021-12-17  0:56               ` Baoquan He
  0 siblings, 0 replies; 9+ messages in thread
From: Baoquan He @ 2021-12-17  0:56 UTC (permalink / raw)
  To: Usama Arif
  Cc: Eric W. Biederman, Vivek Goyal, Dave Young, tglx,
	Borislav Petkov, x86, linux-kernel, fam.zheng

On 12/16/21 at 06:05pm, Usama Arif wrote:
> Hi,
> 
> Just wanted to check again if there were any comments on my responses below
> or the v3 of the patch at https://lore.kernel.org/lkml/20211206164724.2125489-1-usama.arif@bytedance.com/.

I will have a look. Could you please CC kexec mailing list too next time
when kexec/kdump related patch posted so that more people can help review
if concerned or interested?

I usually don't switch to lkml mail folder unless have to, since it
contains too many mails and take longer time.

> 
> Thanks!
> Usama
> 
> On 06/12/2021 16:51, Usama Arif wrote:
> > Hi,
> > 
> > I have sent a v3 of the patch with a much clearer commit message, please
> > let me know if there are any comments for the v3 patch or my responses
> > below.
> > 
> > Thanks,
> > Usama
> > 
> > On 01/12/2021 10:29, Usama Arif wrote:
> > > Hi,
> > > 
> > > (Resending the reply as my email client had updated and inserted
> > > html code and caused a bounceback from the mailing list, sorry about
> > > that.)
> > > 
> > > Thanks for your reply, I have responded with further
> > > comments/questions inline below, and also have provided some context
> > > for the patch at the start:
> > > 
> > > The patch is not introducing a new CONFIG option, as can be seen in
> > > the v2 patch diff. It is converting an existing CONFIG option that
> > > only enabled purgatory for specific architectures in which it has
> > > been implemented (x86, powerpc and s390) to an option that can that
> > > allow purgatory to be disabled (with default enabled) and only
> > > provides code to disable purgatory for x86 only. From what i see,
> > > purgatory is currently not yet implemented in other architectures
> > > like arm64 kexec_file case and riscv, so this would on a high level,
> > > provide only the option to make kexec on x86 similar to other the
> > > other architectures.
> > > 
> > > As a background to the discussion, the usecase we are aiming is to
> > > update the host kernel with kexec in servers which is managing
> > > multiple VMs, to cut down the downtime of servers as much as
> > > possible so that its not noticeable to VMs. The patch is aimed at
> > > x86 purgatory code specifically. We are targetting other
> > > optimizations as well in other areas in boot path to cut down the
> > > 600ms time much further, so that cut down of 200ms downtime is
> > > significant in the usecase described. I did have a few
> > > comments/questions about your reply, please see my responses below:
> > > 
> > > 
> > > On 29/11/2021 16:53, Eric W. Biederman wrote:
> > > > Usama Arif <usama.arif@bytedance.com> writes:
> > > > 
> > > > > Hi,
> > > > > 
> > > > > Thanks for your replies. I have submitted a v2 of the patch with a
> > > > > much more detailed commit message including reason for the
> > > > > patch and timing values.
> > > > > 
> > > > > The time taken from reboot to running init process was measured
> > > > > with both purgatory enabled and disabled over 20 runs and the
> > > > > averages are:
> > > > > Purgatory disabled:
> > > > > - TSC = 3908766161 cycles
> > > > > - ktime = 606.8 ms
> > > > > Purgatory enabled:
> > > > > - TSC = 5005811885 cycles (28.1% worse)
> > > > > - ktime = 843.1 ms (38.9% worse)
> > > > > 
> > > > > 
> > > > > Our reason for this patch is that it helps reduce the
> > > > > downtime of servers when
> > > > > the host kernel managing multiple VMs needs to be updated via kexec,
> > > > > but it makes reboot with kexec much faster so should be a
> > > > > general improvement in
> > > > > boot time if purgatory is not needed and could have other
> > > > > usecases as well.
> > > > > I believe only x86, powerpc and s390 have purgatory
> > > > > supported, other platforms
> > > > > like arm64 dont have it implemented yet, so with the reboot
> > > > > time improvement seen,
> > > > > it would be a good idea to have the option to disable
> > > > > purgatory completely but set default to y.
> > > > > We also have the CONFIG_KEXEC_BZIMAGE_VERIFY_SIG which can
> > > > > be enabled to verify the next
> > > > > kernel image to be booted and purgatory can be completely skipped if
> > > > > not required.
> > > > 
> > > > CONFIG_KEXEC_BZIMAGE_VERIFY_SIG is something totally and completely
> > > > different.  It's job is to verify that the kernel to be booted comes
> > > > from a trusted source.   The sha256 verification in purgatory's job
> > > > is to verify that memory the kernel cares about was not corrupted
> > > > during the kexec process.
> > > > 
> > > 
> > > Thanks, acknowledged.
> > > 
> > > > I believe when you say purgatory you are really talking about that
> > > > sha256 checksum.  It really is not possible to disable all of
> > > > the code that runs between kernels, as the old and the new kernel may
> > > > run at the same addresses.  Anything that runs between the two kernels
> > > > is what is referred to as purgatory.  Even if it is just a small
> > > > assembly stub.
> > > > 
> > >  >
> > > 
> > > With this patch, i am trying to give an option (with default
> > > purgatory enabled)to disable purgatory completely on x86 only, i.e.
> > > no code running between 2 kernels on x86. From my understanding
> > > there is no purgatory in arm64 kexec_file case, in riscv and in some
> > > other archs as well, so I am not sure why its not possible to
> > > disable purgatory (all code running between 2 kernels) in x86?
> > > Unless i misunderstood something about the working of other
> > > platforms? In x86 case, from what i see relocate_kernel is still
> > > part of the older kernel and not purgatory, and with my patch,
> > > purgatory.ro is not built and kexec does execute successfully with
> > > purgatory disabled.
> > > 
> > > About your point for the old and the new kernel may run at the same
> > > addresses,i dont think that can happen as in bzImage64_load function
> > > its loaded using the kbuf struct. This doesnt happen in other
> > > architectures (for e.g. arm64) that dont implement purgatory and any
> > > of the tests I conducted with the patch applied due to the use of
> > > kbuf struct.
> > > 
> > >  From what i see in the code, purgatory in x86 specifically, has 2
> > > main functions, sha256 verification and loading the %rsi register
> > > for bootparam_load_addr. In this patch purgatory was disabled, which
> > > resulted in sha256 verification disabled and bootparam_load_addr
> > > moved to relocate_kernel. Our analysis revealed that most of the
> > > time is spent in the sha256 verification, so I would be ok to
> > > reformat the patch
> > > to make the sha verification optional and keep purgatory enabled if
> > > thats preferred? Although in my opinion the current patch of only
> > > providing an option to disable purgatory seems much better.
> > > 
> > > 
> > > > That sha256 verification is always needed for kexec on panic, there are
> > > > by the nature of a kernel panic too many unknowns to have any confidence
> > > > the new kernel will not be corrupted in the process of kexec before it
> > > > gets started.
> > > > 
> > > > For an ordinary kexec it might be possible to say that you have a
> > > > reliable kernel shutdown process and you know for a fact that something
> > > > won't come along and corrupt the kernel.  I find that a questionable
> > > > assertion.  I haven't seen anyone yet whose focus when getting an
> > > > ordinary kexec to work as anything other than making certain all of the
> > > > drivers are shutdown properly.
> > > > 
> > > > I have seen countless times when a network packet comes in a the wrong
> > > > time and the target kernel's memory is corrupted before it gets far
> > > > enough to initialize the network driver. >
> > > 
> > > I agree that when doing testing and research, there are things that can
> > > go wrong during kexec process, but i assume that the expectation is that
> > > in production environment, when for e.g. updating a kernel with
> > > kexec in servers,the expectation is that kexec will execute
> > > successfully as long as there is nothing wrong in the old kernel and
> > > the new kernel.
> > > 
> > > Maybe I didn't understand this correctly, but if a network packet
> > > comes in and corrupts the kernel memory, wont it also cause a
> > > problem even when purgatory is enabled? I agree that issue like
> > > these would be caught earlier with purgatory, but then if something
> > > like this breaks the kexec process in a production environment where
> > > the old and new kernel are well tested, trusted and expected to
> > > work, wouldn't there be a much more fundamental issue with the
> > > reliability of the current kexec process.
> > > 
> > > > For a 0.2s speed up you are talking about disabling all of the safety
> > > > checks in a very dangerous situation.  How much can you can in
> > > > performance by optimizing the sha256 implementation instead of using
> > > > what is essentially a reference implementation in basic C that I copied
> > > > from somewhere long ago.
> > > > 
> > > > Optimize the sha256 implementation and the memory copy loop and then
> > > > show how the tiny bit of time that is left is on a mission critical path
> > > > and must be removed.  Then we can reasonably talk about a config option
> > > > for disabling the sha256 implementation in the kexec in not-panic case.
> > > > 
> > > Thanks for this, I assume that over here you are suggesting for e.g.
> > > in x86 to replace the existing sha verification implementation in
> > > purgatory with the one in arch/x86/crypto/sha256_ssse3_glue.c? Also
> > > could you point to the memory copy loop to optimize? I can have a
> > > look at these 2 options.
> > > 
> > > Even with these optimizations, i think the v2 patch should still be
> > > considered. The patch in the end is providing only an option to
> > > disable purgatory on x86 only, with the default value of keeping it
> > > enabled and not changing kexec behaviour. The CONFIG already
> > > existed, it is just renamed and now provides user the option to
> > > select to disable purgatory for x86, rather than it being
> > > architecture dependent.
> > > 
> > > Thanks again for the review and comments,
> > > Usama
> > > 
> > > 
> > > > That sha256 implementation in part so that we can all sleep at night
> > > > because we don't have to deal with very very strange heizenbugs.
> > > > 
> > > > Eric
> > > > 
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] x86/purgatory: provide config to disable purgatory
@ 2021-11-29 15:01 Usama Arif
  0 siblings, 0 replies; 9+ messages in thread
From: Usama Arif @ 2021-11-29 15:01 UTC (permalink / raw)
  To: x86, linux-kernel, vgoyal, bp, tglx; +Cc: fam.zheng, Usama Arif

Disabling purgatory reduces the boot time if it is not needed
as the sha256 digest of kexec segments is no longer calculated
or verified. This helps reduce the downtime of servers when
the host kernel managing multiple VMs needs to be updated via kexec.

The usage of registers %r9-%r13 needed to be moved to %r10-%r14
in relocate_kernel_64.S as bootparam_load_addr is passed in %r9.

The time taken from reboot to running init process was measured
with both purgatory enabled and disabled over 20 runs and the
averages are:
Purgatory disabled:
- TSC = 3908766161 cycles
- ktime = 606.8 ms
Purgatory enabled:
- TSC = 5005811885 cycles (28.1% worse)
- ktime = 843.1 ms (38.9% worse)

Hence disabling purgatory has a significant improvement in kexec
time.

Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Reviewed-by: Fam Zheng <fam.zheng@bytedance.com>

---
v2 changes:
 - Updated commit message to include timing and usecase.
 - Fixed bug in v1 in which bootparam_load_addr was not passed when
   purgatory was disabled.
---
 arch/powerpc/Kbuild                  |  2 +-
 arch/powerpc/Kconfig                 |  2 +-
 arch/s390/Kbuild                     |  2 +-
 arch/s390/Kconfig                    |  2 +-
 arch/s390/purgatory/Makefile         |  2 +-
 arch/x86/Kbuild                      |  2 +-
 arch/x86/Kconfig                     |  6 ++-
 arch/x86/include/asm/kexec.h         |  3 +-
 arch/x86/kernel/kexec-bzimage64.c    | 70 ++++++++++++++++------------
 arch/x86/kernel/machine_kexec_64.c   |  3 +-
 arch/x86/kernel/relocate_kernel_64.S |  8 ++++
 arch/x86/purgatory/Makefile          |  2 +-
 include/linux/kexec.h                |  5 ++
 kernel/kexec_file.c                  |  9 ++--
 14 files changed, 73 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/Kbuild b/arch/powerpc/Kbuild
index 22cd0d55a892..072e62d7898e 100644
--- a/arch/powerpc/Kbuild
+++ b/arch/powerpc/Kbuild
@@ -15,7 +15,7 @@ obj-$(CONFIG_KVM)  += kvm/
 
 obj-$(CONFIG_PERF_EVENTS) += perf/
 obj-$(CONFIG_KEXEC_CORE)  += kexec/
-obj-$(CONFIG_KEXEC_FILE)  += purgatory/
+obj-$(CONFIG_KEXEC_PURGATORY)  += purgatory/
 
 # for cleaning
 subdir- += boot
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index dea74d7717c0..58bdfd1abb44 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -557,7 +557,7 @@ config KEXEC_FILE
 	  for kernel and initramfs as opposed to a list of segments as is the
 	  case for the older kexec call.
 
-config ARCH_HAS_KEXEC_PURGATORY
+config KEXEC_PURGATORY
 	def_bool KEXEC_FILE
 
 config RELOCATABLE
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild
index 76e362277179..2ed4ee5cdf59 100644
--- a/arch/s390/Kbuild
+++ b/arch/s390/Kbuild
@@ -7,7 +7,7 @@ obj-$(CONFIG_S390_HYPFS_FS)	+= hypfs/
 obj-$(CONFIG_APPLDATA_BASE)	+= appldata/
 obj-y				+= net/
 obj-$(CONFIG_PCI)		+= pci/
-obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/
+obj-$(CONFIG_KEXEC_PURGATORY) += purgatory/
 
 # for cleaning
 subdir- += boot tools
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2a5bb4f29cfe..d15bdaa0e198 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -538,7 +538,7 @@ config KEXEC_FILE
 	  kexec system call this system call takes file descriptors for the
 	  kernel and initramfs as arguments.
 
-config ARCH_HAS_KEXEC_PURGATORY
+config KEXEC_PURGATORY
 	def_bool y
 	depends on KEXEC_FILE
 
diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
index 360ada80d20c..03cac6d7310a 100644
--- a/arch/s390/purgatory/Makefile
+++ b/arch/s390/purgatory/Makefile
@@ -51,4 +51,4 @@ $(obj)/purgatory.ro: $(obj)/purgatory $(obj)/purgatory.chk FORCE
 $(obj)/kexec-purgatory.o: $(obj)/kexec-purgatory.S $(obj)/purgatory.ro FORCE
 	$(call if_changed_rule,as_o_S)
 
-obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o
+obj-$(CONFIG_KEXEC_PURGATORY) += kexec-purgatory.o
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index f384cb1a4f7a..9089438ed6d8 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -24,7 +24,7 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/
 obj-y += platform/
 obj-y += net/
 
-obj-$(CONFIG_KEXEC_FILE) += purgatory/
+obj-$(CONFIG_KEXEC_PURGATORY) += purgatory/
 
 # for cleaning
 subdir- += boot tools
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7399327d1eff..7efe6dbfdc67 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2000,8 +2000,10 @@ config KEXEC_FILE
 	  for kernel and initramfs as opposed to list of segments as
 	  accepted by previous system call.
 
-config ARCH_HAS_KEXEC_PURGATORY
-	def_bool KEXEC_FILE
+config KEXEC_PURGATORY
+	bool "A standalone relocatable object run between the 2 kernels during kexec"
+	depends on KEXEC_FILE
+	default y
 
 config KEXEC_SIG
 	bool "Verify kernel signature during kexec_file_load() syscall"
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 11b7c06e2828..8c33bb32b593 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -129,7 +129,8 @@ relocate_kernel(unsigned long indirection_page,
 		unsigned long page_list,
 		unsigned long start_address,
 		unsigned int preserve_context,
-		unsigned int host_mem_enc_active);
+		unsigned int host_mem_enc_active,
+		unsigned long bootparam_load_addr);
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 170d0fd68b1f..dbaa0e8c7634 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -374,18 +374,19 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 			return ERR_PTR(ret);
 	}
 
-	/*
-	 * Load purgatory. For 64bit entry point, purgatory  code can be
-	 * anywhere.
-	 */
-	ret = kexec_load_purgatory(image, &pbuf);
-	if (ret) {
-		pr_err("Loading purgatory failed\n");
-		return ERR_PTR(ret);
-	}
-
-	pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+	if (IS_ENABLED(CONFIG_KEXEC_PURGATORY)) {
+		/*
+		 * Load purgatory. For 64bit entry point, purgatory  code can be
+		 * anywhere.
+		 */
+		ret = kexec_load_purgatory(image, &pbuf);
+		if (ret) {
+			pr_err("Loading purgatory failed\n");
+			return ERR_PTR(ret);
+		}
 
+		pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+	}
 
 	/*
 	 * Load Bootparams and cmdline and space for efi stuff.
@@ -466,28 +467,37 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	params->hdr.type_of_loader = 0x0D << 4;
 	params->hdr.loadflags = 0;
 
-	/* Setup purgatory regs for entry */
-	ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
-					     sizeof(regs64), 1);
-	if (ret)
-		goto out_free_params;
+	if (IS_ENABLED(CONFIG_KEXEC_PURGATORY)) {
+		/* Setup purgatory regs for entry */
+		ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+						     sizeof(regs64), 1);
+		if (ret)
+			goto out_free_params;
 
-	regs64.rbx = 0; /* Bootstrap Processor */
-	regs64.rsi = bootparam_load_addr;
-	regs64.rip = kernel_load_addr + 0x200;
-	stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
-	if (IS_ERR(stack)) {
-		pr_err("Could not find address of symbol stack_end\n");
-		ret = -EINVAL;
-		goto out_free_params;
-	}
+		regs64.rbx = 0; /* Bootstrap Processor */
+		regs64.rsi = bootparam_load_addr;
+		regs64.rip = kernel_load_addr + 0x200;
 
-	regs64.rsp = (unsigned long)stack;
-	ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
-					     sizeof(regs64), 0);
-	if (ret)
-		goto out_free_params;
+		stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+		if (IS_ERR(stack)) {
+			pr_err("Could not find address of symbol stack_end\n");
+			ret = -EINVAL;
+			goto out_free_params;
+		}
 
+		regs64.rsp = (unsigned long)stack;
+		ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+						     sizeof(regs64), 0);
+		if (ret)
+			goto out_free_params;
+	} else {
+		/*
+		 * Pass kernel and bootparam load address to relocate_kernel
+		 * if purgatory is disabled.
+		 */
+		image->start = kernel_load_addr + 0x200;
+		image->bootparam_load_addr = bootparam_load_addr;
+	}
 	ret = setup_boot_parameters(image, params, bootparam_load_addr,
 				    efi_map_offset, efi_map_sz,
 				    efi_setup_data_offset);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index f5da4a18070a..f7b009768652 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -359,7 +359,8 @@ void machine_kexec(struct kimage *image)
 				       (unsigned long)page_list,
 				       image->start,
 				       image->preserve_context,
-				       cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT));
+				       cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT),
+				       image->bootparam_load_addr);
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (image->preserve_context)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c8fe74a28143..12789c8eabe6 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -48,6 +48,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
 	 * %rdx start address
 	 * %rcx preserve_context
 	 * %r8  host_mem_enc_active
+	 * %r9  bootparam_load_addr
 	 */
 
 	/* Save the CPU context, used for jumping back */
@@ -59,6 +60,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
 	pushq %r15
 	pushf
 
+	/* Save bootparam_load_addr in %r14  */
+	movq	%r9, %r14
+
 	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11
 	movq	%rsp, RSP(%r11)
 	movq	%cr0, %rax
@@ -179,7 +183,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
 	xorl	%ebx, %ebx
 	xorl    %ecx, %ecx
 	xorl    %edx, %edx
+#ifdef CONFIG_KEXEC_PURGATORY
 	xorl    %esi, %esi
+#else
+	movq    %r14, %rsi
+#endif
 	xorl    %edi, %edi
 	xorl    %ebp, %ebp
 	xorl	%r8d, %r8d
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 95ea17a9d20c..688b3f21be8f 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -81,4 +81,4 @@ quiet_cmd_bin2c = BIN2C   $@
 $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro $(obj)/purgatory.chk FORCE
 	$(call if_changed,bin2c)
 
-obj-$(CONFIG_KEXEC_FILE)	+= kexec-purgatory.o
+obj-$(CONFIG_KEXEC_PURGATORY)	+= kexec-purgatory.o
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0c994ae37729..818c3770158f 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -313,6 +313,11 @@ struct kimage {
 	void *elf_headers;
 	unsigned long elf_headers_sz;
 	unsigned long elf_load_addr;
+
+#ifndef CONFIG_PURGATORY
+	unsigned long bootparam_load_addr;
+#endif
+
 };
 
 /* kexec interface functions */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 8347fc158d2b..ad3131880a37 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -108,6 +108,7 @@ int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
 }
 #endif
 
+#ifdef CONFIG_KEXEC_PURGATORY
 /*
  * arch_kexec_apply_relocations_add - apply relocations of type RELA
  * @pi:		Purgatory to be relocated.
@@ -141,7 +142,7 @@ arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
 	pr_err("REL relocation unsupported.\n");
 	return -ENOEXEC;
 }
-
+#endif /* CONFIG_KEXEC_PURGATORY */
 /*
  * Free up memory used by kernel, initrd, and command line. This is temporary
  * memory allocation which is not needed any more after these buffers have
@@ -724,7 +725,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
 	struct kexec_sha_region *sha_regions;
 	struct purgatory_info *pi = &image->purgatory_info;
 
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+	if (!IS_ENABLED(CONFIG_KEXEC_PURGATORY))
 		return 0;
 
 	zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
@@ -829,7 +830,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
 	return ret;
 }
 
-#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+#ifdef CONFIG_KEXEC_PURGATORY
 /*
  * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
  * @pi:		Purgatory to be loaded.
@@ -1176,7 +1177,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 
 	return 0;
 }
-#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
+#endif /* CONFIG_KEXEC_PURGATORY */
 
 int crash_exclude_mem_range(struct crash_mem *mem,
 			    unsigned long long mstart, unsigned long long mend)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-12-17  0:56 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-23 15:05 [PATCH] x86/purgatory: provide config to disable purgatory Usama Arif
2021-11-23 15:24 ` Borislav Petkov
2021-11-29 14:04   ` Vivek Goyal
     [not found]     ` <79517d3c-3674-cc21-fbdc-b26946809756@bytedance.com>
2021-11-29 16:53       ` [External] " Eric W. Biederman
2021-12-01 10:29         ` Usama Arif
2021-12-06 16:51           ` Usama Arif
2021-12-16 18:05             ` Usama Arif
2021-12-17  0:56               ` Baoquan He
2021-11-29 15:01 Usama Arif

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).