linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug
@ 2022-09-09 21:05 Eric DeVolder
  2022-09-09 21:05 ` [PATCH v12 1/7] crash: move crash_prepare_elf64_headers Eric DeVolder
                   ` (7 more replies)
  0 siblings, 8 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-09 21:05 UTC (permalink / raw)
  To: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky,
	eric.devolder

When the kdump service is loaded, if a CPU or memory is hot
un/plugged, the crash elfcorehdr, which describes the CPUs
and memory in the system, must also be updated, else the resulting
vmcore is inaccurate (eg. missing either CPU context or memory
regions).

The current solution utilizes udev to initiate an unload-then-reload
of the kdump image (e. kernel, initrd, boot_params, puratory and
elfcorehdr) by the userspace kexec utility. In previous posts I have
outlined the significant performance problems related to offloading
this activity to userspace.

This patchset introduces a generic crash hot un/plug handler that
registers with the CPU and memory notifiers. Upon CPU or memory
changes, this generic handler is invoked and performs important
housekeeping, for example obtaining the appropriate lock, and then
invokes an architecture specific handler to do the appropriate
updates.

In the case of x86_64, the arch specific handler generates a new
elfcorehdr, and overwrites the old one in memory. No involvement
with userspace needed.

To realize the benefits/test this patchset, one must make a couple
of minor changes to userspace:

 - Prevent udev from updating kdump crash kernel on hot un/plug changes.
   Add the following as the first lines to the udev rule file
   /usr/lib/udev/rules.d/98-kexec.rules:

   # The kernel handles updates to crash elfcorehdr for cpu and memory changes
   SUBSYSTEM=="cpu", ATTRS{crash_hotplug}=="1", GOTO="kdump_reload_end"
   SUBSYSTEM=="memory", ATTRS{crash_hotplug}=="1", GOTO="kdump_reload_end"

   These lines will cause cpu and memory hot un/plug events to be
   skipped within this rule file, if the kernel has these changes
   enabled.

 - Change to the kexec_file_load for loading the kdump kernel:
   Eg. on RHEL: in /usr/bin/kdumpctl, change to:
    standard_kexec_args="-p -d -s"
   which adds the -s to select kexec_file_load syscall.

This patchset supports kexec_load with a modified kexec userspace
utility, and a working changeset to the kexec userspace utility
is provided here (and to use, the above change to standard_kexec_args
would be, for example, to append --hotplug instead of -s).

  diff --git a/kexec/arch/i386/crashdump-x86.c b/kexec/arch/i386/crashdump-x86.c
  index 9826f6d..4ed395a 100644
  --- a/kexec/arch/i386/crashdump-x86.c
  +++ b/kexec/arch/i386/crashdump-x86.c
  @@ -48,6 +48,7 @@
   #include <x86/x86-linux.h>
   
   extern struct arch_options_t arch_options;
  +extern int do_hotplug;
   
   static int get_kernel_page_offset(struct kexec_info *UNUSED(info),
   				  struct crash_elf_info *elf_info)
  @@ -975,6 +976,14 @@ int load_crashdump_segments(struct kexec_info *info, char* mod_cmdline,
   	} else {
   		memsz = bufsz;
   	}
  +
  +	/* If hotplug support enabled, use larger size to accomodate changes */
  +	if (do_hotplug) {
  +		long int nr_cpus = get_nr_cpus();
  +		memsz = (nr_cpus + CRASH_MAX_MEMORY_RANGES) * sizeof(Elf64_Phdr);
  +	}
  +
  +    info->elfcorehdr =
   	elfcorehdr = add_buffer(info, tmp, bufsz, memsz, align, min_base,
   							max_addr, -1);
   	dbgprintf("Created elf header segment at 0x%lx\n", elfcorehdr);
  diff --git a/kexec/crashdump-elf.c b/kexec/crashdump-elf.c
  index b8bb686..5e29f7a 100644
  --- a/kexec/crashdump-elf.c
  +++ b/kexec/crashdump-elf.c
  @@ -43,11 +43,7 @@ int FUNC(struct kexec_info *info,
   	int (*get_note_info)(int cpu, uint64_t *addr, uint64_t *len);
   	long int count_cpu;
   
  -	if (xen_present())
  -		nr_cpus = xen_get_nr_phys_cpus();
  -	else
  -		nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
  -
  +	nr_cpus = get_nr_cpus();
   	if (nr_cpus < 0) {
   		return -1;
   	}
  diff --git a/kexec/crashdump.h b/kexec/crashdump.h
  index 18bd691..28d3278 100644
  --- a/kexec/crashdump.h
  +++ b/kexec/crashdump.h
  @@ -57,7 +57,6 @@ unsigned long phys_to_virt(struct crash_elf_info *elf_info,
   			   unsigned long long paddr);
   
   unsigned long xen_architecture(struct crash_elf_info *elf_info);
  -int xen_get_nr_phys_cpus(void);
   int xen_get_note(int cpu, uint64_t *addr, uint64_t *len);
   int xen_get_crashkernel_region(uint64_t *start, uint64_t *end);
   
  diff --git a/kexec/kexec-xen.h b/kexec/kexec-xen.h
  index 70fb576..f54a2dd 100644
  --- a/kexec/kexec-xen.h
  +++ b/kexec/kexec-xen.h
  @@ -83,5 +83,6 @@ extern int __xc_interface_close(xc_interface *xch);
   #endif
   
   int xen_get_kexec_range(int range, uint64_t *start, uint64_t *end);
  +int xen_get_nr_phys_cpus(void);
   
   #endif /* KEXEC_XEN_H */
  diff --git a/kexec/kexec.c b/kexec/kexec.c
  index 829a6ea..3668b73 100644
  --- a/kexec/kexec.c
  +++ b/kexec/kexec.c
  @@ -58,6 +58,7 @@
   
   unsigned long long mem_min = 0;
   unsigned long long mem_max = ULONG_MAX;
  +int do_hotplug = 0;
   static unsigned long kexec_flags = 0;
   /* Flags for kexec file (fd) based syscall */
   static unsigned long kexec_file_flags = 0;
  @@ -489,6 +490,17 @@ static int add_backup_segments(struct kexec_info *info,
   	return 0;
   }
   
  +long int get_nr_cpus(void)
  +{
  +    long int nr_cpus;
  +
  +	if (xen_present())
  +		nr_cpus = xen_get_nr_phys_cpus();
  +	else
  +		nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
  +    return nr_cpus;
  +}
  +
   static char *slurp_fd(int fd, const char *filename, off_t size, off_t *nread)
   {
   	char *buf;
  @@ -672,6 +684,14 @@ static void update_purgatory(struct kexec_info *info)
   		if (info->segment[i].mem == (void *)info->rhdr.rel_addr) {
   			continue;
   		}
  +
  +		/* Don't include elfcorehdr in the checksum, if hotplug
  +		 * support enabled.
  +		 */
  +		if (do_hotplug && (info->segment[i].mem == (void *)info->elfcorehdr)) {
  +			continue;
  +		}
  +
   		sha256_update(&ctx, info->segment[i].buf,
   			      info->segment[i].bufsz);
   		nullsz = info->segment[i].memsz - info->segment[i].bufsz;
  @@ -1565,6 +1585,9 @@ int main(int argc, char *argv[])
   		case OPT_PRINT_CKR_SIZE:
   			print_crashkernel_region_size();
   			return 0;
  +		case OPT_HOTPLUG:
  +			do_hotplug = 1;
  +			break;
   		default:
   			break;
   		}
  diff --git a/kexec/kexec.h b/kexec/kexec.h
  index 0f97a97..b0428cc 100644
  --- a/kexec/kexec.h
  +++ b/kexec/kexec.h
  @@ -169,6 +169,7 @@ struct kexec_info {
   	int command_line_len;
   
   	int skip_checks;
  +	unsigned long elfcorehdr;
   };
   
   struct arch_map_entry {
  @@ -231,7 +232,8 @@ extern int file_types;
   #define OPT_PRINT_CKR_SIZE	262
   #define OPT_LOAD_LIVE_UPDATE	263
   #define OPT_EXEC_LIVE_UPDATE	264
  -#define OPT_MAX			265
  +#define OPT_HOTPLUG		265
  +#define OPT_MAX		266
   #define KEXEC_OPTIONS \
   	{ "help",		0, 0, OPT_HELP }, \
   	{ "version",		0, 0, OPT_VERSION }, \
  @@ -258,6 +260,7 @@ extern int file_types;
   	{ "debug",		0, 0, OPT_DEBUG }, \
   	{ "status",		0, 0, OPT_STATUS }, \
   	{ "print-ckr-size",     0, 0, OPT_PRINT_CKR_SIZE }, \
  +	{ "hotplug",		0, 0, OPT_HOTPLUG }, \
   
   #define KEXEC_OPT_STR "h?vdfixyluet:pscaS"
   
  @@ -290,6 +293,8 @@ extern unsigned long add_buffer_phys_virt(struct kexec_info *info,
   	int buf_end, int phys);
   extern void arch_reuse_initrd(void);
   
  +extern long int get_nr_cpus(void);
  +
   extern int ifdown(void);
   
   extern char purgatory[];

Regards,
eric
---
v12: 9sep2022
 - Rebased onto 6.0-rc4
 - Addressed some minor formatting items, per Baoquan

v11: 26aug2022
 https://lkml.org/lkml/2022/8/26/963
 - Rebased onto 6.0-rc2
 - Redid the rework of __weak to use asm/kexec.h, per Baoquan
 - Reworked some comments and minor items, per Baoquan

v10: 21jul2022
 https://lkml.org/lkml/2022/7/21/1007
 - Rebased to 5.19.0-rc7
 - Per Sourabh, corrected build issue with arch_un/map_crash_pages()
   for architectures not supporting this feature.
 - Per David Hildebrand, removed the WARN_ONCE() altogether.
 - Per David Hansen, converted to use of kmap_local_page().
 - Per Baoquan He, replaced use of __weak with the kexec technique.

v9: 13jun2022
 https://lkml.org/lkml/2022/6/13/3382
 - Rebased to 5.18.0
 - Per Sourabh, moved crash_prepare_elf64_headers() into common
   crash_core.c to avoid compile issues with kexec_load only path.
 - Per David Hildebrand, replaced mutex_trylock() with mutex_lock().
 - Changed the __weak arch_crash_handle_hotplug_event() to utilize
   WARN_ONCE() instead of WARN(). Fix some formatting issues.
 - Per Sourabh, introduced sysfs attribute crash_hotplug for memory
   and CPUs; for use by userspace (udev) to determine if the kernel
   performs crash hot un/plug support.
 - Per Sourabh, moved the code detecting the elfcorehdr segment from
   arch/x86 into crash_core:handle_hotplug_event() so both kexec_load
   and kexec_file_load can benefit.
 - Updated userspace kexec-tools kexec utility to reflect change to
   using CRASH_MAX_MEMORY_RANGES and get_nr_cpus().
 - Updated the new proposed udev rules to reflect using the sysfs
   attributes crash_hotplug.

v8: 5may2022
 https://lkml.org/lkml/2022/5/5/1133
 - Per Borislav Petkov, eliminated CONFIG_CRASH_HOTPLUG in favor
   of CONFIG_HOTPLUG_CPU || CONFIG_MEMORY_HOTPLUG, ie a new define
   is not needed. Also use of IS_ENABLED() rather than #ifdef's.
   Renamed crash_hotplug_handler() to handle_hotplug_event().
   And other corrections.
 - Per Baoquan, minimized the parameters to the arch_crash_
   handle_hotplug_event() to hp_action and cpu.
 - Introduce KEXEC_CRASH_HP_INVALID_CPU definition, per Baoquan.
 - Per Sourabh Jain, renamed and repurposed CRASH_HOTPLUG_ELFCOREHDR_SZ
   to CONFIG_CRASH_MAX_MEMORY_RANGES, mirroring kexec-tools change
   by David Hildebrand. Folded this patch into the x86
   kexec_file_load support patch.

v7: 13apr2022
 https://lkml.org/lkml/2022/4/13/850
 - Resolved parameter usage to crash_hotplug_handler(), per Baoquan.

v6: 1apr2022
 https://lkml.org/lkml/2022/4/1/1203
 - Reword commit messages and some comment cleanup per Baoquan.
 - Changed elf_index to elfcorehdr_index for clarity.
 - Minor code changes per Baoquan.

v5: 3mar2022
 https://lkml.org/lkml/2022/3/3/674
 - Reworded description of CRASH_HOTPLUG_ELFCOREHDR_SZ, per
   David Hildenbrand.
 - Refactored slightly a few patches per Baoquan recommendation.

v4: 9feb2022
 https://lkml.org/lkml/2022/2/9/1406
 - Refactored patches per Baoquan suggestsions.
 - A few corrections, per Baoquan.

v3: 10jan2022
 https://lkml.org/lkml/2022/1/10/1212
 - Rebasing per Baoquan He request.
 - Changed memory notifier per David Hildenbrand.
 - Providing example kexec userspace change in cover letter.

RFC v2: 7dec2021
 https://lkml.org/lkml/2021/12/7/1088
 - Acting upon Baoquan He suggestion of removing elfcorehdr from
   the purgatory list of segments, removed purgatory code from
   patchset, and it is signficiantly simpler now.

RFC v1: 18nov2021
 https://lkml.org/lkml/2021/11/18/845
 - working patchset demonstrating kernel handling of hotplug
   updates to x86 elfcorehdr for kexec_file_load

RFC: 14dec2020
 https://lkml.org/lkml/2020/12/14/532
 - proposed concept of allowing kernel to handle hotplug update
   of elfcorehdr
---

Eric DeVolder (7):
  crash: move crash_prepare_elf64_headers
  crash: prototype change for crash_prepare_elf64_headers
  crash: add generic infrastructure for crash hotplug support
  kexec: exclude elfcorehdr from the segment digest
  kexec: exclude hot remove cpu from elfcorehdr notes
  crash: memory and cpu hotplug sysfs attributes
  x86/crash: Add x86 crash hotplug support

 .../admin-guide/mm/memory-hotplug.rst         |   8 +
 Documentation/core-api/cpu_hotplug.rst        |  18 ++
 arch/arm64/kernel/machine_kexec_file.c        |   6 +-
 arch/powerpc/kexec/file_load_64.c             |   2 +-
 arch/riscv/kernel/elf_kexec.c                 |   7 +-
 arch/x86/Kconfig                              |  11 +
 arch/x86/include/asm/kexec.h                  |  20 ++
 arch/x86/kernel/crash.c                       | 104 +++++++-
 drivers/base/cpu.c                            |  14 +
 drivers/base/memory.c                         |  13 +
 include/linux/crash_core.h                    |   8 +
 include/linux/kexec.h                         |  41 ++-
 kernel/crash_core.c                           | 249 ++++++++++++++++++
 kernel/kexec_file.c                           | 105 +-------
 14 files changed, 497 insertions(+), 109 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH v12 1/7] crash: move crash_prepare_elf64_headers
  2022-09-09 21:05 [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Eric DeVolder
@ 2022-09-09 21:05 ` Eric DeVolder
  2022-09-09 21:05 ` [PATCH v12 2/7] crash: prototype change for crash_prepare_elf64_headers Eric DeVolder
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-09 21:05 UTC (permalink / raw)
  To: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky,
	eric.devolder

At the outcome of this patch set, the crash_prepare_elf64_headers()
is utilized on both the kexec_file_load and kexec_load paths. As
such, need to move this function out of kexec_file.c and into a
common location crash_core.c.

No functionality change.

Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
Acked-by: Baoquan He <bhe@redhat.com>
---
 kernel/crash_core.c | 100 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/kexec_file.c |  99 -------------------------------------------
 2 files changed, 100 insertions(+), 99 deletions(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a0eb4d5cf557..46c160d14045 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -10,6 +10,7 @@
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
 #include <linux/sizes.h>
+#include <linux/kexec.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -314,6 +315,105 @@ static int __init parse_crashkernel_dummy(char *arg)
 }
 early_param("crashkernel", parse_crashkernel_dummy);
 
+int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
+			  void **addr, unsigned long *sz)
+{
+	Elf64_Ehdr *ehdr;
+	Elf64_Phdr *phdr;
+	unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+	unsigned char *buf;
+	unsigned int cpu, i;
+	unsigned long long notes_addr;
+	unsigned long mstart, mend;
+
+	/* extra phdr for vmcoreinfo ELF note */
+	nr_phdr = nr_cpus + 1;
+	nr_phdr += mem->nr_ranges;
+
+	/*
+	 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+	 * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+	 * I think this is required by tools like gdb. So same physical
+	 * memory will be mapped in two ELF headers. One will contain kernel
+	 * text virtual addresses and other will have __va(physical) addresses.
+	 */
+
+	nr_phdr++;
+	elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+	elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+	buf = vzalloc(elf_sz);
+	if (!buf)
+		return -ENOMEM;
+
+	ehdr = (Elf64_Ehdr *)buf;
+	phdr = (Elf64_Phdr *)(ehdr + 1);
+	memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+	ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+	ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+	ehdr->e_type = ET_CORE;
+	ehdr->e_machine = ELF_ARCH;
+	ehdr->e_version = EV_CURRENT;
+	ehdr->e_phoff = sizeof(Elf64_Ehdr);
+	ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+	ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+	/* Prepare one phdr of type PT_NOTE for each present CPU */
+	for_each_present_cpu(cpu) {
+		phdr->p_type = PT_NOTE;
+		notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+		phdr->p_offset = phdr->p_paddr = notes_addr;
+		phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+		(ehdr->e_phnum)++;
+		phdr++;
+	}
+
+	/* Prepare one PT_NOTE header for vmcoreinfo */
+	phdr->p_type = PT_NOTE;
+	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+	(ehdr->e_phnum)++;
+	phdr++;
+
+	/* Prepare PT_LOAD type program header for kernel text region */
+	if (need_kernel_map) {
+		phdr->p_type = PT_LOAD;
+		phdr->p_flags = PF_R|PF_W|PF_X;
+		phdr->p_vaddr = (unsigned long) _text;
+		phdr->p_filesz = phdr->p_memsz = _end - _text;
+		phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+		ehdr->e_phnum++;
+		phdr++;
+	}
+
+	/* Go through all the ranges in mem->ranges[] and prepare phdr */
+	for (i = 0; i < mem->nr_ranges; i++) {
+		mstart = mem->ranges[i].start;
+		mend = mem->ranges[i].end;
+
+		phdr->p_type = PT_LOAD;
+		phdr->p_flags = PF_R|PF_W|PF_X;
+		phdr->p_offset  = mstart;
+
+		phdr->p_paddr = mstart;
+		phdr->p_vaddr = (unsigned long) __va(mstart);
+		phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+		phdr->p_align = 0;
+		ehdr->e_phnum++;
+		pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+			phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+			ehdr->e_phnum, phdr->p_offset);
+		phdr++;
+	}
+
+	*addr = buf;
+	*sz = elf_sz;
+	return 0;
+}
+
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len)
 {
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 1d546dc97c50..8017eeb43036 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1217,102 +1217,3 @@ int crash_exclude_mem_range(struct crash_mem *mem,
 	mem->nr_ranges++;
 	return 0;
 }
-
-int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
-			  void **addr, unsigned long *sz)
-{
-	Elf64_Ehdr *ehdr;
-	Elf64_Phdr *phdr;
-	unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
-	unsigned char *buf;
-	unsigned int cpu, i;
-	unsigned long long notes_addr;
-	unsigned long mstart, mend;
-
-	/* extra phdr for vmcoreinfo ELF note */
-	nr_phdr = nr_cpus + 1;
-	nr_phdr += mem->nr_ranges;
-
-	/*
-	 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
-	 * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
-	 * I think this is required by tools like gdb. So same physical
-	 * memory will be mapped in two ELF headers. One will contain kernel
-	 * text virtual addresses and other will have __va(physical) addresses.
-	 */
-
-	nr_phdr++;
-	elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
-	elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
-
-	buf = vzalloc(elf_sz);
-	if (!buf)
-		return -ENOMEM;
-
-	ehdr = (Elf64_Ehdr *)buf;
-	phdr = (Elf64_Phdr *)(ehdr + 1);
-	memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
-	ehdr->e_ident[EI_CLASS] = ELFCLASS64;
-	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
-	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
-	ehdr->e_ident[EI_OSABI] = ELF_OSABI;
-	memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
-	ehdr->e_type = ET_CORE;
-	ehdr->e_machine = ELF_ARCH;
-	ehdr->e_version = EV_CURRENT;
-	ehdr->e_phoff = sizeof(Elf64_Ehdr);
-	ehdr->e_ehsize = sizeof(Elf64_Ehdr);
-	ehdr->e_phentsize = sizeof(Elf64_Phdr);
-
-	/* Prepare one phdr of type PT_NOTE for each present CPU */
-	for_each_present_cpu(cpu) {
-		phdr->p_type = PT_NOTE;
-		notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
-		phdr->p_offset = phdr->p_paddr = notes_addr;
-		phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
-		(ehdr->e_phnum)++;
-		phdr++;
-	}
-
-	/* Prepare one PT_NOTE header for vmcoreinfo */
-	phdr->p_type = PT_NOTE;
-	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
-	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
-	(ehdr->e_phnum)++;
-	phdr++;
-
-	/* Prepare PT_LOAD type program header for kernel text region */
-	if (need_kernel_map) {
-		phdr->p_type = PT_LOAD;
-		phdr->p_flags = PF_R|PF_W|PF_X;
-		phdr->p_vaddr = (unsigned long) _text;
-		phdr->p_filesz = phdr->p_memsz = _end - _text;
-		phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
-		ehdr->e_phnum++;
-		phdr++;
-	}
-
-	/* Go through all the ranges in mem->ranges[] and prepare phdr */
-	for (i = 0; i < mem->nr_ranges; i++) {
-		mstart = mem->ranges[i].start;
-		mend = mem->ranges[i].end;
-
-		phdr->p_type = PT_LOAD;
-		phdr->p_flags = PF_R|PF_W|PF_X;
-		phdr->p_offset  = mstart;
-
-		phdr->p_paddr = mstart;
-		phdr->p_vaddr = (unsigned long) __va(mstart);
-		phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
-		phdr->p_align = 0;
-		ehdr->e_phnum++;
-		pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
-			phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
-			ehdr->e_phnum, phdr->p_offset);
-		phdr++;
-	}
-
-	*addr = buf;
-	*sz = elf_sz;
-	return 0;
-}
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH v12 2/7] crash: prototype change for crash_prepare_elf64_headers
  2022-09-09 21:05 [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Eric DeVolder
  2022-09-09 21:05 ` [PATCH v12 1/7] crash: move crash_prepare_elf64_headers Eric DeVolder
@ 2022-09-09 21:05 ` Eric DeVolder
  2022-09-09 21:05 ` [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support Eric DeVolder
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-09 21:05 UTC (permalink / raw)
  To: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky,
	eric.devolder

From within crash_prepare_elf64_headers() there is a need to
reference the struct kimage hotplug members. As such, this
change passes the struct kimage as a parameter to the
crash_prepare_elf64_headers(). The hotplug members are added
in "crash: add generic infrastructure for crash hotplug support".

This is preparation for later patch, no functionality change.

Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
Acked-by: Baoquan He <bhe@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
---
 arch/arm64/kernel/machine_kexec_file.c | 6 +++---
 arch/powerpc/kexec/file_load_64.c      | 2 +-
 arch/riscv/kernel/elf_kexec.c          | 7 ++++---
 arch/x86/kernel/crash.c                | 2 +-
 include/linux/kexec.h                  | 7 +++++--
 kernel/crash_core.c                    | 4 ++--
 6 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index a11a6e14ba89..2f7b773a83bb 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -39,7 +39,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
 	return kexec_image_post_load_cleanup_default(image);
 }
 
-static int prepare_elf_headers(void **addr, unsigned long *sz)
+static int prepare_elf_headers(struct kimage *image, void **addr, unsigned long *sz)
 {
 	struct crash_mem *cmem;
 	unsigned int nr_ranges;
@@ -64,7 +64,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
 	}
 
 	/* Exclude crashkernel region */
-	ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+	ret = crash_exclude_mem_range(image, cmem, crashk_res.start, crashk_res.end);
 	if (ret)
 		goto out;
 
@@ -74,7 +74,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
 			goto out;
 	}
 
-	ret = crash_prepare_elf64_headers(cmem, true, addr, sz);
+	ret = crash_prepare_elf64_headers(image, cmem, true, addr, sz);
 
 out:
 	kfree(cmem);
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 349a781cea0b..a0af9966a8f0 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -798,7 +798,7 @@ static int load_elfcorehdr_segment(struct kimage *image, struct kexec_buf *kbuf)
 		goto out;
 
 	/* Setup elfcorehdr segment */
-	ret = crash_prepare_elf64_headers(cmem, false, &headers, &headers_sz);
+	ret = crash_prepare_elf64_headers(image, cmem, false, &headers, &headers_sz);
 	if (ret) {
 		pr_err("Failed to prepare elf headers for the core\n");
 		goto out;
diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c
index 0cb94992c15b..ffde73228108 100644
--- a/arch/riscv/kernel/elf_kexec.c
+++ b/arch/riscv/kernel/elf_kexec.c
@@ -118,7 +118,8 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
 	return 0;
 }
 
-static int prepare_elf_headers(void **addr, unsigned long *sz)
+static int prepare_elf_headers(struct kimage *image,
+	void **addr, unsigned long *sz)
 {
 	struct crash_mem *cmem;
 	unsigned int nr_ranges;
@@ -140,7 +141,7 @@ static int prepare_elf_headers(void **addr, unsigned long *sz)
 	/* Exclude crashkernel region */
 	ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
 	if (!ret)
-		ret = crash_prepare_elf64_headers(cmem, true, addr, sz);
+		ret = crash_prepare_elf64_headers(image, cmem, true, addr, sz);
 
 out:
 	kfree(cmem);
@@ -212,7 +213,7 @@ static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
 
 	/* Add elfcorehdr */
 	if (image->type == KEXEC_TYPE_CRASH) {
-		ret = prepare_elf_headers(&headers, &headers_sz);
+		ret = prepare_elf_headers(image, &headers, &headers_sz);
 		if (ret) {
 			pr_err("Preparing elf core header failed\n");
 			goto out;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 9730c88530fc..9ceb93c176a6 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -265,7 +265,7 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
 		goto out;
 
 	/* By default prepare 64bit headers */
-	ret =  crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
+	ret =  crash_prepare_elf64_headers(image, cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
 
 out:
 	vfree(cmem);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 13e6c4b58f07..4eefa631e0ae 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -253,8 +253,11 @@ struct crash_mem {
 extern int crash_exclude_mem_range(struct crash_mem *mem,
 				   unsigned long long mstart,
 				   unsigned long long mend);
-extern int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
-				       void **addr, unsigned long *sz);
+extern int crash_prepare_elf64_headers(struct kimage *image,
+				   struct crash_mem *mem,
+				   int need_kernel_map,
+				   void **addr,
+				   unsigned long *sz);
 
 #ifndef arch_kexec_apply_relocations_add
 /*
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 46c160d14045..8c648fd5897a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -315,8 +315,8 @@ static int __init parse_crashkernel_dummy(char *arg)
 }
 early_param("crashkernel", parse_crashkernel_dummy);
 
-int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
-			  void **addr, unsigned long *sz)
+int crash_prepare_elf64_headers(struct kimage *image, struct crash_mem *mem,
+			  int need_kernel_map, void **addr, unsigned long *sz)
 {
 	Elf64_Ehdr *ehdr;
 	Elf64_Phdr *phdr;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support
  2022-09-09 21:05 [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Eric DeVolder
  2022-09-09 21:05 ` [PATCH v12 1/7] crash: move crash_prepare_elf64_headers Eric DeVolder
  2022-09-09 21:05 ` [PATCH v12 2/7] crash: prototype change for crash_prepare_elf64_headers Eric DeVolder
@ 2022-09-09 21:05 ` Eric DeVolder
  2022-10-03 17:51   ` Sourabh Jain
  2022-10-04  6:38   ` Sourabh Jain
  2022-09-09 21:05 ` [PATCH v12 4/7] kexec: exclude elfcorehdr from the segment digest Eric DeVolder
                   ` (4 subsequent siblings)
  7 siblings, 2 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-09 21:05 UTC (permalink / raw)
  To: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky,
	eric.devolder

CPU and memory change notifications are received in order to
regenerate the elfcorehdr.

To support cpu hotplug, a callback is registered to capture the
CPUHP_AP_ONLINE_DYN online and offline events via
cpuhp_setup_state_nocalls().

To support memory hotplug, a notifier is registered to capture the
MEM_ONLINE and MEM_OFFLINE events via register_memory_notifier().

The cpu callback and memory notifiers call handle_hotplug_event()
which performs needed tasks and then dispatches the event to the
architecture specific arch_crash_handle_hotplug_event(). During the
process, the kexec_mutex is held.

Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
Acked-by: Baoquan He <bhe@redhat.com>
---
 include/linux/crash_core.h |   8 +++
 include/linux/kexec.h      |  26 +++++++
 kernel/crash_core.c        | 134 +++++++++++++++++++++++++++++++++++++
 3 files changed, 168 insertions(+)

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index de62a722431e..a270f8660538 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -84,4 +84,12 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 
+#define KEXEC_CRASH_HP_REMOVE_CPU		0
+#define KEXEC_CRASH_HP_ADD_CPU			1
+#define KEXEC_CRASH_HP_REMOVE_MEMORY		2
+#define KEXEC_CRASH_HP_ADD_MEMORY		3
+#define KEXEC_CRASH_HP_INVALID_CPU		-1U
+
+struct kimage;
+
 #endif /* LINUX_CRASH_CORE_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 4eefa631e0ae..9597b41136ec 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -374,6 +374,13 @@ struct kimage {
 	struct purgatory_info purgatory_info;
 #endif
 
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+	bool hotplug_event;
+	unsigned int offlinecpu;
+	bool elfcorehdr_index_valid;
+	int elfcorehdr_index;
+#endif
+
 #ifdef CONFIG_IMA_KEXEC
 	/* Virtual address of IMA measurement buffer for kexec syscall */
 	void *ima_buffer;
@@ -503,6 +510,25 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g
 static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
 #endif
 
+#ifndef arch_map_crash_pages
+static inline void *arch_map_crash_pages(unsigned long paddr,
+		unsigned long size)
+{
+	return NULL;
+}
+#endif
+
+#ifndef arch_unmap_crash_pages
+static inline void arch_unmap_crash_pages(void **ptr) { }
+#endif
+
+#ifndef arch_crash_handle_hotplug_event
+static inline void arch_crash_handle_hotplug_event(struct kimage *image,
+		unsigned int hp_action)
+{
+}
+#endif
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 8c648fd5897a..4b15d91f0b21 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,6 +11,8 @@
 #include <linux/vmalloc.h>
 #include <linux/sizes.h>
 #include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -18,6 +20,7 @@
 #include <crypto/sha1.h>
 
 #include "kallsyms_internal.h"
+#include "kexec_internal.h"
 
 /* vmcoreinfo stuff */
 unsigned char *vmcoreinfo_data;
@@ -612,3 +615,134 @@ static int __init crash_save_vmcoreinfo_init(void)
 }
 
 subsys_initcall(crash_save_vmcoreinfo_init);
+
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * To accurately reflect hot un/plug changes, the elfcorehdr (which
+ * is passed to the crash kernel via the elfcorehdr= parameter)
+ * must be updated with the new list of CPUs and memories.
+ *
+ * In order to make changes to elfcorehdr, two conditions are needed:
+ * First, the segment containing the elfcorehdr must be large enough
+ * to permit a growing number of resources. The elfcorehdr memory is
+ * typically based on CONFIG_NR_CPUS and CONFIG_CRASH_MAX_MEMORY_RANGES.
+ * Second, purgatory must explicitly exclude the elfcorehdr from the
+ * list of segments it checks (since the elfcorehdr changes and thus
+ * would require an update to purgatory itself to update the digest).
+ */
+static void handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
+{
+	/* Obtain lock while changing crash information */
+	mutex_lock(&kexec_mutex);
+
+	/* Check kdump is loaded */
+	if (kexec_crash_image) {
+		struct kimage *image = kexec_crash_image;
+
+		if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
+			hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
+			pr_debug("crash hp: hp_action %u, cpu %u\n", hp_action, cpu);
+		else
+			pr_debug("crash hp: hp_action %u\n", hp_action);
+
+		/*
+		 * When the struct kimage is allocated, it is wiped to zero, so
+		 * the elfcorehdr_index_valid defaults to false. Find the
+		 * segment containing the elfcorehdr, if not already found.
+		 * This works for both the kexec_load and kexec_file_load paths.
+		 */
+		if (!image->elfcorehdr_index_valid) {
+			unsigned char *ptr;
+			unsigned long mem, memsz;
+			unsigned int n;
+
+			for (n = 0; n < image->nr_segments; n++) {
+				mem = image->segment[n].mem;
+				memsz = image->segment[n].memsz;
+				ptr = arch_map_crash_pages(mem, memsz);
+				if (ptr) {
+					/* The segment containing elfcorehdr */
+					if (memcmp(ptr, ELFMAG, SELFMAG) == 0) {
+						image->elfcorehdr_index = (int)n;
+						image->elfcorehdr_index_valid = true;
+					}
+				}
+				arch_unmap_crash_pages((void **)&ptr);
+			}
+		}
+
+		if (!image->elfcorehdr_index_valid) {
+			pr_err("crash hp: unable to locate elfcorehdr segment");
+			goto out;
+		}
+
+		/* Needed in order for the segments to be updated */
+		arch_kexec_unprotect_crashkres();
+
+		/* Flag to differentiate between normal load and hotplug */
+		image->hotplug_event = true;
+
+		/* Now invoke arch-specific update handler */
+		arch_crash_handle_hotplug_event(image, hp_action);
+
+		/* No longer handling a hotplug event */
+		image->hotplug_event = false;
+
+		/* Change back to read-only */
+		arch_kexec_protect_crashkres();
+	}
+
+out:
+	/* Release lock now that update complete */
+	mutex_unlock(&kexec_mutex);
+}
+
+static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
+{
+	switch (val) {
+	case MEM_ONLINE:
+		handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, 0);
+		break;
+
+	case MEM_OFFLINE:
+		handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, 0);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block crash_memhp_nb = {
+	.notifier_call = crash_memhp_notifier,
+	.priority = 0
+};
+
+static int crash_cpuhp_online(unsigned int cpu)
+{
+	handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu);
+	return 0;
+}
+
+static int crash_cpuhp_offline(unsigned int cpu)
+{
+	handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu);
+	return 0;
+}
+
+static int __init crash_hotplug_init(void)
+{
+	int result = 0;
+
+	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+		register_memory_notifier(&crash_memhp_nb);
+
+	if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		result = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+						   "crash/cpuhp",
+						   crash_cpuhp_online,
+						   crash_cpuhp_offline);
+
+	return result;
+}
+
+subsys_initcall(crash_hotplug_init);
+#endif
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH v12 4/7] kexec: exclude elfcorehdr from the segment digest
  2022-09-09 21:05 [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Eric DeVolder
                   ` (2 preceding siblings ...)
  2022-09-09 21:05 ` [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support Eric DeVolder
@ 2022-09-09 21:05 ` Eric DeVolder
  2022-09-09 21:05 ` [PATCH v12 5/7] kexec: exclude hot remove cpu from elfcorehdr notes Eric DeVolder
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-09 21:05 UTC (permalink / raw)
  To: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky,
	eric.devolder

When a crash kernel is loaded via the kexec_file_load syscall, the
kernel places the various segments (ie crash kernel, crash initrd,
boot_params, elfcorehdr, purgatory, etc) in memory. For those
architectures that utilize purgatory, a hash digest of the segments
is calculated for integrity checking. This digest is embedded into
the purgatory image prior to placing purgatory in memory.

Since hotplug events cause changes to the elfcorehdr, purgatory
integrity checking fails (at crash time, and no kdump created).
As a result, this change explicitly excludes the elfcorehdr segment
from the list of segments used to create the digest. By doing so,
this permits changes to the elfcorehdr in response to hotplug events,
without having to also reload purgatory due to the change to the
digest.

Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
Acked-by: Baoquan He <bhe@redhat.com>
---
 kernel/kexec_file.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 8017eeb43036..d0c2661b3509 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -723,6 +723,12 @@ static int kexec_calculate_store_digests(struct kimage *image)
 	for (j = i = 0; i < image->nr_segments; i++) {
 		struct kexec_segment *ksegment;
 
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+		/* Exclude elfcorehdr segment to allow future changes via hotplug */
+		if (image->elfcorehdr_index_valid && (j == image->elfcorehdr_index))
+			continue;
+#endif
+
 		ksegment = &image->segment[i];
 		/*
 		 * Skip purgatory as it will be modified once we put digest
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH v12 5/7] kexec: exclude hot remove cpu from elfcorehdr notes
  2022-09-09 21:05 [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Eric DeVolder
                   ` (3 preceding siblings ...)
  2022-09-09 21:05 ` [PATCH v12 4/7] kexec: exclude elfcorehdr from the segment digest Eric DeVolder
@ 2022-09-09 21:05 ` Eric DeVolder
  2022-09-09 21:05 ` [PATCH v12 6/7] crash: memory and cpu hotplug sysfs attributes Eric DeVolder
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-09 21:05 UTC (permalink / raw)
  To: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky,
	eric.devolder

Due to use of CPUHP_AP_ONLINE_DYN, upon CPU unplug, the CPU is
still in the for_each_present_cpu() list when within the
handle_hotplug_event(). Thus the CPU must be explicitly excluded
when building the new list of CPUs.

This change identifies in handle_hotplug_event() the CPU to be
excluded, and the check for excluding the CPU in
crash_prepare_elf64_headers().

Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
Acked-by: Baoquan He <bhe@redhat.com>
---
 kernel/crash_core.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4b15d91f0b21..5bc5159d9cb1 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -366,6 +366,11 @@ int crash_prepare_elf64_headers(struct kimage *image, struct crash_mem *mem,
 
 	/* Prepare one phdr of type PT_NOTE for each present CPU */
 	for_each_present_cpu(cpu) {
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+		/* Skip the soon-to-be offlined cpu */
+		if (image->hotplug_event && (cpu == image->offlinecpu))
+			continue;
+#endif
 		phdr->p_type = PT_NOTE;
 		notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
 		phdr->p_offset = phdr->p_paddr = notes_addr;
@@ -682,6 +687,16 @@ static void handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
 		/* Flag to differentiate between normal load and hotplug */
 		image->hotplug_event = true;
 
+		/*
+		 * Due to use of CPUHP_AP_ONLINE_DYN, upon unplug and during
+		 * this callback, the CPU is still in the for_each_present_cpu()
+		 * list. Must explicitly look to exclude this CPU when building
+		 * new list.
+		 */
+		image->offlinecpu =
+			(hp_action == KEXEC_CRASH_HP_REMOVE_CPU) ?
+				cpu : KEXEC_CRASH_HP_INVALID_CPU;
+
 		/* Now invoke arch-specific update handler */
 		arch_crash_handle_hotplug_event(image, hp_action);
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH v12 6/7] crash: memory and cpu hotplug sysfs attributes
  2022-09-09 21:05 [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Eric DeVolder
                   ` (4 preceding siblings ...)
  2022-09-09 21:05 ` [PATCH v12 5/7] kexec: exclude hot remove cpu from elfcorehdr notes Eric DeVolder
@ 2022-09-09 21:05 ` Eric DeVolder
  2022-09-09 21:05 ` [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support Eric DeVolder
  2022-09-12  3:47 ` [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Baoquan He
  7 siblings, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-09 21:05 UTC (permalink / raw)
  To: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky,
	eric.devolder

This introduces the crash_hotplug attribute for memory and CPUs
for use by userspace.  This change directly facilitates the udev
rule for managing userspace re-loading of the crash kernel upon
hot un/plug changes.

For memory, this changeset introduces the crash_hotplug attribute
to the /sys/devices/system/memory directory. For example:

 # udevadm info --attribute-walk /sys/devices/system/memory/memory81
  looking at device '/devices/system/memory/memory81':
    KERNEL=="memory81"
    SUBSYSTEM=="memory"
    DRIVER==""
    ATTR{online}=="1"
    ATTR{phys_device}=="0"
    ATTR{phys_index}=="00000051"
    ATTR{removable}=="1"
    ATTR{state}=="online"
    ATTR{valid_zones}=="Movable"

  looking at parent device '/devices/system/memory':
    KERNELS=="memory"
    SUBSYSTEMS==""
    DRIVERS==""
    ATTRS{auto_online_blocks}=="offline"
    ATTRS{block_size_bytes}=="8000000"
    ATTRS{crash_hotplug}=="1"

For CPUs, this changeset introduces the crash_hotplug attribute
to the /sys/devices/system/cpu directory. For example:

 # udevadm info --attribute-walk /sys/devices/system/cpu/cpu0
  looking at device '/devices/system/cpu/cpu0':
    KERNEL=="cpu0"
    SUBSYSTEM=="cpu"
    DRIVER=="processor"
    ATTR{crash_notes}=="277c38600"
    ATTR{crash_notes_size}=="368"
    ATTR{online}=="1"

  looking at parent device '/devices/system/cpu':
    KERNELS=="cpu"
    SUBSYSTEMS==""
    DRIVERS==""
    ATTRS{crash_hotplug}=="1"
    ATTRS{isolated}==""
    ATTRS{kernel_max}=="8191"
    ATTRS{nohz_full}=="  (null)"
    ATTRS{offline}=="4-7"
    ATTRS{online}=="0-3"
    ATTRS{possible}=="0-7"
    ATTRS{present}=="0-3"

With these sysfs attributes in place, it is possible to efficiently
instruct the udev rule to skip crash kernel reloading.

For example, the following is the proposed udev rule change for RHEL
system 98-kexec.rules (as the first lines of the rule file):

 # The kernel handles updates to crash elfcorehdr for cpu and memory changes
 SUBSYSTEM=="cpu", ATTRS{crash_hotplug}=="1", GOTO="kdump_reload_end"
 SUBSYSTEM=="memory", ATTRS{crash_hotplug}=="1", GOTO="kdump_reload_end"

When examined in the context of 98-kexec.rules, the above change
tests if crash_hotplug is set, and if so, it skips the userspace
initiated unload-then-reload of the crash kernel.

Cpu and memory checks are separated in accordance with
CONFIG_HOTPLUG_CPU and CONFIG_MEMORY_HOTPLUG kernel config options.
If an architecture supports, for example, memory hotplug but not
CPU hotplug, then the /sys/devices/system/memory/crash_hotplug
attribute file is present, but the /sys/devices/system/cpu/crash_hotplug
attribute file will NOT be present. Thus the udev rule will skip
userspace processing of memory hot un/plug events, but the udev
rule will fail for CPU events, thus allowing userspace to process
cpu hot un/plug events (ie the unload-then-reload of the kdump
capture kernel).

Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
Acked-by: Baoquan He <bhe@redhat.com>
---
 .../admin-guide/mm/memory-hotplug.rst          |  8 ++++++++
 Documentation/core-api/cpu_hotplug.rst         | 18 ++++++++++++++++++
 drivers/base/cpu.c                             | 14 ++++++++++++++
 drivers/base/memory.c                          | 13 +++++++++++++
 include/linux/kexec.h                          |  8 ++++++++
 5 files changed, 61 insertions(+)

diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index a3c9e8ad8fa0..15fd1751a63c 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -293,6 +293,14 @@ The following files are currently defined:
 		       Availability depends on the CONFIG_ARCH_MEMORY_PROBE
 		       kernel configuration option.
 ``uevent``	       read-write: generic udev file for device subsystems.
+``crash_hotplug``      read-only: when changes to the system memory map
+		       occur due to hot un/plug of memory, this file contains
+		       '1' if the kernel updates the kdump capture kernel memory
+		       map itself (via elfcorehdr), or '0' if userspace must update
+		       the kdump capture kernel memory map.
+
+		       Availability depends on the CONFIG_MEMORY_HOTPLUG kernel
+		       configuration option.
 ====================== =========================================================
 
 .. note::
diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst
index c6f4ba2fb32d..13e33d098645 100644
--- a/Documentation/core-api/cpu_hotplug.rst
+++ b/Documentation/core-api/cpu_hotplug.rst
@@ -750,6 +750,24 @@ will receive all events. A script like::
 
 can process the event further.
 
+When changes to the CPUs in the system occur, the sysfs file
+/sys/devices/system/cpu/crash_hotplug contains '1' if the kernel
+updates the kdump capture kernel list of CPUs itself (via elfcorehdr),
+or '0' if userspace must update the kdump capture kernel list of CPUs.
+
+The availability depends on the CONFIG_HOTPLUG_CPU kernel configuration
+option.
+
+To skip userspace processing of CPU hot un/plug events for kdump
+(ie the unload-then-reload to obtain a current list of CPUs), this sysfs
+file can be used in a udev rule as follows:
+
+ SUBSYSTEM=="cpu", ATTRS{crash_hotplug}=="1", GOTO="kdump_reload_end"
+
+For a cpu hot un/plug event, if the architecture supports kernel updates
+of the elfcorehdr (which contains the list of CPUs), then the rule skips
+the unload-then-reload of the kdump capture kernel.
+
 Kernel Inline Documentations Reference
 ======================================
 
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 4c98849577d4..fedbf87f9d13 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -293,6 +293,17 @@ static ssize_t print_cpus_nohz_full(struct device *dev,
 static DEVICE_ATTR(nohz_full, 0444, print_cpus_nohz_full, NULL);
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+#include <linux/kexec.h>
+static ssize_t crash_hotplug_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	return sprintf(buf, "%d\n", crash_hotplug_cpu_support());
+}
+static DEVICE_ATTR_ADMIN_RO(crash_hotplug);
+#endif
+
 static void cpu_device_release(struct device *dev)
 {
 	/*
@@ -469,6 +480,9 @@ static struct attribute *cpu_root_attrs[] = {
 #ifdef CONFIG_NO_HZ_FULL
 	&dev_attr_nohz_full.attr,
 #endif
+#ifdef CONFIG_HOTPLUG_CPU
+	&dev_attr_crash_hotplug.attr,
+#endif
 #ifdef CONFIG_GENERIC_CPU_AUTOPROBE
 	&dev_attr_modalias.attr,
 #endif
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bc60c9cd3230..b754918c3dac 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -483,6 +483,16 @@ static ssize_t auto_online_blocks_store(struct device *dev,
 
 static DEVICE_ATTR_RW(auto_online_blocks);
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+#include <linux/kexec.h>
+static ssize_t crash_hotplug_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", crash_hotplug_memory_support());
+}
+static DEVICE_ATTR_RO(crash_hotplug);
+#endif
+
 /*
  * Some architectures will have custom drivers to do this, and
  * will not need to do it from userspace.  The fake hot-add code
@@ -887,6 +897,9 @@ static struct attribute *memory_root_attrs[] = {
 
 	&dev_attr_block_size_bytes.attr,
 	&dev_attr_auto_online_blocks.attr,
+#ifdef CONFIG_MEMORY_HOTPLUG
+	&dev_attr_crash_hotplug.attr,
+#endif
 	NULL
 };
 
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 9597b41136ec..a48577a36fb8 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -529,6 +529,14 @@ static inline void arch_crash_handle_hotplug_event(struct kimage *image,
 }
 #endif
 
+#ifndef crash_hotplug_cpu_support
+static inline int crash_hotplug_cpu_support(void) { return 0; }
+#endif
+
+#ifndef crash_hotplug_memory_support
+static inline int crash_hotplug_memory_support(void) { return 0; }
+#endif
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-09 21:05 [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Eric DeVolder
                   ` (5 preceding siblings ...)
  2022-09-09 21:05 ` [PATCH v12 6/7] crash: memory and cpu hotplug sysfs attributes Eric DeVolder
@ 2022-09-09 21:05 ` Eric DeVolder
  2022-09-12  6:52   ` Borislav Petkov
  2022-09-19  7:06   ` Sourabh Jain
  2022-09-12  3:47 ` [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Baoquan He
  7 siblings, 2 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-09 21:05 UTC (permalink / raw)
  To: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky,
	eric.devolder

For x86_64, when CPU or memory is hot un/plugged, the crash
elfcorehdr, which describes the CPUs and memory in the system,
must also be updated.

When loading the crash kernel via kexec_load or kexec_file_load,
the elfcorehdr is identified at run time in
crash_core:handle_hotplug_event().

To update the elfcorehdr for x86_64, a new elfcorehdr must be
generated from the available CPUs and memory. The new elfcorehdr
is prepared into a buffer, and then installed over the top of
the existing elfcorehdr.

In the patch 'kexec: exclude elfcorehdr from the segment digest'
the need to update purgatory due to the change in elfcorehdr was
eliminated.  As a result, no changes to purgatory or boot_params
(as the elfcorehdr= kernel command line parameter pointer
remains unchanged and correct) are needed, just elfcorehdr.

To accommodate a growing number of resources via hotplug, the
elfcorehdr segment must be sufficiently large enough to accommodate
changes, see the CRASH_MAX_MEMORY_RANGES configure item.

With this change, crash hotplug for kexec_file_load syscall
is supported. The kexec_load is also supported, but also
requires a corresponding change to userspace kexec-tools.

Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
Acked-by: Baoquan He <bhe@redhat.com>
---
 arch/x86/Kconfig             |  11 ++++
 arch/x86/include/asm/kexec.h |  20 +++++++
 arch/x86/kernel/crash.c      | 102 +++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9920f1341c8..cdfc9b2fdf98 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2056,6 +2056,17 @@ config CRASH_DUMP
 	  (CONFIG_RELOCATABLE=y).
 	  For more details see Documentation/admin-guide/kdump/kdump.rst
 
+config CRASH_MAX_MEMORY_RANGES
+	depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
+	int
+	default 32768
+	help
+	  For the kexec_file_load path, specify the maximum number of
+	  memory regions, eg. as represented by the 'System RAM' entries
+	  in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
+	  This value is combined with NR_CPUS and multiplied by Elf64_Phdr
+	  size to determine the final buffer size.
+
 config KEXEC_JUMP
 	bool "kexec jump"
 	depends on KEXEC && HIBERNATION
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index a3760ca796aa..432073385b2d 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -212,6 +212,26 @@ typedef void crash_vmclear_fn(void);
 extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
 extern void kdump_nmi_shootdown_cpus(void);
 
+void *arch_map_crash_pages(unsigned long paddr, unsigned long size);
+#define arch_map_crash_pages arch_map_crash_pages
+
+void arch_unmap_crash_pages(void **ptr);
+#define arch_unmap_crash_pages arch_unmap_crash_pages
+
+void arch_crash_handle_hotplug_event(struct kimage *image,
+		unsigned int hp_action);
+#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
+
+#ifdef CONFIG_HOTPLUG_CPU
+static inline int crash_hotplug_cpu_support(void) { return 1; }
+#define crash_hotplug_cpu_support crash_hotplug_cpu_support
+#endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline int crash_hotplug_memory_support(void) { return 1; }
+#define crash_hotplug_memory_support crash_hotplug_memory_support
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 9ceb93c176a6..8fc7d678ac72 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/memblock.h>
+#include <linux/highmem.h>
 
 #include <asm/processor.h>
 #include <asm/hardirq.h>
@@ -397,7 +398,18 @@ int crash_load_segments(struct kimage *image)
 	image->elf_headers = kbuf.buffer;
 	image->elf_headers_sz = kbuf.bufsz;
 
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+	/* Ensure elfcorehdr segment large enough for hotplug changes */
+	kbuf.memsz =
+		(CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
+			sizeof(Elf64_Phdr);
+	/* Mark as usable to crash kernel, else crash kernel fails on boot */
+	image->elf_headers_sz = kbuf.memsz;
+	image->elfcorehdr_index = image->nr_segments;
+	image->elfcorehdr_index_valid = true;
+#else
 	kbuf.memsz = kbuf.bufsz;
+#endif
 	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
 	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
 	ret = kexec_add_buffer(&kbuf);
@@ -412,3 +424,93 @@ int crash_load_segments(struct kimage *image)
 	return ret;
 }
 #endif /* CONFIG_KEXEC_FILE */
+
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * NOTE: The addresses and sizes passed to this routine have
+ * already been fully aligned on page boundaries. There is no
+ * need for massaging the address or size.
+ */
+void *arch_map_crash_pages(unsigned long paddr, unsigned long size)
+{
+	void *ptr = NULL;
+
+	if (size > 0) {
+		struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
+
+		ptr = kmap_local_page(page);
+	}
+
+	return ptr;
+}
+
+void arch_unmap_crash_pages(void **ptr)
+{
+	if (ptr) {
+		if (*ptr)
+			kunmap_local(*ptr);
+		*ptr = NULL;
+	}
+}
+
+/**
+ * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
+ * @image: the active struct kimage
+ * @hp_action: the hot un/plug action being handled
+ *
+ * To accurately reflect hot un/plug changes, the new elfcorehdr
+ * is prepared in a kernel buffer, and then it is written on top
+ * of the existing/old elfcorehdr.
+ */
+void arch_crash_handle_hotplug_event(struct kimage *image,
+	unsigned int hp_action)
+{
+	struct kexec_segment *ksegment;
+	unsigned char *ptr = NULL;
+	unsigned long elfsz = 0;
+	void *elfbuf = NULL;
+	unsigned long mem, memsz;
+
+	/*
+	 * Elfcorehdr_index_valid checked in crash_core:handle_hotplug_event()
+	 */
+	ksegment = &image->segment[image->elfcorehdr_index];
+	mem = ksegment->mem;
+	memsz = ksegment->memsz;
+
+	/*
+	 * Create the new elfcorehdr reflecting the changes to CPU and/or
+	 * memory resources.
+	 */
+	if (prepare_elf_headers(image, &elfbuf, &elfsz)) {
+		pr_err("crash hp: unable to prepare elfcore headers");
+		goto out;
+	}
+	if (elfsz > memsz) {
+		pr_err("crash hp: update elfcorehdr elfsz %lu > memsz %lu",
+			elfsz, memsz);
+		goto out;
+	}
+
+	/*
+	 * At this point, we are all but assured of success.
+	 * Copy new elfcorehdr into destination.
+	 */
+	ptr = arch_map_crash_pages(mem, memsz);
+	if (ptr) {
+		/*
+		 * Temporarily invalidate the crash image while the
+		 * elfcorehdr is updated.
+		 */
+		xchg(&kexec_crash_image, NULL);
+		memcpy_flushcache((void *)ptr, elfbuf, elfsz);
+		xchg(&kexec_crash_image, image);
+	}
+	arch_unmap_crash_pages((void **)&ptr);
+	pr_debug("crash hp: re-loaded elfcorehdr at 0x%lx\n", mem);
+
+out:
+	if (elfbuf)
+		vfree(elfbuf);
+}
+#endif
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug
  2022-09-09 21:05 [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Eric DeVolder
                   ` (6 preceding siblings ...)
  2022-09-09 21:05 ` [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support Eric DeVolder
@ 2022-09-12  3:47 ` Baoquan He
  7 siblings, 0 replies; 57+ messages in thread
From: Baoquan He @ 2022-09-12  3:47 UTC (permalink / raw)
  To: Eric DeVolder, bp, akpm, x86
  Cc: linux-kernel, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, bp,
	dave.hansen, hpa, nramas, thomas.lendacky, robh, efault, rppt,
	david, sourabhjain, konrad.wilk, boris.ostrovsky

On 09/09/22 at 05:05pm, Eric DeVolder wrote:
> When the kdump service is loaded, if a CPU or memory is hot
> un/plugged, the crash elfcorehdr, which describes the CPUs
> and memory in the system, must also be updated, else the resulting
> vmcore is inaccurate (eg. missing either CPU context or memory
> regions).
> 
> The current solution utilizes udev to initiate an unload-then-reload
> of the kdump image (e. kernel, initrd, boot_params, puratory and
> elfcorehdr) by the userspace kexec utility. In previous posts I have
> outlined the significant performance problems related to offloading
> this activity to userspace.
> 
> This patchset introduces a generic crash hot un/plug handler that
> registers with the CPU and memory notifiers. Upon CPU or memory
> changes, this generic handler is invoked and performs important
> housekeeping, for example obtaining the appropriate lock, and then
> invokes an architecture specific handler to do the appropriate
> updates.
> 
> In the case of x86_64, the arch specific handler generates a new
> elfcorehdr, and overwrites the old one in memory. No involvement
> with userspace needed.

Thank a lot for all the effort, Eric.

Hi Boris, Andrew,

This version looks good to me. It introduces a framework for kdump
to react mem/cpu hotplug eveut and add x86 handler. Should this go
to x86 tree or mm tree? Please check what else we need do to fix or
improve.

Thanks
Baoquan


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-09 21:05 ` [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support Eric DeVolder
@ 2022-09-12  6:52   ` Borislav Petkov
  2022-09-13 19:12     ` Eric DeVolder
  2022-09-19  7:06   ` Sourabh Jain
  1 sibling, 1 reply; 57+ messages in thread
From: Borislav Petkov @ 2022-09-12  6:52 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky

On Fri, Sep 09, 2022 at 05:05:09PM -0400, Eric DeVolder wrote:
> For x86_64, when CPU or memory is hot un/plugged, the crash
> elfcorehdr, which describes the CPUs and memory in the system,
> must also be updated.
> 
> When loading the crash kernel via kexec_load or kexec_file_load,

Please end function names with parentheses. Check the whole patch pls.

> the elfcorehdr is identified at run time in
> crash_core:handle_hotplug_event().
> 
> To update the elfcorehdr for x86_64, a new elfcorehdr must be
> generated from the available CPUs and memory. The new elfcorehdr
> is prepared into a buffer, and then installed over the top of
> the existing elfcorehdr.
> 
> In the patch 'kexec: exclude elfcorehdr from the segment digest'
> the need to update purgatory due to the change in elfcorehdr was
> eliminated.  As a result, no changes to purgatory or boot_params
> (as the elfcorehdr= kernel command line parameter pointer
> remains unchanged and correct) are needed, just elfcorehdr.
> 
> To accommodate a growing number of resources via hotplug, the
> elfcorehdr segment must be sufficiently large enough to accommodate
> changes, see the CRASH_MAX_MEMORY_RANGES configure item.
> 
> With this change, crash hotplug for kexec_file_load syscall
> is supported.

Redundant sentence.

> The kexec_load is also supported, but also
> requires a corresponding change to userspace kexec-tools.

Ditto.

> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
> Acked-by: Baoquan He <bhe@redhat.com>
> ---
>  arch/x86/Kconfig             |  11 ++++
>  arch/x86/include/asm/kexec.h |  20 +++++++
>  arch/x86/kernel/crash.c      | 102 +++++++++++++++++++++++++++++++++++
>  3 files changed, 133 insertions(+)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index f9920f1341c8..cdfc9b2fdf98 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -2056,6 +2056,17 @@ config CRASH_DUMP
>  	  (CONFIG_RELOCATABLE=y).
>  	  For more details see Documentation/admin-guide/kdump/kdump.rst
>  
> +config CRASH_MAX_MEMORY_RANGES
> +	depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
> +	int
> +	default 32768
> +	help
> +	  For the kexec_file_load path, specify the maximum number of
> +	  memory regions, eg. as represented by the 'System RAM' entries
> +	  in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
> +	  This value is combined with NR_CPUS and multiplied by Elf64_Phdr
> +	  size to determine the final buffer size.

If I'm purely a user, I'm left wondering how to determine what to
specify. Do you have a guidance text somewhere you can point to from
here?

> +
>  config KEXEC_JUMP
>  	bool "kexec jump"
>  	depends on KEXEC && HIBERNATION
> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
> index a3760ca796aa..432073385b2d 100644
> --- a/arch/x86/include/asm/kexec.h
> +++ b/arch/x86/include/asm/kexec.h
> @@ -212,6 +212,26 @@ typedef void crash_vmclear_fn(void);
>  extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
>  extern void kdump_nmi_shootdown_cpus(void);
>  
> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size);
> +#define arch_map_crash_pages arch_map_crash_pages
> +
> +void arch_unmap_crash_pages(void **ptr);
> +#define arch_unmap_crash_pages arch_unmap_crash_pages
> +
> +void arch_crash_handle_hotplug_event(struct kimage *image,
> +		unsigned int hp_action);
> +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +static inline int crash_hotplug_cpu_support(void) { return 1; }
> +#define crash_hotplug_cpu_support crash_hotplug_cpu_support
> +#endif
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static inline int crash_hotplug_memory_support(void) { return 1; }
> +#define crash_hotplug_memory_support crash_hotplug_memory_support
> +#endif
> +
>  #endif /* __ASSEMBLY__ */
>  
>  #endif /* _ASM_X86_KEXEC_H */
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index 9ceb93c176a6..8fc7d678ac72 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -25,6 +25,7 @@
>  #include <linux/slab.h>
>  #include <linux/vmalloc.h>
>  #include <linux/memblock.h>
> +#include <linux/highmem.h>
>  
>  #include <asm/processor.h>
>  #include <asm/hardirq.h>
> @@ -397,7 +398,18 @@ int crash_load_segments(struct kimage *image)
>  	image->elf_headers = kbuf.buffer;
>  	image->elf_headers_sz = kbuf.bufsz;
>  
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> +	/* Ensure elfcorehdr segment large enough for hotplug changes */
> +	kbuf.memsz =
> +		(CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
> +			sizeof(Elf64_Phdr);


	kbuf.memsz  = CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES;
	kbuf.memsz *= sizeof(Elf64_Phdr);

looks more readable to me.


> +	/* Mark as usable to crash kernel, else crash kernel fails on boot */
> +	image->elf_headers_sz = kbuf.memsz;
> +	image->elfcorehdr_index = image->nr_segments;
> +	image->elfcorehdr_index_valid = true;
> +#else
>  	kbuf.memsz = kbuf.bufsz;

Do that initialization at the top where you declare kbuf and get rid of
the #else branch.

> +#endif
>  	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>  	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>  	ret = kexec_add_buffer(&kbuf);
> @@ -412,3 +424,93 @@ int crash_load_segments(struct kimage *image)
>  	return ret;
>  }
>  #endif /* CONFIG_KEXEC_FILE */
> +
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)

This ugly ifdeffery is still here. Why don't you have stubs for the
!defined() cases in the header so that you can drop those here?

> +/*
> + * NOTE: The addresses and sizes passed to this routine have
> + * already been fully aligned on page boundaries. There is no
> + * need for massaging the address or size.
> + */
> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size)
> +{
> +	void *ptr = NULL;
> +
> +	if (size > 0) {
> +		struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
> +
> +		ptr = kmap_local_page(page);
> +	}
> +
> +	return ptr;
> +}

	if (size > 0)
		return kmap_local_page(pfn_to_page(paddr >> PAGE_SHIFT));
	else
		return NULL;

That's it.

> +
> +void arch_unmap_crash_pages(void **ptr)
> +{
> +	if (ptr) {
> +		if (*ptr)
> +			kunmap_local(*ptr);
> +		*ptr = NULL;
> +	}

Oh wow, this is just nuts. Why does it have to pass in a pointer to
pointer which you have to carefully check twice? And why is it a void
**?

And why are those called arch_ if all I see is the x86 variants? Are
there gonna be other arches? And even if, why can't the other arches do
kmap_local_page() too?

> +}
> +
> +/**
> + * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
> + * @image: the active struct kimage
> + * @hp_action: the hot un/plug action being handled
> + *
> + * To accurately reflect hot un/plug changes, the new elfcorehdr
> + * is prepared in a kernel buffer, and then it is written on top
> + * of the existing/old elfcorehdr.
> + */
> +void arch_crash_handle_hotplug_event(struct kimage *image,
> +	unsigned int hp_action)

Align arguments on the opening brace.

> +{
> +	struct kexec_segment *ksegment;
> +	unsigned char *ptr = NULL;
> +	unsigned long elfsz = 0;
> +	void *elfbuf = NULL;
> +	unsigned long mem, memsz;

Please sort function local variables declaration in a reverse christmas
tree order:

	<type A> longest_variable_name;
	<type B> shorter_var_name;
	<type C> even_shorter;
	<type D> i;

> +
> +	/*
> +	 * Elfcorehdr_index_valid checked in crash_core:handle_hotplug_event()

Elfcorehdr_index_valid??

> +	 */
> +	ksegment = &image->segment[image->elfcorehdr_index];
> +	mem = ksegment->mem;
> +	memsz = ksegment->memsz;
> +
> +	/*
> +	 * Create the new elfcorehdr reflecting the changes to CPU and/or
> +	 * memory resources.
> +	 */
> +	if (prepare_elf_headers(image, &elfbuf, &elfsz)) {
> +		pr_err("crash hp: unable to prepare elfcore headers");
			^^^^^^^^

this thing is done with pr_fmt(). Grep the tree for examples.

> +		goto out;
> +	}

The three lines above reading ksegment need to be here, where the test
is done.

> +	if (elfsz > memsz) {
> +		pr_err("crash hp: update elfcorehdr elfsz %lu > memsz %lu",
> +			elfsz, memsz);
> +		goto out;
> +	}
> +
> +	/*
> +	 * At this point, we are all but assured of success.

Who is "we"?

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-12  6:52   ` Borislav Petkov
@ 2022-09-13 19:12     ` Eric DeVolder
  2022-09-26 19:19       ` Eric DeVolder
  2022-09-28 16:07       ` Borislav Petkov
  0 siblings, 2 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-13 19:12 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky

Boris,
Thanks for the feedback! Inline responses below.
eric

On 9/12/22 01:52, Borislav Petkov wrote:
> On Fri, Sep 09, 2022 at 05:05:09PM -0400, Eric DeVolder wrote:
>> For x86_64, when CPU or memory is hot un/plugged, the crash
>> elfcorehdr, which describes the CPUs and memory in the system,
>> must also be updated.
>>
>> When loading the crash kernel via kexec_load or kexec_file_load,
> 
> Please end function names with parentheses. Check the whole patch pls.
Done.

> 
>> the elfcorehdr is identified at run time in
>> crash_core:handle_hotplug_event().
>>
>> To update the elfcorehdr for x86_64, a new elfcorehdr must be
>> generated from the available CPUs and memory. The new elfcorehdr
>> is prepared into a buffer, and then installed over the top of
>> the existing elfcorehdr.
>>
>> In the patch 'kexec: exclude elfcorehdr from the segment digest'
>> the need to update purgatory due to the change in elfcorehdr was
>> eliminated.  As a result, no changes to purgatory or boot_params
>> (as the elfcorehdr= kernel command line parameter pointer
>> remains unchanged and correct) are needed, just elfcorehdr.
>>
>> To accommodate a growing number of resources via hotplug, the
>> elfcorehdr segment must be sufficiently large enough to accommodate
>> changes, see the CRASH_MAX_MEMORY_RANGES configure item.
>>
>> With this change, crash hotplug for kexec_file_load syscall
>> is supported.
> 
> Redundant sentence.
Removed.

> 
>> The kexec_load is also supported, but also
>> requires a corresponding change to userspace kexec-tools.
> 
> Ditto.
Removed.

> 
>> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
>> Acked-by: Baoquan He <bhe@redhat.com>
>> ---
>>   arch/x86/Kconfig             |  11 ++++
>>   arch/x86/include/asm/kexec.h |  20 +++++++
>>   arch/x86/kernel/crash.c      | 102 +++++++++++++++++++++++++++++++++++
>>   3 files changed, 133 insertions(+)
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index f9920f1341c8..cdfc9b2fdf98 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -2056,6 +2056,17 @@ config CRASH_DUMP
>>   	  (CONFIG_RELOCATABLE=y).
>>   	  For more details see Documentation/admin-guide/kdump/kdump.rst
>>   
>> +config CRASH_MAX_MEMORY_RANGES
>> +	depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
>> +	int
>> +	default 32768
>> +	help
>> +	  For the kexec_file_load path, specify the maximum number of
>> +	  memory regions, eg. as represented by the 'System RAM' entries
>> +	  in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
>> +	  This value is combined with NR_CPUS and multiplied by Elf64_Phdr
>> +	  size to determine the final buffer size.
> 
> If I'm purely a user, I'm left wondering how to determine what to
> specify. Do you have a guidance text somewhere you can point to from
> here?

This topic was discussed previously https://lkml.org/lkml/2022/3/3/372.
David points out that terminology is tricky here due to differing behaviors.
And perhaps that is your point in asking for guidance text. It can be
complicated, but it all comes down to System RAM entries.

I could perhaps offer an overly simplified example such that for 1GiB block
size, for example, the CRASH_MAX_MEMORY_RANGES of 32768 would allow for 32TiB
of memory?

> 
>> +
>>   config KEXEC_JUMP
>>   	bool "kexec jump"
>>   	depends on KEXEC && HIBERNATION
>> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
>> index a3760ca796aa..432073385b2d 100644
>> --- a/arch/x86/include/asm/kexec.h
>> +++ b/arch/x86/include/asm/kexec.h
>> @@ -212,6 +212,26 @@ typedef void crash_vmclear_fn(void);
>>   extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
>>   extern void kdump_nmi_shootdown_cpus(void);
>>   
>> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size);
>> +#define arch_map_crash_pages arch_map_crash_pages
>> +
>> +void arch_unmap_crash_pages(void **ptr);
>> +#define arch_unmap_crash_pages arch_unmap_crash_pages
>> +
>> +void arch_crash_handle_hotplug_event(struct kimage *image,
>> +		unsigned int hp_action);
>> +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
>> +
>> +#ifdef CONFIG_HOTPLUG_CPU
>> +static inline int crash_hotplug_cpu_support(void) { return 1; }
>> +#define crash_hotplug_cpu_support crash_hotplug_cpu_support
>> +#endif
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static inline int crash_hotplug_memory_support(void) { return 1; }
>> +#define crash_hotplug_memory_support crash_hotplug_memory_support
>> +#endif
>> +
>>   #endif /* __ASSEMBLY__ */
>>   
>>   #endif /* _ASM_X86_KEXEC_H */
>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>> index 9ceb93c176a6..8fc7d678ac72 100644
>> --- a/arch/x86/kernel/crash.c
>> +++ b/arch/x86/kernel/crash.c
>> @@ -25,6 +25,7 @@
>>   #include <linux/slab.h>
>>   #include <linux/vmalloc.h>
>>   #include <linux/memblock.h>
>> +#include <linux/highmem.h>
>>   
>>   #include <asm/processor.h>
>>   #include <asm/hardirq.h>
>> @@ -397,7 +398,18 @@ int crash_load_segments(struct kimage *image)
>>   	image->elf_headers = kbuf.buffer;
>>   	image->elf_headers_sz = kbuf.bufsz;
>>   
>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>> +	/* Ensure elfcorehdr segment large enough for hotplug changes */
>> +	kbuf.memsz =
>> +		(CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
>> +			sizeof(Elf64_Phdr);
> 
> 
> 	kbuf.memsz  = CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES;
> 	kbuf.memsz *= sizeof(Elf64_Phdr);
> 
> looks more readable to me.
Done.

> 
> 
>> +	/* Mark as usable to crash kernel, else crash kernel fails on boot */
>> +	image->elf_headers_sz = kbuf.memsz;
>> +	image->elfcorehdr_index = image->nr_segments;
>> +	image->elfcorehdr_index_valid = true;
>> +#else
>>   	kbuf.memsz = kbuf.bufsz;
> 
> Do that initialization at the top where you declare kbuf and get rid of
> the #else branch.
The kbuf.bufsz value is obtained via a call to prepare_elf_headers(); I can not initialize it at its 
declaration.

> 
>> +#endif
>>   	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>>   	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>>   	ret = kexec_add_buffer(&kbuf);
>> @@ -412,3 +424,93 @@ int crash_load_segments(struct kimage *image)
>>   	return ret;
>>   }
>>   #endif /* CONFIG_KEXEC_FILE */
>> +
>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> 
> This ugly ifdeffery is still here. Why don't you have stubs for the
> !defined() cases in the header so that you can drop those here?
> 

I'm at a loss as to what to do differently here. You've raised this issue before and I went back and 
looked at the suggestions then and I don't see how that applies to this situation. How is this 
situation different than the #ifdef CONFIG_KEXEC_FILE that immediately preceeds it?

I've included a copy of the current state of this section below for additional markup.

>> +/*
>> + * NOTE: The addresses and sizes passed to this routine have
>> + * already been fully aligned on page boundaries. There is no
>> + * need for massaging the address or size.
>> + */
>> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size)
>> +{
>> +	void *ptr = NULL;
>> +
>> +	if (size > 0) {
>> +		struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
>> +
>> +		ptr = kmap_local_page(page);
>> +	}
>> +
>> +	return ptr;
>> +}
> 
> 	if (size > 0)
> 		return kmap_local_page(pfn_to_page(paddr >> PAGE_SHIFT));
> 	else
> 		return NULL;
> 
> That's it.
Done.

> 
>> +
>> +void arch_unmap_crash_pages(void **ptr)
>> +{
>> +	if (ptr) {
>> +		if (*ptr)
>> +			kunmap_local(*ptr);
>> +		*ptr = NULL;
>> +	}
> 
> Oh wow, this is just nuts. Why does it have to pass in a pointer to
> pointer which you have to carefully check twice? And why is it a void
> **?
A long time ago this made sense, but it no longer makes sense. I've corrected this.

> 
> And why are those called arch_ if all I see is the x86 variants? Are
> there gonna be other arches? And even if, why can't the other arches do
> kmap_local_page() too?
Currently there is a concurrent effort for PPC support by Sourabh Jain, and in that effort 
arch_map_crash_pages() is using __va(paddr).

I do not know the nuances between kmap_local_page() and __va() to answer the question.

If kmap_local_page() works for all archs, then I'm happy to drop these arch_ variants
and use it directly.

> 
>> +}
>> +
>> +/**
>> + * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
>> + * @image: the active struct kimage
>> + * @hp_action: the hot un/plug action being handled
>> + *
>> + * To accurately reflect hot un/plug changes, the new elfcorehdr
>> + * is prepared in a kernel buffer, and then it is written on top
>> + * of the existing/old elfcorehdr.
>> + */
>> +void arch_crash_handle_hotplug_event(struct kimage *image,
>> +	unsigned int hp_action)
> 
> Align arguments on the opening brace.
Done.

> 
>> +{
>> +	struct kexec_segment *ksegment;
>> +	unsigned char *ptr = NULL;
>> +	unsigned long elfsz = 0;
>> +	void *elfbuf = NULL;
>> +	unsigned long mem, memsz;
> 
> Please sort function local variables declaration in a reverse christmas
> tree order:
> 
> 	<type A> longest_variable_name;
> 	<type B> shorter_var_name;
> 	<type C> even_shorter;
> 	<type D> i;
> 
Done.

>> +
>> +	/*
>> +	 * Elfcorehdr_index_valid checked in crash_core:handle_hotplug_event()
> 
> Elfcorehdr_index_valid??
Comment reworked.


> 
>> +	 */
>> +	ksegment = &image->segment[image->elfcorehdr_index];
>> +	mem = ksegment->mem;
>> +	memsz = ksegment->memsz;
>> +
>> +	/*
>> +	 * Create the new elfcorehdr reflecting the changes to CPU and/or
>> +	 * memory resources.
>> +	 */
>> +	if (prepare_elf_headers(image, &elfbuf, &elfsz)) {
>> +		pr_err("crash hp: unable to prepare elfcore headers");
> 			^^^^^^^^
> 
> this thing is done with pr_fmt(). Grep the tree for examples.
Done, thanks for pointing that out.

> 
>> +		goto out;
>> +	}
> 
> The three lines above reading ksegment need to be here, where the test
> is done.
Done.

> 
>> +	if (elfsz > memsz) {
>> +		pr_err("crash hp: update elfcorehdr elfsz %lu > memsz %lu",
>> +			elfsz, memsz);
>> +		goto out;
>> +	}
>> +
>> +	/*
>> +	 * At this point, we are all but assured of success.
> 
> Who is "we"?
> 
Comment reworked.


Here is a copy of the current state of this code, for determining how to address the question above.

#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)

#undef pr_fmt
#define pr_fmt(fmt) "crash hp: " fmt

/*
  * NOTE: The addresses and sizes passed to this routine have
  * already been fully aligned on page boundaries. There is no
  * need for massaging the address or size.
  */
void *arch_map_crash_pages(unsigned long paddr, unsigned long size)
{
         if (size > 0)
                 return kmap_local_page(pfn_to_page(paddr >> PAGE_SHIFT));
         else
                 return NULL;
}

void arch_unmap_crash_pages(void *ptr)
{
         if (ptr)
                 kunmap_local(ptr);
}

/**
  * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
  * @image: the active struct kimage
  * @hp_action: the hot un/plug action being handled
  *
  * To accurately reflect hot un/plug changes, the new elfcorehdr
  * is prepared in a kernel buffer, and then it is written on top
  * of the existing/old elfcorehdr.
  */
void arch_crash_handle_hotplug_event(struct kimage *image,
                                     unsigned int hp_action)
{
         unsigned long mem, memsz;
         unsigned long elfsz = 0;
         void *elfbuf = NULL;
         void *ptr;

         /*
          * Create the new elfcorehdr reflecting the changes to CPU and/or
          * memory resources.
          */
         if (prepare_elf_headers(image, &elfbuf, &elfsz)) {
                 pr_err("unable to prepare elfcore headers");
                 goto out;
         }

         /*
          * Obtain address and size of the elfcorehdr segment, and
          * check it against the new elfcorehdr buffer.
          */
         mem = image->segment[image->elfcorehdr_index].mem;
         memsz = image->segment[image->elfcorehdr_index].memsz;
         if (elfsz > memsz) {
                 pr_err("update elfcorehdr elfsz %lu > memsz %lu",
                         elfsz, memsz);
                 goto out;
         }

         /*
          * Copy new elfcorehdr over the old elfcorehdr at destination.
          */
         ptr = arch_map_crash_pages(mem, memsz);
         if (ptr) {
                 /*
                  * Temporarily invalidate the crash image while the
                  * elfcorehdr is updated.
                  */
                 xchg(&kexec_crash_image, NULL);
                 memcpy_flushcache(ptr, elfbuf, elfsz);
                 xchg(&kexec_crash_image, image);
         }
         arch_unmap_crash_pages(ptr);
         pr_debug("re-loaded elfcorehdr at 0x%lx\n", mem);

out:
         if (elfbuf)
                 vfree(elfbuf);
}
#endif


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-09 21:05 ` [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support Eric DeVolder
  2022-09-12  6:52   ` Borislav Petkov
@ 2022-09-19  7:06   ` Sourabh Jain
  2022-10-07 19:33     ` Eric DeVolder
  1 sibling, 1 reply; 57+ messages in thread
From: Sourabh Jain @ 2022-09-19  7:06 UTC (permalink / raw)
  To: Eric DeVolder, linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, konrad.wilk, boris.ostrovsky


On 10/09/22 02:35, Eric DeVolder wrote:
> For x86_64, when CPU or memory is hot un/plugged, the crash
> elfcorehdr, which describes the CPUs and memory in the system,
> must also be updated.
>
> When loading the crash kernel via kexec_load or kexec_file_load,
> the elfcorehdr is identified at run time in
> crash_core:handle_hotplug_event().
>
> To update the elfcorehdr for x86_64, a new elfcorehdr must be
> generated from the available CPUs and memory. The new elfcorehdr
> is prepared into a buffer, and then installed over the top of
> the existing elfcorehdr.
>
> In the patch 'kexec: exclude elfcorehdr from the segment digest'
> the need to update purgatory due to the change in elfcorehdr was
> eliminated.  As a result, no changes to purgatory or boot_params
> (as the elfcorehdr= kernel command line parameter pointer
> remains unchanged and correct) are needed, just elfcorehdr.
>
> To accommodate a growing number of resources via hotplug, the
> elfcorehdr segment must be sufficiently large enough to accommodate
> changes, see the CRASH_MAX_MEMORY_RANGES configure item.
>
> With this change, crash hotplug for kexec_file_load syscall
> is supported. The kexec_load is also supported, but also
> requires a corresponding change to userspace kexec-tools.
>
> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
> Acked-by: Baoquan He <bhe@redhat.com>
> ---
>   arch/x86/Kconfig             |  11 ++++
>   arch/x86/include/asm/kexec.h |  20 +++++++
>   arch/x86/kernel/crash.c      | 102 +++++++++++++++++++++++++++++++++++
>   3 files changed, 133 insertions(+)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index f9920f1341c8..cdfc9b2fdf98 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -2056,6 +2056,17 @@ config CRASH_DUMP
>   	  (CONFIG_RELOCATABLE=y).
>   	  For more details see Documentation/admin-guide/kdump/kdump.rst
>   
> +config CRASH_MAX_MEMORY_RANGES
> +	depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
> +	int
> +	default 32768
> +	help
> +	  For the kexec_file_load path, specify the maximum number of
> +	  memory regions, eg. as represented by the 'System RAM' entries
> +	  in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
> +	  This value is combined with NR_CPUS and multiplied by Elf64_Phdr
> +	  size to determine the final buffer size.
> +
>   config KEXEC_JUMP
>   	bool "kexec jump"
>   	depends on KEXEC && HIBERNATION
> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
> index a3760ca796aa..432073385b2d 100644
> --- a/arch/x86/include/asm/kexec.h
> +++ b/arch/x86/include/asm/kexec.h
> @@ -212,6 +212,26 @@ typedef void crash_vmclear_fn(void);
>   extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
>   extern void kdump_nmi_shootdown_cpus(void);
>   
> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size);
> +#define arch_map_crash_pages arch_map_crash_pages
> +
> +void arch_unmap_crash_pages(void **ptr);
> +#define arch_unmap_crash_pages arch_unmap_crash_pages
> +
> +void arch_crash_handle_hotplug_event(struct kimage *image,
> +		unsigned int hp_action);
> +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +static inline int crash_hotplug_cpu_support(void) { return 1; }
> +#define crash_hotplug_cpu_support crash_hotplug_cpu_support
> +#endif
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static inline int crash_hotplug_memory_support(void) { return 1; }
> +#define crash_hotplug_memory_support crash_hotplug_memory_support
> +#endif
> +
>   #endif /* __ASSEMBLY__ */
>   
>   #endif /* _ASM_X86_KEXEC_H */
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index 9ceb93c176a6..8fc7d678ac72 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -25,6 +25,7 @@
>   #include <linux/slab.h>
>   #include <linux/vmalloc.h>
>   #include <linux/memblock.h>
> +#include <linux/highmem.h>
>   
>   #include <asm/processor.h>
>   #include <asm/hardirq.h>
> @@ -397,7 +398,18 @@ int crash_load_segments(struct kimage *image)
>   	image->elf_headers = kbuf.buffer;
>   	image->elf_headers_sz = kbuf.bufsz;
>   
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> +	/* Ensure elfcorehdr segment large enough for hotplug changes */
> +	kbuf.memsz =
> +		(CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
> +			sizeof(Elf64_Phdr);
> +	/* Mark as usable to crash kernel, else crash kernel fails on boot */
> +	image->elf_headers_sz = kbuf.memsz;
> +	image->elfcorehdr_index = image->nr_segments;
> +	image->elfcorehdr_index_valid = true;
> +#else
>   	kbuf.memsz = kbuf.bufsz;
> +#endif
>   	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>   	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>   	ret = kexec_add_buffer(&kbuf);
> @@ -412,3 +424,93 @@ int crash_load_segments(struct kimage *image)
>   	return ret;
>   }
>   #endif /* CONFIG_KEXEC_FILE */
> +
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> +/*
> + * NOTE: The addresses and sizes passed to this routine have
> + * already been fully aligned on page boundaries. There is no
> + * need for massaging the address or size.
> + */
> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size)
> +{
> +	void *ptr = NULL;
> +
> +	if (size > 0) {
> +		struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
> +
> +		ptr = kmap_local_page(page);
> +	}
> +
> +	return ptr;
> +}
> +
> +void arch_unmap_crash_pages(void **ptr)
> +{
> +	if (ptr) {
> +		if (*ptr)
> +			kunmap_local(*ptr);
> +		*ptr = NULL;
> +	}
> +}
> +
> +/**
> + * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
> + * @image: the active struct kimage
> + * @hp_action: the hot un/plug action being handled
> + *
> + * To accurately reflect hot un/plug changes, the new elfcorehdr
> + * is prepared in a kernel buffer, and then it is written on top
> + * of the existing/old elfcorehdr.
> + */
> +void arch_crash_handle_hotplug_event(struct kimage *image,
> +	unsigned int hp_action)
> +{
> +	struct kexec_segment *ksegment;
> +	unsigned char *ptr = NULL;
> +	unsigned long elfsz = 0;
> +	void *elfbuf = NULL;
> +	unsigned long mem, memsz;
> +
> +	/*
> +	 * Elfcorehdr_index_valid checked in crash_core:handle_hotplug_event()
> +	 */
> +	ksegment = &image->segment[image->elfcorehdr_index];
> +	mem = ksegment->mem;
> +	memsz = ksegment->memsz;
> +
> +	/*
> +	 * Create the new elfcorehdr reflecting the changes to CPU and/or
> +	 * memory resources.
> +	 */
> +	if (prepare_elf_headers(image, &elfbuf, &elfsz)) {
> +		pr_err("crash hp: unable to prepare elfcore headers");
> +		goto out;

On PowerPC, while preparing the elf core header the memblock structure 
is used to prepare program header for memory regions of elfcorehdr. 
Since the above arch specific hotplug handler gets invoked when memory 
is marked offline (MEM_OFFLINE) which is before memblock structure gets 
updated so on PowerPC the above handler may not work for memory hotplug 
case.

Just wondering which data structure is used to get the list of memory 
regions while preparing program header for memory regions of elfcorehdr 
on other architectures?

Thanks,
Sourabh Jain

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-13 19:12     ` Eric DeVolder
@ 2022-09-26 19:19       ` Eric DeVolder
  2022-09-28 16:07       ` Borislav Petkov
  1 sibling, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-26 19:19 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky

Boris,
I've a few questions for you below. With your responses, I am hopeful we can finish this series soon!
Thanks,
eric

On 9/13/22 14:12, Eric DeVolder wrote:
> Boris,
> Thanks for the feedback! Inline responses below.
> eric
> 
> On 9/12/22 01:52, Borislav Petkov wrote:
>> On Fri, Sep 09, 2022 at 05:05:09PM -0400, Eric DeVolder wrote:
>>> For x86_64, when CPU or memory is hot un/plugged, the crash
>>> elfcorehdr, which describes the CPUs and memory in the system,
>>> must also be updated.
>>>
>>> When loading the crash kernel via kexec_load or kexec_file_load,
>>
>> Please end function names with parentheses. Check the whole patch pls.
> Done.
> 
>>
>>> the elfcorehdr is identified at run time in
>>> crash_core:handle_hotplug_event().
>>>
>>> To update the elfcorehdr for x86_64, a new elfcorehdr must be
>>> generated from the available CPUs and memory. The new elfcorehdr
>>> is prepared into a buffer, and then installed over the top of
>>> the existing elfcorehdr.
>>>
>>> In the patch 'kexec: exclude elfcorehdr from the segment digest'
>>> the need to update purgatory due to the change in elfcorehdr was
>>> eliminated.  As a result, no changes to purgatory or boot_params
>>> (as the elfcorehdr= kernel command line parameter pointer
>>> remains unchanged and correct) are needed, just elfcorehdr.
>>>
>>> To accommodate a growing number of resources via hotplug, the
>>> elfcorehdr segment must be sufficiently large enough to accommodate
>>> changes, see the CRASH_MAX_MEMORY_RANGES configure item.
>>>
>>> With this change, crash hotplug for kexec_file_load syscall
>>> is supported.
>>
>> Redundant sentence.
> Removed.
> 
>>
>>> The kexec_load is also supported, but also
>>> requires a corresponding change to userspace kexec-tools.
>>
>> Ditto.
> Removed.
> 
>>
>>> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
>>> Acked-by: Baoquan He <bhe@redhat.com>
>>> ---
>>>   arch/x86/Kconfig             |  11 ++++
>>>   arch/x86/include/asm/kexec.h |  20 +++++++
>>>   arch/x86/kernel/crash.c      | 102 +++++++++++++++++++++++++++++++++++
>>>   3 files changed, 133 insertions(+)
>>>
>>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>>> index f9920f1341c8..cdfc9b2fdf98 100644
>>> --- a/arch/x86/Kconfig
>>> +++ b/arch/x86/Kconfig
>>> @@ -2056,6 +2056,17 @@ config CRASH_DUMP
>>>         (CONFIG_RELOCATABLE=y).
>>>         For more details see Documentation/admin-guide/kdump/kdump.rst
>>> +config CRASH_MAX_MEMORY_RANGES
>>> +    depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
>>> +    int
>>> +    default 32768
>>> +    help
>>> +      For the kexec_file_load path, specify the maximum number of
>>> +      memory regions, eg. as represented by the 'System RAM' entries
>>> +      in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
>>> +      This value is combined with NR_CPUS and multiplied by Elf64_Phdr
>>> +      size to determine the final buffer size.
>>
>> If I'm purely a user, I'm left wondering how to determine what to
>> specify. Do you have a guidance text somewhere you can point to from
>> here?
> 
> This topic was discussed previously https://lkml.org/lkml/2022/3/3/372.
> David points out that terminology is tricky here due to differing behaviors.
> And perhaps that is your point in asking for guidance text. It can be
> complicated, but it all comes down to System RAM entries.
> 
> I could perhaps offer an overly simplified example such that for 1GiB block
> size, for example, the CRASH_MAX_MEMORY_RANGES of 32768 would allow for 32TiB
> of memory?
> 
>>
>>> +
>>>   config KEXEC_JUMP
>>>       bool "kexec jump"
>>>       depends on KEXEC && HIBERNATION
>>> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
>>> index a3760ca796aa..432073385b2d 100644
>>> --- a/arch/x86/include/asm/kexec.h
>>> +++ b/arch/x86/include/asm/kexec.h
>>> @@ -212,6 +212,26 @@ typedef void crash_vmclear_fn(void);
>>>   extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
>>>   extern void kdump_nmi_shootdown_cpus(void);
>>> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size);
>>> +#define arch_map_crash_pages arch_map_crash_pages
>>> +
>>> +void arch_unmap_crash_pages(void **ptr);
>>> +#define arch_unmap_crash_pages arch_unmap_crash_pages
>>> +
>>> +void arch_crash_handle_hotplug_event(struct kimage *image,
>>> +        unsigned int hp_action);
>>> +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
>>> +
>>> +#ifdef CONFIG_HOTPLUG_CPU
>>> +static inline int crash_hotplug_cpu_support(void) { return 1; }
>>> +#define crash_hotplug_cpu_support crash_hotplug_cpu_support
>>> +#endif
>>> +
>>> +#ifdef CONFIG_MEMORY_HOTPLUG
>>> +static inline int crash_hotplug_memory_support(void) { return 1; }
>>> +#define crash_hotplug_memory_support crash_hotplug_memory_support
>>> +#endif
>>> +
>>>   #endif /* __ASSEMBLY__ */
>>>   #endif /* _ASM_X86_KEXEC_H */
>>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>>> index 9ceb93c176a6..8fc7d678ac72 100644
>>> --- a/arch/x86/kernel/crash.c
>>> +++ b/arch/x86/kernel/crash.c
>>> @@ -25,6 +25,7 @@
>>>   #include <linux/slab.h>
>>>   #include <linux/vmalloc.h>
>>>   #include <linux/memblock.h>
>>> +#include <linux/highmem.h>
>>>   #include <asm/processor.h>
>>>   #include <asm/hardirq.h>
>>> @@ -397,7 +398,18 @@ int crash_load_segments(struct kimage *image)
>>>       image->elf_headers = kbuf.buffer;
>>>       image->elf_headers_sz = kbuf.bufsz;
>>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>> +    /* Ensure elfcorehdr segment large enough for hotplug changes */
>>> +    kbuf.memsz =
>>> +        (CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
>>> +            sizeof(Elf64_Phdr);
>>
>>
>>     kbuf.memsz  = CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES;
>>     kbuf.memsz *= sizeof(Elf64_Phdr);
>>
>> looks more readable to me.
> Done.
> 
>>
>>
>>> +    /* Mark as usable to crash kernel, else crash kernel fails on boot */
>>> +    image->elf_headers_sz = kbuf.memsz;
>>> +    image->elfcorehdr_index = image->nr_segments;
>>> +    image->elfcorehdr_index_valid = true;
>>> +#else
>>>       kbuf.memsz = kbuf.bufsz;
>>
>> Do that initialization at the top where you declare kbuf and get rid of
>> the #else branch.
> The kbuf.bufsz value is obtained via a call to prepare_elf_headers(); I can not initialize it at its 
> declaration.
> 
>>
>>> +#endif
>>>       kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>>>       kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>>>       ret = kexec_add_buffer(&kbuf);
>>> @@ -412,3 +424,93 @@ int crash_load_segments(struct kimage *image)
>>>       return ret;
>>>   }
>>>   #endif /* CONFIG_KEXEC_FILE */
>>> +
>>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>
>> This ugly ifdeffery is still here. Why don't you have stubs for the
>> !defined() cases in the header so that you can drop those here?
>>
> 
> I'm at a loss as to what to do differently here. You've raised this issue before and I went back and 
> looked at the suggestions then and I don't see how that applies to this situation. How is this 
> situation different than the #ifdef CONFIG_KEXEC_FILE that immediately preceeds it?
> 
> I've included a copy of the current state of this section below for additional markup.
> 
>>> +/*
>>> + * NOTE: The addresses and sizes passed to this routine have
>>> + * already been fully aligned on page boundaries. There is no
>>> + * need for massaging the address or size.
>>> + */
>>> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size)
>>> +{
>>> +    void *ptr = NULL;
>>> +
>>> +    if (size > 0) {
>>> +        struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
>>> +
>>> +        ptr = kmap_local_page(page);
>>> +    }
>>> +
>>> +    return ptr;
>>> +}
>>
>>     if (size > 0)
>>         return kmap_local_page(pfn_to_page(paddr >> PAGE_SHIFT));
>>     else
>>         return NULL;
>>
>> That's it.
> Done.
> 
>>
>>> +
>>> +void arch_unmap_crash_pages(void **ptr)
>>> +{
>>> +    if (ptr) {
>>> +        if (*ptr)
>>> +            kunmap_local(*ptr);
>>> +        *ptr = NULL;
>>> +    }
>>
>> Oh wow, this is just nuts. Why does it have to pass in a pointer to
>> pointer which you have to carefully check twice? And why is it a void
>> **?
> A long time ago this made sense, but it no longer makes sense. I've corrected this.
> 
>>
>> And why are those called arch_ if all I see is the x86 variants? Are
>> there gonna be other arches? And even if, why can't the other arches do
>> kmap_local_page() too?
> Currently there is a concurrent effort for PPC support by Sourabh Jain, and in that effort 
> arch_map_crash_pages() is using __va(paddr).
> 
> I do not know the nuances between kmap_local_page() and __va() to answer the question.
> 
> If kmap_local_page() works for all archs, then I'm happy to drop these arch_ variants
> and use it directly.
> 
>>
>>> +}
>>> +
>>> +/**
>>> + * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
>>> + * @image: the active struct kimage
>>> + * @hp_action: the hot un/plug action being handled
>>> + *
>>> + * To accurately reflect hot un/plug changes, the new elfcorehdr
>>> + * is prepared in a kernel buffer, and then it is written on top
>>> + * of the existing/old elfcorehdr.
>>> + */
>>> +void arch_crash_handle_hotplug_event(struct kimage *image,
>>> +    unsigned int hp_action)
>>
>> Align arguments on the opening brace.
> Done.
> 
>>
>>> +{
>>> +    struct kexec_segment *ksegment;
>>> +    unsigned char *ptr = NULL;
>>> +    unsigned long elfsz = 0;
>>> +    void *elfbuf = NULL;
>>> +    unsigned long mem, memsz;
>>
>> Please sort function local variables declaration in a reverse christmas
>> tree order:
>>
>>     <type A> longest_variable_name;
>>     <type B> shorter_var_name;
>>     <type C> even_shorter;
>>     <type D> i;
>>
> Done.
> 
>>> +
>>> +    /*
>>> +     * Elfcorehdr_index_valid checked in crash_core:handle_hotplug_event()
>>
>> Elfcorehdr_index_valid??
> Comment reworked.
> 
> 
>>
>>> +     */
>>> +    ksegment = &image->segment[image->elfcorehdr_index];
>>> +    mem = ksegment->mem;
>>> +    memsz = ksegment->memsz;
>>> +
>>> +    /*
>>> +     * Create the new elfcorehdr reflecting the changes to CPU and/or
>>> +     * memory resources.
>>> +     */
>>> +    if (prepare_elf_headers(image, &elfbuf, &elfsz)) {
>>> +        pr_err("crash hp: unable to prepare elfcore headers");
>>             ^^^^^^^^
>>
>> this thing is done with pr_fmt(). Grep the tree for examples.
> Done, thanks for pointing that out.
> 
>>
>>> +        goto out;
>>> +    }
>>
>> The three lines above reading ksegment need to be here, where the test
>> is done.
> Done.
> 
>>
>>> +    if (elfsz > memsz) {
>>> +        pr_err("crash hp: update elfcorehdr elfsz %lu > memsz %lu",
>>> +            elfsz, memsz);
>>> +        goto out;
>>> +    }
>>> +
>>> +    /*
>>> +     * At this point, we are all but assured of success.
>>
>> Who is "we"?
>>
> Comment reworked.
> 
> 
> Here is a copy of the current state of this code, for determining how to address the question above.
> 
> #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> 
> #undef pr_fmt
> #define pr_fmt(fmt) "crash hp: " fmt
> 
> /*
>   * NOTE: The addresses and sizes passed to this routine have
>   * already been fully aligned on page boundaries. There is no
>   * need for massaging the address or size.
>   */
> void *arch_map_crash_pages(unsigned long paddr, unsigned long size)
> {
>          if (size > 0)
>                  return kmap_local_page(pfn_to_page(paddr >> PAGE_SHIFT));
>          else
>                  return NULL;
> }
> 
> void arch_unmap_crash_pages(void *ptr)
> {
>          if (ptr)
>                  kunmap_local(ptr);
> }
> 
> /**
>   * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
>   * @image: the active struct kimage
>   * @hp_action: the hot un/plug action being handled
>   *
>   * To accurately reflect hot un/plug changes, the new elfcorehdr
>   * is prepared in a kernel buffer, and then it is written on top
>   * of the existing/old elfcorehdr.
>   */
> void arch_crash_handle_hotplug_event(struct kimage *image,
>                                      unsigned int hp_action)
> {
>          unsigned long mem, memsz;
>          unsigned long elfsz = 0;
>          void *elfbuf = NULL;
>          void *ptr;
> 
>          /*
>           * Create the new elfcorehdr reflecting the changes to CPU and/or
>           * memory resources.
>           */
>          if (prepare_elf_headers(image, &elfbuf, &elfsz)) {
>                  pr_err("unable to prepare elfcore headers");
>                  goto out;
>          }
> 
>          /*
>           * Obtain address and size of the elfcorehdr segment, and
>           * check it against the new elfcorehdr buffer.
>           */
>          mem = image->segment[image->elfcorehdr_index].mem;
>          memsz = image->segment[image->elfcorehdr_index].memsz;
>          if (elfsz > memsz) {
>                  pr_err("update elfcorehdr elfsz %lu > memsz %lu",
>                          elfsz, memsz);
>                  goto out;
>          }
> 
>          /*
>           * Copy new elfcorehdr over the old elfcorehdr at destination.
>           */
>          ptr = arch_map_crash_pages(mem, memsz);
>          if (ptr) {
>                  /*
>                   * Temporarily invalidate the crash image while the
>                   * elfcorehdr is updated.
>                   */
>                  xchg(&kexec_crash_image, NULL);
>                  memcpy_flushcache(ptr, elfbuf, elfsz);
>                  xchg(&kexec_crash_image, image);
>          }
>          arch_unmap_crash_pages(ptr);
>          pr_debug("re-loaded elfcorehdr at 0x%lx\n", mem);
> 
> out:
>          if (elfbuf)
>                  vfree(elfbuf);
> }
> #endif
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-13 19:12     ` Eric DeVolder
  2022-09-26 19:19       ` Eric DeVolder
@ 2022-09-28 16:07       ` Borislav Petkov
  2022-09-28 16:38         ` Borislav Petkov
  2022-09-30 15:36         ` Eric DeVolder
  1 sibling, 2 replies; 57+ messages in thread
From: Borislav Petkov @ 2022-09-28 16:07 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky

On Tue, Sep 13, 2022 at 02:12:31PM -0500, Eric DeVolder wrote:
> This topic was discussed previously https://lkml.org/lkml/2022/3/3/372.

Please do not use lkml.org to refer to lkml messages. We have a
perfectly fine archival system at lore.kernel.org. You simply do

https://lore.kernel.org/r/<Message-ID>

when you want to point to a previous mail.

> David points out that terminology is tricky here due to differing behaviors.
> And perhaps that is your point in asking for guidance text. It can be
> complicated

Which means you need an explanation how to use this even more.

And why is CONFIG_CRASH_MAX_MEMORY_RANGES even a Kconfig item and not
something you discover from the hardware?

Your help text talks about System RAM entries in /proc/iomem which means
that those entries are present somewhere in the kernel and you can read
them out and do the proper calculations dynamically instead of doing the
static CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES thing.

> , but it all comes down to System RAM entries.
> 
> I could perhaps offer an overly simplified example such that for 1GiB block
> size, for example, the CRASH_MAX_MEMORY_RANGES of 32768 would allow for 32TiB
> of memory?

Yes, and stick it somewhere in Documentation/admin-guide/kdump/ and
refer to it in that help text so that people can find it and read how to
use your new option.

> The kbuf.bufsz value is obtained via a call to prepare_elf_headers(); I can
> not initialize it at its declaration.

Sorry, I meant this:

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 8fc7d678ac72..ee6fd9f1b2b9 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -395,8 +395,9 @@ int crash_load_segments(struct kimage *image)
 	if (ret)
 		return ret;
 
-	image->elf_headers = kbuf.buffer;
-	image->elf_headers_sz = kbuf.bufsz;
+	image->elf_headers	= kbuf.buffer;
+	image->elf_headers_sz	= kbuf.bufsz;
+	kbuf.memsz		= kbuf.bufsz;
 
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 	/* Ensure elfcorehdr segment large enough for hotplug changes */
@@ -407,9 +408,8 @@ int crash_load_segments(struct kimage *image)
 	image->elf_headers_sz = kbuf.memsz;
 	image->elfcorehdr_index = image->nr_segments;
 	image->elfcorehdr_index_valid = true;
-#else
-	kbuf.memsz = kbuf.bufsz;
 #endif
+
 	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
 	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
 	ret = kexec_add_buffer(&kbuf);

> I'm at a loss as to what to do differently here. You've raised this issue
> before and I went back and looked at the suggestions then and I don't see
> how that applies to this situation. How is this situation different than the
> #ifdef CONFIG_KEXEC_FILE that immediately preceeds it?

See the diff at the end. I'm not saying this is how you should do it
but it should give you a better idea. The logic being, the functions
in the .c file don't really need ifdeffery around them - you're adding
1-2 functions and crash.c is not that big - so they can be built in
unconditionally. You'd need the ifdeffery *in the header only* when
crash.c is not being built.

But I've done it with ifdeffery in the .c file now because yes, the
kexec code is a minefield of ifdeffery. Hell, there's ifdeffery even in
the headers for structs. Ifdeffery you don't really need. Someone should
clean that up and simplify this immensely.

> Currently there is a concurrent effort for PPC support by Sourabh
> Jain, and in that effort arch_map_crash_pages() is using __va(paddr).

Why?

> I do not know the nuances between kmap_local_page() and __va() to
> answer the question.

kmap_local_page() is a generic interface and it should work on any arch.

And it is documented even:

$ git grep kmap_local_page Documentation/

> If kmap_local_page() works for all archs, then I'm happy to drop these
> arch_ variants and use it directly.

Yes, pls do.

---

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 432073385b2d..b73c9628cd85 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -205,6 +205,17 @@ void *arch_kexec_kernel_image_load(struct kimage *image);
 
 int arch_kimage_file_post_load_cleanup(struct kimage *image);
 #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+
+#ifdef CONFIG_CRASH_MAX_MEMORY_RANGES
+void *arch_map_crash_pages(unsigned long paddr, unsigned long size);
+void arch_unmap_crash_pages(void **ptr);
+void arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action);
+#else
+void *arch_map_crash_pages(unsigned long paddr, unsigned long size) { return NULL; }
+void arch_unmap_crash_pages(void **ptr) { }
+void arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action) { }
+#endif
+
 #endif
 #endif
 
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 8fc7d678ac72..a526c893abe8 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -395,8 +395,9 @@ int crash_load_segments(struct kimage *image)
 	if (ret)
 		return ret;
 
-	image->elf_headers = kbuf.buffer;
-	image->elf_headers_sz = kbuf.bufsz;
+	image->elf_headers	= kbuf.buffer;
+	image->elf_headers_sz	= kbuf.bufsz;
+	kbuf.memsz		= kbuf.bufsz;
 
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 	/* Ensure elfcorehdr segment large enough for hotplug changes */
@@ -407,9 +408,8 @@ int crash_load_segments(struct kimage *image)
 	image->elf_headers_sz = kbuf.memsz;
 	image->elfcorehdr_index = image->nr_segments;
 	image->elfcorehdr_index_valid = true;
-#else
-	kbuf.memsz = kbuf.bufsz;
 #endif
+
 	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
 	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
 	ret = kexec_add_buffer(&kbuf);
@@ -425,7 +425,8 @@ int crash_load_segments(struct kimage *image)
 }
 #endif /* CONFIG_KEXEC_FILE */
 
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+#ifdef CONFIG_CRASH_MAX_MEMORY_RANGES
+
 /*
  * NOTE: The addresses and sizes passed to this routine have
  * already been fully aligned on page boundaries. There is no
@@ -462,8 +463,7 @@ void arch_unmap_crash_pages(void **ptr)
  * is prepared in a kernel buffer, and then it is written on top
  * of the existing/old elfcorehdr.
  */
-void arch_crash_handle_hotplug_event(struct kimage *image,
-	unsigned int hp_action)
+void arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action)
 {
 	struct kexec_segment *ksegment;
 	unsigned char *ptr = NULL;
@@ -513,4 +513,5 @@ void arch_crash_handle_hotplug_event(struct kimage *image,
 	if (elfbuf)
 		vfree(elfbuf);
 }
-#endif
+
+#endif /* CONFIG_CRASH_MAX_MEMORY_RANGES */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index a48577a36fb8..0f79ad4c4f80 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -27,6 +27,19 @@ extern struct resource crashk_res;
 extern struct resource crashk_low_res;
 extern note_buf_t __percpu *crash_notes;
 
+/* Alignment required for elf header segment */
+#define ELF_CORE_HEADER_ALIGN   4096
+
+struct crash_mem_range {
+	u64 start, end;
+};
+
+struct crash_mem {
+	unsigned int max_nr_ranges;
+	unsigned int nr_ranges;
+	struct crash_mem_range ranges[];
+};
+
 #ifdef CONFIG_KEXEC_CORE
 #include <linux/list.h>
 #include <linux/compat.h>
@@ -237,19 +250,6 @@ static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
 }
 #endif
 
-/* Alignment required for elf header segment */
-#define ELF_CORE_HEADER_ALIGN   4096
-
-struct crash_mem_range {
-	u64 start, end;
-};
-
-struct crash_mem {
-	unsigned int max_nr_ranges;
-	unsigned int nr_ranges;
-	struct crash_mem_range ranges[];
-};
-
 extern int crash_exclude_mem_range(struct crash_mem *mem,
 				   unsigned long long mstart,
 				   unsigned long long mend);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 5bc5159d9cb1..f6b5d835f826 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -622,6 +622,15 @@ static int __init crash_save_vmcoreinfo_init(void)
 subsys_initcall(crash_save_vmcoreinfo_init);
 
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+
+void __weak *arch_map_crash_pages(unsigned long paddr, unsigned long size)
+{
+	return NULL;
+}
+
+void __weak arch_unmap_crash_pages(void **ptr) { }
+void __weak arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action) { }
+
 /*
  * To accurately reflect hot un/plug changes, the elfcorehdr (which
  * is passed to the crash kernel via the elfcorehdr= parameter)

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-28 16:07       ` Borislav Petkov
@ 2022-09-28 16:38         ` Borislav Petkov
  2022-09-30 15:36         ` Eric DeVolder
  1 sibling, 0 replies; 57+ messages in thread
From: Borislav Petkov @ 2022-09-28 16:38 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky

On Wed, Sep 28, 2022 at 06:07:24PM +0200, Borislav Petkov wrote:
>  #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>  	/* Ensure elfcorehdr segment large enough for hotplug changes */
> @@ -407,9 +408,8 @@ int crash_load_segments(struct kimage *image)
>  	image->elf_headers_sz = kbuf.memsz;
>  	image->elfcorehdr_index = image->nr_segments;
>  	image->elfcorehdr_index_valid = true;

And that ifdeffery above can be made more readable too:

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a526c893abe8..7aab6e942761 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -399,16 +399,15 @@ int crash_load_segments(struct kimage *image)
 	image->elf_headers_sz	= kbuf.bufsz;
 	kbuf.memsz		= kbuf.bufsz;
 
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 	/* Ensure elfcorehdr segment large enough for hotplug changes */
-	kbuf.memsz =
-		(CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
-			sizeof(Elf64_Phdr);
-	/* Mark as usable to crash kernel, else crash kernel fails on boot */
-	image->elf_headers_sz = kbuf.memsz;
-	image->elfcorehdr_index = image->nr_segments;
-	image->elfcorehdr_index_valid = true;
-#endif
+	if (IS_ENABLED(CONFIG_CRASH_MAX_MEMORY_RANGES)) {
+		kbuf.memsz = (CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) * sizeof(Elf64_Phdr);
+
+		/* Mark as usable to crash kernel, else crash kernel fails on boot */
+		image->elf_headers_sz = kbuf.memsz;
+		image->elfcorehdr_index = image->nr_segments;
+		image->elfcorehdr_index_valid = true;
+	}
 
 	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
 	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-28 16:07       ` Borislav Petkov
  2022-09-28 16:38         ` Borislav Petkov
@ 2022-09-30 15:36         ` Eric DeVolder
  2022-09-30 16:50           ` Borislav Petkov
                             ` (2 more replies)
  1 sibling, 3 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-09-30 15:36 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, sourabhjain, konrad.wilk, boris.ostrovsky



On 9/28/22 11:07, Borislav Petkov wrote:
> On Tue, Sep 13, 2022 at 02:12:31PM -0500, Eric DeVolder wrote:
>> This topic was discussed previously https://lkml.org/lkml/2022/3/3/372.
> 
> Please do not use lkml.org to refer to lkml messages. We have a
> perfectly fine archival system at lore.kernel.org. You simply do
> 
> https://lore.kernel.org/r/<Message-ID>
> 
> when you want to point to a previous mail.

ok, thanks for pointing that out to me.
> 
>> David points out that terminology is tricky here due to differing behaviors.
>> And perhaps that is your point in asking for guidance text. It can be
>> complicated
> 
> Which means you need an explanation how to use this even more.
> 
> And why is CONFIG_CRASH_MAX_MEMORY_RANGES even a Kconfig item and not
> something you discover from the hardware?

No, is the short answer.

> 
> Your help text talks about System RAM entries in /proc/iomem which means
> that those entries are present somewhere in the kernel and you can read
> them out and do the proper calculations dynamically instead of doing the
> static CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES thing.

The intent is to compute the max size buffer needed to contain a maximum populated elfcorehdr, which 
is primarily based on the number of CPUs and memory regions. Thus far I (and others involved) have 
not found a kernel method to determine the maximum number of memory regions possible (if you are 
aware of one, please let me know!). Thus CONFIG_CRASH_MAX_MEMORY_RANGES was born (rather borrowed 
from kexec-tools).

So no dynamic computation is possible, yet.

> 
>> , but it all comes down to System RAM entries.
>>
>> I could perhaps offer an overly simplified example such that for 1GiB block
>> size, for example, the CRASH_MAX_MEMORY_RANGES of 32768 would allow for 32TiB
>> of memory?
> 
> Yes, and stick it somewhere in Documentation/admin-guide/kdump/ and
> refer to it in that help text so that people can find it and read how to
> use your new option.
> 
ok

>> The kbuf.bufsz value is obtained via a call to prepare_elf_headers(); I can
>> not initialize it at its declaration.
> 
> Sorry, I meant this:
> 
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index 8fc7d678ac72..ee6fd9f1b2b9 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -395,8 +395,9 @@ int crash_load_segments(struct kimage *image)
>   	if (ret)
>   		return ret;
>   
> -	image->elf_headers = kbuf.buffer;
> -	image->elf_headers_sz = kbuf.bufsz;
> +	image->elf_headers	= kbuf.buffer;
> +	image->elf_headers_sz	= kbuf.bufsz;
> +	kbuf.memsz		= kbuf.bufsz;
>   
>   #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>   	/* Ensure elfcorehdr segment large enough for hotplug changes */
> @@ -407,9 +408,8 @@ int crash_load_segments(struct kimage *image)
>   	image->elf_headers_sz = kbuf.memsz;
>   	image->elfcorehdr_index = image->nr_segments;
>   	image->elfcorehdr_index_valid = true;
> -#else
> -	kbuf.memsz = kbuf.bufsz;
>   #endif
> +
>   	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>   	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>   	ret = kexec_add_buffer(&kbuf);
> 
ok

>> I'm at a loss as to what to do differently here. You've raised this issue
>> before and I went back and looked at the suggestions then and I don't see
>> how that applies to this situation. How is this situation different than the
>> #ifdef CONFIG_KEXEC_FILE that immediately preceeds it?
> 
> See the diff at the end. I'm not saying this is how you should do it
> but it should give you a better idea. The logic being, the functions
> in the .c file don't really need ifdeffery around them - you're adding
> 1-2 functions and crash.c is not that big - so they can be built in
> unconditionally. You'd need the ifdeffery *in the header only* when
> crash.c is not being built.
ok; I've overlooked that scenario.
> 
> But I've done it with ifdeffery in the .c file now because yes, the
> kexec code is a minefield of ifdeffery. Hell, there's ifdeffery even in
> the headers for structs. Ifdeffery you don't really need. Someone should
> clean that up and simplify this immensely.

ok

> 
>> Currently there is a concurrent effort for PPC support by Sourabh
>> Jain, and in that effort arch_map_crash_pages() is using __va(paddr).
> 
> Why?
> 
>> I do not know the nuances between kmap_local_page() and __va() to
>> answer the question.
> 
> kmap_local_page() is a generic interface and it should work on any arch.
> 
> And it is documented even:
> 
> $ git grep kmap_local_page Documentation/
> 
>> If kmap_local_page() works for all archs, then I'm happy to drop these
>> arch_ variants and use it directly.
> 
> Yes, pls do.

I'll check with Sourabh to see if PPC can work with kmap_local_page().

> 
> ---
> 
> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
> index 432073385b2d..b73c9628cd85 100644
> --- a/arch/x86/include/asm/kexec.h
> +++ b/arch/x86/include/asm/kexec.h
> @@ -205,6 +205,17 @@ void *arch_kexec_kernel_image_load(struct kimage *image);
>   
>   int arch_kimage_file_post_load_cleanup(struct kimage *image);
>   #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
> +
> +#ifdef CONFIG_CRASH_MAX_MEMORY_RANGES
So I think the use of CONFIG_CRASH_MAX_MEMORY_RANGES is not correct; it still needs to be based on 
the cpu or memory hotplug options.

> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size);
> +void arch_unmap_crash_pages(void **ptr);
> +void arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action);
> +#else
> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size) { return NULL; }
> +void arch_unmap_crash_pages(void **ptr) { }
> +void arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action) { }
> +#endif
> +
>   #endif
>   #endif
>   
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index 8fc7d678ac72..a526c893abe8 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -395,8 +395,9 @@ int crash_load_segments(struct kimage *image)
>   	if (ret)
>   		return ret;
>   
> -	image->elf_headers = kbuf.buffer;
> -	image->elf_headers_sz = kbuf.bufsz;
> +	image->elf_headers	= kbuf.buffer;
> +	image->elf_headers_sz	= kbuf.bufsz;
> +	kbuf.memsz		= kbuf.bufsz;
>   
>   #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>   	/* Ensure elfcorehdr segment large enough for hotplug changes */
> @@ -407,9 +408,8 @@ int crash_load_segments(struct kimage *image)
>   	image->elf_headers_sz = kbuf.memsz;
>   	image->elfcorehdr_index = image->nr_segments;
>   	image->elfcorehdr_index_valid = true;
> -#else
> -	kbuf.memsz = kbuf.bufsz;
>   #endif
> +
>   	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>   	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>   	ret = kexec_add_buffer(&kbuf);
> @@ -425,7 +425,8 @@ int crash_load_segments(struct kimage *image)
>   }
>   #endif /* CONFIG_KEXEC_FILE */
>   
> -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> +#ifdef CONFIG_CRASH_MAX_MEMORY_RANGES
Again, I don't think CONFIG_CRASH_MAX_MEMORY_RANGES makes sense, at all.

> +
>   /*
>    * NOTE: The addresses and sizes passed to this routine have
>    * already been fully aligned on page boundaries. There is no
> @@ -462,8 +463,7 @@ void arch_unmap_crash_pages(void **ptr)
>    * is prepared in a kernel buffer, and then it is written on top
>    * of the existing/old elfcorehdr.
>    */
> -void arch_crash_handle_hotplug_event(struct kimage *image,
> -	unsigned int hp_action)
> +void arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action)
>   {
>   	struct kexec_segment *ksegment;
>   	unsigned char *ptr = NULL;
> @@ -513,4 +513,5 @@ void arch_crash_handle_hotplug_event(struct kimage *image,
>   	if (elfbuf)
>   		vfree(elfbuf);
>   }
> -#endif
> +
> +#endif /* CONFIG_CRASH_MAX_MEMORY_RANGES */
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index a48577a36fb8..0f79ad4c4f80 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -27,6 +27,19 @@ extern struct resource crashk_res;
>   extern struct resource crashk_low_res;
>   extern note_buf_t __percpu *crash_notes;
>   
> +/* Alignment required for elf header segment */
> +#define ELF_CORE_HEADER_ALIGN   4096
> +
> +struct crash_mem_range {
> +	u64 start, end;
> +};
> +
> +struct crash_mem {
> +	unsigned int max_nr_ranges;
> +	unsigned int nr_ranges;
> +	struct crash_mem_range ranges[];
> +};
> +
>   #ifdef CONFIG_KEXEC_CORE
>   #include <linux/list.h>
>   #include <linux/compat.h>
> @@ -237,19 +250,6 @@ static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
>   }
>   #endif
>   
> -/* Alignment required for elf header segment */
> -#define ELF_CORE_HEADER_ALIGN   4096
> -
> -struct crash_mem_range {
> -	u64 start, end;
> -};
> -
> -struct crash_mem {
> -	unsigned int max_nr_ranges;
> -	unsigned int nr_ranges;
> -	struct crash_mem_range ranges[];
> -};
> -
>   extern int crash_exclude_mem_range(struct crash_mem *mem,
>   				   unsigned long long mstart,
>   				   unsigned long long mend);
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 5bc5159d9cb1..f6b5d835f826 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -622,6 +622,15 @@ static int __init crash_save_vmcoreinfo_init(void)
>   subsys_initcall(crash_save_vmcoreinfo_init);
>   
>   #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> +
> +void __weak *arch_map_crash_pages(unsigned long paddr, unsigned long size)
> +{
> +	return NULL;
> +}
> +
> +void __weak arch_unmap_crash_pages(void **ptr) { }
> +void __weak arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action) { }
> +
I was asked by Baoquan He to eliminate the use of __weak, which I did. I followed the technique used 
by other kexec infrastructure.


>   /*
>    * To accurately reflect hot un/plug changes, the elfcorehdr (which
>    * is passed to the crash kernel via the elfcorehdr= parameter)
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-30 15:36         ` Eric DeVolder
@ 2022-09-30 16:50           ` Borislav Petkov
  2022-09-30 17:11             ` Eric DeVolder
  2022-10-04  7:03           ` Sourabh Jain
  2022-10-04  9:10           ` Sourabh Jain
  2 siblings, 1 reply; 57+ messages in thread
From: Borislav Petkov @ 2022-09-30 16:50 UTC (permalink / raw)
  To: Eric DeVolder, Oscar Salvador, Andrew Morton, david
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, sourabhjain, linux-mm

On Fri, Sep 30, 2022 at 10:36:49AM -0500, Eric DeVolder wrote:
> > Your help text talks about System RAM entries in /proc/iomem which means
> > that those entries are present somewhere in the kernel and you can read
> > them out and do the proper calculations dynamically instead of doing the
> > static CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES thing.
> 
> The intent is to compute the max size buffer needed to contain a maximum
> populated elfcorehdr, which is primarily based on the number of CPUs and
> memory regions. Thus far I (and others involved) have not found a kernel
> method to determine the maximum number of memory regions possible (if you
> are aware of one, please let me know!). Thus CONFIG_CRASH_MAX_MEMORY_RANGES
> was born (rather borrowed from kexec-tools).

Let's ask some mm folks.

mm folks, is there a way to enumerate all the memory regions a machine
has?

It looks to me like register_memory_resource() in mm/memory_hotplug.c
does register the resource so there should be a way to count that list
of resources or at least maintain a count somewhere so that kexec/crash
code can know how big its elfcodehdr buffer should be instead of doing a
clumsy Kconfig item where people would need to guess...

Hmm.

> > +#ifdef CONFIG_CRASH_MAX_MEMORY_RANGES
> So I think the use of CONFIG_CRASH_MAX_MEMORY_RANGES is not correct; it
> still needs to be based on the cpu or memory hotplug options.

You're kidding, right?

+config CRASH_MAX_MEMORY_RANGES
+	depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
	^^^^^^^^^^				^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> > @@ -622,6 +622,15 @@ static int __init crash_save_vmcoreinfo_init(void)
> >   subsys_initcall(crash_save_vmcoreinfo_init);
> >   #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> > +
> > +void __weak *arch_map_crash_pages(unsigned long paddr, unsigned long size)
> > +{
> > +	return NULL;
> > +}
> > +
> > +void __weak arch_unmap_crash_pages(void **ptr) { }
> > +void __weak arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action) { }
> > +
> I was asked by Baoquan He to eliminate the use of __weak

Because?

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-30 16:50           ` Borislav Petkov
@ 2022-09-30 17:11             ` Eric DeVolder
  2022-09-30 17:40               ` Borislav Petkov
  0 siblings, 1 reply; 57+ messages in thread
From: Eric DeVolder @ 2022-09-30 17:11 UTC (permalink / raw)
  To: Borislav Petkov, Oscar Salvador, Andrew Morton, david
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, sourabhjain, linux-mm



On 9/30/22 11:50, Borislav Petkov wrote:
> On Fri, Sep 30, 2022 at 10:36:49AM -0500, Eric DeVolder wrote:
>>> Your help text talks about System RAM entries in /proc/iomem which means
>>> that those entries are present somewhere in the kernel and you can read
>>> them out and do the proper calculations dynamically instead of doing the
>>> static CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES thing.
>>
>> The intent is to compute the max size buffer needed to contain a maximum
>> populated elfcorehdr, which is primarily based on the number of CPUs and
>> memory regions. Thus far I (and others involved) have not found a kernel
>> method to determine the maximum number of memory regions possible (if you
>> are aware of one, please let me know!). Thus CONFIG_CRASH_MAX_MEMORY_RANGES
>> was born (rather borrowed from kexec-tools).
> 
> Let's ask some mm folks.
> 
> mm folks, is there a way to enumerate all the memory regions a machine
> has?
> 
> It looks to me like register_memory_resource() in mm/memory_hotplug.c
> does register the resource so there should be a way to count that list
> of resources or at least maintain a count somewhere so that kexec/crash
> code can know how big its elfcodehdr buffer should be instead of doing a
> clumsy Kconfig item where people would need to guess...
> 
> Hmm.
> 

There is of course a way to enumerate the memory regions in use on the machine, that is not what 
this code needs. In order to compute the maximum buffer size needed (this buffer size is computed 
once), the count of the maximum number of memory regions possible (even if not currently in use) is 
what is needed.

>>> +#ifdef CONFIG_CRASH_MAX_MEMORY_RANGES
>> So I think the use of CONFIG_CRASH_MAX_MEMORY_RANGES is not correct; it
>> still needs to be based on the cpu or memory hotplug options.
> 
> You're kidding, right?
> 
> +config CRASH_MAX_MEMORY_RANGES
> +	depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
> 	^^^^^^^^^^				^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Oh, that would be an error of haste on my part. This should be:
   depends on CRASH_DUMP && MEMORY_HOTPLUG

> 
>>> @@ -622,6 +622,15 @@ static int __init crash_save_vmcoreinfo_init(void)
>>>    subsys_initcall(crash_save_vmcoreinfo_init);
>>>    #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>> +
>>> +void __weak *arch_map_crash_pages(unsigned long paddr, unsigned long size)
>>> +{
>>> +	return NULL;
>>> +}
>>> +
>>> +void __weak arch_unmap_crash_pages(void **ptr) { }
>>> +void __weak arch_crash_handle_hotplug_event(struct kimage *image, unsigned int hp_action) { }
>>> +
>> I was asked by Baoquan He to eliminate the use of __weak
> 
> Because?
> 

Baoquan pointed me to:

https://lore.kernel.org/lkml/cover.1656659357.git.naveen.n.rao@linux.vnet.ibm.com/T/

Thanks,
eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-30 17:11             ` Eric DeVolder
@ 2022-09-30 17:40               ` Borislav Petkov
  2022-10-08  2:35                 ` Baoquan He
  2022-10-12 16:20                 ` Eric DeVolder
  0 siblings, 2 replies; 57+ messages in thread
From: Borislav Petkov @ 2022-09-30 17:40 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: Oscar Salvador, Andrew Morton, david, linux-kernel, x86, kexec,
	ebiederm, dyoung, bhe, vgoyal, tglx, mingo, dave.hansen, hpa,
	nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm

On Fri, Sep 30, 2022 at 12:11:26PM -0500, Eric DeVolder wrote:
> There is of course a way to enumerate the memory regions in use on the
> machine, that is not what this code needs. In order to compute the maximum
> buffer size needed (this buffer size is computed once), the count of the
> maximum number of memory regions possible (even if not currently in use) is
> what is needed.

Isn't that max number documented somewhere in memory hotplug docs?

Because then you don't need that Kconfig item either. Imagine you're a
distro kernel distributor and you want crash to work on all machines
your kernel works.

So you go and set that number to max. And that would be the 99% of the
kernel configs out there.

Which means, you can just set it to max without a Kconfig item.

> Oh, that would be an error of haste on my part. This should be:
>   depends on CRASH_DUMP && MEMORY_HOTPLUG

You need a Kconfig item which enables all this gunk as MEMORY_HOTPLUG is
not a omnipresent feature. And that Kconfig item should depend on the
other Kconfig items of the technology you need.

> Baoquan pointed me to:
> 
> https://lore.kernel.org/lkml/cover.1656659357.git.naveen.n.rao@linux.vnet.ibm.com/T/

In that thread says:

"- arch_kexec_apply_relocations_add() is only overridden by x86 and s390.
  Retain the function prototype for those and move the weak
  implementation into the header as a static inline for other
  architectures."

So yes, that's even better.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support
  2022-09-09 21:05 ` [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support Eric DeVolder
@ 2022-10-03 17:51   ` Sourabh Jain
  2022-10-07 19:14     ` Eric DeVolder
  2022-10-04  6:38   ` Sourabh Jain
  1 sibling, 1 reply; 57+ messages in thread
From: Sourabh Jain @ 2022-10-03 17:51 UTC (permalink / raw)
  To: Eric DeVolder, linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, konrad.wilk, boris.ostrovsky

Hello Eric,

On 10/09/22 02:35, Eric DeVolder wrote:
> CPU and memory change notifications are received in order to
> regenerate the elfcorehdr.
>
> To support cpu hotplug, a callback is registered to capture the
> CPUHP_AP_ONLINE_DYN online and offline events via
> cpuhp_setup_state_nocalls().
>
> To support memory hotplug, a notifier is registered to capture the
> MEM_ONLINE and MEM_OFFLINE events via register_memory_notifier().
>
> The cpu callback and memory notifiers call handle_hotplug_event()
> which performs needed tasks and then dispatches the event to the
> architecture specific arch_crash_handle_hotplug_event(). During the
> process, the kexec_mutex is held.
>
> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
> Acked-by: Baoquan He <bhe@redhat.com>
> ---
>   include/linux/crash_core.h |   8 +++
>   include/linux/kexec.h      |  26 +++++++
>   kernel/crash_core.c        | 134 +++++++++++++++++++++++++++++++++++++
>   3 files changed, 168 insertions(+)
>
> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
> index de62a722431e..a270f8660538 100644
> --- a/include/linux/crash_core.h
> +++ b/include/linux/crash_core.h
> @@ -84,4 +84,12 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
>   int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
>   		unsigned long long *crash_size, unsigned long long *crash_base);
>   
> +#define KEXEC_CRASH_HP_REMOVE_CPU		0
> +#define KEXEC_CRASH_HP_ADD_CPU			1
> +#define KEXEC_CRASH_HP_REMOVE_MEMORY		2
> +#define KEXEC_CRASH_HP_ADD_MEMORY		3
> +#define KEXEC_CRASH_HP_INVALID_CPU		-1U
> +
> +struct kimage;
> +
>   #endif /* LINUX_CRASH_CORE_H */
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index 4eefa631e0ae..9597b41136ec 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -374,6 +374,13 @@ struct kimage {
>   	struct purgatory_info purgatory_info;
>   #endif
>   
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> +	bool hotplug_event;
> +	unsigned int offlinecpu;
> +	bool elfcorehdr_index_valid;
> +	int elfcorehdr_index;
> +#endif
> +
>   #ifdef CONFIG_IMA_KEXEC
>   	/* Virtual address of IMA measurement buffer for kexec syscall */
>   	void *ima_buffer;
> @@ -503,6 +510,25 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g
>   static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
>   #endif
>   
> +#ifndef arch_map_crash_pages
> +static inline void *arch_map_crash_pages(unsigned long paddr,
> +		unsigned long size)
> +{
> +	return NULL;
> +}
> +#endif
> +
> +#ifndef arch_unmap_crash_pages
> +static inline void arch_unmap_crash_pages(void **ptr) { }
> +#endif
> +
> +#ifndef arch_crash_handle_hotplug_event
> +static inline void arch_crash_handle_hotplug_event(struct kimage *image,
> +		unsigned int hp_action)
> +{
> +}
> +#endif
> +
>   #else /* !CONFIG_KEXEC_CORE */
>   struct pt_regs;
>   struct task_struct;
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 8c648fd5897a..4b15d91f0b21 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -11,6 +11,8 @@
>   #include <linux/vmalloc.h>
>   #include <linux/sizes.h>
>   #include <linux/kexec.h>
> +#include <linux/memory.h>
> +#include <linux/cpuhotplug.h>
>   
>   #include <asm/page.h>
>   #include <asm/sections.h>
> @@ -18,6 +20,7 @@
>   #include <crypto/sha1.h>
>   
>   #include "kallsyms_internal.h"
> +#include "kexec_internal.h"
>   
>   /* vmcoreinfo stuff */
>   unsigned char *vmcoreinfo_data;
> @@ -612,3 +615,134 @@ static int __init crash_save_vmcoreinfo_init(void)
>   }
>   
>   subsys_initcall(crash_save_vmcoreinfo_init);
> +
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> +/*
> + * To accurately reflect hot un/plug changes, the elfcorehdr (which
> + * is passed to the crash kernel via the elfcorehdr= parameter)
> + * must be updated with the new list of CPUs and memories.
> + *
> + * In order to make changes to elfcorehdr, two conditions are needed:
> + * First, the segment containing the elfcorehdr must be large enough
> + * to permit a growing number of resources. The elfcorehdr memory is
> + * typically based on CONFIG_NR_CPUS and CONFIG_CRASH_MAX_MEMORY_RANGES.
> + * Second, purgatory must explicitly exclude the elfcorehdr from the
> + * list of segments it checks (since the elfcorehdr changes and thus
> + * would require an update to purgatory itself to update the digest).
> + */
> +static void handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
> +{
> +	/* Obtain lock while changing crash information */
> +	mutex_lock(&kexec_mutex);
> +
> +	/* Check kdump is loaded */
> +	if (kexec_crash_image) {
> +		struct kimage *image = kexec_crash_image;
> +
> +		if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
> +			hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
> +			pr_debug("crash hp: hp_action %u, cpu %u\n", hp_action, cpu);
> +		else
> +			pr_debug("crash hp: hp_action %u\n", hp_action);
> +
> +		/*
> +		 * When the struct kimage is allocated, it is wiped to zero, so
> +		 * the elfcorehdr_index_valid defaults to false. Find the
> +		 * segment containing the elfcorehdr, if not already found.
> +		 * This works for both the kexec_load and kexec_file_load paths.
> +		 */
> +		if (!image->elfcorehdr_index_valid) {
> +			unsigned char *ptr;
> +			unsigned long mem, memsz;
> +			unsigned int n;
> +
> +			for (n = 0; n < image->nr_segments; n++) {
> +				mem = image->segment[n].mem;
> +				memsz = image->segment[n].memsz;
> +				ptr = arch_map_crash_pages(mem, memsz);
> +				if (ptr) {
> +					/* The segment containing elfcorehdr */
> +					if (memcmp(ptr, ELFMAG, SELFMAG) == 0) {
> +						image->elfcorehdr_index = (int)n;
> +						image->elfcorehdr_index_valid = true;
> +					}
> +				}
> +				arch_unmap_crash_pages((void **)&ptr);
> +			}
> +		}
> +
> +		if (!image->elfcorehdr_index_valid) {
> +			pr_err("crash hp: unable to locate elfcorehdr segment");
> +			goto out;
> +		}
> +
> +		/* Needed in order for the segments to be updated */
> +		arch_kexec_unprotect_crashkres();
> +
> +		/* Flag to differentiate between normal load and hotplug */
> +		image->hotplug_event = true;
> +
> +		/* Now invoke arch-specific update handler */
> +		arch_crash_handle_hotplug_event(image, hp_action);
> +
> +		/* No longer handling a hotplug event */
> +		image->hotplug_event = false;
> +
> +		/* Change back to read-only */
> +		arch_kexec_protect_crashkres();
> +	}
> +
> +out:
> +	/* Release lock now that update complete */
> +	mutex_unlock(&kexec_mutex);
> +}
> +
> +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
> +{
> +	switch (val) {
> +	case MEM_ONLINE:
> +		handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, 0);
> +		break;
> +
> +	case MEM_OFFLINE:
> +		handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, 0);
> +		break;
> +	}
> +	return NOTIFY_OK;

Can we pass v (memory_notify) argument to 
arch_crash_handle_hotplug_event function
via handle_hotplug_event?

Because the way memory hotplug is handled on PowerPC, it is hard to 
update the elfcorehdr
without memory_notify args.

On PowePC memblock data structure is used to prepare elfcorehdr for 
kdump. Since the notifier
used for memory hotplug crash handler get initiated before the memblock 
data structure update
happens (as depicted below), the newly prepared elfcorehdr still holds 
the old memory regions.
So if the system crash with obsolete elfcorehdr, makedumpfile failed to 
collect vmcore.

Sequence of actions done on PowerPC to server the memory hotplug:

  Initiate memory hot remove
           |
           v
  offline pages
           |
           v
  initiate memory notify call chain
  for MEM_OFFLINE event.
  (same is used for crash update)
           |
           v
  prepare new elfcorehdr for kdump using
  memblock data structure
           |
           v
  update memblock data structure

How passing memory_notify to arch crash hotplug handler will help?

memory_notify holds the start PFN and page count, with that we can get
the base address and size of hot unplugged memory and can use the same
to avoid hot unplugged memeory region to get added in the elfcorehdr..

Thanks,
Sourabh Jain


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support
  2022-09-09 21:05 ` [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support Eric DeVolder
  2022-10-03 17:51   ` Sourabh Jain
@ 2022-10-04  6:38   ` Sourabh Jain
  2022-10-07 19:19     ` Eric DeVolder
  1 sibling, 1 reply; 57+ messages in thread
From: Sourabh Jain @ 2022-10-04  6:38 UTC (permalink / raw)
  To: Eric DeVolder, linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, konrad.wilk, boris.ostrovsky


On 10/09/22 02:35, Eric DeVolder wrote:
> CPU and memory change notifications are received in order to
> regenerate the elfcorehdr.
>
> To support cpu hotplug, a callback is registered to capture the
> CPUHP_AP_ONLINE_DYN online and offline events via
> cpuhp_setup_state_nocalls().
>
> To support memory hotplug, a notifier is registered to capture the
> MEM_ONLINE and MEM_OFFLINE events via register_memory_notifier().
>
> The cpu callback and memory notifiers call handle_hotplug_event()
> which performs needed tasks and then dispatches the event to the
> architecture specific arch_crash_handle_hotplug_event(). During the
> process, the kexec_mutex is held.
>
> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
> Acked-by: Baoquan He <bhe@redhat.com>
> ---
>   include/linux/crash_core.h |   8 +++
>   include/linux/kexec.h      |  26 +++++++
>   kernel/crash_core.c        | 134 +++++++++++++++++++++++++++++++++++++
>   3 files changed, 168 insertions(+)
>
> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
> index de62a722431e..a270f8660538 100644
> --- a/include/linux/crash_core.h
> +++ b/include/linux/crash_core.h
> @@ -84,4 +84,12 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
>   int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
>   		unsigned long long *crash_size, unsigned long long *crash_base);
>   
> +#define KEXEC_CRASH_HP_REMOVE_CPU		0
> +#define KEXEC_CRASH_HP_ADD_CPU			1
> +#define KEXEC_CRASH_HP_REMOVE_MEMORY		2
> +#define KEXEC_CRASH_HP_ADD_MEMORY		3
> +#define KEXEC_CRASH_HP_INVALID_CPU		-1U
> +
> +struct kimage;
> +
>   #endif /* LINUX_CRASH_CORE_H */
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index 4eefa631e0ae..9597b41136ec 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -374,6 +374,13 @@ struct kimage {
>   	struct purgatory_info purgatory_info;
>   #endif
>   
> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> +	bool hotplug_event;
> +	unsigned int offlinecpu;
> +	bool elfcorehdr_index_valid;
> +	int elfcorehdr_index;

Do we really need elfcorehdr_index_valid to decide elfcorehdr_index 
holds a valid index?

How about initializing elfcorehdr_index to a negative number while 
loading kdump kernel (or kexec kernel if needed)
for both kexec_load and kexec_file_load case and consider that as 
invalid index to find the correct one.

Some thing like this:

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 5bc5159d9cb1..0cccdb2f7f26 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -656,7 +656,7 @@ static void handle_hotplug_event(unsigned int 
hp_action, unsigned int cpu)
                  * segment containing the elfcorehdr, if not already found.
                  * This works for both the kexec_load and 
kexec_file_load paths.
                  */
-               if (!image->elfcorehdr_index_valid) {
+               if (image->elfcorehdr_index < 0) {
                         unsigned char *ptr;
                         unsigned long mem, memsz;
                         unsigned int n;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b5e40f069768..ed1c6a88879b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -156,6 +156,10 @@ static int do_kexec_load(unsigned long entry, 
unsigned long nr_segments,
         if (ret)
                 goto out;

+       /* Below check is not necessary */
+       if (flags & KEXEC_FILE_ON_CRASH)
+               image->elfcorehdr_index = -1;
+
         /* Install the new kernel and uninstall the old */
         image = xchg(dest_image, image);

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index d0c2661b3509..535dbc26930a 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -400,6 +400,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, 
int, initrd_fd,
         if (ret)
                 goto out;

+       /* Below check is not necessary */
+       if (flags & KEXEC_FILE_ON_CRASH)
+               image->elfcorehdr_index = -1;
+
         /*
          * Free up any temporary buffers allocated which are not needed
          * after image has been loaded

Thanks,
Sourabh Jain


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-30 15:36         ` Eric DeVolder
  2022-09-30 16:50           ` Borislav Petkov
@ 2022-10-04  7:03           ` Sourabh Jain
  2022-10-07 19:56             ` Eric DeVolder
  2022-10-04  9:10           ` Sourabh Jain
  2 siblings, 1 reply; 57+ messages in thread
From: Sourabh Jain @ 2022-10-04  7:03 UTC (permalink / raw)
  To: Eric DeVolder, Borislav Petkov
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, konrad.wilk, boris.ostrovsky


On 30/09/22 21:06, Eric DeVolder wrote:
>
>
> On 9/28/22 11:07, Borislav Petkov wrote:
>> On Tue, Sep 13, 2022 at 02:12:31PM -0500, Eric DeVolder wrote:
>>> This topic was discussed previously https://lkml.org/lkml/2022/3/3/372.
>>
>> Please do not use lkml.org to refer to lkml messages. We have a
>> perfectly fine archival system at lore.kernel.org. You simply do
>>
>> https://lore.kernel.org/r/<Message-ID>
>>
>> when you want to point to a previous mail.
>
> ok, thanks for pointing that out to me.
>>
>>> David points out that terminology is tricky here due to differing 
>>> behaviors.
>>> And perhaps that is your point in asking for guidance text. It can be
>>> complicated
>>
>> Which means you need an explanation how to use this even more.
>>
>> And why is CONFIG_CRASH_MAX_MEMORY_RANGES even a Kconfig item and not
>> something you discover from the hardware?
>
> No, is the short answer.
>
>>
>> Your help text talks about System RAM entries in /proc/iomem which means
>> that those entries are present somewhere in the kernel and you can read
>> them out and do the proper calculations dynamically instead of doing the
>> static CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES thing.
>
> The intent is to compute the max size buffer needed to contain a 
> maximum populated elfcorehdr, which is primarily based on the number 
> of CPUs and memory regions. Thus far I (and others involved) have not 
> found a kernel method to determine the maximum number of memory 
> regions possible (if you are aware of one, please let me know!). Thus 
> CONFIG_CRASH_MAX_MEMORY_RANGES was born (rather borrowed from 
> kexec-tools).
>
> So no dynamic computation is possible, yet.

Hello Eric,

How about allocating buffer space for max program header possible in a 
elfcorehdr?

mage->elf_headers_sz = kbuf.memsz = PN_XNUM * sizeof(Elf64_Phdr);

PN_XNUM is part of linux/elf.h (include/uapi/linux/elf.h).

Refer below link for more details:
https://man7.org/linux/man-pages/man5/elf.5.html

Thanks,
Sourabh Jain


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-30 15:36         ` Eric DeVolder
  2022-09-30 16:50           ` Borislav Petkov
  2022-10-04  7:03           ` Sourabh Jain
@ 2022-10-04  9:10           ` Sourabh Jain
  2022-10-07 20:00             ` Eric DeVolder
  2 siblings, 1 reply; 57+ messages in thread
From: Sourabh Jain @ 2022-10-04  9:10 UTC (permalink / raw)
  To: Eric DeVolder, Borislav Petkov
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, konrad.wilk, boris.ostrovsky


On 30/09/22 21:06, Eric DeVolder wrote:
>
>
> On 9/28/22 11:07, Borislav Petkov wrote:
>> On Tue, Sep 13, 2022 at 02:12:31PM -0500, Eric DeVolder wrote:
>>> This topic was discussed previously https://lkml.org/lkml/2022/3/3/372.
>>
>> Please do not use lkml.org to refer to lkml messages. We have a
>> perfectly fine archival system at lore.kernel.org. You simply do
>>
>> https://lore.kernel.org/r/<Message-ID>
>>
>> when you want to point to a previous mail.
>
> ok, thanks for pointing that out to me.
>>
>>> David points out that terminology is tricky here due to differing 
>>> behaviors.
>>> And perhaps that is your point in asking for guidance text. It can be
>>> complicated
>>
>> Which means you need an explanation how to use this even more.
>>
>> And why is CONFIG_CRASH_MAX_MEMORY_RANGES even a Kconfig item and not
>> something you discover from the hardware?
>
> No, is the short answer.
>
>>
>> Your help text talks about System RAM entries in /proc/iomem which means
>> that those entries are present somewhere in the kernel and you can read
>> them out and do the proper calculations dynamically instead of doing the
>> static CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES thing.
>
> The intent is to compute the max size buffer needed to contain a 
> maximum populated elfcorehdr, which is primarily based on the number 
> of CPUs and memory regions. Thus far I (and others involved) have not 
> found a kernel method to determine the maximum number of memory 
> regions possible (if you are aware of one, please let me know!). Thus 
> CONFIG_CRASH_MAX_MEMORY_RANGES was born (rather borrowed from 
> kexec-tools).
>
> So no dynamic computation is possible, yet.
>
>>
>>> , but it all comes down to System RAM entries.
>>>
>>> I could perhaps offer an overly simplified example such that for 
>>> 1GiB block
>>> size, for example, the CRASH_MAX_MEMORY_RANGES of 32768 would allow 
>>> for 32TiB
>>> of memory?
>>
>> Yes, and stick it somewhere in Documentation/admin-guide/kdump/ and
>> refer to it in that help text so that people can find it and read how to
>> use your new option.
>>
> ok
>
>>> The kbuf.bufsz value is obtained via a call to 
>>> prepare_elf_headers(); I can
>>> not initialize it at its declaration.
>>
>> Sorry, I meant this:
>>
>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>> index 8fc7d678ac72..ee6fd9f1b2b9 100644
>> --- a/arch/x86/kernel/crash.c
>> +++ b/arch/x86/kernel/crash.c
>> @@ -395,8 +395,9 @@ int crash_load_segments(struct kimage *image)
>>       if (ret)
>>           return ret;
>>   -    image->elf_headers = kbuf.buffer;
>> -    image->elf_headers_sz = kbuf.bufsz;
>> +    image->elf_headers    = kbuf.buffer;
>> +    image->elf_headers_sz    = kbuf.bufsz;
>> +    kbuf.memsz        = kbuf.bufsz;
>>     #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>       /* Ensure elfcorehdr segment large enough for hotplug changes */
>> @@ -407,9 +408,8 @@ int crash_load_segments(struct kimage *image)
>>       image->elf_headers_sz = kbuf.memsz;
>>       image->elfcorehdr_index = image->nr_segments;
>>       image->elfcorehdr_index_valid = true;
>> -#else
>> -    kbuf.memsz = kbuf.bufsz;
>>   #endif
>> +
>>       kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>>       kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>>       ret = kexec_add_buffer(&kbuf);
>>
> ok
>
>>> I'm at a loss as to what to do differently here. You've raised this 
>>> issue
>>> before and I went back and looked at the suggestions then and I 
>>> don't see
>>> how that applies to this situation. How is this situation different 
>>> than the
>>> #ifdef CONFIG_KEXEC_FILE that immediately preceeds it?
>>
>> See the diff at the end. I'm not saying this is how you should do it
>> but it should give you a better idea. The logic being, the functions
>> in the .c file don't really need ifdeffery around them - you're adding
>> 1-2 functions and crash.c is not that big - so they can be built in
>> unconditionally. You'd need the ifdeffery *in the header only* when
>> crash.c is not being built.
> ok; I've overlooked that scenario.
>>
>> But I've done it with ifdeffery in the .c file now because yes, the
>> kexec code is a minefield of ifdeffery. Hell, there's ifdeffery even in
>> the headers for structs. Ifdeffery you don't really need. Someone should
>> clean that up and simplify this immensely.
>
> ok
>
>>
>>> Currently there is a concurrent effort for PPC support by Sourabh
>>> Jain, and in that effort arch_map_crash_pages() is using __va(paddr).
>>
>> Why?
>>
>>> I do not know the nuances between kmap_local_page() and __va() to
>>> answer the question.
>>
>> kmap_local_page() is a generic interface and it should work on any arch.
>>
>> And it is documented even:
>>
>> $ git grep kmap_local_page Documentation/
>>
>>> If kmap_local_page() works for all archs, then I'm happy to drop these
>>> arch_ variants and use it directly.
>>
>> Yes, pls do.
>
> I'll check with Sourabh to see if PPC can work with kmap_local_page().
I think kmap_local_page do support on  PowerPC. But can you explain why 
we need this
function here, aren't the reserve memory already available to use?

Thanks,
Sourabh Jain

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support
  2022-10-03 17:51   ` Sourabh Jain
@ 2022-10-07 19:14     ` Eric DeVolder
  2022-10-17  6:45       ` Sourabh Jain
  2022-10-24  9:10       ` Baoquan He
  0 siblings, 2 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-10-07 19:14 UTC (permalink / raw)
  To: Sourabh Jain, linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, konrad.wilk, boris.ostrovsky



On 10/3/22 12:51, Sourabh Jain wrote:
> Hello Eric,
> 
> On 10/09/22 02:35, Eric DeVolder wrote:
>> CPU and memory change notifications are received in order to
>> regenerate the elfcorehdr.
>>
>> To support cpu hotplug, a callback is registered to capture the
>> CPUHP_AP_ONLINE_DYN online and offline events via
>> cpuhp_setup_state_nocalls().
>>
>> To support memory hotplug, a notifier is registered to capture the
>> MEM_ONLINE and MEM_OFFLINE events via register_memory_notifier().
>>
>> The cpu callback and memory notifiers call handle_hotplug_event()
>> which performs needed tasks and then dispatches the event to the
>> architecture specific arch_crash_handle_hotplug_event(). During the
>> process, the kexec_mutex is held.
>>
>> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
>> Acked-by: Baoquan He <bhe@redhat.com>
>> ---
>>   include/linux/crash_core.h |   8 +++
>>   include/linux/kexec.h      |  26 +++++++
>>   kernel/crash_core.c        | 134 +++++++++++++++++++++++++++++++++++++
>>   3 files changed, 168 insertions(+)
>>
>> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
>> index de62a722431e..a270f8660538 100644
>> --- a/include/linux/crash_core.h
>> +++ b/include/linux/crash_core.h
>> @@ -84,4 +84,12 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
>>   int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
>>           unsigned long long *crash_size, unsigned long long *crash_base);
>> +#define KEXEC_CRASH_HP_REMOVE_CPU        0
>> +#define KEXEC_CRASH_HP_ADD_CPU            1
>> +#define KEXEC_CRASH_HP_REMOVE_MEMORY        2
>> +#define KEXEC_CRASH_HP_ADD_MEMORY        3
>> +#define KEXEC_CRASH_HP_INVALID_CPU        -1U
>> +
>> +struct kimage;
>> +
>>   #endif /* LINUX_CRASH_CORE_H */
>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>> index 4eefa631e0ae..9597b41136ec 100644
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>> @@ -374,6 +374,13 @@ struct kimage {
>>       struct purgatory_info purgatory_info;
>>   #endif
>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>> +    bool hotplug_event;
>> +    unsigned int offlinecpu;
>> +    bool elfcorehdr_index_valid;
>> +    int elfcorehdr_index;
>> +#endif
>> +
>>   #ifdef CONFIG_IMA_KEXEC
>>       /* Virtual address of IMA measurement buffer for kexec syscall */
>>       void *ima_buffer;
>> @@ -503,6 +510,25 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g
>>   static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
>>   #endif
>> +#ifndef arch_map_crash_pages
>> +static inline void *arch_map_crash_pages(unsigned long paddr,
>> +        unsigned long size)
>> +{
>> +    return NULL;
>> +}
>> +#endif
>> +
>> +#ifndef arch_unmap_crash_pages
>> +static inline void arch_unmap_crash_pages(void **ptr) { }
>> +#endif
>> +
>> +#ifndef arch_crash_handle_hotplug_event
>> +static inline void arch_crash_handle_hotplug_event(struct kimage *image,
>> +        unsigned int hp_action)
>> +{
>> +}
>> +#endif
>> +
>>   #else /* !CONFIG_KEXEC_CORE */
>>   struct pt_regs;
>>   struct task_struct;
>> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
>> index 8c648fd5897a..4b15d91f0b21 100644
>> --- a/kernel/crash_core.c
>> +++ b/kernel/crash_core.c
>> @@ -11,6 +11,8 @@
>>   #include <linux/vmalloc.h>
>>   #include <linux/sizes.h>
>>   #include <linux/kexec.h>
>> +#include <linux/memory.h>
>> +#include <linux/cpuhotplug.h>
>>   #include <asm/page.h>
>>   #include <asm/sections.h>
>> @@ -18,6 +20,7 @@
>>   #include <crypto/sha1.h>
>>   #include "kallsyms_internal.h"
>> +#include "kexec_internal.h"
>>   /* vmcoreinfo stuff */
>>   unsigned char *vmcoreinfo_data;
>> @@ -612,3 +615,134 @@ static int __init crash_save_vmcoreinfo_init(void)
>>   }
>>   subsys_initcall(crash_save_vmcoreinfo_init);
>> +
>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>> +/*
>> + * To accurately reflect hot un/plug changes, the elfcorehdr (which
>> + * is passed to the crash kernel via the elfcorehdr= parameter)
>> + * must be updated with the new list of CPUs and memories.
>> + *
>> + * In order to make changes to elfcorehdr, two conditions are needed:
>> + * First, the segment containing the elfcorehdr must be large enough
>> + * to permit a growing number of resources. The elfcorehdr memory is
>> + * typically based on CONFIG_NR_CPUS and CONFIG_CRASH_MAX_MEMORY_RANGES.
>> + * Second, purgatory must explicitly exclude the elfcorehdr from the
>> + * list of segments it checks (since the elfcorehdr changes and thus
>> + * would require an update to purgatory itself to update the digest).
>> + */
>> +static void handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
>> +{
>> +    /* Obtain lock while changing crash information */
>> +    mutex_lock(&kexec_mutex);
>> +
>> +    /* Check kdump is loaded */
>> +    if (kexec_crash_image) {
>> +        struct kimage *image = kexec_crash_image;
>> +
>> +        if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
>> +            hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
>> +            pr_debug("crash hp: hp_action %u, cpu %u\n", hp_action, cpu);
>> +        else
>> +            pr_debug("crash hp: hp_action %u\n", hp_action);
>> +
>> +        /*
>> +         * When the struct kimage is allocated, it is wiped to zero, so
>> +         * the elfcorehdr_index_valid defaults to false. Find the
>> +         * segment containing the elfcorehdr, if not already found.
>> +         * This works for both the kexec_load and kexec_file_load paths.
>> +         */
>> +        if (!image->elfcorehdr_index_valid) {
>> +            unsigned char *ptr;
>> +            unsigned long mem, memsz;
>> +            unsigned int n;
>> +
>> +            for (n = 0; n < image->nr_segments; n++) {
>> +                mem = image->segment[n].mem;
>> +                memsz = image->segment[n].memsz;
>> +                ptr = arch_map_crash_pages(mem, memsz);
>> +                if (ptr) {
>> +                    /* The segment containing elfcorehdr */
>> +                    if (memcmp(ptr, ELFMAG, SELFMAG) == 0) {
>> +                        image->elfcorehdr_index = (int)n;
>> +                        image->elfcorehdr_index_valid = true;
>> +                    }
>> +                }
>> +                arch_unmap_crash_pages((void **)&ptr);
>> +            }
>> +        }
>> +
>> +        if (!image->elfcorehdr_index_valid) {
>> +            pr_err("crash hp: unable to locate elfcorehdr segment");
>> +            goto out;
>> +        }
>> +
>> +        /* Needed in order for the segments to be updated */
>> +        arch_kexec_unprotect_crashkres();
>> +
>> +        /* Flag to differentiate between normal load and hotplug */
>> +        image->hotplug_event = true;
>> +
>> +        /* Now invoke arch-specific update handler */
>> +        arch_crash_handle_hotplug_event(image, hp_action);
>> +
>> +        /* No longer handling a hotplug event */
>> +        image->hotplug_event = false;
>> +
>> +        /* Change back to read-only */
>> +        arch_kexec_protect_crashkres();
>> +    }
>> +
>> +out:
>> +    /* Release lock now that update complete */
>> +    mutex_unlock(&kexec_mutex);
>> +}
>> +
>> +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
>> +{
>> +    switch (val) {
>> +    case MEM_ONLINE:
>> +        handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, 0);
>> +        break;
>> +
>> +    case MEM_OFFLINE:
>> +        handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, 0);
>> +        break;
>> +    }
>> +    return NOTIFY_OK;
> 
> Can we pass v (memory_notify) argument to arch_crash_handle_hotplug_event function
> via handle_hotplug_event?
> 
> Because the way memory hotplug is handled on PowerPC, it is hard to update the elfcorehdr
> without memory_notify args.
> 
> On PowePC memblock data structure is used to prepare elfcorehdr for kdump. Since the notifier
> used for memory hotplug crash handler get initiated before the memblock data structure update
> happens (as depicted below), the newly prepared elfcorehdr still holds the old memory regions.
> So if the system crash with obsolete elfcorehdr, makedumpfile failed to collect vmcore.
> 
> Sequence of actions done on PowerPC to server the memory hotplug:
> 
>   Initiate memory hot remove
>            |
>            v
>   offline pages
>            |
>            v
>   initiate memory notify call chain
>   for MEM_OFFLINE event.
>   (same is used for crash update)
>            |
>            v
>   prepare new elfcorehdr for kdump using
>   memblock data structure
>            |
>            v
>   update memblock data structure
> 
> How passing memory_notify to arch crash hotplug handler will help?
> 
> memory_notify holds the start PFN and page count, with that we can get
> the base address and size of hot unplugged memory and can use the same
> to avoid hot unplugged memeory region to get added in the elfcorehdr..
> 
> Thanks,
> Sourabh Jain
> 

Sourabh, let's see what Baoquan thinks.

Baoquan, are you OK with this request? I once had these parameters to the
crash hotplug handler and since they were unused at the time, you asked
that I remove them, which I did.

To accommodate this, how about this:

static void handle_hotplug_event(unsigned int hp_action, unsigned int cpu,
      unsigned long mem_start, unsigned long mem_size)

For CPU events, I would just pass zeros for mem_start/size. For memory events,
I would pass KEXEC_CRASH_HP_INVALID_CPU.

Thanks,
eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support
  2022-10-04  6:38   ` Sourabh Jain
@ 2022-10-07 19:19     ` Eric DeVolder
  0 siblings, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-10-07 19:19 UTC (permalink / raw)
  To: Sourabh Jain, linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, konrad.wilk, boris.ostrovsky



On 10/4/22 01:38, Sourabh Jain wrote:
> 
> On 10/09/22 02:35, Eric DeVolder wrote:
>> CPU and memory change notifications are received in order to
>> regenerate the elfcorehdr.
>>
>> To support cpu hotplug, a callback is registered to capture the
>> CPUHP_AP_ONLINE_DYN online and offline events via
>> cpuhp_setup_state_nocalls().
>>
>> To support memory hotplug, a notifier is registered to capture the
>> MEM_ONLINE and MEM_OFFLINE events via register_memory_notifier().
>>
>> The cpu callback and memory notifiers call handle_hotplug_event()
>> which performs needed tasks and then dispatches the event to the
>> architecture specific arch_crash_handle_hotplug_event(). During the
>> process, the kexec_mutex is held.
>>
>> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
>> Acked-by: Baoquan He <bhe@redhat.com>
>> ---
>>   include/linux/crash_core.h |   8 +++
>>   include/linux/kexec.h      |  26 +++++++
>>   kernel/crash_core.c        | 134 +++++++++++++++++++++++++++++++++++++
>>   3 files changed, 168 insertions(+)
>>
>> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
>> index de62a722431e..a270f8660538 100644
>> --- a/include/linux/crash_core.h
>> +++ b/include/linux/crash_core.h
>> @@ -84,4 +84,12 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
>>   int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
>>           unsigned long long *crash_size, unsigned long long *crash_base);
>> +#define KEXEC_CRASH_HP_REMOVE_CPU        0
>> +#define KEXEC_CRASH_HP_ADD_CPU            1
>> +#define KEXEC_CRASH_HP_REMOVE_MEMORY        2
>> +#define KEXEC_CRASH_HP_ADD_MEMORY        3
>> +#define KEXEC_CRASH_HP_INVALID_CPU        -1U
>> +
>> +struct kimage;
>> +
>>   #endif /* LINUX_CRASH_CORE_H */
>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>> index 4eefa631e0ae..9597b41136ec 100644
>> --- a/include/linux/kexec.h
>> +++ b/include/linux/kexec.h
>> @@ -374,6 +374,13 @@ struct kimage {
>>       struct purgatory_info purgatory_info;
>>   #endif
>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>> +    bool hotplug_event;
>> +    unsigned int offlinecpu;
>> +    bool elfcorehdr_index_valid;
>> +    int elfcorehdr_index;
> 
> Do we really need elfcorehdr_index_valid to decide elfcorehdr_index holds a valid index?
No, as you point out you can overload the index value itself.
(In fact I originally went this route but encountered trouble
with locating the proper locations to place the initialization code).

However, the current approach has the advantage that it is
automatically zero'd and thus set to its correct (false) setting
immediatley upon kexec load without any additional code. As the
diff you have below indicates, there are several sites that need
to set the index to its false (-1) value to accomplish the same.

I prefer the index_valid approach, but if there is strong support
for overloading the index, then it can be changed.

eric


> 
> How about initializing elfcorehdr_index to a negative number while loading kdump kernel (or kexec 
> kernel if needed)
> for both kexec_load and kexec_file_load case and consider that as invalid index to find the correct 
> one.
> 
> Some thing like this:
> 
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 5bc5159d9cb1..0cccdb2f7f26 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -656,7 +656,7 @@ static void handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
>                   * segment containing the elfcorehdr, if not already found.
>                   * This works for both the kexec_load and kexec_file_load paths.
>                   */
> -               if (!image->elfcorehdr_index_valid) {
> +               if (image->elfcorehdr_index < 0) {
>                          unsigned char *ptr;
>                          unsigned long mem, memsz;
>                          unsigned int n;
> diff --git a/kernel/kexec.c b/kernel/kexec.c
> index b5e40f069768..ed1c6a88879b 100644
> --- a/kernel/kexec.c
> +++ b/kernel/kexec.c
> @@ -156,6 +156,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
>          if (ret)
>                  goto out;
> 
> +       /* Below check is not necessary */
> +       if (flags & KEXEC_FILE_ON_CRASH)
> +               image->elfcorehdr_index = -1;
> +
>          /* Install the new kernel and uninstall the old */
>          image = xchg(dest_image, image);
> 
> diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
> index d0c2661b3509..535dbc26930a 100644
> --- a/kernel/kexec_file.c
> +++ b/kernel/kexec_file.c
> @@ -400,6 +400,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
>          if (ret)
>                  goto out;
> 
> +       /* Below check is not necessary */
> +       if (flags & KEXEC_FILE_ON_CRASH)
> +               image->elfcorehdr_index = -1;
> +
>          /*
>           * Free up any temporary buffers allocated which are not needed
>           * after image has been loaded
> 
> Thanks,
> Sourabh Jain
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-19  7:06   ` Sourabh Jain
@ 2022-10-07 19:33     ` Eric DeVolder
  2022-10-17  6:54       ` Sourabh Jain
  0 siblings, 1 reply; 57+ messages in thread
From: Eric DeVolder @ 2022-10-07 19:33 UTC (permalink / raw)
  To: Sourabh Jain, linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, konrad.wilk, boris.ostrovsky



On 9/19/22 02:06, Sourabh Jain wrote:
> 
> On 10/09/22 02:35, Eric DeVolder wrote:
>> For x86_64, when CPU or memory is hot un/plugged, the crash
>> elfcorehdr, which describes the CPUs and memory in the system,
>> must also be updated.
>>
>> When loading the crash kernel via kexec_load or kexec_file_load,
>> the elfcorehdr is identified at run time in
>> crash_core:handle_hotplug_event().
>>
>> To update the elfcorehdr for x86_64, a new elfcorehdr must be
>> generated from the available CPUs and memory. The new elfcorehdr
>> is prepared into a buffer, and then installed over the top of
>> the existing elfcorehdr.
>>
>> In the patch 'kexec: exclude elfcorehdr from the segment digest'
>> the need to update purgatory due to the change in elfcorehdr was
>> eliminated.  As a result, no changes to purgatory or boot_params
>> (as the elfcorehdr= kernel command line parameter pointer
>> remains unchanged and correct) are needed, just elfcorehdr.
>>
>> To accommodate a growing number of resources via hotplug, the
>> elfcorehdr segment must be sufficiently large enough to accommodate
>> changes, see the CRASH_MAX_MEMORY_RANGES configure item.
>>
>> With this change, crash hotplug for kexec_file_load syscall
>> is supported. The kexec_load is also supported, but also
>> requires a corresponding change to userspace kexec-tools.
>>
>> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
>> Acked-by: Baoquan He <bhe@redhat.com>
>> ---
>>   arch/x86/Kconfig             |  11 ++++
>>   arch/x86/include/asm/kexec.h |  20 +++++++
>>   arch/x86/kernel/crash.c      | 102 +++++++++++++++++++++++++++++++++++
>>   3 files changed, 133 insertions(+)
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index f9920f1341c8..cdfc9b2fdf98 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -2056,6 +2056,17 @@ config CRASH_DUMP
>>         (CONFIG_RELOCATABLE=y).
>>         For more details see Documentation/admin-guide/kdump/kdump.rst
>> +config CRASH_MAX_MEMORY_RANGES
>> +    depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
>> +    int
>> +    default 32768
>> +    help
>> +      For the kexec_file_load path, specify the maximum number of
>> +      memory regions, eg. as represented by the 'System RAM' entries
>> +      in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
>> +      This value is combined with NR_CPUS and multiplied by Elf64_Phdr
>> +      size to determine the final buffer size.
>> +
>>   config KEXEC_JUMP
>>       bool "kexec jump"
>>       depends on KEXEC && HIBERNATION
>> diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
>> index a3760ca796aa..432073385b2d 100644
>> --- a/arch/x86/include/asm/kexec.h
>> +++ b/arch/x86/include/asm/kexec.h
>> @@ -212,6 +212,26 @@ typedef void crash_vmclear_fn(void);
>>   extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
>>   extern void kdump_nmi_shootdown_cpus(void);
>> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size);
>> +#define arch_map_crash_pages arch_map_crash_pages
>> +
>> +void arch_unmap_crash_pages(void **ptr);
>> +#define arch_unmap_crash_pages arch_unmap_crash_pages
>> +
>> +void arch_crash_handle_hotplug_event(struct kimage *image,
>> +        unsigned int hp_action);
>> +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
>> +
>> +#ifdef CONFIG_HOTPLUG_CPU
>> +static inline int crash_hotplug_cpu_support(void) { return 1; }
>> +#define crash_hotplug_cpu_support crash_hotplug_cpu_support
>> +#endif
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static inline int crash_hotplug_memory_support(void) { return 1; }
>> +#define crash_hotplug_memory_support crash_hotplug_memory_support
>> +#endif
>> +
>>   #endif /* __ASSEMBLY__ */
>>   #endif /* _ASM_X86_KEXEC_H */
>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>> index 9ceb93c176a6..8fc7d678ac72 100644
>> --- a/arch/x86/kernel/crash.c
>> +++ b/arch/x86/kernel/crash.c
>> @@ -25,6 +25,7 @@
>>   #include <linux/slab.h>
>>   #include <linux/vmalloc.h>
>>   #include <linux/memblock.h>
>> +#include <linux/highmem.h>
>>   #include <asm/processor.h>
>>   #include <asm/hardirq.h>
>> @@ -397,7 +398,18 @@ int crash_load_segments(struct kimage *image)
>>       image->elf_headers = kbuf.buffer;
>>       image->elf_headers_sz = kbuf.bufsz;
>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>> +    /* Ensure elfcorehdr segment large enough for hotplug changes */
>> +    kbuf.memsz =
>> +        (CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
>> +            sizeof(Elf64_Phdr);
>> +    /* Mark as usable to crash kernel, else crash kernel fails on boot */
>> +    image->elf_headers_sz = kbuf.memsz;
>> +    image->elfcorehdr_index = image->nr_segments;
>> +    image->elfcorehdr_index_valid = true;
>> +#else
>>       kbuf.memsz = kbuf.bufsz;
>> +#endif
>>       kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>>       kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>>       ret = kexec_add_buffer(&kbuf);
>> @@ -412,3 +424,93 @@ int crash_load_segments(struct kimage *image)
>>       return ret;
>>   }
>>   #endif /* CONFIG_KEXEC_FILE */
>> +
>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>> +/*
>> + * NOTE: The addresses and sizes passed to this routine have
>> + * already been fully aligned on page boundaries. There is no
>> + * need for massaging the address or size.
>> + */
>> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size)
>> +{
>> +    void *ptr = NULL;
>> +
>> +    if (size > 0) {
>> +        struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
>> +
>> +        ptr = kmap_local_page(page);
>> +    }
>> +
>> +    return ptr;
>> +}
>> +
>> +void arch_unmap_crash_pages(void **ptr)
>> +{
>> +    if (ptr) {
>> +        if (*ptr)
>> +            kunmap_local(*ptr);
>> +        *ptr = NULL;
>> +    }
>> +}
>> +
>> +/**
>> + * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
>> + * @image: the active struct kimage
>> + * @hp_action: the hot un/plug action being handled
>> + *
>> + * To accurately reflect hot un/plug changes, the new elfcorehdr
>> + * is prepared in a kernel buffer, and then it is written on top
>> + * of the existing/old elfcorehdr.
>> + */
>> +void arch_crash_handle_hotplug_event(struct kimage *image,
>> +    unsigned int hp_action)
>> +{
>> +    struct kexec_segment *ksegment;
>> +    unsigned char *ptr = NULL;
>> +    unsigned long elfsz = 0;
>> +    void *elfbuf = NULL;
>> +    unsigned long mem, memsz;
>> +
>> +    /*
>> +     * Elfcorehdr_index_valid checked in crash_core:handle_hotplug_event()
>> +     */
>> +    ksegment = &image->segment[image->elfcorehdr_index];
>> +    mem = ksegment->mem;
>> +    memsz = ksegment->memsz;
>> +
>> +    /*
>> +     * Create the new elfcorehdr reflecting the changes to CPU and/or
>> +     * memory resources.
>> +     */
>> +    if (prepare_elf_headers(image, &elfbuf, &elfsz)) {
>> +        pr_err("crash hp: unable to prepare elfcore headers");
>> +        goto out;
> 
> On PowerPC, while preparing the elf core header the memblock structure is used to prepare program 
> header for memory regions of elfcorehdr. Since the above arch specific hotplug handler gets invoked 
> when memory is marked offline (MEM_OFFLINE) which is before memblock structure gets updated so on 
> PowerPC the above handler may not work for memory hotplug case.
> 
> Just wondering which data structure is used to get the list of memory regions while preparing 
> program header for memory regions of elfcorehdr on other architectures?
> 
> Thanks,
> Sourabh Jain

I think your request to report the memory block address in comments of patch 3/7 "crash: add generic 
infrastructure" cover this scenario now.
Thanks,
eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-04  7:03           ` Sourabh Jain
@ 2022-10-07 19:56             ` Eric DeVolder
  0 siblings, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-10-07 19:56 UTC (permalink / raw)
  To: Sourabh Jain, Borislav Petkov
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, konrad.wilk, boris.ostrovsky



On 10/4/22 02:03, Sourabh Jain wrote:
> 
> On 30/09/22 21:06, Eric DeVolder wrote:
>>
>>
>> On 9/28/22 11:07, Borislav Petkov wrote:
>>> On Tue, Sep 13, 2022 at 02:12:31PM -0500, Eric DeVolder wrote:
>>>> This topic was discussed previously https://lkml.org/lkml/2022/3/3/372.
>>>
>>> Please do not use lkml.org to refer to lkml messages. We have a
>>> perfectly fine archival system at lore.kernel.org. You simply do
>>>
>>> https://lore.kernel.org/r/<Message-ID>
>>>
>>> when you want to point to a previous mail.
>>
>> ok, thanks for pointing that out to me.
>>>
>>>> David points out that terminology is tricky here due to differing behaviors.
>>>> And perhaps that is your point in asking for guidance text. It can be
>>>> complicated
>>>
>>> Which means you need an explanation how to use this even more.
>>>
>>> And why is CONFIG_CRASH_MAX_MEMORY_RANGES even a Kconfig item and not
>>> something you discover from the hardware?
>>
>> No, is the short answer.
>>
>>>
>>> Your help text talks about System RAM entries in /proc/iomem which means
>>> that those entries are present somewhere in the kernel and you can read
>>> them out and do the proper calculations dynamically instead of doing the
>>> static CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES thing.
>>
>> The intent is to compute the max size buffer needed to contain a maximum populated elfcorehdr, 
>> which is primarily based on the number of CPUs and memory regions. Thus far I (and others 
>> involved) have not found a kernel method to determine the maximum number of memory regions 
>> possible (if you are aware of one, please let me know!). Thus CONFIG_CRASH_MAX_MEMORY_RANGES was 
>> born (rather borrowed from kexec-tools).
>>
>> So no dynamic computation is possible, yet.
> 
> Hello Eric,
> 
> How about allocating buffer space for max program header possible in a elfcorehdr?
> 
> mage->elf_headers_sz = kbuf.memsz = PN_XNUM * sizeof(Elf64_Phdr);
> 
> PN_XNUM is part of linux/elf.h (include/uapi/linux/elf.h).
> 
> Refer below link for more details:
> https://man7.org/linux/man-pages/man5/elf.5.html
> 
> Thanks,
> Sourabh Jain
> 

Well, that is an idea. I'm not sure it is the answer yet, but if I do compute
a value, then that value needs to be checked against PN_XNUM so it still results
in a valid elfcorehdr.

Thanks,
eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-04  9:10           ` Sourabh Jain
@ 2022-10-07 20:00             ` Eric DeVolder
  2022-10-12  4:55               ` Sourabh Jain
  0 siblings, 1 reply; 57+ messages in thread
From: Eric DeVolder @ 2022-10-07 20:00 UTC (permalink / raw)
  To: Sourabh Jain, Borislav Petkov
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, konrad.wilk, boris.ostrovsky



On 10/4/22 04:10, Sourabh Jain wrote:
> 
> On 30/09/22 21:06, Eric DeVolder wrote:
>>
>>
>> On 9/28/22 11:07, Borislav Petkov wrote:
>>> On Tue, Sep 13, 2022 at 02:12:31PM -0500, Eric DeVolder wrote:
>>>> This topic was discussed previously https://lkml.org/lkml/2022/3/3/372.
>>>
>>> Please do not use lkml.org to refer to lkml messages. We have a
>>> perfectly fine archival system at lore.kernel.org. You simply do
>>>
>>> https://lore.kernel.org/r/<Message-ID>
>>>
>>> when you want to point to a previous mail.
>>
>> ok, thanks for pointing that out to me.
>>>
>>>> David points out that terminology is tricky here due to differing behaviors.
>>>> And perhaps that is your point in asking for guidance text. It can be
>>>> complicated
>>>
>>> Which means you need an explanation how to use this even more.
>>>
>>> And why is CONFIG_CRASH_MAX_MEMORY_RANGES even a Kconfig item and not
>>> something you discover from the hardware?
>>
>> No, is the short answer.
>>
>>>
>>> Your help text talks about System RAM entries in /proc/iomem which means
>>> that those entries are present somewhere in the kernel and you can read
>>> them out and do the proper calculations dynamically instead of doing the
>>> static CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES thing.
>>
>> The intent is to compute the max size buffer needed to contain a maximum populated elfcorehdr, 
>> which is primarily based on the number of CPUs and memory regions. Thus far I (and others 
>> involved) have not found a kernel method to determine the maximum number of memory regions 
>> possible (if you are aware of one, please let me know!). Thus CONFIG_CRASH_MAX_MEMORY_RANGES was 
>> born (rather borrowed from kexec-tools).
>>
>> So no dynamic computation is possible, yet.
>>
>>>
>>>> , but it all comes down to System RAM entries.
>>>>
>>>> I could perhaps offer an overly simplified example such that for 1GiB block
>>>> size, for example, the CRASH_MAX_MEMORY_RANGES of 32768 would allow for 32TiB
>>>> of memory?
>>>
>>> Yes, and stick it somewhere in Documentation/admin-guide/kdump/ and
>>> refer to it in that help text so that people can find it and read how to
>>> use your new option.
>>>
>> ok
>>
>>>> The kbuf.bufsz value is obtained via a call to prepare_elf_headers(); I can
>>>> not initialize it at its declaration.
>>>
>>> Sorry, I meant this:
>>>
>>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>>> index 8fc7d678ac72..ee6fd9f1b2b9 100644
>>> --- a/arch/x86/kernel/crash.c
>>> +++ b/arch/x86/kernel/crash.c
>>> @@ -395,8 +395,9 @@ int crash_load_segments(struct kimage *image)
>>>       if (ret)
>>>           return ret;
>>>   -    image->elf_headers = kbuf.buffer;
>>> -    image->elf_headers_sz = kbuf.bufsz;
>>> +    image->elf_headers    = kbuf.buffer;
>>> +    image->elf_headers_sz    = kbuf.bufsz;
>>> +    kbuf.memsz        = kbuf.bufsz;
>>>     #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>>       /* Ensure elfcorehdr segment large enough for hotplug changes */
>>> @@ -407,9 +408,8 @@ int crash_load_segments(struct kimage *image)
>>>       image->elf_headers_sz = kbuf.memsz;
>>>       image->elfcorehdr_index = image->nr_segments;
>>>       image->elfcorehdr_index_valid = true;
>>> -#else
>>> -    kbuf.memsz = kbuf.bufsz;
>>>   #endif
>>> +
>>>       kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>>>       kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>>>       ret = kexec_add_buffer(&kbuf);
>>>
>> ok
>>
>>>> I'm at a loss as to what to do differently here. You've raised this issue
>>>> before and I went back and looked at the suggestions then and I don't see
>>>> how that applies to this situation. How is this situation different than the
>>>> #ifdef CONFIG_KEXEC_FILE that immediately preceeds it?
>>>
>>> See the diff at the end. I'm not saying this is how you should do it
>>> but it should give you a better idea. The logic being, the functions
>>> in the .c file don't really need ifdeffery around them - you're adding
>>> 1-2 functions and crash.c is not that big - so they can be built in
>>> unconditionally. You'd need the ifdeffery *in the header only* when
>>> crash.c is not being built.
>> ok; I've overlooked that scenario.
>>>
>>> But I've done it with ifdeffery in the .c file now because yes, the
>>> kexec code is a minefield of ifdeffery. Hell, there's ifdeffery even in
>>> the headers for structs. Ifdeffery you don't really need. Someone should
>>> clean that up and simplify this immensely.
>>
>> ok
>>
>>>
>>>> Currently there is a concurrent effort for PPC support by Sourabh
>>>> Jain, and in that effort arch_map_crash_pages() is using __va(paddr).
>>>
>>> Why?
>>>
>>>> I do not know the nuances between kmap_local_page() and __va() to
>>>> answer the question.
>>>
>>> kmap_local_page() is a generic interface and it should work on any arch.
>>>
>>> And it is documented even:
>>>
>>> $ git grep kmap_local_page Documentation/
>>>
>>>> If kmap_local_page() works for all archs, then I'm happy to drop these
>>>> arch_ variants and use it directly.
>>>
>>> Yes, pls do.
>>
>> I'll check with Sourabh to see if PPC can work with kmap_local_page().
> I think kmap_local_page do support on  PowerPC. But can you explain why we need this
> function here, aren't the reserve memory already available to use?

On x86, attempts to access the elfcorehdr without mapping it did not work (resulted
in a fault).

Let me know if using kmap_local_page() in place of __va() in arch_map_crash_pages().
If it does, then I can eliminate arch_un/map_crash_pages() and use kmap_local_page()
directly.

Thanks,
eric
> 
> Thanks,
> Sourabh Jain

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-30 17:40               ` Borislav Petkov
@ 2022-10-08  2:35                 ` Baoquan He
  2022-10-12 17:46                   ` Borislav Petkov
  2022-10-12 16:20                 ` Eric DeVolder
  1 sibling, 1 reply; 57+ messages in thread
From: Baoquan He @ 2022-10-08  2:35 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Eric DeVolder, Oscar Salvador, Andrew Morton, david,
	linux-kernel, x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo,
	dave.hansen, hpa, nramas, thomas.lendacky, robh, efault, rppt,
	sourabhjain, linux-mm

On 09/30/22 at 07:40pm, Borislav Petkov wrote:
> On Fri, Sep 30, 2022 at 12:11:26PM -0500, Eric DeVolder wrote:
> > There is of course a way to enumerate the memory regions in use on the
> > machine, that is not what this code needs. In order to compute the maximum
> > buffer size needed (this buffer size is computed once), the count of the
> > maximum number of memory regions possible (even if not currently in use) is
> > what is needed.
> 
> Isn't that max number documented somewhere in memory hotplug docs?

Memory hptlug is not limited by a certin or a max number of memory
regions, but limited by how large of the linear mapping range which
physical can be mapped into.

E.g on x86_64, with 4-level page tables, it has 64TB linear mapping
range by default. On principle, we can add 64TB of phisical memory
into system altogether from booting and memory hotplug. While with
KASLR enabled, it has 10TB of linear mapping range by default, see
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING. Means there's only 10TB
phisical memory being allowed to be added into system.

For the Kconfig CRASH_MAX_MEMORY_RANGES Eric added, it's meaningful to
me to set a fixed value which is enough in reality. For extreme testing
with special purpose, it could be broken easily, people need decide by
self whether the CONFIG_CRASH_MAX_MEMORY_RANGES is enlarged or not.
E.g on x86_64, we make a system with memory smaller than 64G, this will
cause the memory block size being probed as 256M. Then we hot added many
Tera Bytes of physical memory every second memory block after bootup with
a shell shell script. It could be easier to manipulate this with virtiomem.
Please see function probe_memory_block_size() on x86_64 about the memory
block size probing. However, I don't think on real system, this kind of
system could really exist, with a tiny memory booted up, a huge memory
hot added sparsely.

> 
> Because then you don't need that Kconfig item either. Imagine you're a
> distro kernel distributor and you want crash to work on all machines
> your kernel works.
> 
> So you go and set that number to max. And that would be the 99% of the
> kernel configs out there.
> 
> Which means, you can just set it to max without a Kconfig item.
> 
> > Oh, that would be an error of haste on my part. This should be:
> >   depends on CRASH_DUMP && MEMORY_HOTPLUG
> 
> You need a Kconfig item which enables all this gunk as MEMORY_HOTPLUG is
> not a omnipresent feature. And that Kconfig item should depend on the
> other Kconfig items of the technology you need.
> 
> > Baoquan pointed me to:
> > 
> > https://lore.kernel.org/lkml/cover.1656659357.git.naveen.n.rao@linux.vnet.ibm.com/T/
> 
> In that thread says:
> 
> "- arch_kexec_apply_relocations_add() is only overridden by x86 and s390.
>   Retain the function prototype for those and move the weak
>   implementation into the header as a static inline for other
>   architectures."
> 
> So yes, that's even better.
> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> https://people.kernel.org/tglx/notes-about-netiquette
> 


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-07 20:00             ` Eric DeVolder
@ 2022-10-12  4:55               ` Sourabh Jain
  2022-10-12 16:23                 ` Eric DeVolder
  0 siblings, 1 reply; 57+ messages in thread
From: Sourabh Jain @ 2022-10-12  4:55 UTC (permalink / raw)
  To: Eric DeVolder, Borislav Petkov
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, konrad.wilk, boris.ostrovsky


On 08/10/22 01:30, Eric DeVolder wrote:
>
>
> On 10/4/22 04:10, Sourabh Jain wrote:
>>
>> On 30/09/22 21:06, Eric DeVolder wrote:
>>>
>>>
>>> On 9/28/22 11:07, Borislav Petkov wrote:
>>>> On Tue, Sep 13, 2022 at 02:12:31PM -0500, Eric DeVolder wrote:
>>>>> This topic was discussed previously 
>>>>> https://lkml.org/lkml/2022/3/3/372.
>>>>
>>>> Please do not use lkml.org to refer to lkml messages. We have a
>>>> perfectly fine archival system at lore.kernel.org. You simply do
>>>>
>>>> https://lore.kernel.org/r/<Message-ID>
>>>>
>>>> when you want to point to a previous mail.
>>>
>>> ok, thanks for pointing that out to me.
>>>>
>>>>> David points out that terminology is tricky here due to differing 
>>>>> behaviors.
>>>>> And perhaps that is your point in asking for guidance text. It can be
>>>>> complicated
>>>>
>>>> Which means you need an explanation how to use this even more.
>>>>
>>>> And why is CONFIG_CRASH_MAX_MEMORY_RANGES even a Kconfig item and not
>>>> something you discover from the hardware?
>>>
>>> No, is the short answer.
>>>
>>>>
>>>> Your help text talks about System RAM entries in /proc/iomem which 
>>>> means
>>>> that those entries are present somewhere in the kernel and you can 
>>>> read
>>>> them out and do the proper calculations dynamically instead of 
>>>> doing the
>>>> static CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES thing.
>>>
>>> The intent is to compute the max size buffer needed to contain a 
>>> maximum populated elfcorehdr, which is primarily based on the number 
>>> of CPUs and memory regions. Thus far I (and others involved) have 
>>> not found a kernel method to determine the maximum number of memory 
>>> regions possible (if you are aware of one, please let me know!). 
>>> Thus CONFIG_CRASH_MAX_MEMORY_RANGES was born (rather borrowed from 
>>> kexec-tools).
>>>
>>> So no dynamic computation is possible, yet.
>>>
>>>>
>>>>> , but it all comes down to System RAM entries.
>>>>>
>>>>> I could perhaps offer an overly simplified example such that for 
>>>>> 1GiB block
>>>>> size, for example, the CRASH_MAX_MEMORY_RANGES of 32768 would 
>>>>> allow for 32TiB
>>>>> of memory?
>>>>
>>>> Yes, and stick it somewhere in Documentation/admin-guide/kdump/ and
>>>> refer to it in that help text so that people can find it and read 
>>>> how to
>>>> use your new option.
>>>>
>>> ok
>>>
>>>>> The kbuf.bufsz value is obtained via a call to 
>>>>> prepare_elf_headers(); I can
>>>>> not initialize it at its declaration.
>>>>
>>>> Sorry, I meant this:
>>>>
>>>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>>>> index 8fc7d678ac72..ee6fd9f1b2b9 100644
>>>> --- a/arch/x86/kernel/crash.c
>>>> +++ b/arch/x86/kernel/crash.c
>>>> @@ -395,8 +395,9 @@ int crash_load_segments(struct kimage *image)
>>>>       if (ret)
>>>>           return ret;
>>>>   -    image->elf_headers = kbuf.buffer;
>>>> -    image->elf_headers_sz = kbuf.bufsz;
>>>> +    image->elf_headers    = kbuf.buffer;
>>>> +    image->elf_headers_sz    = kbuf.bufsz;
>>>> +    kbuf.memsz        = kbuf.bufsz;
>>>>     #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>>>       /* Ensure elfcorehdr segment large enough for hotplug changes */
>>>> @@ -407,9 +408,8 @@ int crash_load_segments(struct kimage *image)
>>>>       image->elf_headers_sz = kbuf.memsz;
>>>>       image->elfcorehdr_index = image->nr_segments;
>>>>       image->elfcorehdr_index_valid = true;
>>>> -#else
>>>> -    kbuf.memsz = kbuf.bufsz;
>>>>   #endif
>>>> +
>>>>       kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>>>>       kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>>>>       ret = kexec_add_buffer(&kbuf);
>>>>
>>> ok
>>>
>>>>> I'm at a loss as to what to do differently here. You've raised 
>>>>> this issue
>>>>> before and I went back and looked at the suggestions then and I 
>>>>> don't see
>>>>> how that applies to this situation. How is this situation 
>>>>> different than the
>>>>> #ifdef CONFIG_KEXEC_FILE that immediately preceeds it?
>>>>
>>>> See the diff at the end. I'm not saying this is how you should do it
>>>> but it should give you a better idea. The logic being, the functions
>>>> in the .c file don't really need ifdeffery around them - you're adding
>>>> 1-2 functions and crash.c is not that big - so they can be built in
>>>> unconditionally. You'd need the ifdeffery *in the header only* when
>>>> crash.c is not being built.
>>> ok; I've overlooked that scenario.
>>>>
>>>> But I've done it with ifdeffery in the .c file now because yes, the
>>>> kexec code is a minefield of ifdeffery. Hell, there's ifdeffery 
>>>> even in
>>>> the headers for structs. Ifdeffery you don't really need. Someone 
>>>> should
>>>> clean that up and simplify this immensely.
>>>
>>> ok
>>>
>>>>
>>>>> Currently there is a concurrent effort for PPC support by Sourabh
>>>>> Jain, and in that effort arch_map_crash_pages() is using __va(paddr).
>>>>
>>>> Why?
>>>>
>>>>> I do not know the nuances between kmap_local_page() and __va() to
>>>>> answer the question.
>>>>
>>>> kmap_local_page() is a generic interface and it should work on any 
>>>> arch.
>>>>
>>>> And it is documented even:
>>>>
>>>> $ git grep kmap_local_page Documentation/
>>>>
>>>>> If kmap_local_page() works for all archs, then I'm happy to drop 
>>>>> these
>>>>> arch_ variants and use it directly.
>>>>
>>>> Yes, pls do.
>>>
>>> I'll check with Sourabh to see if PPC can work with kmap_local_page().
>> I think kmap_local_page do support on  PowerPC. But can you explain 
>> why we need this
>> function here, aren't the reserve memory already available to use?
>
> On x86, attempts to access the elfcorehdr without mapping it did not 
> work (resulted
> in a fault).
>
> Let me know if using kmap_local_page() in place of __va() in 
> arch_map_crash_pages().
> If it does, then I can eliminate arch_un/map_crash_pages() and use 
> kmap_local_page()
> directly.
Hello Eric,

Atleast on ppc64 we have direct mapping available and hence just by 
doing page shift
on physical address (__va) we can get valid virtual address on powerpc. 
In short we don't
have to generate mapping again to access reserved region.

Regardless let's go with kdump_local_page API, it is supported on powerpc.

Thanks,
Sourabh Jain

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-09-30 17:40               ` Borislav Petkov
  2022-10-08  2:35                 ` Baoquan He
@ 2022-10-12 16:20                 ` Eric DeVolder
  2022-10-25 10:39                   ` Borislav Petkov
  1 sibling, 1 reply; 57+ messages in thread
From: Eric DeVolder @ 2022-10-12 16:20 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Oscar Salvador, Andrew Morton, david, linux-kernel, x86, kexec,
	ebiederm, dyoung, bhe, vgoyal, tglx, mingo, dave.hansen, hpa,
	nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm



On 9/30/22 12:40, Borislav Petkov wrote:
> On Fri, Sep 30, 2022 at 12:11:26PM -0500, Eric DeVolder wrote:
>> There is of course a way to enumerate the memory regions in use on the
>> machine, that is not what this code needs. In order to compute the maximum
>> buffer size needed (this buffer size is computed once), the count of the
>> maximum number of memory regions possible (even if not currently in use) is
>> what is needed.
> 
> Isn't that max number documented somewhere in memory hotplug docs?
> 
> Because then you don't need that Kconfig item either. Imagine you're a
> distro kernel distributor and you want crash to work on all machines
> your kernel works.
> 
> So you go and set that number to max. And that would be the 99% of the
> kernel configs out there.
> 
> Which means, you can just set it to max without a Kconfig item.
> 
>> Oh, that would be an error of haste on my part. This should be:
>>    depends on CRASH_DUMP && MEMORY_HOTPLUG
> 
> You need a Kconfig item which enables all this gunk as MEMORY_HOTPLUG is
> not a omnipresent feature. And that Kconfig item should depend on the
> other Kconfig items of the technology you need.

I once had CONFIG_CRASH_HOTPLUG, but you disagreed.

https://lore.kernel.org/lkml/Ylgot+LUDQl+G%2F5N@zn.tnic/

 From there I simply went with

  #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)

which route do you prefer?

Thanks!
eric

> 
>> Baoquan pointed me to:
>>
>> https://lore.kernel.org/lkml/cover.1656659357.git.naveen.n.rao@linux.vnet.ibm.com/T/
> 
> In that thread says:
> 
> "- arch_kexec_apply_relocations_add() is only overridden by x86 and s390.
>    Retain the function prototype for those and move the weak
>    implementation into the header as a static inline for other
>    architectures."
> 
> So yes, that's even better.
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-12  4:55               ` Sourabh Jain
@ 2022-10-12 16:23                 ` Eric DeVolder
  0 siblings, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-10-12 16:23 UTC (permalink / raw)
  To: Sourabh Jain, Borislav Petkov
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal, tglx,
	mingo, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, konrad.wilk, boris.ostrovsky



On 10/11/22 23:55, Sourabh Jain wrote:
> 
>>>>>> If kmap_local_page() works for all archs, then I'm happy to drop these
>>>>>> arch_ variants and use it directly.
>>>>>
>>>>> Yes, pls do.
>>>>
>>>> I'll check with Sourabh to see if PPC can work with kmap_local_page().
>>> I think kmap_local_page do support on  PowerPC. But can you explain why we need this
>>> function here, aren't the reserve memory already available to use?
>>
>> On x86, attempts to access the elfcorehdr without mapping it did not work (resulted
>> in a fault).
>>
>> Let me know if using kmap_local_page() in place of __va() in arch_map_crash_pages().
>> If it does, then I can eliminate arch_un/map_crash_pages() and use kmap_local_page()
>> directly.
> Hello Eric,
> 
> Atleast on ppc64 we have direct mapping available and hence just by doing page shift
> on physical address (__va) we can get valid virtual address on powerpc. In short we don't
> have to generate mapping again to access reserved region.
> 
> Regardless let's go with kdump_local_page API, it is supported on powerpc.
> 
> Thanks,
> Sourabh Jain

Ok, I will go that route.
Thanks!
eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-08  2:35                 ` Baoquan He
@ 2022-10-12 17:46                   ` Borislav Petkov
  2022-10-12 20:19                     ` Eric DeVolder
  0 siblings, 1 reply; 57+ messages in thread
From: Borislav Petkov @ 2022-10-12 17:46 UTC (permalink / raw)
  To: Baoquan He
  Cc: Eric DeVolder, Oscar Salvador, Andrew Morton, david,
	linux-kernel, x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo,
	dave.hansen, hpa, nramas, thomas.lendacky, robh, efault, rppt,
	sourabhjain, linux-mm

On Sat, Oct 08, 2022 at 10:35:14AM +0800, Baoquan He wrote:
> Memory hptlug is not limited by a certin or a max number of memory
> regions, but limited by how large of the linear mapping range which
> physical can be mapped into.

Memory hotplug is not limited by some abstract range but by the *actual*
possibility of how many DIMM slots on any motherboard can hotplug
memory. Certainly not 32K.

So you can choose a sane default which covers *all* actual systems out
there.

> For the Kconfig CRASH_MAX_MEMORY_RANGES Eric added, it's meaningful to
> me to set a fixed value which is enough in reality.

Yes, exactly.

> For extreme testing with special purpose, it could be broken easily,
> people need decide by self whether the CONFIG_CRASH_MAX_MEMORY_RANGES
> is enlarged or not.

I don't want for people to decide on one more thing where they have to
go and read a bunch of specs just to know what is a good value. So we
should set a sane, *practical* upper limit and simply go with it.

Everything else is testing stuff and if you test the kernel, then you
can change limits and values and so on as you want to.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-12 17:46                   ` Borislav Petkov
@ 2022-10-12 20:19                     ` Eric DeVolder
  2022-10-12 20:41                       ` Borislav Petkov
  2022-10-12 20:42                       ` Eric DeVolder
  0 siblings, 2 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-10-12 20:19 UTC (permalink / raw)
  To: Borislav Petkov, Baoquan He
  Cc: Oscar Salvador, Andrew Morton, david, linux-kernel, x86, kexec,
	ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen, hpa, nramas,
	thomas.lendacky, robh, efault, rppt, sourabhjain, linux-mm



On 10/12/22 12:46, Borislav Petkov wrote:
> On Sat, Oct 08, 2022 at 10:35:14AM +0800, Baoquan He wrote:
>> Memory hptlug is not limited by a certin or a max number of memory
>> regions, but limited by how large of the linear mapping range which
>> physical can be mapped into.
> 
> Memory hotplug is not limited by some abstract range but by the *actual*
> possibility of how many DIMM slots on any motherboard can hotplug
> memory. Certainly not 32K.
> 
> So you can choose a sane default which covers *all* actual systems out
> there.


We run here QEMU with the ability for 1024 DIMM slots. A DIMM can be any
reasonable power of 2 size, and then that DIMM is further divided into memblocks,
typically 128MiB.

So, for example, 1TiB requires 1024 DIMMs of 1GiB each with 128MiB memblocks, that results
in 8K possible memory regions. So just going to 4TiB reaches 32K memory regions.

This I can attest for virtualized DIMMs, not sure about other memory hotplug technologies
like virtio-mem or DynamicMemory. But it seems reasonable that those technologies could
also easily reach into these number ranges.

Eric

> 
>> For the Kconfig CRASH_MAX_MEMORY_RANGES Eric added, it's meaningful to
>> me to set a fixed value which is enough in reality.
> 
> Yes, exactly.
> 
>> For extreme testing with special purpose, it could be broken easily,
>> people need decide by self whether the CONFIG_CRASH_MAX_MEMORY_RANGES
>> is enlarged or not.
> 
> I don't want for people to decide on one more thing where they have to
> go and read a bunch of specs just to know what is a good value. So we
> should set a sane, *practical* upper limit and simply go with it.
> 
> Everything else is testing stuff and if you test the kernel, then you
> can change limits and values and so on as you want to.
> 
> Thx.
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-12 20:19                     ` Eric DeVolder
@ 2022-10-12 20:41                       ` Borislav Petkov
  2022-10-13  2:57                         ` Baoquan He
  2022-10-12 20:42                       ` Eric DeVolder
  1 sibling, 1 reply; 57+ messages in thread
From: Borislav Petkov @ 2022-10-12 20:41 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: Baoquan He, Oscar Salvador, Andrew Morton, david, linux-kernel,
	x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen,
	hpa, nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm

On Wed, Oct 12, 2022 at 03:19:19PM -0500, Eric DeVolder wrote:
> We run here QEMU with the ability for 1024 DIMM slots.

QEMU, haha.

What is the highest count of DIMM slots which are hotpluggable on a
real, *physical* system today? Are you saying you can have 1K DIMM slots
on a board?

I hardly doubt that.

> So, for example, 1TiB requires 1024 DIMMs of 1GiB each with 128MiB
> memblocks, that results in 8K possible memory regions. So just going
> to 4TiB reaches 32K memory regions.

Lemme see if I understand this correctly: when a system like that
crashes, you want to kdump *all* those 4TiB in a vmcore? How long would
that dump take to complete? A day?

IOW, how does a realistic use case of this look like - not a QEMU one?

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-12 20:19                     ` Eric DeVolder
  2022-10-12 20:41                       ` Borislav Petkov
@ 2022-10-12 20:42                       ` Eric DeVolder
  1 sibling, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-10-12 20:42 UTC (permalink / raw)
  To: Borislav Petkov, Baoquan He
  Cc: Oscar Salvador, Andrew Morton, david, linux-kernel, x86, kexec,
	ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen, hpa, nramas,
	thomas.lendacky, robh, efault, rppt, sourabhjain, linux-mm



On 10/12/22 15:19, Eric DeVolder wrote:
> 
> 
> On 10/12/22 12:46, Borislav Petkov wrote:
>> On Sat, Oct 08, 2022 at 10:35:14AM +0800, Baoquan He wrote:
>>> Memory hptlug is not limited by a certin or a max number of memory
>>> regions, but limited by how large of the linear mapping range which
>>> physical can be mapped into.
>>
>> Memory hotplug is not limited by some abstract range but by the *actual*
>> possibility of how many DIMM slots on any motherboard can hotplug
>> memory. Certainly not 32K.
>>
>> So you can choose a sane default which covers *all* actual systems out
>> there.
> 
> 
> We run here QEMU with the ability for 1024 DIMM slots. A DIMM can be any
> reasonable power of 2 size, and then that DIMM is further divided into memblocks,
> typically 128MiB.
> 
> So, for example, 1TiB requires 1024 DIMMs of 1GiB each with 128MiB memblocks, that results
> in 8K possible memory regions. So just going to 4TiB reaches 32K memory regions.
> 
> This I can attest for virtualized DIMMs, not sure about other memory hotplug technologies
> like virtio-mem or DynamicMemory. But it seems reasonable that those technologies could
> also easily reach into these number ranges.
> 
> Eric

Oh, to be fair, if the above were fully populated, it would essentially coalescence
into a single reported region via crash_prepare_elf64_headers(). But in the sadistic
case, where every other memblock was offlined, that would result in the need to
report half of the memory regions via the elfcorehdr.

$0.02.
eric

> 
>>
>>> For the Kconfig CRASH_MAX_MEMORY_RANGES Eric added, it's meaningful to
>>> me to set a fixed value which is enough in reality.
>>
>> Yes, exactly.
>>
>>> For extreme testing with special purpose, it could be broken easily,
>>> people need decide by self whether the CONFIG_CRASH_MAX_MEMORY_RANGES
>>> is enlarged or not.
>>
>> I don't want for people to decide on one more thing where they have to
>> go and read a bunch of specs just to know what is a good value. So we
>> should set a sane, *practical* upper limit and simply go with it.
>>
>> Everything else is testing stuff and if you test the kernel, then you
>> can change limits and values and so on as you want to.
>>
>> Thx.
>>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-12 20:41                       ` Borislav Petkov
@ 2022-10-13  2:57                         ` Baoquan He
  2022-10-25 10:31                           ` Borislav Petkov
  0 siblings, 1 reply; 57+ messages in thread
From: Baoquan He @ 2022-10-13  2:57 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Eric DeVolder, Oscar Salvador, Andrew Morton, david,
	linux-kernel, x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo,
	dave.hansen, hpa, nramas, thomas.lendacky, robh, efault, rppt,
	sourabhjain, linux-mm

On 10/12/22 at 10:41pm, Borislav Petkov wrote:
> On Wed, Oct 12, 2022 at 03:19:19PM -0500, Eric DeVolder wrote:
> > We run here QEMU with the ability for 1024 DIMM slots.
> 
> QEMU, haha.
> 
> What is the highest count of DIMM slots which are hotpluggable on a
> real, *physical* system today? Are you saying you can have 1K DIMM slots
> on a board?

The concern to range number mainly is on Virt guest systems. On
baremetal system, basically only very high end server support memory hotplug.
I ever visited customer's lab and saw one server, it owns 8 slots, on
each slot a box containing about 20 cpus and 2T memory at most can be
plugged in at one time. So people won't make too many slots for
hotplugging since it's too expensive.

I checked user space kexec code, the maximum memory range number is
honored to x86_64 because of a HPE SGI system. After that, nobody
complains about it. Please see below user space kexec-tools commit in
https://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git

The memory ranges may be not all made by different DIMM slots, could be
firmware reservatoin, e.g efi/BIOS diggged out physical memory, or the
cpu logical address space is occupied by pci or other stuffs. I don't
have a HPE SGI system at hand to check.

commit 4a6d67d9e938a7accf128aff23f8ad4bda67f729
Author: Xunlei Pang <xlpang@redhat.com>
Date:   Thu Mar 23 19:16:59 2017 +0800

    x86: Support large number of memory ranges
    
    We got a problem on one SGI 64TB machine, the current kexec-tools
    failed to work due to the insufficient ranges(MAX_MEMORY_RANGES)
    allowed which is defined as 1024(less than the ranges on the machine).
    The kcore header is insufficient due to the same reason as well.
    
    To solve this, this patch simply doubles "MAX_MEMORY_RANGES" and
    "KCORE_ELF_HEADERS_SIZE".
    
    Signed-off-by: Xunlei Pang <xlpang@redhat.com>
    Tested-by: Frank Ramsay <frank.ramsay@hpe.com>
    Signed-off-by: Simon Horman <horms@verge.net.au>

diff --git a/kexec/arch/i386/kexec-x86.h b/kexec/arch/i386/kexec-x86.h
index 33df3524f4e2..51855f8db762 100644
--- a/kexec/arch/i386/kexec-x86.h
+++ b/kexec/arch/i386/kexec-x86.h
@@ -1,7 +1,7 @@
 #ifndef KEXEC_X86_H
 #define KEXEC_X86_H
 
-#define MAX_MEMORY_RANGES 1024
+#define MAX_MEMORY_RANGES 2048

> 
> I hardly doubt that.

The questioning is reasonable. 32K truly looks too much. 

Now CONFIG_NR_CPUS has the maximum number as 8192. And user space 
kexec-tools has maximum memory range number as 2048. We can take
the current 8192 + 2048  = 10K as default value conservatively. Or
take 8192 + 2048 * 2 = 12K which has two times of maximum memory range
bumber in kexec-tools. What do you think?

> 
> > So, for example, 1TiB requires 1024 DIMMs of 1GiB each with 128MiB
> > memblocks, that results in 8K possible memory regions. So just going
> > to 4TiB reaches 32K memory regions.
> 
> Lemme see if I understand this correctly: when a system like that
> crashes, you want to kdump *all* those 4TiB in a vmcore? How long would
> that dump take to complete? A day?

That is not a problem. The time of vmcore dumping mainly depends on the
actual memory size, not on memory range numbers. when dumping vmcore,
people use makedumpfile to filter zero page, free page, cache page, or user
date page according to configuration. If memory is huge, they can use
nr_cpus=x to enable multiple cpu to do multi-thread dumping. Kdump now
support more than 10 TB vmcore dumping.


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support
  2022-10-07 19:14     ` Eric DeVolder
@ 2022-10-17  6:45       ` Sourabh Jain
  2022-10-24  9:10       ` Baoquan He
  1 sibling, 0 replies; 57+ messages in thread
From: Sourabh Jain @ 2022-10-17  6:45 UTC (permalink / raw)
  To: Eric DeVolder, linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, konrad.wilk, boris.ostrovsky


On 08/10/22 00:44, Eric DeVolder wrote:
>
>
> On 10/3/22 12:51, Sourabh Jain wrote:
>> Hello Eric,
>>
>> On 10/09/22 02:35, Eric DeVolder wrote:
>>> CPU and memory change notifications are received in order to
>>> regenerate the elfcorehdr.
>>>
>>> To support cpu hotplug, a callback is registered to capture the
>>> CPUHP_AP_ONLINE_DYN online and offline events via
>>> cpuhp_setup_state_nocalls().
>>>
>>> To support memory hotplug, a notifier is registered to capture the
>>> MEM_ONLINE and MEM_OFFLINE events via register_memory_notifier().
>>>
>>> The cpu callback and memory notifiers call handle_hotplug_event()
>>> which performs needed tasks and then dispatches the event to the
>>> architecture specific arch_crash_handle_hotplug_event(). During the
>>> process, the kexec_mutex is held.
>>>
>>> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
>>> Acked-by: Baoquan He <bhe@redhat.com>
>>> ---
>>>   include/linux/crash_core.h |   8 +++
>>>   include/linux/kexec.h      |  26 +++++++
>>>   kernel/crash_core.c        | 134 
>>> +++++++++++++++++++++++++++++++++++++
>>>   3 files changed, 168 insertions(+)
>>>
>>> diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
>>> index de62a722431e..a270f8660538 100644
>>> --- a/include/linux/crash_core.h
>>> +++ b/include/linux/crash_core.h
>>> @@ -84,4 +84,12 @@ int parse_crashkernel_high(char *cmdline, 
>>> unsigned long long system_ram,
>>>   int parse_crashkernel_low(char *cmdline, unsigned long long 
>>> system_ram,
>>>           unsigned long long *crash_size, unsigned long long 
>>> *crash_base);
>>> +#define KEXEC_CRASH_HP_REMOVE_CPU        0
>>> +#define KEXEC_CRASH_HP_ADD_CPU            1
>>> +#define KEXEC_CRASH_HP_REMOVE_MEMORY        2
>>> +#define KEXEC_CRASH_HP_ADD_MEMORY        3
>>> +#define KEXEC_CRASH_HP_INVALID_CPU        -1U
>>> +
>>> +struct kimage;
>>> +
>>>   #endif /* LINUX_CRASH_CORE_H */
>>> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
>>> index 4eefa631e0ae..9597b41136ec 100644
>>> --- a/include/linux/kexec.h
>>> +++ b/include/linux/kexec.h
>>> @@ -374,6 +374,13 @@ struct kimage {
>>>       struct purgatory_info purgatory_info;
>>>   #endif
>>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>> +    bool hotplug_event;
>>> +    unsigned int offlinecpu;
>>> +    bool elfcorehdr_index_valid;
>>> +    int elfcorehdr_index;
>>> +#endif
>>> +
>>>   #ifdef CONFIG_IMA_KEXEC
>>>       /* Virtual address of IMA measurement buffer for kexec syscall */
>>>       void *ima_buffer;
>>> @@ -503,6 +510,25 @@ static inline int 
>>> arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g
>>>   static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned 
>>> int pages) { }
>>>   #endif
>>> +#ifndef arch_map_crash_pages
>>> +static inline void *arch_map_crash_pages(unsigned long paddr,
>>> +        unsigned long size)
>>> +{
>>> +    return NULL;
>>> +}
>>> +#endif
>>> +
>>> +#ifndef arch_unmap_crash_pages
>>> +static inline void arch_unmap_crash_pages(void **ptr) { }
>>> +#endif
>>> +
>>> +#ifndef arch_crash_handle_hotplug_event
>>> +static inline void arch_crash_handle_hotplug_event(struct kimage 
>>> *image,
>>> +        unsigned int hp_action)
>>> +{
>>> +}
>>> +#endif
>>> +
>>>   #else /* !CONFIG_KEXEC_CORE */
>>>   struct pt_regs;
>>>   struct task_struct;
>>> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
>>> index 8c648fd5897a..4b15d91f0b21 100644
>>> --- a/kernel/crash_core.c
>>> +++ b/kernel/crash_core.c
>>> @@ -11,6 +11,8 @@
>>>   #include <linux/vmalloc.h>
>>>   #include <linux/sizes.h>
>>>   #include <linux/kexec.h>
>>> +#include <linux/memory.h>
>>> +#include <linux/cpuhotplug.h>
>>>   #include <asm/page.h>
>>>   #include <asm/sections.h>
>>> @@ -18,6 +20,7 @@
>>>   #include <crypto/sha1.h>
>>>   #include "kallsyms_internal.h"
>>> +#include "kexec_internal.h"
>>>   /* vmcoreinfo stuff */
>>>   unsigned char *vmcoreinfo_data;
>>> @@ -612,3 +615,134 @@ static int __init 
>>> crash_save_vmcoreinfo_init(void)
>>>   }
>>>   subsys_initcall(crash_save_vmcoreinfo_init);
>>> +
>>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>> +/*
>>> + * To accurately reflect hot un/plug changes, the elfcorehdr (which
>>> + * is passed to the crash kernel via the elfcorehdr= parameter)
>>> + * must be updated with the new list of CPUs and memories.
>>> + *
>>> + * In order to make changes to elfcorehdr, two conditions are needed:
>>> + * First, the segment containing the elfcorehdr must be large enough
>>> + * to permit a growing number of resources. The elfcorehdr memory is
>>> + * typically based on CONFIG_NR_CPUS and 
>>> CONFIG_CRASH_MAX_MEMORY_RANGES.
>>> + * Second, purgatory must explicitly exclude the elfcorehdr from the
>>> + * list of segments it checks (since the elfcorehdr changes and thus
>>> + * would require an update to purgatory itself to update the digest).
>>> + */
>>> +static void handle_hotplug_event(unsigned int hp_action, unsigned 
>>> int cpu)
>>> +{
>>> +    /* Obtain lock while changing crash information */
>>> +    mutex_lock(&kexec_mutex);
>>> +
>>> +    /* Check kdump is loaded */
>>> +    if (kexec_crash_image) {
>>> +        struct kimage *image = kexec_crash_image;
>>> +
>>> +        if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
>>> +            hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
>>> +            pr_debug("crash hp: hp_action %u, cpu %u\n", hp_action, 
>>> cpu);
>>> +        else
>>> +            pr_debug("crash hp: hp_action %u\n", hp_action);
>>> +
>>> +        /*
>>> +         * When the struct kimage is allocated, it is wiped to 
>>> zero, so
>>> +         * the elfcorehdr_index_valid defaults to false. Find the
>>> +         * segment containing the elfcorehdr, if not already found.
>>> +         * This works for both the kexec_load and kexec_file_load 
>>> paths.
>>> +         */
>>> +        if (!image->elfcorehdr_index_valid) {
>>> +            unsigned char *ptr;
>>> +            unsigned long mem, memsz;
>>> +            unsigned int n;
>>> +
>>> +            for (n = 0; n < image->nr_segments; n++) {
>>> +                mem = image->segment[n].mem;
>>> +                memsz = image->segment[n].memsz;
>>> +                ptr = arch_map_crash_pages(mem, memsz);
>>> +                if (ptr) {
>>> +                    /* The segment containing elfcorehdr */
>>> +                    if (memcmp(ptr, ELFMAG, SELFMAG) == 0) {
>>> +                        image->elfcorehdr_index = (int)n;
>>> +                        image->elfcorehdr_index_valid = true;
>>> +                    }
>>> +                }
>>> +                arch_unmap_crash_pages((void **)&ptr);
>>> +            }
>>> +        }
>>> +
>>> +        if (!image->elfcorehdr_index_valid) {
>>> +            pr_err("crash hp: unable to locate elfcorehdr segment");
>>> +            goto out;
>>> +        }
>>> +
>>> +        /* Needed in order for the segments to be updated */
>>> +        arch_kexec_unprotect_crashkres();
>>> +
>>> +        /* Flag to differentiate between normal load and hotplug */
>>> +        image->hotplug_event = true;
>>> +
>>> +        /* Now invoke arch-specific update handler */
>>> +        arch_crash_handle_hotplug_event(image, hp_action);
>>> +
>>> +        /* No longer handling a hotplug event */
>>> +        image->hotplug_event = false;
>>> +
>>> +        /* Change back to read-only */
>>> +        arch_kexec_protect_crashkres();
>>> +    }
>>> +
>>> +out:
>>> +    /* Release lock now that update complete */
>>> +    mutex_unlock(&kexec_mutex);
>>> +}
>>> +
>>> +static int crash_memhp_notifier(struct notifier_block *nb, unsigned 
>>> long val, void *v)
>>> +{
>>> +    switch (val) {
>>> +    case MEM_ONLINE:
>>> +        handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, 0);
>>> +        break;
>>> +
>>> +    case MEM_OFFLINE:
>>> +        handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, 0);
>>> +        break;
>>> +    }
>>> +    return NOTIFY_OK;
>>
>> Can we pass v (memory_notify) argument to 
>> arch_crash_handle_hotplug_event function
>> via handle_hotplug_event?
>>
>> Because the way memory hotplug is handled on PowerPC, it is hard to 
>> update the elfcorehdr
>> without memory_notify args.
>>
>> On PowePC memblock data structure is used to prepare elfcorehdr for 
>> kdump. Since the notifier
>> used for memory hotplug crash handler get initiated before the 
>> memblock data structure update
>> happens (as depicted below), the newly prepared elfcorehdr still 
>> holds the old memory regions.
>> So if the system crash with obsolete elfcorehdr, makedumpfile failed 
>> to collect vmcore.
>>
>> Sequence of actions done on PowerPC to server the memory hotplug:
>>
>>   Initiate memory hot remove
>>            |
>>            v
>>   offline pages
>>            |
>>            v
>>   initiate memory notify call chain
>>   for MEM_OFFLINE event.
>>   (same is used for crash update)
>>            |
>>            v
>>   prepare new elfcorehdr for kdump using
>>   memblock data structure
>>            |
>>            v
>>   update memblock data structure
>>
>> How passing memory_notify to arch crash hotplug handler will help?
>>
>> memory_notify holds the start PFN and page count, with that we can get
>> the base address and size of hot unplugged memory and can use the same
>> to avoid hot unplugged memeory region to get added in the elfcorehdr..
>>
>> Thanks,
>> Sourabh Jain
>>
>
> Sourabh, let's see what Baoquan thinks.
>
> Baoquan, are you OK with this request? I once had these parameters to the
> crash hotplug handler and since they were unused at the time, you asked
> that I remove them, which I did.
>
> To accommodate this, how about this:
>
> static void handle_hotplug_event(unsigned int hp_action, unsigned int 
> cpu,
>      unsigned long mem_start, unsigned long mem_size)
>
> For CPU events, I would just pass zeros for mem_start/size. For memory 
> events,
> I would pass KEXEC_CRASH_HP_INVALID_CPU.

How about passing memory_notify struct as is and let architecture handle 
the rest?

Thanks,
Sourabh Jain

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-07 19:33     ` Eric DeVolder
@ 2022-10-17  6:54       ` Sourabh Jain
  0 siblings, 0 replies; 57+ messages in thread
From: Sourabh Jain @ 2022-10-17  6:54 UTC (permalink / raw)
  To: Eric DeVolder, linux-kernel, x86, kexec, ebiederm, dyoung, bhe, vgoyal
  Cc: tglx, mingo, bp, dave.hansen, hpa, nramas, thomas.lendacky, robh,
	efault, rppt, david, konrad.wilk, boris.ostrovsky


On 08/10/22 01:03, Eric DeVolder wrote:
>
>
> On 9/19/22 02:06, Sourabh Jain wrote:
>>
>> On 10/09/22 02:35, Eric DeVolder wrote:
>>> For x86_64, when CPU or memory is hot un/plugged, the crash
>>> elfcorehdr, which describes the CPUs and memory in the system,
>>> must also be updated.
>>>
>>> When loading the crash kernel via kexec_load or kexec_file_load,
>>> the elfcorehdr is identified at run time in
>>> crash_core:handle_hotplug_event().
>>>
>>> To update the elfcorehdr for x86_64, a new elfcorehdr must be
>>> generated from the available CPUs and memory. The new elfcorehdr
>>> is prepared into a buffer, and then installed over the top of
>>> the existing elfcorehdr.
>>>
>>> In the patch 'kexec: exclude elfcorehdr from the segment digest'
>>> the need to update purgatory due to the change in elfcorehdr was
>>> eliminated.  As a result, no changes to purgatory or boot_params
>>> (as the elfcorehdr= kernel command line parameter pointer
>>> remains unchanged and correct) are needed, just elfcorehdr.
>>>
>>> To accommodate a growing number of resources via hotplug, the
>>> elfcorehdr segment must be sufficiently large enough to accommodate
>>> changes, see the CRASH_MAX_MEMORY_RANGES configure item.
>>>
>>> With this change, crash hotplug for kexec_file_load syscall
>>> is supported. The kexec_load is also supported, but also
>>> requires a corresponding change to userspace kexec-tools.
>>>
>>> Signed-off-by: Eric DeVolder <eric.devolder@oracle.com>
>>> Acked-by: Baoquan He <bhe@redhat.com>
>>> ---
>>>   arch/x86/Kconfig             |  11 ++++
>>>   arch/x86/include/asm/kexec.h |  20 +++++++
>>>   arch/x86/kernel/crash.c      | 102 
>>> +++++++++++++++++++++++++++++++++++
>>>   3 files changed, 133 insertions(+)
>>>
>>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>>> index f9920f1341c8..cdfc9b2fdf98 100644
>>> --- a/arch/x86/Kconfig
>>> +++ b/arch/x86/Kconfig
>>> @@ -2056,6 +2056,17 @@ config CRASH_DUMP
>>>         (CONFIG_RELOCATABLE=y).
>>>         For more details see Documentation/admin-guide/kdump/kdump.rst
>>> +config CRASH_MAX_MEMORY_RANGES
>>> +    depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || 
>>> MEMORY_HOTPLUG)
>>> +    int
>>> +    default 32768
>>> +    help
>>> +      For the kexec_file_load path, specify the maximum number of
>>> +      memory regions, eg. as represented by the 'System RAM' entries
>>> +      in /proc/iomem, that the elfcorehdr buffer/segment can 
>>> accommodate.
>>> +      This value is combined with NR_CPUS and multiplied by Elf64_Phdr
>>> +      size to determine the final buffer size.
>>> +
>>>   config KEXEC_JUMP
>>>       bool "kexec jump"
>>>       depends on KEXEC && HIBERNATION
>>> diff --git a/arch/x86/include/asm/kexec.h 
>>> b/arch/x86/include/asm/kexec.h
>>> index a3760ca796aa..432073385b2d 100644
>>> --- a/arch/x86/include/asm/kexec.h
>>> +++ b/arch/x86/include/asm/kexec.h
>>> @@ -212,6 +212,26 @@ typedef void crash_vmclear_fn(void);
>>>   extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
>>>   extern void kdump_nmi_shootdown_cpus(void);
>>> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size);
>>> +#define arch_map_crash_pages arch_map_crash_pages
>>> +
>>> +void arch_unmap_crash_pages(void **ptr);
>>> +#define arch_unmap_crash_pages arch_unmap_crash_pages
>>> +
>>> +void arch_crash_handle_hotplug_event(struct kimage *image,
>>> +        unsigned int hp_action);
>>> +#define arch_crash_handle_hotplug_event 
>>> arch_crash_handle_hotplug_event
>>> +
>>> +#ifdef CONFIG_HOTPLUG_CPU
>>> +static inline int crash_hotplug_cpu_support(void) { return 1; }
>>> +#define crash_hotplug_cpu_support crash_hotplug_cpu_support
>>> +#endif
>>> +
>>> +#ifdef CONFIG_MEMORY_HOTPLUG
>>> +static inline int crash_hotplug_memory_support(void) { return 1; }
>>> +#define crash_hotplug_memory_support crash_hotplug_memory_support
>>> +#endif
>>> +
>>>   #endif /* __ASSEMBLY__ */
>>>   #endif /* _ASM_X86_KEXEC_H */
>>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>>> index 9ceb93c176a6..8fc7d678ac72 100644
>>> --- a/arch/x86/kernel/crash.c
>>> +++ b/arch/x86/kernel/crash.c
>>> @@ -25,6 +25,7 @@
>>>   #include <linux/slab.h>
>>>   #include <linux/vmalloc.h>
>>>   #include <linux/memblock.h>
>>> +#include <linux/highmem.h>
>>>   #include <asm/processor.h>
>>>   #include <asm/hardirq.h>
>>> @@ -397,7 +398,18 @@ int crash_load_segments(struct kimage *image)
>>>       image->elf_headers = kbuf.buffer;
>>>       image->elf_headers_sz = kbuf.bufsz;
>>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>> +    /* Ensure elfcorehdr segment large enough for hotplug changes */
>>> +    kbuf.memsz =
>>> +        (CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
>>> +            sizeof(Elf64_Phdr);
>>> +    /* Mark as usable to crash kernel, else crash kernel fails on 
>>> boot */
>>> +    image->elf_headers_sz = kbuf.memsz;
>>> +    image->elfcorehdr_index = image->nr_segments;
>>> +    image->elfcorehdr_index_valid = true;
>>> +#else
>>>       kbuf.memsz = kbuf.bufsz;
>>> +#endif
>>>       kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
>>>       kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
>>>       ret = kexec_add_buffer(&kbuf);
>>> @@ -412,3 +424,93 @@ int crash_load_segments(struct kimage *image)
>>>       return ret;
>>>   }
>>>   #endif /* CONFIG_KEXEC_FILE */
>>> +
>>> +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
>>> +/*
>>> + * NOTE: The addresses and sizes passed to this routine have
>>> + * already been fully aligned on page boundaries. There is no
>>> + * need for massaging the address or size.
>>> + */
>>> +void *arch_map_crash_pages(unsigned long paddr, unsigned long size)
>>> +{
>>> +    void *ptr = NULL;
>>> +
>>> +    if (size > 0) {
>>> +        struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
>>> +
>>> +        ptr = kmap_local_page(page);
>>> +    }
>>> +
>>> +    return ptr;
>>> +}
>>> +
>>> +void arch_unmap_crash_pages(void **ptr)
>>> +{
>>> +    if (ptr) {
>>> +        if (*ptr)
>>> +            kunmap_local(*ptr);
>>> +        *ptr = NULL;
>>> +    }
>>> +}
>>> +
>>> +/**
>>> + * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr 
>>> changes
>>> + * @image: the active struct kimage
>>> + * @hp_action: the hot un/plug action being handled
>>> + *
>>> + * To accurately reflect hot un/plug changes, the new elfcorehdr
>>> + * is prepared in a kernel buffer, and then it is written on top
>>> + * of the existing/old elfcorehdr.
>>> + */
>>> +void arch_crash_handle_hotplug_event(struct kimage *image,
>>> +    unsigned int hp_action)
>>> +{
>>> +    struct kexec_segment *ksegment;
>>> +    unsigned char *ptr = NULL;
>>> +    unsigned long elfsz = 0;
>>> +    void *elfbuf = NULL;
>>> +    unsigned long mem, memsz;
>>> +
>>> +    /*
>>> +     * Elfcorehdr_index_valid checked in 
>>> crash_core:handle_hotplug_event()
>>> +     */
>>> +    ksegment = &image->segment[image->elfcorehdr_index];
>>> +    mem = ksegment->mem;
>>> +    memsz = ksegment->memsz;
>>> +
>>> +    /*
>>> +     * Create the new elfcorehdr reflecting the changes to CPU and/or
>>> +     * memory resources.
>>> +     */
>>> +    if (prepare_elf_headers(image, &elfbuf, &elfsz)) {
>>> +        pr_err("crash hp: unable to prepare elfcore headers");
>>> +        goto out;
>>
>> On PowerPC, while preparing the elf core header the memblock 
>> structure is used to prepare program header for memory regions of 
>> elfcorehdr. Since the above arch specific hotplug handler gets 
>> invoked when memory is marked offline (MEM_OFFLINE) which is before 
>> memblock structure gets updated so on PowerPC the above handler may 
>> not work for memory hotplug case.
>>
>> Just wondering which data structure is used to get the list of memory 
>> regions while preparing program header for memory regions of 
>> elfcorehdr on other architectures?
>>
>> Thanks,
>> Sourabh Jain
>
> I think your request to report the memory block address in comments of 
> patch 3/7 "crash: add generic infrastructure" cover this scenario now.
Yes, the asked changes will make easy for PowerPC to recreate elfcorehdr.

Thanks,
Sourabh Jain

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support
  2022-10-07 19:14     ` Eric DeVolder
  2022-10-17  6:45       ` Sourabh Jain
@ 2022-10-24  9:10       ` Baoquan He
  2022-10-26  7:00         ` Sourabh Jain
  1 sibling, 1 reply; 57+ messages in thread
From: Baoquan He @ 2022-10-24  9:10 UTC (permalink / raw)
  To: Eric DeVolder, Sourabh Jain
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo,
	bp, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, konrad.wilk, boris.ostrovsky

Hi Eric, Sourabh,

On 10/07/22 at 02:14pm, Eric DeVolder wrote:
> 
> 
> On 10/3/22 12:51, Sourabh Jain wrote:
> > Hello Eric,
> > 
> > On 10/09/22 02:35, Eric DeVolder wrote:
......
> > > +static void handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
> > > +{
> > > +    /* Obtain lock while changing crash information */
> > > +    mutex_lock(&kexec_mutex);
> > > +
> > > +    /* Check kdump is loaded */
> > > +    if (kexec_crash_image) {
> > > +        struct kimage *image = kexec_crash_image;
> > > +
> > > +        if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
> > > +            hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
> > > +            pr_debug("crash hp: hp_action %u, cpu %u\n", hp_action, cpu);
> > > +        else
> > > +            pr_debug("crash hp: hp_action %u\n", hp_action);
> > > +
> > > +        /*
> > > +         * When the struct kimage is allocated, it is wiped to zero, so
> > > +         * the elfcorehdr_index_valid defaults to false. Find the
> > > +         * segment containing the elfcorehdr, if not already found.
> > > +         * This works for both the kexec_load and kexec_file_load paths.
> > > +         */
> > > +        if (!image->elfcorehdr_index_valid) {
> > > +            unsigned char *ptr;
> > > +            unsigned long mem, memsz;
> > > +            unsigned int n;
> > > +
> > > +            for (n = 0; n < image->nr_segments; n++) {
> > > +                mem = image->segment[n].mem;
> > > +                memsz = image->segment[n].memsz;
> > > +                ptr = arch_map_crash_pages(mem, memsz);
> > > +                if (ptr) {
> > > +                    /* The segment containing elfcorehdr */
> > > +                    if (memcmp(ptr, ELFMAG, SELFMAG) == 0) {
> > > +                        image->elfcorehdr_index = (int)n;
> > > +                        image->elfcorehdr_index_valid = true;
> > > +                    }
> > > +                }
> > > +                arch_unmap_crash_pages((void **)&ptr);
> > > +            }
> > > +        }
> > > +
> > > +        if (!image->elfcorehdr_index_valid) {
> > > +            pr_err("crash hp: unable to locate elfcorehdr segment");
> > > +            goto out;
> > > +        }
> > > +
> > > +        /* Needed in order for the segments to be updated */
> > > +        arch_kexec_unprotect_crashkres();
> > > +
> > > +        /* Flag to differentiate between normal load and hotplug */
> > > +        image->hotplug_event = true;
> > > +
> > > +        /* Now invoke arch-specific update handler */
> > > +        arch_crash_handle_hotplug_event(image, hp_action);
> > > +
> > > +        /* No longer handling a hotplug event */
> > > +        image->hotplug_event = false;
> > > +
> > > +        /* Change back to read-only */
> > > +        arch_kexec_protect_crashkres();
> > > +    }
> > > +
> > > +out:
> > > +    /* Release lock now that update complete */
> > > +    mutex_unlock(&kexec_mutex);
> > > +}
> > > +
> > > +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
> > > +{
> > > +    switch (val) {
> > > +    case MEM_ONLINE:
> > > +        handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, 0);
> > > +        break;
> > > +
> > > +    case MEM_OFFLINE:
> > > +        handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, 0);
> > > +        break;
> > > +    }
> > > +    return NOTIFY_OK;
> > 
> > Can we pass v (memory_notify) argument to arch_crash_handle_hotplug_event function
> > via handle_hotplug_event?
> > 
> > Because the way memory hotplug is handled on PowerPC, it is hard to update the elfcorehdr
> > without memory_notify args.
> > 
> > On PowePC memblock data structure is used to prepare elfcorehdr for kdump. Since the notifier
> > used for memory hotplug crash handler get initiated before the memblock data structure update
> > happens (as depicted below), the newly prepared elfcorehdr still holds the old memory regions.
> > So if the system crash with obsolete elfcorehdr, makedumpfile failed to collect vmcore.
> > 
> > Sequence of actions done on PowerPC to server the memory hotplug:
> > 
> >   Initiate memory hot remove
> >            |
> >            v
> >   offline pages
> >            |
> >            v
> >   initiate memory notify call chain
> >   for MEM_OFFLINE event.
> >   (same is used for crash update)
> >            |
> >            v
> >   prepare new elfcorehdr for kdump using
> >   memblock data structure
> >            |
> >            v
> >   update memblock data structure
> > 
> > How passing memory_notify to arch crash hotplug handler will help?
> > 
> > memory_notify holds the start PFN and page count, with that we can get
> > the base address and size of hot unplugged memory and can use the same
> > to avoid hot unplugged memeory region to get added in the elfcorehdr..
> > 
> > Thanks,
> > Sourabh Jain
> > 
> 
> Sourabh, let's see what Baoquan thinks.
> 
> Baoquan, are you OK with this request? I once had these parameters to the
> crash hotplug handler and since they were unused at the time, you asked
> that I remove them, which I did.

Sorry to miss this mail. I thought both of you were talking about
somthing, and didn't notice this question to me.

I think there are two ways to solve the issue Sourabh raised:
1) make handle_hotplug_event() get and pass down the memory_notify as
Sourabh said, or the hp_action, mem_start|size as Eric suggested. I
have to admit I haven't carefully checked which one is better.

2) let the current code as is since it's aiming at x86 only. Later
Sourabh can modify code according to his need on ppc. This can give
satisfying why on code change each time.

I personally like the 2nd way, while also like seeing 1st one if the
code change and log is convincing to any reviewer.

> 
> To accommodate this, how about this:
> 
> static void handle_hotplug_event(unsigned int hp_action, unsigned int cpu,
>      unsigned long mem_start, unsigned long mem_size)
> 
> For CPU events, I would just pass zeros for mem_start/size. For memory events,
> I would pass KEXEC_CRASH_HP_INVALID_CPU.
> 
> Thanks,
> eric


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-13  2:57                         ` Baoquan He
@ 2022-10-25 10:31                           ` Borislav Petkov
  2022-10-26 14:48                             ` Baoquan He
  0 siblings, 1 reply; 57+ messages in thread
From: Borislav Petkov @ 2022-10-25 10:31 UTC (permalink / raw)
  To: Baoquan He
  Cc: Eric DeVolder, Oscar Salvador, Andrew Morton, david,
	linux-kernel, x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo,
	dave.hansen, hpa, nramas, thomas.lendacky, robh, efault, rppt,
	sourabhjain, linux-mm

On Thu, Oct 13, 2022 at 10:57:28AM +0800, Baoquan He wrote:
> The concern to range number mainly is on Virt guest systems.

And why would virt emulate 1K hotpluggable DIMM slots and not emulate a
real machine?

> On baremetal system, basically only very high end server support
> memory hotplug. I ever visited customer's lab and saw one server,
> it owns 8 slots, on each slot a box containing about 20 cpus and 2T
> memory at most can be plugged in at one time. So people won't make too
> many slots for hotplugging since it's too expensive.

There you have it - the persuading argument.

> I checked user space kexec code, the maximum memory range number is
> honored to x86_64 because of a HPE SGI system. After that, nobody
> complains about it. Please see below user space kexec-tools commit in
> https://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git
> 
> The memory ranges may be not all made by different DIMM slots, could be
> firmware reservatoin, e.g efi/BIOS diggged out physical memory,
			    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

I don't know what that means.

If it is firmware crap, you want to exclude that from kdump anyway.

> Now CONFIG_NR_CPUS has the maximum number as 8192. And user space
> kexec-tools has maximum memory range number as 2048. We can take
> the current 8192 + 2048  = 10K as default value conservatively. Or
> take 8192 + 2048 * 2 = 12K which has two times of maximum memory range
> bumber in kexec-tools. What do you think?

I still think that we should stick to reality and support what is
possible not what is potentially and theoretically there.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-12 16:20                 ` Eric DeVolder
@ 2022-10-25 10:39                   ` Borislav Petkov
  0 siblings, 0 replies; 57+ messages in thread
From: Borislav Petkov @ 2022-10-25 10:39 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: Oscar Salvador, Andrew Morton, david, linux-kernel, x86, kexec,
	ebiederm, dyoung, bhe, vgoyal, tglx, mingo, dave.hansen, hpa,
	nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm

On Wed, Oct 12, 2022 at 11:20:59AM -0500, Eric DeVolder wrote:
> I once had CONFIG_CRASH_HOTPLUG, but you disagreed.
> 
> https://lore.kernel.org/lkml/Ylgot+LUDQl+G%2F5N@zn.tnic/
> 
> From there I simply went with
> 
>  #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
> 
> which route do you prefer?

If you do a single Kconfig item which depends on those two, it probably
is cleaner this way. And if the max memory ranges are hardcoded you
don't need the other prompt asking the user something she most likely
doesn't know how to answer properly.

That is, unless you wanna have that crash hotplug built in all the time.

Because CONFIG_HOTPLUG_CPU is pretty much always enabled so you might
just as well add the crash hotplug support unconditionally, without any
Kconfig ifdeffery whatsoever except CONFIG_MEMORY_HOTPLUG as that is
special and not present on the majority of hardware.

But on a plain simple laptop or workstation which has CPU hotplug, would
it make sense for the crash ranges to get updated too when CPUs are
offlined?

If so, I think you want this code present there too, without a Kconfig
item.

If this is server-only anyway, then a single Kconfig item sounds like
not such a bad idea.

I hope that makes some sense.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support
  2022-10-24  9:10       ` Baoquan He
@ 2022-10-26  7:00         ` Sourabh Jain
  0 siblings, 0 replies; 57+ messages in thread
From: Sourabh Jain @ 2022-10-26  7:00 UTC (permalink / raw)
  To: Baoquan He, Eric DeVolder
  Cc: linux-kernel, x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo,
	bp, dave.hansen, hpa, nramas, thomas.lendacky, robh, efault,
	rppt, david, konrad.wilk, boris.ostrovsky

Hello Baoquan,

On 24/10/22 14:40, Baoquan He wrote:
> Hi Eric, Sourabh,
>
> On 10/07/22 at 02:14pm, Eric DeVolder wrote:
>>
>> On 10/3/22 12:51, Sourabh Jain wrote:
>>> Hello Eric,
>>>
>>> On 10/09/22 02:35, Eric DeVolder wrote:
> ......
>>>> +static void handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
>>>> +{
>>>> +    /* Obtain lock while changing crash information */
>>>> +    mutex_lock(&kexec_mutex);
>>>> +
>>>> +    /* Check kdump is loaded */
>>>> +    if (kexec_crash_image) {
>>>> +        struct kimage *image = kexec_crash_image;
>>>> +
>>>> +        if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
>>>> +            hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
>>>> +            pr_debug("crash hp: hp_action %u, cpu %u\n", hp_action, cpu);
>>>> +        else
>>>> +            pr_debug("crash hp: hp_action %u\n", hp_action);
>>>> +
>>>> +        /*
>>>> +         * When the struct kimage is allocated, it is wiped to zero, so
>>>> +         * the elfcorehdr_index_valid defaults to false. Find the
>>>> +         * segment containing the elfcorehdr, if not already found.
>>>> +         * This works for both the kexec_load and kexec_file_load paths.
>>>> +         */
>>>> +        if (!image->elfcorehdr_index_valid) {
>>>> +            unsigned char *ptr;
>>>> +            unsigned long mem, memsz;
>>>> +            unsigned int n;
>>>> +
>>>> +            for (n = 0; n < image->nr_segments; n++) {
>>>> +                mem = image->segment[n].mem;
>>>> +                memsz = image->segment[n].memsz;
>>>> +                ptr = arch_map_crash_pages(mem, memsz);
>>>> +                if (ptr) {
>>>> +                    /* The segment containing elfcorehdr */
>>>> +                    if (memcmp(ptr, ELFMAG, SELFMAG) == 0) {
>>>> +                        image->elfcorehdr_index = (int)n;
>>>> +                        image->elfcorehdr_index_valid = true;
>>>> +                    }
>>>> +                }
>>>> +                arch_unmap_crash_pages((void **)&ptr);
>>>> +            }
>>>> +        }
>>>> +
>>>> +        if (!image->elfcorehdr_index_valid) {
>>>> +            pr_err("crash hp: unable to locate elfcorehdr segment");
>>>> +            goto out;
>>>> +        }
>>>> +
>>>> +        /* Needed in order for the segments to be updated */
>>>> +        arch_kexec_unprotect_crashkres();
>>>> +
>>>> +        /* Flag to differentiate between normal load and hotplug */
>>>> +        image->hotplug_event = true;
>>>> +
>>>> +        /* Now invoke arch-specific update handler */
>>>> +        arch_crash_handle_hotplug_event(image, hp_action);
>>>> +
>>>> +        /* No longer handling a hotplug event */
>>>> +        image->hotplug_event = false;
>>>> +
>>>> +        /* Change back to read-only */
>>>> +        arch_kexec_protect_crashkres();
>>>> +    }
>>>> +
>>>> +out:
>>>> +    /* Release lock now that update complete */
>>>> +    mutex_unlock(&kexec_mutex);
>>>> +}
>>>> +
>>>> +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
>>>> +{
>>>> +    switch (val) {
>>>> +    case MEM_ONLINE:
>>>> +        handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, 0);
>>>> +        break;
>>>> +
>>>> +    case MEM_OFFLINE:
>>>> +        handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, 0);
>>>> +        break;
>>>> +    }
>>>> +    return NOTIFY_OK;
>>> Can we pass v (memory_notify) argument to arch_crash_handle_hotplug_event function
>>> via handle_hotplug_event?
>>>
>>> Because the way memory hotplug is handled on PowerPC, it is hard to update the elfcorehdr
>>> without memory_notify args.
>>>
>>> On PowePC memblock data structure is used to prepare elfcorehdr for kdump. Since the notifier
>>> used for memory hotplug crash handler get initiated before the memblock data structure update
>>> happens (as depicted below), the newly prepared elfcorehdr still holds the old memory regions.
>>> So if the system crash with obsolete elfcorehdr, makedumpfile failed to collect vmcore.
>>>
>>> Sequence of actions done on PowerPC to server the memory hotplug:
>>>
>>>    Initiate memory hot remove
>>>             |
>>>             v
>>>    offline pages
>>>             |
>>>             v
>>>    initiate memory notify call chain
>>>    for MEM_OFFLINE event.
>>>    (same is used for crash update)
>>>             |
>>>             v
>>>    prepare new elfcorehdr for kdump using
>>>    memblock data structure
>>>             |
>>>             v
>>>    update memblock data structure
>>>
>>> How passing memory_notify to arch crash hotplug handler will help?
>>>
>>> memory_notify holds the start PFN and page count, with that we can get
>>> the base address and size of hot unplugged memory and can use the same
>>> to avoid hot unplugged memeory region to get added in the elfcorehdr..
>>>
>>> Thanks,
>>> Sourabh Jain
>>>
>> Sourabh, let's see what Baoquan thinks.
>>
>> Baoquan, are you OK with this request? I once had these parameters to the
>> crash hotplug handler and since they were unused at the time, you asked
>> that I remove them, which I did.
> Sorry to miss this mail. I thought both of you were talking about
> somthing, and didn't notice this question to me.
>
> I think there are two ways to solve the issue Sourabh raised:
> 1) make handle_hotplug_event() get and pass down the memory_notify as
> Sourabh said, or the hp_action, mem_start|size as Eric suggested. I
> have to admit I haven't carefully checked which one is better.
>
> 2) let the current code as is since it's aiming at x86 only. Later
> Sourabh can modify code according to his need on ppc. This can give
> satisfying why on code change each time.
>
> I personally like the 2nd way, while also like seeing 1st one if the
> code change and log is convincing to any reviewer.

Ok let's go with second approach. I will introduce a patch in PowerPC 
series to update the
handle_hotplug_event function signature and justify the change.

Thanks,
Sourabh Jain


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-25 10:31                           ` Borislav Petkov
@ 2022-10-26 14:48                             ` Baoquan He
  2022-10-26 14:54                               ` David Hildenbrand
  2022-10-27 19:24                               ` Eric DeVolder
  0 siblings, 2 replies; 57+ messages in thread
From: Baoquan He @ 2022-10-26 14:48 UTC (permalink / raw)
  To: Borislav Petkov, david
  Cc: Eric DeVolder, Oscar Salvador, Andrew Morton, linux-kernel, x86,
	kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen, hpa,
	nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm

On 10/25/22 at 12:31pm, Borislav Petkov wrote:
> On Thu, Oct 13, 2022 at 10:57:28AM +0800, Baoquan He wrote:
> > The concern to range number mainly is on Virt guest systems.
> 
> And why would virt emulate 1K hotpluggable DIMM slots and not emulate a
> real machine?

Well, currently, mem hotpug is an important feature on virt system to
dynamically increase/shrink memory on the system. If only emulating real
machine, it won't be different than bare metal system.

IIRC, the ballon driver or virtio-mem feature can add memory board, e.g
1G, block size is 128M, 8 blocks added. When shrinking this 1G memory
later, it will take best effort way to hot remove memory. Means if any
memory block is occupied, it will be kept there. Finally we could only
remove every second blocks, 4 blocks altogether. Then the left
un-removed blocks will produce 4 separate memory regions. Like this, a
virt guest could have many memory regions in kernel after memory
being added/removed.

If I am wrong, Please correct me, David.

> 
> > On baremetal system, basically only very high end server support
> > memory hotplug. I ever visited customer's lab and saw one server,
> > it owns 8 slots, on each slot a box containing about 20 cpus and 2T
> > memory at most can be plugged in at one time. So people won't make too
> > many slots for hotplugging since it's too expensive.
> 
> There you have it - the persuading argument.
> 
> > I checked user space kexec code, the maximum memory range number is
> > honored to x86_64 because of a HPE SGI system. After that, nobody
> > complains about it. Please see below user space kexec-tools commit in
> > https://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git
> > 
> > The memory ranges may be not all made by different DIMM slots, could be
> > firmware reservatoin, e.g efi/BIOS diggged out physical memory,
> 			    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> I don't know what that means.
> 
> If it is firmware crap, you want to exclude that from kdump anyway.

Yes, now assume we have a HPE SGI system and it has memory hotplug
capacity. The system itself has already got memory regions more than
1024. Then when we hot add extra memory board, we want to include the
newly added memory regions into elfcorehdr so that it will be dumped out
in kdump kernel.

That's why I earlier suggested 2048 for number of memory regions.

> 
> > Now CONFIG_NR_CPUS has the maximum number as 8192. And user space
> > kexec-tools has maximum memory range number as 2048. We can take
> > the current 8192 + 2048  = 10K as default value conservatively. Or
> > take 8192 + 2048 * 2 = 12K which has two times of maximum memory range
> > bumber in kexec-tools. What do you think?
> 
> I still think that we should stick to reality and support what is
> possible not what is potentially and theoretically there.

Yes, agree. We should try to get a number which satisfies needs in
reality.

For Kconfig CRASH_MAX_MEMORY_RANGES in this patch, I have three items to
suggest:

1) the name is not good, it doesn't reflect the fact that it's the
number of program headers of elfcorehdr which includes the cpu note 
numbers and memory region numers.

2) default cpu number, I suggest 512 or 1024. The biggest number I
ever saw in reality is 384. On virt system, it won't be too big. Below
is abstracted from arch/x86/Kconfig. A smaller one is also OK, we can
enlarge it when people really have a super machine and run into the
problem.

   config NR_CPUS_DEFAULT
           int
           depends on X86_64
           default 8192 if  MAXSMP
           default   64 if  SMP
           default    1 if !SMP

3) For memory regions, I would suggest 2048. Likewise, smaller value is
also fine, we can enlarge it when a real system run into this.

I made a draft here for reference, with my undertanding. Please feel
free to change it.

+config CRASH_ELF_CORE_PHDRS_NUM
+       depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
+       int
+       default 3072
+	help
+         For the kexec_file_load path, specify the default number of
+         phdr for the vmcore. E.g the memory regions represented by the
+         'System RAM' entries in /proc/iomem, the cpu notes of each
+         present cpu stored in /sys/devices/system/cpu/cpuX/crash_notes.

Thanks


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-26 14:48                             ` Baoquan He
@ 2022-10-26 14:54                               ` David Hildenbrand
  2022-10-27 13:52                                 ` Baoquan He
  2022-10-27 19:24                               ` Eric DeVolder
  1 sibling, 1 reply; 57+ messages in thread
From: David Hildenbrand @ 2022-10-26 14:54 UTC (permalink / raw)
  To: Baoquan He, Borislav Petkov
  Cc: Eric DeVolder, Oscar Salvador, Andrew Morton, linux-kernel, x86,
	kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen, hpa,
	nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm

On 26.10.22 16:48, Baoquan He wrote:
> On 10/25/22 at 12:31pm, Borislav Petkov wrote:
>> On Thu, Oct 13, 2022 at 10:57:28AM +0800, Baoquan He wrote:
>>> The concern to range number mainly is on Virt guest systems.
>>
>> And why would virt emulate 1K hotpluggable DIMM slots and not emulate a
>> real machine?

IIRC, ACPI only allows for 256 slots. PPC dlpar might provide more.

> 
> Well, currently, mem hotpug is an important feature on virt system to
> dynamically increase/shrink memory on the system. If only emulating real
> machine, it won't be different than bare metal system.
> 
> IIRC, the ballon driver or virtio-mem feature can add memory board, e.g
> 1G, block size is 128M, 8 blocks added. When shrinking this 1G memory
> later, it will take best effort way to hot remove memory. Means if any
> memory block is occupied, it will be kept there. Finally we could only
> remove every second blocks, 4 blocks altogether. Then the left
> un-removed blocks will produce 4 separate memory regions. Like this, a
> virt guest could have many memory regions in kernel after memory
> being added/removed.
> 
> If I am wrong, Please correct me, David.

Yes, virtio-mem (but also PPC dlpar) can result in many individual 
memory blocks with holes in between after hotunplug. Hotplug OTOH, 
usually tries to "plug" these holes and reduce the total number of 
memory blocks. It might be rare that our range will be heavily 
fragmented after unplug, but it's certainly possible.

[...]

> 
> Yes, now assume we have a HPE SGI system and it has memory hotplug
> capacity. The system itself has already got memory regions more than
> 1024. Then when we hot add extra memory board, we want to include the
> newly added memory regions into elfcorehdr so that it will be dumped out
> in kdump kernel.
> 
> That's why I earlier suggested 2048 for number of memory regions.

The more the better, unless "it hurts". Assuming a single memory block 
is 128 MiB, that would be 256 GiB.

Usually, on big systems, the memory block size is 2 GiB. So 4 TiB.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-26 14:54                               ` David Hildenbrand
@ 2022-10-27 13:52                                 ` Baoquan He
  2022-10-27 19:28                                   ` Eric DeVolder
  0 siblings, 1 reply; 57+ messages in thread
From: Baoquan He @ 2022-10-27 13:52 UTC (permalink / raw)
  To: David Hildenbrand, Borislav Petkov, Eric DeVolder
  Cc: Oscar Salvador, Andrew Morton, linux-kernel, x86, kexec,
	ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen, hpa, nramas,
	thomas.lendacky, robh, efault, rppt, sourabhjain, linux-mm

On 10/26/22 at 04:54pm, David Hildenbrand wrote:
> On 26.10.22 16:48, Baoquan He wrote:
> > On 10/25/22 at 12:31pm, Borislav Petkov wrote:
> > > On Thu, Oct 13, 2022 at 10:57:28AM +0800, Baoquan He wrote:
> > > > The concern to range number mainly is on Virt guest systems.
> > > 
> > > And why would virt emulate 1K hotpluggable DIMM slots and not emulate a
> > > real machine?
> 
> IIRC, ACPI only allows for 256 slots. PPC dlpar might provide more.
> 
> > 
> > Well, currently, mem hotpug is an important feature on virt system to
> > dynamically increase/shrink memory on the system. If only emulating real
> > machine, it won't be different than bare metal system.
> > 
> > IIRC, the ballon driver or virtio-mem feature can add memory board, e.g
> > 1G, block size is 128M, 8 blocks added. When shrinking this 1G memory
> > later, it will take best effort way to hot remove memory. Means if any
> > memory block is occupied, it will be kept there. Finally we could only
> > remove every second blocks, 4 blocks altogether. Then the left
> > un-removed blocks will produce 4 separate memory regions. Like this, a
> > virt guest could have many memory regions in kernel after memory
> > being added/removed.
> > 
> > If I am wrong, Please correct me, David.
> 
> Yes, virtio-mem (but also PPC dlpar) can result in many individual memory
> blocks with holes in between after hotunplug. Hotplug OTOH, usually tries to
> "plug" these holes and reduce the total number of memory blocks. It might be
> rare that our range will be heavily fragmented after unplug, but it's
> certainly possible.
> 
> [...]
> 
> > 
> > Yes, now assume we have a HPE SGI system and it has memory hotplug
> > capacity. The system itself has already got memory regions more than
> > 1024. Then when we hot add extra memory board, we want to include the
> > newly added memory regions into elfcorehdr so that it will be dumped out
> > in kdump kernel.
> > 
> > That's why I earlier suggested 2048 for number of memory regions.
> 
> The more the better, unless "it hurts". Assuming a single memory block is
> 128 MiB, that would be 256 GiB.
> 
> Usually, on big systems, the memory block size is 2 GiB. So 4 TiB.

Thanks a lot for these valuable inputs, David.

Hi Boris, Eric

So what's your suggested value for the Kconfig option?

1) cpu number, 1024?
2) memory regions, 2048?

About below draft, any comment? We can decide a value based on our
knowledge, can adjust later if any real system has more than the number.

+config CRASH_ELF_CORE_PHDRS_NUM
+       depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
+       int
+       default 3072
+       help
+         For the kexec_file_load path, specify the default number of
+         phdr for the vmcore. E.g the memory regions represented by the
+         'System RAM' entries in /proc/iomem, the cpu notes of each
+         present cpu stored in /sys/devices/system/cpu/cpuX/crash_notes.

Thanks
Baoquan


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-26 14:48                             ` Baoquan He
  2022-10-26 14:54                               ` David Hildenbrand
@ 2022-10-27 19:24                               ` Eric DeVolder
  2022-10-28 10:19                                 ` Borislav Petkov
  1 sibling, 1 reply; 57+ messages in thread
From: Eric DeVolder @ 2022-10-27 19:24 UTC (permalink / raw)
  To: Baoquan He, Borislav Petkov, david
  Cc: Oscar Salvador, Andrew Morton, linux-kernel, x86, kexec,
	ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen, hpa, nramas,
	thomas.lendacky, robh, efault, rppt, sourabhjain, linux-mm



On 10/26/22 09:48, Baoquan He wrote:
> On 10/25/22 at 12:31pm, Borislav Petkov wrote:
>> On Thu, Oct 13, 2022 at 10:57:28AM +0800, Baoquan He wrote:
>>> The concern to range number mainly is on Virt guest systems.
>>
>> And why would virt emulate 1K hotpluggable DIMM slots and not emulate a
>> real machine?
> 
> Well, currently, mem hotpug is an important feature on virt system to
> dynamically increase/shrink memory on the system. If only emulating real
> machine, it won't be different than bare metal system.
> 
> IIRC, the ballon driver or virtio-mem feature can add memory board, e.g
> 1G, block size is 128M, 8 blocks added. When shrinking this 1G memory
> later, it will take best effort way to hot remove memory. Means if any
> memory block is occupied, it will be kept there. Finally we could only
> remove every second blocks, 4 blocks altogether. Then the left
> un-removed blocks will produce 4 separate memory regions. Like this, a
> virt guest could have many memory regions in kernel after memory
> being added/removed.
> 
> If I am wrong, Please correct me, David.
> 
>>
>>> On baremetal system, basically only very high end server support
>>> memory hotplug. I ever visited customer's lab and saw one server,
>>> it owns 8 slots, on each slot a box containing about 20 cpus and 2T
>>> memory at most can be plugged in at one time. So people won't make too
>>> many slots for hotplugging since it's too expensive.
>>
>> There you have it - the persuading argument.

So after re-reading the exchanges, many times, I think I realized that I have introduced confusion 
by using "hotplug", and specifically "memory hotplug" and DIMMs in the same breath, and thus that 
perhaps equated hotplug with ACPI DIMMs in these discussions.

Allow me to state that "hotplug" in this patch series refers to CPU and memory hot un/plug, and does 
*not* explicitly refer to any particular underlying technology to generate CPU and/or memory hot 
un/plug events.

For example, I have been using DIMMs as my example because that has been my test vehicle for 
exercising this code; as such I think the discussion cornered itself into real world vs virt 
discussion about DIMMs, with no end in sight. To be plain, this patch series does not intend to 
convey or change anything specific about ACPI DIMMs.

In reality, when I state "hotplug" in these patches, I am talking generically and therefore 
inclusive of any technology that can hot un/plug CPUs or memory. For memory specifically, this 
includes ACPI DIMMs (whether baremetal or QEMU), ballooning, virtio-mem, PPC dlpar (per David), 
Microsoft DynamicMemory, and the upcoming CXL.mem technology. Probably others that I am not aware. 
Any of these technologies can add or remove memory from a bare metal and/or virtual system.

I apologize.

What is important is the number of memory regions (ie. /proc/iomem entries) that can be considered 
to be the maximum. There is no kernel definition of such. The need to identify a maximum number is 
so that the buffer containing the elfcorehdr can be sized and allocated at kdump load time, *once*. 
This elfcorehdr buffer is then modified/updated repeatedly as hot un/plug events occur. It is *not* 
re-allocated on each hot un/plug event; that is what the current solution does, sort of.

>>
>>> I checked user space kexec code, the maximum memory range number is
>>> honored to x86_64 because of a HPE SGI system. After that, nobody
>>> complains about it. Please see below user space kexec-tools commit in
>>> https://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git
>>>
>>> The memory ranges may be not all made by different DIMM slots, could be
>>> firmware reservatoin, e.g efi/BIOS diggged out physical memory,
>> 			    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>
>> I don't know what that means.
>>
>> If it is firmware crap, you want to exclude that from kdump anyway.
> 
> Yes, now assume we have a HPE SGI system and it has memory hotplug
> capacity. The system itself has already got memory regions more than
> 1024. Then when we hot add extra memory board, we want to include the
> newly added memory regions into elfcorehdr so that it will be dumped out
> in kdump kernel.
> 
> That's why I earlier suggested 2048 for number of memory regions.
> 
>>
>>> Now CONFIG_NR_CPUS has the maximum number as 8192. And user space
>>> kexec-tools has maximum memory range number as 2048. We can take
>>> the current 8192 + 2048  = 10K as default value conservatively. Or
>>> take 8192 + 2048 * 2 = 12K which has two times of maximum memory range
>>> bumber in kexec-tools. What do you think?
>>
>> I still think that we should stick to reality and support what is
>> possible not what is potentially and theoretically there.
> 
> Yes, agree. We should try to get a number which satisfies needs in
> reality.


> 
> For Kconfig CRASH_MAX_MEMORY_RANGES in this patch, I have three items to
> suggest:
> 
> 1) the name is not good, it doesn't reflect the fact that it's the
> number of program headers of elfcorehdr which includes the cpu note
> numbers and memory region numers.

The total number of program headers is, generally speaking, the number of CPUs and the number of 
memory regions (plug one for VMCOREINFO and maybe a few others). The NR_CPUS_DEFAULT conveys the 
maximum number of CPUs possible, and likewise the CRASH_MAX_MEMORY_RANGES is intended to convey the 
maximum number of memory regions possible.

It is not misnamed, imho; rather, I think due to the confusion I outline above, misunderstood.

> 
> 2) default cpu number, I suggest 512 or 1024. The biggest number I
> ever saw in reality is 384. On virt system, it won't be too big. Below
> is abstracted from arch/x86/Kconfig. A smaller one is also OK, we can
> enlarge it when people really have a super machine and run into the
> problem.
> 
>     config NR_CPUS_DEFAULT
>             int
>             depends on X86_64
>             default 8192 if  MAXSMP
>             default   64 if  SMP
>             default    1 if !SMP

I'm all for making this a sane number, I'm just not sure this patch series is the place to do this?

> 
> 3) For memory regions, I would suggest 2048. Likewise, smaller value is
> also fine, we can enlarge it when a real system run into this.

As David points out, if the memblock size if 128MiB, then 2048 allows for 256GiB, which I believe to 
be too low for a maximum default. I'd at least target 1TiB with 128MiB memblock size, which would 
put the number at 8192.

Should the memblock size be 2GiB, for really big systems, then 8192 entries allow handling 16GiB.

With sizeof(elf64_phdr) of 64 bytes, that means the elfcorehdr buffer/memory segment is essentially 
512KiB.

Be aware, in reality, that if the system was fully populated, it would not actually consume all 8192 
phdrs. Rather /proc/iomem would essentially show a large contiguous address space which would 
require just a single phdr. The reason to consider having the larger number of phdrs is so that if 
the memory becomes fragmented due to hot un/plug events, then you need the phdrs to record the 
sparse mapping. The sadistic case is that every other memblock is offlined/removed, not likely, but 
possible.


> 
> I made a draft here for reference, with my undertanding. Please feel
> free to change it.
> 
> +config CRASH_ELF_CORE_PHDRS_NUM
> +       depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
> +       int
> +       default 3072
> +	help
> +         For the kexec_file_load path, specify the default number of
> +         phdr for the vmcore. E.g the memory regions represented by the
> +         'System RAM' entries in /proc/iomem, the cpu notes of each
> +         present cpu stored in /sys/devices/system/cpu/cpuX/crash_notes.
> 
> Thanks
> 

I'd prefer keeping CRASH_MAX_MEMORY_RANGES as that allow the maximum phdr number value to be 
reflective of CPUs and/or memory; not all systems support both CPU and memory hotplug. For example, 
I have queued up this change to reflect this:

     if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
         /* Ensure elfcorehdr segment large enough for hotplug changes */
         unsigned long pnum = 2; /* VMCOREINFO and kernel_map */
         if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
             pnum += CONFIG_NR_CPUS_DEFAULT;
         if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
             pnum += CONFIG_CRASH_MAX_MEMORY_RANGES;
         if (pnum < (unsigned long)PN_XNUM) {
             kbuf.memsz = pnum * sizeof(Elf64_Phdr);
             kbuf.memsz += sizeof(Elf64_Ehdr);
             image->elfcorehdr_index = image->nr_segments;
             image->elfcorehdr_index_valid = true;
             /* Mark as usable to crash kernel, else crash kernel fails on boot *
             image->elf_headers_sz = kbuf.memsz;
         } else {
             pr_err("number of Phdrs %lu exceeds max %lu\n", pnum, (unsigned long
         }
     }

I hope this helps clarify my intentions with this patch series.
Thanks,
eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-27 13:52                                 ` Baoquan He
@ 2022-10-27 19:28                                   ` Eric DeVolder
  2022-10-29  4:27                                     ` Baoquan He
  0 siblings, 1 reply; 57+ messages in thread
From: Eric DeVolder @ 2022-10-27 19:28 UTC (permalink / raw)
  To: Baoquan He, David Hildenbrand, Borislav Petkov
  Cc: Oscar Salvador, Andrew Morton, linux-kernel, x86, kexec,
	ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen, hpa, nramas,
	thomas.lendacky, robh, efault, rppt, sourabhjain, linux-mm



On 10/27/22 08:52, Baoquan He wrote:
> On 10/26/22 at 04:54pm, David Hildenbrand wrote:
>> On 26.10.22 16:48, Baoquan He wrote:
>>> On 10/25/22 at 12:31pm, Borislav Petkov wrote:
>>>> On Thu, Oct 13, 2022 at 10:57:28AM +0800, Baoquan He wrote:
>>>>> The concern to range number mainly is on Virt guest systems.
>>>>
>>>> And why would virt emulate 1K hotpluggable DIMM slots and not emulate a
>>>> real machine?
>>
>> IIRC, ACPI only allows for 256 slots. PPC dlpar might provide more.
>>
>>>
>>> Well, currently, mem hotpug is an important feature on virt system to
>>> dynamically increase/shrink memory on the system. If only emulating real
>>> machine, it won't be different than bare metal system.
>>>
>>> IIRC, the ballon driver or virtio-mem feature can add memory board, e.g
>>> 1G, block size is 128M, 8 blocks added. When shrinking this 1G memory
>>> later, it will take best effort way to hot remove memory. Means if any
>>> memory block is occupied, it will be kept there. Finally we could only
>>> remove every second blocks, 4 blocks altogether. Then the left
>>> un-removed blocks will produce 4 separate memory regions. Like this, a
>>> virt guest could have many memory regions in kernel after memory
>>> being added/removed.
>>>
>>> If I am wrong, Please correct me, David.
>>
>> Yes, virtio-mem (but also PPC dlpar) can result in many individual memory
>> blocks with holes in between after hotunplug. Hotplug OTOH, usually tries to
>> "plug" these holes and reduce the total number of memory blocks. It might be
>> rare that our range will be heavily fragmented after unplug, but it's
>> certainly possible.
>>
>> [...]
>>
>>>
>>> Yes, now assume we have a HPE SGI system and it has memory hotplug
>>> capacity. The system itself has already got memory regions more than
>>> 1024. Then when we hot add extra memory board, we want to include the
>>> newly added memory regions into elfcorehdr so that it will be dumped out
>>> in kdump kernel.
>>>
>>> That's why I earlier suggested 2048 for number of memory regions.
>>
>> The more the better, unless "it hurts". Assuming a single memory block is
>> 128 MiB, that would be 256 GiB.
>>
>> Usually, on big systems, the memory block size is 2 GiB. So 4 TiB.
> 
> Thanks a lot for these valuable inputs, David.
> 
> Hi Boris, Eric
> 
> So what's your suggested value for the Kconfig option?
> 
> 1) cpu number, 1024?
> 2) memory regions, 2048?
> 
> About below draft, any comment? We can decide a value based on our
> knowledge, can adjust later if any real system has more than the number.
> 
> +config CRASH_ELF_CORE_PHDRS_NUM
> +       depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
> +       int
> +       default 3072
> +       help
> +         For the kexec_file_load path, specify the default number of
> +         phdr for the vmcore. E.g the memory regions represented by the
> +         'System RAM' entries in /proc/iomem, the cpu notes of each
> +         present cpu stored in /sys/devices/system/cpu/cpuX/crash_notes.
> 
> Thanks
> Baoquan
> 

I prefer to keep CRASH_MAX_MEMORY_RANGES, as explained in my response to your message on October 26.
eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-27 19:24                               ` Eric DeVolder
@ 2022-10-28 10:19                                 ` Borislav Petkov
  2022-10-28 15:29                                   ` Eric DeVolder
  0 siblings, 1 reply; 57+ messages in thread
From: Borislav Petkov @ 2022-10-28 10:19 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: Baoquan He, david, Oscar Salvador, Andrew Morton, linux-kernel,
	x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen,
	hpa, nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm

On Thu, Oct 27, 2022 at 02:24:11PM -0500, Eric DeVolder wrote:
> Be aware, in reality, that if the system was fully populated, it would not
> actually consume all 8192 phdrs. Rather /proc/iomem would essentially show a
> large contiguous address space which would require just a single phdr.

Then that from below:

	pnum += CONFIG_CRASH_MAX_MEMORY_RANGES;

which then would end up allocating 8192 would be a total waste.

So why don't you make that number dynamic then?

You start with something sensible:

	total_num_pheaders = num_online_cpus() + "some number of regions" + "some few others"

I.e., a number which is a good compromise on the majority of machines.

Then, on hotplug events you count how many new regions are coming in
and when you reach the total_num_pheaders number, you double it (or some
other increase stragegy), reallocate the ELF header buffers etc needed
for kdump and you're good.

This way, you don't waste memory unnecessarily on the majority of
systems and those who need more, get to allocate more.

> I'd prefer keeping CRASH_MAX_MEMORY_RANGES as that allow the maximum phdr
> number value to be reflective of CPUs and/or memory; not all systems support
> both CPU and memory hotplug. For example, I have queued up this change to
> reflect this:
> 
>     if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {

If you're going to keep CRASH_MAX_MEMORY_RANGES, then you can test only
that thing as it expresses the dependency on CONFIG_HOTPLUG_CPU and
CONFIG_MEMORY_HOTPLUG already.

If you end up making the number dynamic, then you could make that a
different Kconfig item which contains all that crash code as most of the
people won't need it anyway.

Hmm?

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-28 10:19                                 ` Borislav Petkov
@ 2022-10-28 15:29                                   ` Eric DeVolder
  2022-10-28 17:06                                     ` Borislav Petkov
  0 siblings, 1 reply; 57+ messages in thread
From: Eric DeVolder @ 2022-10-28 15:29 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Baoquan He, david, Oscar Salvador, Andrew Morton, linux-kernel,
	x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen,
	hpa, nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm



On 10/28/22 05:19, Borislav Petkov wrote:
> On Thu, Oct 27, 2022 at 02:24:11PM -0500, Eric DeVolder wrote:
>> Be aware, in reality, that if the system was fully populated, it would not
>> actually consume all 8192 phdrs. Rather /proc/iomem would essentially show a
>> large contiguous address space which would require just a single phdr.
> 
> Then that from below:
> 
> 	pnum += CONFIG_CRASH_MAX_MEMORY_RANGES;
> 
> which then would end up allocating 8192 would be a total waste.
> 
> So why don't you make that number dynamic then?
> 
> You start with something sensible:
> 
> 	total_num_pheaders = num_online_cpus() + "some number of regions" + "some few others"
> 
> I.e., a number which is a good compromise on the majority of machines.
> 
> Then, on hotplug events you count how many new regions are coming in
> and when you reach the total_num_pheaders number, you double it (or some
> other increase stragegy), reallocate the ELF header buffers etc needed
> for kdump and you're good.
> 
> This way, you don't waste memory unnecessarily on the majority of
> systems and those who need more, get to allocate more.

This patch series sizes and allocates the memory buffer/segment for the elfcorehdr once, at kdump 
load time.

In order to dynamically resize the elcorehdr memory buffer/segment, that causes the following ripple 
effects:

  - Permitting resizing of the elfcorehdr requires a means to "allocate" a new size buffer from 
within the crash kernel reserved area. There is no allocator today; currently it is a kind of 
one-pass placement process that happens at load time. The critical side effect of allocating a new 
elfcorehdr buffer memory/segment is that it creates a new address for the elfcorehdr.

  - The elfcorehdr is passed to the crash kernel via the elfcorehdr= kernel cmdline option. As such, 
a dynamic change to the size of the elfcorehdr size necessarily invites a change of address of that 
buffer, and therefore a change to rewrite the crash kernel cmdline to reflect the new elfcorehdr 
buffer address.

  - A change to the cmdline, also invites a possible change of address of the buffer containing the 
cmdline, and thus a change to the x86 boot_params, which contains the cmdline pointer.

  - A change to the cmdline and/or boot_params, which are *not* excluded from the hash/digest, means 
that the purgatory hash/digest needs to be recomputed, and purgatory re-linked with the new 
hash/digest and replaced.

A fair amount of work, but I have had this working in the past, around the v1 patch series 
timeframe. However, it only worked for the kexec_file_load() syscall as all the needed pieces of 
information were available; but for kexec_load(), it isn't possible to relink purgatory as by that 
point purgatory is but a user-space binary blob.

It was feedback on the v1/v2 that pointed out that by excluding the elfcorehdr from the hash/digest, 
the "change of address" problem with the elfcorehdr buffer/segment goes away, and, in turn, negates 
the need to: introduce an allocator for the crash kernel reserved space, rewrite the crash kernel 
cmdline with a new elfcorehdr, update boot_params with a new cmdline and re-link and replace 
purgatory with the updated digest. And it enables this hotplug efforts to support kexec_load() 
syscall as well.

So it is with this in mind that I suggest we stay with the statically sized elfcorehdr buffer.

If that can be agreed upon, then it is "just a matter" of picking a useful elfcorehdr size. 
Currently that size is derived from the NR_DEFAULT_CPUS and CRASH_MAX_MEMORY_RANGES. So, there is 
still the CRASH_MAX_MEMORY_RANGES knob to help a dial in size, should there be some issue with the 
default value/size.

Or if there is desire to drop computing the size from NR_DEFAULT_CPUs and CRASH_MAX_MEMORY_RANGES 
and simply go with CRASH_HOTPLUG_ELFCOREHDR_SZ which simply specifies the buffer size, then I'm also 
good with that.

I still owe a much better explanation of how to size the elfcorehdr. I can use the comments and 
ideas from the discussion to provide the necessary insight when choosing this value, whether that be 
CRASH_MAX_MEMORY_RANGES or CRASH_HOTPLUG_ELFCOREHDR_SZ.


> 
>> I'd prefer keeping CRASH_MAX_MEMORY_RANGES as that allow the maximum phdr
>> number value to be reflective of CPUs and/or memory; not all systems support
>> both CPU and memory hotplug. For example, I have queued up this change to
>> reflect this:
>>
>>      if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
> 
> If you're going to keep CRASH_MAX_MEMORY_RANGES, then you can test only
> that thing as it expresses the dependency on CONFIG_HOTPLUG_CPU and
> CONFIG_MEMORY_HOTPLUG already.
> 
> If you end up making the number dynamic, then you could make that a
> different Kconfig item which contains all that crash code as most of the
> people won't need it anyway.

It is my intention to correct the CRASH_MAX_MEMORY_RANGES (if we keep it) as such:

config CRASH_MAX_MEMORY_RANGES
     depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG

CRASH_MAX_MEMORY_RANGES should have never had CPU_HOTPLUG as a dependency; that was a cut-n-paste 
error on my part.

> 
> Hmm?
> 

Thank you for the time and thought on this topic!
eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-28 15:29                                   ` Eric DeVolder
@ 2022-10-28 17:06                                     ` Borislav Petkov
  2022-10-28 19:26                                       ` Eric DeVolder
  0 siblings, 1 reply; 57+ messages in thread
From: Borislav Petkov @ 2022-10-28 17:06 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: Baoquan He, david, Oscar Salvador, Andrew Morton, linux-kernel,
	x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen,
	hpa, nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm

On Fri, Oct 28, 2022 at 10:29:45AM -0500, Eric DeVolder wrote:
> So it is with this in mind that I suggest we stay with the statically sized elfcorehdr buffer.
> 
> If that can be agreed upon, then it is "just a matter" of picking a useful
> elfcorehdr size. Currently that size is derived from the NR_DEFAULT_CPUS and
> CRASH_MAX_MEMORY_RANGES. So, there is still the CRASH_MAX_MEMORY_RANGES knob
> to help a dial in size, should there be some issue with the default
> value/size.

Let's see

        kbuf.memsz =
                (CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
                        sizeof(Elf64_Phdr);

which, IINM, is

	(8192 + 32768) * 56

which is something like 2M.

(CONFIG_NR_CPUS_DEFAULT = 8192 - this is because of MAXSMP which gets
set on distro kernels)

Now, since userspace kexec tools uses 2048 for max memory ranges, that
size becomes smaller - around half a Mb. And since y'all wanna be on the
safe side, you can quadruple it and have

	(8192 + 8192) * 56

which is still under a megabyte. And that's fine, I guess, on a big
server.

> Or if there is desire to drop computing the size from NR_DEFAULT_CPUs and

I think you should leave the dependency on the Kconfig size so that
smaller machines which are configured this way, don't end up wasting
unnecessary memory.

> It is my intention to correct the CRASH_MAX_MEMORY_RANGES (if we keep it) as such:
> 
> config CRASH_MAX_MEMORY_RANGES
>     depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG

Yes, but don't leave it to the user to decide what number to choose
- choose a high enough number, explain why you've chosen this with a
comment and that's it.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-28 17:06                                     ` Borislav Petkov
@ 2022-10-28 19:26                                       ` Eric DeVolder
  2022-10-28 20:30                                         ` Borislav Petkov
  0 siblings, 1 reply; 57+ messages in thread
From: Eric DeVolder @ 2022-10-28 19:26 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Baoquan He, david, Oscar Salvador, Andrew Morton, linux-kernel,
	x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen,
	hpa, nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm



On 10/28/22 12:06, Borislav Petkov wrote:
> On Fri, Oct 28, 2022 at 10:29:45AM -0500, Eric DeVolder wrote:
>> So it is with this in mind that I suggest we stay with the statically sized elfcorehdr buffer.
>>
>> If that can be agreed upon, then it is "just a matter" of picking a useful
>> elfcorehdr size. Currently that size is derived from the NR_DEFAULT_CPUS and
>> CRASH_MAX_MEMORY_RANGES. So, there is still the CRASH_MAX_MEMORY_RANGES knob
>> to help a dial in size, should there be some issue with the default
>> value/size.
> 
> Let's see
> 
>          kbuf.memsz =
>                  (CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES) *
>                          sizeof(Elf64_Phdr);
> 
> which, IINM, is
> 
> 	(8192 + 32768) * 56
> 
> which is something like 2M.
> 
> (CONFIG_NR_CPUS_DEFAULT = 8192 - this is because of MAXSMP which gets
> set on distro kernels)
> 
> Now, since userspace kexec tools uses 2048 for max memory ranges, that
> size becomes smaller - around half a Mb. And since y'all wanna be on the
> safe side, you can quadruple it and have
> 
> 	(8192 + 8192) * 56
> 
> which is still under a megabyte. And that's fine, I guess, on a big
> server.

Excellent, I'll set CRASH_MAX_MEMORY_RANGES to 8192! That seems a quite fair trade off of elfcorehdr 
size vs system size (ie 1TiB w/ 128MiB memblock size).

> 
>> Or if there is desire to drop computing the size from NR_DEFAULT_CPUs and
> 
> I think you should leave the dependency on the Kconfig size so that
> smaller machines which are configured this way, don't end up wasting
> unnecessary memory.

Excellent, I'll leave the computation as NR_DEFAULT_CPUS + CRASH_MAX_MEMORY_RANGES.

> 
>> It is my intention to correct the CRASH_MAX_MEMORY_RANGES (if we keep it) as such:
>>
>> config CRASH_MAX_MEMORY_RANGES
>>      depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG
> 
> Yes, but don't leave it to the user to decide what number to choose
> - choose a high enough number, explain why you've chosen this with a
> comment and that's it.

I currently have the Kconfig item as:

config CRASH_MAX_MEMORY_RANGES
     depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG
     int
     default 8192
     help
       For the kexec_file_load path, specify the maximum number of
       memory regions, eg. as represented by the 'System RAM' entries
       in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
       This value is combined with NR_CPUS and multiplied by Elf64_Phdr
       size to determine the final buffer size.

I'll work to provide information a better explanation as to the 8192 number.

Thank you!
eric

> 
> Thx.
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-28 19:26                                       ` Eric DeVolder
@ 2022-10-28 20:30                                         ` Borislav Petkov
  2022-10-28 20:34                                           ` Eric DeVolder
  2022-10-28 21:22                                           ` Eric DeVolder
  0 siblings, 2 replies; 57+ messages in thread
From: Borislav Petkov @ 2022-10-28 20:30 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: Baoquan He, david, Oscar Salvador, Andrew Morton, linux-kernel,
	x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen,
	hpa, nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm

On Fri, Oct 28, 2022 at 02:26:58PM -0500, Eric DeVolder wrote:
> config CRASH_MAX_MEMORY_RANGES
>     depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG
>     int
>     default 8192
>     help
>       For the kexec_file_load path, specify the maximum number of
>       memory regions, eg. as represented by the 'System RAM' entries
>       in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
>       This value is combined with NR_CPUS and multiplied by Elf64_Phdr
>       size to determine the final buffer size.

No, do this:

config CRASH_MEMORY_HOTPLUG_SUPPORT
    depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG
    help
      Help text explaining what this feature is

this thing will simply get enabled when the user enables MEMORY_HOTPLUG
and CRASH_DUMP.

and then you do in the code:

/*
 * A comment explaining how the 8192 value has been selected.
 */
#define CRASH_MAX_MEMORY_RANGES	8192

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-28 20:30                                         ` Borislav Petkov
@ 2022-10-28 20:34                                           ` Eric DeVolder
  2022-10-28 21:22                                           ` Eric DeVolder
  1 sibling, 0 replies; 57+ messages in thread
From: Eric DeVolder @ 2022-10-28 20:34 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Baoquan He, david, Oscar Salvador, Andrew Morton, linux-kernel,
	x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen,
	hpa, nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm



On 10/28/22 15:30, Borislav Petkov wrote:
> On Fri, Oct 28, 2022 at 02:26:58PM -0500, Eric DeVolder wrote:
>> config CRASH_MAX_MEMORY_RANGES
>>      depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG
>>      int
>>      default 8192
>>      help
>>        For the kexec_file_load path, specify the maximum number of
>>        memory regions, eg. as represented by the 'System RAM' entries
>>        in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
>>        This value is combined with NR_CPUS and multiplied by Elf64_Phdr
>>        size to determine the final buffer size.
> 
> No, do this:
> 
> config CRASH_MEMORY_HOTPLUG_SUPPORT
>      depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG
>      help
>        Help text explaining what this feature is
> 
> this thing will simply get enabled when the user enables MEMORY_HOTPLUG
> and CRASH_DUMP.
> 
> and then you do in the code:
> 
> /*
>   * A comment explaining how the 8192 value has been selected.
>   */
> #define CRASH_MAX_MEMORY_RANGES	8192
> 
> Thx.
> 
ok, will do!
thanks!
eric

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-28 20:30                                         ` Borislav Petkov
  2022-10-28 20:34                                           ` Eric DeVolder
@ 2022-10-28 21:22                                           ` Eric DeVolder
  2022-10-28 22:19                                             ` Borislav Petkov
  1 sibling, 1 reply; 57+ messages in thread
From: Eric DeVolder @ 2022-10-28 21:22 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Baoquan He, david, Oscar Salvador, Andrew Morton, linux-kernel,
	x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen,
	hpa, nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm



On 10/28/22 15:30, Borislav Petkov wrote:
> On Fri, Oct 28, 2022 at 02:26:58PM -0500, Eric DeVolder wrote:
>> config CRASH_MAX_MEMORY_RANGES
>>      depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG
>>      int
>>      default 8192
>>      help
>>        For the kexec_file_load path, specify the maximum number of
>>        memory regions, eg. as represented by the 'System RAM' entries
>>        in /proc/iomem, that the elfcorehdr buffer/segment can accommodate.
>>        This value is combined with NR_CPUS and multiplied by Elf64_Phdr
>>        size to determine the final buffer size.
> 
> No, do this:
> 
> config CRASH_MEMORY_HOTPLUG_SUPPORT
>      depends on CRASH_DUMP && KEXEC_FILE && MEMORY_HOTPLUG
>      help
>        Help text explaining what this feature is
> 
> this thing will simply get enabled when the user enables MEMORY_HOTPLUG
> and CRASH_DUMP.
> 
> and then you do in the code:
> 
> /*
>   * A comment explaining how the 8192 value has been selected.
>   */
> #define CRASH_MAX_MEMORY_RANGES	8192
> 
> Thx.
> 

How is this comment?

/*
  * For the kexec_file_load() syscall path, specify the maximum number of
  * memory regions that the elfcorehdr buffer/segment can accommodate.
  * These regions are obtained via walk_system_ram_res(); eg. the
  * 'System RAM' entries in /proc/iomem.
  * This value is combined with NR_CPUS and multiplied by sizeof(Elf64_Phdr)
  * to determine the final elfcorehdr memory buffer/segment size.
  * The value 8192, for example, covers a (sparsely populated) 1TiB system
  * consisting of 128MiB memblock size, while resulting in an elfcorehdr
  * memory buffer/segment size under 1MiB.
  */
#define CRASH_MAX_MEMORY_RANGES 8192

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-28 21:22                                           ` Eric DeVolder
@ 2022-10-28 22:19                                             ` Borislav Petkov
  0 siblings, 0 replies; 57+ messages in thread
From: Borislav Petkov @ 2022-10-28 22:19 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: Baoquan He, david, Oscar Salvador, Andrew Morton, linux-kernel,
	x86, kexec, ebiederm, dyoung, vgoyal, tglx, mingo, dave.hansen,
	hpa, nramas, thomas.lendacky, robh, efault, rppt, sourabhjain,
	linux-mm

On Fri, Oct 28, 2022 at 04:22:54PM -0500, Eric DeVolder wrote:
> /*
>  * For the kexec_file_load() syscall path, specify the maximum number of
>  * memory regions that the elfcorehdr buffer/segment can accommodate.
>  * These regions are obtained via walk_system_ram_res(); eg. the
>  * 'System RAM' entries in /proc/iomem.
>  * This value is combined with NR_CPUS and multiplied by sizeof(Elf64_Phdr)

NR_CPUS_DEFAULT

>  * to determine the final elfcorehdr memory buffer/segment size.
>  * The value 8192, for example, covers a (sparsely populated) 1TiB system
>  * consisting of 128MiB memblock size, while resulting in an elfcorehdr
>  * memory buffer/segment size under 1MiB.

... and it is a sane choice trying to accomodate both actual baremetal
and VM configurations."

Yeah, it's a good start.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support
  2022-10-27 19:28                                   ` Eric DeVolder
@ 2022-10-29  4:27                                     ` Baoquan He
  0 siblings, 0 replies; 57+ messages in thread
From: Baoquan He @ 2022-10-29  4:27 UTC (permalink / raw)
  To: Eric DeVolder
  Cc: David Hildenbrand, Borislav Petkov, Oscar Salvador,
	Andrew Morton, linux-kernel, x86, kexec, ebiederm, dyoung,
	vgoyal, tglx, mingo, dave.hansen, hpa, nramas, thomas.lendacky,
	robh, efault, rppt, sourabhjain, linux-mm

On 10/27/22 at 02:28pm, Eric DeVolder wrote:
> 
> 
> On 10/27/22 08:52, Baoquan He wrote:
> > On 10/26/22 at 04:54pm, David Hildenbrand wrote:
> > > On 26.10.22 16:48, Baoquan He wrote:
> > > > On 10/25/22 at 12:31pm, Borislav Petkov wrote:
> > > > > On Thu, Oct 13, 2022 at 10:57:28AM +0800, Baoquan He wrote:
> > > > > > The concern to range number mainly is on Virt guest systems.
> > > > > 
> > > > > And why would virt emulate 1K hotpluggable DIMM slots and not emulate a
> > > > > real machine?
> > > 
> > > IIRC, ACPI only allows for 256 slots. PPC dlpar might provide more.
> > > 
> > > > 
> > > > Well, currently, mem hotpug is an important feature on virt system to
> > > > dynamically increase/shrink memory on the system. If only emulating real
> > > > machine, it won't be different than bare metal system.
> > > > 
> > > > IIRC, the ballon driver or virtio-mem feature can add memory board, e.g
> > > > 1G, block size is 128M, 8 blocks added. When shrinking this 1G memory
> > > > later, it will take best effort way to hot remove memory. Means if any
> > > > memory block is occupied, it will be kept there. Finally we could only
> > > > remove every second blocks, 4 blocks altogether. Then the left
> > > > un-removed blocks will produce 4 separate memory regions. Like this, a
> > > > virt guest could have many memory regions in kernel after memory
> > > > being added/removed.
> > > > 
> > > > If I am wrong, Please correct me, David.
> > > 
> > > Yes, virtio-mem (but also PPC dlpar) can result in many individual memory
> > > blocks with holes in between after hotunplug. Hotplug OTOH, usually tries to
> > > "plug" these holes and reduce the total number of memory blocks. It might be
> > > rare that our range will be heavily fragmented after unplug, but it's
> > > certainly possible.
> > > 
> > > [...]
> > > 
> > > > 
> > > > Yes, now assume we have a HPE SGI system and it has memory hotplug
> > > > capacity. The system itself has already got memory regions more than
> > > > 1024. Then when we hot add extra memory board, we want to include the
> > > > newly added memory regions into elfcorehdr so that it will be dumped out
> > > > in kdump kernel.
> > > > 
> > > > That's why I earlier suggested 2048 for number of memory regions.
> > > 
> > > The more the better, unless "it hurts". Assuming a single memory block is
> > > 128 MiB, that would be 256 GiB.
> > > 
> > > Usually, on big systems, the memory block size is 2 GiB. So 4 TiB.
> > 
> > Thanks a lot for these valuable inputs, David.
> > 
> > Hi Boris, Eric
> > 
> > So what's your suggested value for the Kconfig option?
> > 
> > 1) cpu number, 1024?
> > 2) memory regions, 2048?
> > 
> > About below draft, any comment? We can decide a value based on our
> > knowledge, can adjust later if any real system has more than the number.
> > 
> > +config CRASH_ELF_CORE_PHDRS_NUM
> > +       depends on CRASH_DUMP && KEXEC_FILE && (HOTPLUG_CPU || MEMORY_HOTPLUG)
> > +       int
> > +       default 3072
> > +       help
> > +         For the kexec_file_load path, specify the default number of
> > +         phdr for the vmcore. E.g the memory regions represented by the
> > +         'System RAM' entries in /proc/iomem, the cpu notes of each
> > +         present cpu stored in /sys/devices/system/cpu/cpuX/crash_notes.
> > 
> > Thanks
> > Baoquan
> > 
> 
> I prefer to keep CRASH_MAX_MEMORY_RANGES, as explained in my response to your message on October 26.
> eric

Ah, sorry, I mixed it up with NR_CPUS. I went on an office outing
yesterday, glad to see you and Boris have made an agreement on the code
change and value. Thanks.


> 


^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2022-10-29  4:28 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-09 21:05 [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Eric DeVolder
2022-09-09 21:05 ` [PATCH v12 1/7] crash: move crash_prepare_elf64_headers Eric DeVolder
2022-09-09 21:05 ` [PATCH v12 2/7] crash: prototype change for crash_prepare_elf64_headers Eric DeVolder
2022-09-09 21:05 ` [PATCH v12 3/7] crash: add generic infrastructure for crash hotplug support Eric DeVolder
2022-10-03 17:51   ` Sourabh Jain
2022-10-07 19:14     ` Eric DeVolder
2022-10-17  6:45       ` Sourabh Jain
2022-10-24  9:10       ` Baoquan He
2022-10-26  7:00         ` Sourabh Jain
2022-10-04  6:38   ` Sourabh Jain
2022-10-07 19:19     ` Eric DeVolder
2022-09-09 21:05 ` [PATCH v12 4/7] kexec: exclude elfcorehdr from the segment digest Eric DeVolder
2022-09-09 21:05 ` [PATCH v12 5/7] kexec: exclude hot remove cpu from elfcorehdr notes Eric DeVolder
2022-09-09 21:05 ` [PATCH v12 6/7] crash: memory and cpu hotplug sysfs attributes Eric DeVolder
2022-09-09 21:05 ` [PATCH v12 7/7] x86/crash: Add x86 crash hotplug support Eric DeVolder
2022-09-12  6:52   ` Borislav Petkov
2022-09-13 19:12     ` Eric DeVolder
2022-09-26 19:19       ` Eric DeVolder
2022-09-28 16:07       ` Borislav Petkov
2022-09-28 16:38         ` Borislav Petkov
2022-09-30 15:36         ` Eric DeVolder
2022-09-30 16:50           ` Borislav Petkov
2022-09-30 17:11             ` Eric DeVolder
2022-09-30 17:40               ` Borislav Petkov
2022-10-08  2:35                 ` Baoquan He
2022-10-12 17:46                   ` Borislav Petkov
2022-10-12 20:19                     ` Eric DeVolder
2022-10-12 20:41                       ` Borislav Petkov
2022-10-13  2:57                         ` Baoquan He
2022-10-25 10:31                           ` Borislav Petkov
2022-10-26 14:48                             ` Baoquan He
2022-10-26 14:54                               ` David Hildenbrand
2022-10-27 13:52                                 ` Baoquan He
2022-10-27 19:28                                   ` Eric DeVolder
2022-10-29  4:27                                     ` Baoquan He
2022-10-27 19:24                               ` Eric DeVolder
2022-10-28 10:19                                 ` Borislav Petkov
2022-10-28 15:29                                   ` Eric DeVolder
2022-10-28 17:06                                     ` Borislav Petkov
2022-10-28 19:26                                       ` Eric DeVolder
2022-10-28 20:30                                         ` Borislav Petkov
2022-10-28 20:34                                           ` Eric DeVolder
2022-10-28 21:22                                           ` Eric DeVolder
2022-10-28 22:19                                             ` Borislav Petkov
2022-10-12 20:42                       ` Eric DeVolder
2022-10-12 16:20                 ` Eric DeVolder
2022-10-25 10:39                   ` Borislav Petkov
2022-10-04  7:03           ` Sourabh Jain
2022-10-07 19:56             ` Eric DeVolder
2022-10-04  9:10           ` Sourabh Jain
2022-10-07 20:00             ` Eric DeVolder
2022-10-12  4:55               ` Sourabh Jain
2022-10-12 16:23                 ` Eric DeVolder
2022-09-19  7:06   ` Sourabh Jain
2022-10-07 19:33     ` Eric DeVolder
2022-10-17  6:54       ` Sourabh Jain
2022-09-12  3:47 ` [PATCH v12 0/7] crash: Kernel handling of CPU and memory hot un/plug Baoquan He

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).