* [RFC PATCH v2 1/3] crash: export dev memmap header to vmcoreinfo
2023-04-27 10:18 [RFC PATCH v2 0/3] pmem memmap dump support Li Zhijian
@ 2023-04-27 10:18 ` Li Zhijian
2023-04-27 10:18 ` [RFC PATCH v2 2/3] drivers/nvdimm: export memmap of namespace " Li Zhijian
` (6 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Li Zhijian @ 2023-04-27 10:18 UTC (permalink / raw)
To: x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst, Li Zhijian,
Baoquan He, Vivek Goyal, Dave Young
Introduce a symbol and export it to vmcoreinfo. Dumping
applications such as makedumpfile, with this variable, they are able to
restore a linked list which contained the memmap region located in
device.
With this mechanism, nvdimm/pmem which allows placing memmap in device
is able to export the its memmap(page array) to kdump kernel via
vmcoreinfo.
CC: Baoquan He <bhe@redhat.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Dave Young <dyoung@redhat.com>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
include/linux/crash_core.h | 8 +++++
kernel/crash_core.c | 61 ++++++++++++++++++++++++++++++++++++++
2 files changed, 69 insertions(+)
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index de62a722431e..05ec2777f4fd 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -84,4 +84,12 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
unsigned long long *crash_size, unsigned long long *crash_base);
+#ifdef CONFIG_CRASH_CORE
+void devm_memmap_vmcore_delete(void *match);
+void devm_memmap_vmcore_update(void *match, u64 pfn, u64 npfn, bool dev);
+#else
+#define devm_memmap_vmcore_delete(match) do {} while (0)
+#define devm_memmap_vmcore_update(match, pfn, npfn, dev) do {} while (0)
+#endif
+
#endif /* LINUX_CRASH_CORE_H */
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 755f5f08ab38..f28cbd98f28b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -401,6 +401,61 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
}
EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+struct devm_memmap_vmcore {
+ struct list_head entry;
+ unsigned long start;
+ unsigned long end;
+ void *match;
+};
+
+static struct devm_memmap_vmcore devm_memmap_vmcore_head = {
+ .entry = LIST_HEAD_INIT(devm_memmap_vmcore_head.entry),
+};
+static DEFINE_MUTEX(devm_memmap_vmcore_mutex);
+
+static void devm_memmap_vmcore_add(void *match, u64 pfn, u64 npfn)
+{
+ struct devm_memmap_vmcore *metadata;
+
+ metadata = kzalloc(sizeof(*metadata), GFP_KERNEL);
+ if (!metadata) {
+ pr_err("No enough memory");
+ return;
+ }
+
+ metadata->start = pfn;
+ metadata->end = pfn + npfn;
+ metadata->match = match;
+
+ mutex_lock(&devm_memmap_vmcore_mutex);
+ list_add(&metadata->entry, &devm_memmap_vmcore_head.entry);
+ mutex_unlock(&devm_memmap_vmcore_mutex);
+}
+
+void devm_memmap_vmcore_delete(void *match)
+{
+ struct devm_memmap_vmcore *metadata;
+
+ mutex_lock(&devm_memmap_vmcore_mutex);
+ list_for_each_entry(metadata, &devm_memmap_vmcore_head.entry, entry) {
+ if (metadata->match == match) {
+ list_del(&metadata->entry);
+ kfree(metadata);
+ break;
+ }
+ }
+ mutex_unlock(&devm_memmap_vmcore_mutex);
+}
+EXPORT_SYMBOL_GPL(devm_memmap_vmcore_delete);
+
+void devm_memmap_vmcore_update(void *match, u64 start_pfn, u64 npfn, bool dev)
+{
+ devm_memmap_vmcore_delete(match);
+ if (dev)
+ devm_memmap_vmcore_add(match, start_pfn, npfn);
+}
+EXPORT_SYMBOL_GPL(devm_memmap_vmcore_update);
+
static int __init crash_save_vmcoreinfo_init(void)
{
vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -436,6 +491,12 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(contig_page_data);
#endif
#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(devm_memmap_vmcore_head);
+ VMCOREINFO_STRUCT_SIZE(devm_memmap_vmcore);
+ VMCOREINFO_OFFSET(devm_memmap_vmcore, entry);
+ VMCOREINFO_OFFSET(devm_memmap_vmcore, start);
+ VMCOREINFO_OFFSET(devm_memmap_vmcore, end);
+
VMCOREINFO_SYMBOL_ARRAY(mem_section);
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
--
2.29.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [RFC PATCH v2 2/3] drivers/nvdimm: export memmap of namespace to vmcoreinfo
2023-04-27 10:18 [RFC PATCH v2 0/3] pmem memmap dump support Li Zhijian
2023-04-27 10:18 ` [RFC PATCH v2 1/3] crash: export dev memmap header to vmcoreinfo Li Zhijian
@ 2023-04-27 10:18 ` Li Zhijian
2023-04-27 22:50 ` Ira Weiny
2023-04-27 10:18 ` [RFC PATCH v2 3/3] resource, crash: Make kexec_file_load support pmem Li Zhijian
` (5 subsequent siblings)
7 siblings, 1 reply; 18+ messages in thread
From: Li Zhijian @ 2023-04-27 10:18 UTC (permalink / raw)
To: x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst, Li Zhijian,
Dan Williams, Vishal Verma, Dave Jiang, Ira Weiny
Each namespace has its own memmap, it will be udpated when
namespace initializing/creating, updating, and deleting.
CC: Dan Williams <dan.j.williams@intel.com>
CC: Vishal Verma <vishal.l.verma@intel.com>
CC: Dave Jiang <dave.jiang@intel.com>
CC: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
drivers/nvdimm/namespace_devs.c | 2 ++
drivers/nvdimm/pfn_devs.c | 3 +++
2 files changed, 5 insertions(+)
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index c60ec0b373c5..096203e6203f 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/nd.h>
+#include <linux/crash_core.h>
#include "nd-core.h"
#include "pmem.h"
#include "pfn.h"
@@ -853,6 +854,7 @@ static ssize_t size_store(struct device *dev,
if (rc == 0 && val == 0 && is_namespace_pmem(dev)) {
struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+ devm_memmap_vmcore_delete(to_ndns(dev));
kfree(nspm->uuid);
nspm->uuid = NULL;
}
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index af7d9301520c..80076996b2da 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -9,6 +9,7 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/crash_core.h>
#include "nd-core.h"
#include "pfn.h"
#include "nd.h"
@@ -716,6 +717,8 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
} else
return -ENXIO;
+ devm_memmap_vmcore_update(ndns, altmap->base_pfn, PHYS_PFN(offset),
+ nd_pfn->mode == PFN_MODE_PMEM);
return 0;
}
--
2.29.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [RFC PATCH v2 2/3] drivers/nvdimm: export memmap of namespace to vmcoreinfo
2023-04-27 10:18 ` [RFC PATCH v2 2/3] drivers/nvdimm: export memmap of namespace " Li Zhijian
@ 2023-04-27 22:50 ` Ira Weiny
2023-04-28 7:01 ` Zhijian Li (Fujitsu)
0 siblings, 1 reply; 18+ messages in thread
From: Ira Weiny @ 2023-04-27 22:50 UTC (permalink / raw)
To: Li Zhijian, x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst, Li Zhijian,
Dan Williams, Vishal Verma, Dave Jiang, Ira Weiny
Li Zhijian wrote:
> Each namespace has its own memmap, it will be udpated when
> namespace initializing/creating, updating, and deleting.
>
> CC: Dan Williams <dan.j.williams@intel.com>
> CC: Vishal Verma <vishal.l.verma@intel.com>
> CC: Dave Jiang <dave.jiang@intel.com>
> CC: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
> ---
> drivers/nvdimm/namespace_devs.c | 2 ++
> drivers/nvdimm/pfn_devs.c | 3 +++
> 2 files changed, 5 insertions(+)
>
> diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
> index c60ec0b373c5..096203e6203f 100644
> --- a/drivers/nvdimm/namespace_devs.c
> +++ b/drivers/nvdimm/namespace_devs.c
> @@ -8,6 +8,7 @@
> #include <linux/slab.h>
> #include <linux/list.h>
> #include <linux/nd.h>
> +#include <linux/crash_core.h>
> #include "nd-core.h"
> #include "pmem.h"
> #include "pfn.h"
> @@ -853,6 +854,7 @@ static ssize_t size_store(struct device *dev,
> if (rc == 0 && val == 0 && is_namespace_pmem(dev)) {
> struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
>
> + devm_memmap_vmcore_delete(to_ndns(dev));
This seems like an odd place to put this. Could you explain the reasoning
more?
Ira
> kfree(nspm->uuid);
> nspm->uuid = NULL;
> }
> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> index af7d9301520c..80076996b2da 100644
> --- a/drivers/nvdimm/pfn_devs.c
> +++ b/drivers/nvdimm/pfn_devs.c
> @@ -9,6 +9,7 @@
> #include <linux/slab.h>
> #include <linux/fs.h>
> #include <linux/mm.h>
> +#include <linux/crash_core.h>
> #include "nd-core.h"
> #include "pfn.h"
> #include "nd.h"
> @@ -716,6 +717,8 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
> } else
> return -ENXIO;
>
> + devm_memmap_vmcore_update(ndns, altmap->base_pfn, PHYS_PFN(offset),
> + nd_pfn->mode == PFN_MODE_PMEM);
> return 0;
> }
>
> --
> 2.29.2
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC PATCH v2 2/3] drivers/nvdimm: export memmap of namespace to vmcoreinfo
2023-04-27 22:50 ` Ira Weiny
@ 2023-04-28 7:01 ` Zhijian Li (Fujitsu)
0 siblings, 0 replies; 18+ messages in thread
From: Zhijian Li (Fujitsu) @ 2023-04-28 7:01 UTC (permalink / raw)
To: Ira Weiny, x86, nvdimm, kexec
Cc: linux-kernel, Yasunori Gotou (Fujitsu), Xiao Yang (Fujitsu),
Shiyang Ruan (Fujitsu),
Dan Williams, Vishal Verma, Dave Jiang
On 28/04/2023 06:50, Ira Weiny wrote:
> Li Zhijian wrote:
>> Each namespace has its own memmap, it will be udpated when
>> namespace initializing/creating, updating, and deleting.
>>
>> CC: Dan Williams <dan.j.williams@intel.com>
>> CC: Vishal Verma <vishal.l.verma@intel.com>
>> CC: Dave Jiang <dave.jiang@intel.com>
>> CC: Ira Weiny <ira.weiny@intel.com>
>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>> ---
>> drivers/nvdimm/namespace_devs.c | 2 ++
>> drivers/nvdimm/pfn_devs.c | 3 +++
>> 2 files changed, 5 insertions(+)
>>
>> diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
>> index c60ec0b373c5..096203e6203f 100644
>> --- a/drivers/nvdimm/namespace_devs.c
>> +++ b/drivers/nvdimm/namespace_devs.c
>> @@ -8,6 +8,7 @@
>> #include <linux/slab.h>
>> #include <linux/list.h>
>> #include <linux/nd.h>
>> +#include <linux/crash_core.h>
>> #include "nd-core.h"
>> #include "pmem.h"
>> #include "pfn.h"
>> @@ -853,6 +854,7 @@ static ssize_t size_store(struct device *dev,
>> if (rc == 0 && val == 0 && is_namespace_pmem(dev)) {
>> struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
>>
>> + devm_memmap_vmcore_delete(to_ndns(dev));
>
> This seems like an odd place to put this. Could you explain the reasoning
> more?
>
Ira,
Users who want to manage the namespace of pmem usually use the 'ndctl' command. The following cases
would touch the memmap of the namespace.
a. create namespace 'ndctl create-namespace -f -e namespace0.0 --mode=fsdax -s $(((1024+16)<<20)) -M dev'
b. change namespace size 'ndctl create-namespace -f -e namespace0.0 --mode=fsdax -s $(((1024)<<20)) -M dev'
c. change memmap location 'ndctl create-namespace -f -e namespace0.0 --mode=fsdax -s $(((1024+16)<<20)) -M mem'
d. destroy namespace 'ndctl destroy-namespace -f namespace0.0'
Unlike the former 3 cases, the case d, it will not invoke '__nvdimm_setup_pfn()'. Instead, ndctl
just do something like 'echo 0 >/sys/bus/nd/devices/namespace0.0/size'
We have to delete this namespace from devm_memmap_vmcore in this case. So here is an odd place
but it works. I have tried to find a place pairing with __nvdimm_setup_pfn(), but i failed
at last. If you have any good idea, please let me know :)
Thanks
Zhijian
> Ira
>
>> kfree(nspm->uuid);
>> nspm->uuid = NULL;
>> }
>> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
>> index af7d9301520c..80076996b2da 100644
>> --- a/drivers/nvdimm/pfn_devs.c
>> +++ b/drivers/nvdimm/pfn_devs.c
>> @@ -9,6 +9,7 @@
>> #include <linux/slab.h>
>> #include <linux/fs.h>
>> #include <linux/mm.h>
>> +#include <linux/crash_core.h>
>> #include "nd-core.h"
>> #include "pfn.h"
>> #include "nd.h"
>> @@ -716,6 +717,8 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
>> } else
>> return -ENXIO;
>>
>> + devm_memmap_vmcore_update(ndns, altmap->base_pfn, PHYS_PFN(offset),
>> + nd_pfn->mode == PFN_MODE_PMEM);
>> return 0;
>> }
>>
>> --
>> 2.29.2
>>
>
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [RFC PATCH v2 3/3] resource, crash: Make kexec_file_load support pmem
2023-04-27 10:18 [RFC PATCH v2 0/3] pmem memmap dump support Li Zhijian
2023-04-27 10:18 ` [RFC PATCH v2 1/3] crash: export dev memmap header to vmcoreinfo Li Zhijian
2023-04-27 10:18 ` [RFC PATCH v2 2/3] drivers/nvdimm: export memmap of namespace " Li Zhijian
@ 2023-04-27 10:18 ` Li Zhijian
2023-04-27 11:39 ` Greg Kroah-Hartman
2023-04-27 20:41 ` Jane Chu
2023-04-27 10:18 ` [RFC PATCH v2 kexec-tools] kexec: Add and mark pmem region into PT_LOADs Li Zhijian
` (4 subsequent siblings)
7 siblings, 2 replies; 18+ messages in thread
From: Li Zhijian @ 2023-04-27 10:18 UTC (permalink / raw)
To: x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst, Li Zhijian,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Eric Biederman, Takashi Iwai, Baoquan He,
Vlastimil Babka, Sean Christopherson, Jonathan Cameron,
Greg Kroah-Hartman, Andy Shevchenko, Dan Williams,
Rafael J. Wysocki, Ira Weiny, Raul E Rangel, Colin Foster,
Vishal Verma
It does:
1. Add pmem region into PT_LOADs of vmcore
2. Mark pmem region's p_flags as PF_DEV
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
CC: Borislav Petkov <bp@alien8.de>
CC: Dave Hansen <dave.hansen@linux.intel.com>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Eric Biederman <ebiederm@xmission.com>
CC: Takashi Iwai <tiwai@suse.de>
CC: Baoquan He <bhe@redhat.com>
CC: Vlastimil Babka <vbabka@suse.cz>
CC: Sean Christopherson <seanjc@google.com>
CC: Jonathan Cameron <Jonathan.Cameron@huawei.com>
CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
CC: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
CC: Dan Williams <dan.j.williams@intel.com>
CC: "Rafael J. Wysocki" <rafael@kernel.org>
CC: Ira Weiny <ira.weiny@intel.com>
CC: Raul E Rangel <rrangel@chromium.org>
CC: Colin Foster <colin.foster@in-advantage.com>
CC: Vishal Verma <vishal.l.verma@intel.com>
CC: x86@kernel.org
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
arch/x86/kernel/crash.c | 2 ++
include/linux/ioport.h | 3 +++
kernel/kexec_file.c | 10 ++++++++++
kernel/resource.c | 11 +++++++++++
4 files changed, 26 insertions(+)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index cdd92ab43cda..dc9d03083565 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -178,6 +178,7 @@ static struct crash_mem *fill_up_crash_elf_data(void)
if (!nr_ranges)
return NULL;
+ walk_pmem_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
/*
* Exclusion of crash region and/or crashk_low_res may cause
* another range split. So add extra two slots here.
@@ -243,6 +244,7 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
if (ret)
goto out;
+ walk_pmem_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
/* Exclude unwanted mem ranges */
ret = elf_header_exclude_ranges(cmem);
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 25d768d48970..bde88a47cc1a 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -331,6 +331,9 @@ extern int
walk_system_ram_res(u64 start, u64 end, void *arg,
int (*func)(struct resource *, void *));
extern int
+walk_pmem_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *));
+extern int
walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
void *arg, int (*func)(struct resource *, void *));
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f1a0e4e3fb5c..e79ceaee2926 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -29,6 +29,8 @@
#include <linux/vmalloc.h>
#include "kexec_internal.h"
+#define PF_DEV (1 << 4)
+
#ifdef CONFIG_KEXEC_SIG
static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE);
@@ -1221,6 +1223,12 @@ int crash_exclude_mem_range(struct crash_mem *mem,
return 0;
}
+static bool is_pmem_range(u64 start, u64 size)
+{
+ return REGION_INTERSECTS == region_intersects(start, size,
+ IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY);
+}
+
int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
void **addr, unsigned long *sz)
{
@@ -1302,6 +1310,8 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R|PF_W|PF_X;
+ if (is_pmem_range(mstart, mend - mstart))
+ phdr->p_flags |= PF_DEV;
phdr->p_offset = mstart;
phdr->p_paddr = mstart;
diff --git a/kernel/resource.c b/kernel/resource.c
index b1763b2fd7ef..f3f1ce6fc384 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -431,6 +431,17 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
func);
}
+/*
+ * This function calls the @func callback against all memory ranges, which
+ * are ranges marked as IORESOURCE_MEM and IORES_DESC_PERSISTENT_MEMORY.
+ */
+int walk_pmem_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ return __walk_iomem_res_desc(start, end, IORESOURCE_MEM,
+ IORES_DESC_PERSISTENT_MEMORY, arg, func);
+}
+
/*
* This function calls the @func callback against all memory ranges, which
* are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
--
2.29.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [RFC PATCH v2 3/3] resource, crash: Make kexec_file_load support pmem
2023-04-27 10:18 ` [RFC PATCH v2 3/3] resource, crash: Make kexec_file_load support pmem Li Zhijian
@ 2023-04-27 11:39 ` Greg Kroah-Hartman
2023-04-28 7:36 ` Zhijian Li (Fujitsu)
2023-04-27 20:41 ` Jane Chu
1 sibling, 1 reply; 18+ messages in thread
From: Greg Kroah-Hartman @ 2023-04-27 11:39 UTC (permalink / raw)
To: Li Zhijian
Cc: x86, nvdimm, kexec, linux-kernel, y-goto, yangx.jy, ruansy.fnst,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Eric Biederman, Takashi Iwai, Baoquan He,
Vlastimil Babka, Sean Christopherson, Jonathan Cameron,
Andy Shevchenko, Dan Williams, Rafael J. Wysocki, Ira Weiny,
Raul E Rangel, Colin Foster, Vishal Verma
On Thu, Apr 27, 2023 at 06:18:34PM +0800, Li Zhijian wrote:
> It does:
> 1. Add pmem region into PT_LOADs of vmcore
> 2. Mark pmem region's p_flags as PF_DEV
I'm sorry, but I can not parse this changelog.
Please take a look at the kernel documentation for how to write a good
changelog message so that we can properly review the change you wish to
have accepted.
thanks,
greg k-h
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC PATCH v2 3/3] resource, crash: Make kexec_file_load support pmem
2023-04-27 11:39 ` Greg Kroah-Hartman
@ 2023-04-28 7:36 ` Zhijian Li (Fujitsu)
0 siblings, 0 replies; 18+ messages in thread
From: Zhijian Li (Fujitsu) @ 2023-04-28 7:36 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: x86, nvdimm, kexec, linux-kernel, Yasunori Gotou (Fujitsu),
Xiao Yang (Fujitsu), Shiyang Ruan (Fujitsu),
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Eric Biederman, Takashi Iwai, Baoquan He,
Vlastimil Babka, Sean Christopherson, Jonathan Cameron,
Andy Shevchenko, Dan Williams, Rafael J. Wysocki, Ira Weiny,
Raul E Rangel, Colin Foster, Vishal Verma
Greg,
Sorry for these *BAD* changelog, This patch is most like a *HACKing* to resource.c currently.
Please allow me to rewrite it once.
Only the region described by PT_LOADs of /proc/vmcore are dumpable/readble by dumping applications.
Previously, on x86/x86_64 only system ram resources will be injected into PT_LOADs.
So in order to make the entire pmem resource is dumpable/readable, we need to add pmem region
into the PT_LOADs of /proc/vmcore.
Here we introduce a new API walk_pmem_res() to walk the pmem region first. Further, we will also
mark pmem region with extra p_flags PF_DEV when it's adding into PT_LOADs.
Then the dumping applications are able to know if the region is pmem or not according this flag
and take special actions correspondingly.
Thanks
Zhijian
On 27/04/2023 19:39, Greg Kroah-Hartman wrote:
> On Thu, Apr 27, 2023 at 06:18:34PM +0800, Li Zhijian wrote:
>> It does:
>> 1. Add pmem region into PT_LOADs of vmcore
>> 2. Mark pmem region's p_flags as PF_DEV
>
> I'm sorry, but I can not parse this changelog.
>
> Please take a look at the kernel documentation for how to write a good
> changelog message so that we can properly review the change you wish to
> have accepted.
>
> thanks,
>
> greg k-h
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC PATCH v2 3/3] resource, crash: Make kexec_file_load support pmem
2023-04-27 10:18 ` [RFC PATCH v2 3/3] resource, crash: Make kexec_file_load support pmem Li Zhijian
2023-04-27 11:39 ` Greg Kroah-Hartman
@ 2023-04-27 20:41 ` Jane Chu
2023-04-28 7:10 ` Zhijian Li (Fujitsu)
1 sibling, 1 reply; 18+ messages in thread
From: Jane Chu @ 2023-04-27 20:41 UTC (permalink / raw)
To: Li Zhijian, x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, H. Peter Anvin,
Eric Biederman, Takashi Iwai, Baoquan He, Vlastimil Babka,
Sean Christopherson, Jonathan Cameron, Greg Kroah-Hartman,
Andy Shevchenko, Dan Williams, Rafael J. Wysocki, Ira Weiny,
Raul E Rangel, Colin Foster, Vishal Verma
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index cdd92ab43cda..dc9d03083565 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -178,6 +178,7 @@ static struct crash_mem *fill_up_crash_elf_data(void)
> if (!nr_ranges)
> return NULL;
>
> + walk_pmem_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
So this will overwrite 'nr_ranges' produced by the previous
walk_system_ram_res() call, sure it's correct?
Regards,
-jane
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC PATCH v2 3/3] resource, crash: Make kexec_file_load support pmem
2023-04-27 20:41 ` Jane Chu
@ 2023-04-28 7:10 ` Zhijian Li (Fujitsu)
0 siblings, 0 replies; 18+ messages in thread
From: Zhijian Li (Fujitsu) @ 2023-04-28 7:10 UTC (permalink / raw)
To: Jane Chu, x86, nvdimm, kexec
Cc: linux-kernel, Yasunori Gotou (Fujitsu), Xiao Yang (Fujitsu),
Shiyang Ruan (Fujitsu),
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
H. Peter Anvin, Eric Biederman, Takashi Iwai, Baoquan He,
Vlastimil Babka, Sean Christopherson, Jonathan Cameron,
Greg Kroah-Hartman, Andy Shevchenko, Dan Williams,
Rafael J. Wysocki, Ira Weiny, Raul E Rangel, Colin Foster,
Vishal Verma
Jane,
On 28/04/2023 04:41, Jane Chu wrote:
>> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
>> index cdd92ab43cda..dc9d03083565 100644
>> --- a/arch/x86/kernel/crash.c
>> +++ b/arch/x86/kernel/crash.c
>> @@ -178,6 +178,7 @@ static struct crash_mem *fill_up_crash_elf_data(void)
>> if (!nr_ranges)
>> return NULL;
>> + walk_pmem_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
>
> So this will overwrite 'nr_ranges' produced by the previous walk_system_ram_res() call, sure it's correct?
It depends on how the callback walk_system_ram_res() handle 'nr_ranges', so it's safe for this changes IMHO.
163 static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
164 {
165 unsigned int *nr_ranges = arg;
166
167 (*nr_ranges)++;
168 return 0;
169 }
170
171 /* Gather all the required information to prepare elf headers for ram regions */
172 static struct crash_mem *fill_up_crash_elf_data(void)
173 {
174 unsigned int nr_ranges = 0;
175 struct crash_mem *cmem;
176
177 walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
178 if (!nr_ranges)
179 return NULL;
180
181 walk_pmem_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
At last, nr_ranges = #ram_res + #pmem_res.
Thanks
Zhijian
>
> Regards,
> -jane
^ permalink raw reply [flat|nested] 18+ messages in thread
* [RFC PATCH v2 kexec-tools] kexec: Add and mark pmem region into PT_LOADs
2023-04-27 10:18 [RFC PATCH v2 0/3] pmem memmap dump support Li Zhijian
` (2 preceding siblings ...)
2023-04-27 10:18 ` [RFC PATCH v2 3/3] resource, crash: Make kexec_file_load support pmem Li Zhijian
@ 2023-04-27 10:18 ` Li Zhijian
2023-04-27 10:18 ` [RFC PATCH v2 makedumpfile 1/3] elf_info.c: Introduce is_pmem_pt_load_range Li Zhijian
` (3 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Li Zhijian @ 2023-04-27 10:18 UTC (permalink / raw)
To: x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst, Li Zhijian,
Baoquan He, Vivek Goyal, Dave Young
It does:
1. Add pmem region into PT_LOADs of vmcore
2. Mark pmem region's p_flags as PF_DEV
CC: Baoquan He <bhe@redhat.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Dave Young <dyoung@redhat.com>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
kexec/crashdump-elf.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/kexec/crashdump-elf.c b/kexec/crashdump-elf.c
index b8bb686a17ca..ab257e825187 100644
--- a/kexec/crashdump-elf.c
+++ b/kexec/crashdump-elf.c
@@ -25,6 +25,8 @@ do { \
} while(0)
#endif
+#define PF_DEV (1 << 4)
+
/* Prepares the crash memory headers and stores in supplied buffer. */
int FUNC(struct kexec_info *info,
struct crash_elf_info *elf_info,
@@ -199,7 +201,7 @@ int FUNC(struct kexec_info *info,
* A seprate program header for Backup Region*/
for (i = 0; i < ranges; i++, range++) {
unsigned long long mstart, mend;
- if (range->type != RANGE_RAM)
+ if (range->type != RANGE_RAM && range->type != RANGE_PMEM)
continue;
mstart = range->start;
mend = range->end;
@@ -209,6 +211,8 @@ int FUNC(struct kexec_info *info,
bufp += sizeof(PHDR);
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R|PF_W|PF_X;
+ if (range->type == RANGE_PMEM)
+ phdr->p_flags |= PF_DEV;
phdr->p_offset = mstart;
if (mstart == info->backup_src_start
--
2.29.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [RFC PATCH v2 makedumpfile 1/3] elf_info.c: Introduce is_pmem_pt_load_range
2023-04-27 10:18 [RFC PATCH v2 0/3] pmem memmap dump support Li Zhijian
` (3 preceding siblings ...)
2023-04-27 10:18 ` [RFC PATCH v2 kexec-tools] kexec: Add and mark pmem region into PT_LOADs Li Zhijian
@ 2023-04-27 10:18 ` Li Zhijian
2023-04-27 10:18 ` [RFC PATCH v2 makedumpfile 2/3] makedumpfile.c: Exclude all pmem pages Li Zhijian
` (2 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Li Zhijian @ 2023-04-27 10:18 UTC (permalink / raw)
To: x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst, Li Zhijian,
Baoquan He, Vivek Goyal, Dave Young
It checks BIT(4) of Elf64_Phdr, currently only the former 3 bits are used
by ELF. In kexec-tool, we extend the BIT(4) to indicate pmem or not.
dump_Elf_load: phys_start phys_end virt_start virt_end is_pmem
dump_Elf_load: LOAD[ 0] 6b800000 6e42c000 ffffffffbcc00000 ffffffffbf82c000 false
dump_Elf_load: LOAD[ 1] 1000 9fc00 ffff975980001000 ffff97598009fc00 false
dump_Elf_load: LOAD[ 2] 100000 7f000000 ffff975980100000 ffff9759ff000000 false
dump_Elf_load: LOAD[ 3] bf000000 bffd7000 ffff975a3f000000 ffff975a3ffd7000 false
dump_Elf_load: LOAD[ 4] 100000000 140000000 ffff975a80000000 ffff975ac0000000 false
dump_Elf_load: LOAD[ 5] 140000000 23e200000 ffff975ac0000000 ffff975bbe200000 true
CC: Baoquan He <bhe@redhat.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Dave Young <dyoung@redhat.com>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
elf_info.c | 31 +++++++++++++++++++++++++++----
elf_info.h | 1 +
2 files changed, 28 insertions(+), 4 deletions(-)
diff --git a/elf_info.c b/elf_info.c
index bc24083655d6..41b36b2804d2 100644
--- a/elf_info.c
+++ b/elf_info.c
@@ -43,6 +43,7 @@ struct pt_load_segment {
unsigned long long phys_end;
unsigned long long virt_start;
unsigned long long virt_end;
+ int is_pmem;
};
static int nr_cpus; /* number of cpu */
@@ -153,6 +154,8 @@ check_elf_format(int fd, char *filename, int *phnum, unsigned int *num_load)
return FALSE;
}
+#define PF_DEV (1 << 4)
+
static int
dump_Elf_load(Elf64_Phdr *prog, int num_load)
{
@@ -170,17 +173,37 @@ dump_Elf_load(Elf64_Phdr *prog, int num_load)
pls->virt_end = pls->virt_start + prog->p_memsz;
pls->file_offset = prog->p_offset;
pls->file_size = prog->p_filesz;
+ pls->is_pmem = !!(prog->p_flags & PF_DEV);
if (num_load == 0)
- DEBUG_MSG("%8s %16s %16s %16s %16s\n", "",
- "phys_start", "phys_end", "virt_start", "virt_end");
+ DEBUG_MSG("%8s %16s %16s %16s %16s %8s\n", "",
+ "phys_start", "phys_end", "virt_start", "virt_end",
+ "is_pmem");
- DEBUG_MSG("LOAD[%2d] %16llx %16llx %16llx %16llx\n", num_load,
- pls->phys_start, pls->phys_end, pls->virt_start, pls->virt_end);
+ DEBUG_MSG("LOAD[%2d] %16llx %16llx %16llx %16llx %8s\n", num_load,
+ pls->phys_start, pls->phys_end, pls->virt_start, pls->virt_end,
+ pls->is_pmem ? "true": "false");
return TRUE;
}
+int is_pmem_pt_load_range(unsigned long long start, unsigned long long end)
+{
+ int i;
+ struct pt_load_segment *pls;
+
+ for (i = 0; i < num_pt_loads; i++) {
+ pls = &pt_loads[i];
+ if (pls->is_pmem && pls->phys_start == NOT_PADDR)
+ return TRUE;
+ if (pls->is_pmem && pls->phys_start != NOT_PADDR &&
+ pls->phys_start <= start && pls->phys_end >= end)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
static off_t
offset_next_note(void *note)
{
diff --git a/elf_info.h b/elf_info.h
index d5416b32cdd7..a08d59a331f6 100644
--- a/elf_info.h
+++ b/elf_info.h
@@ -64,6 +64,7 @@ int get_pt_load_extents(int idx,
off_t *file_offset,
off_t *file_size);
unsigned int get_num_pt_loads(void);
+int is_pmem_pt_load_range(unsigned long long start, unsigned long long end);
void set_nr_cpus(int num);
int get_nr_cpus(void);
--
2.29.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [RFC PATCH v2 makedumpfile 2/3] makedumpfile.c: Exclude all pmem pages
2023-04-27 10:18 [RFC PATCH v2 0/3] pmem memmap dump support Li Zhijian
` (4 preceding siblings ...)
2023-04-27 10:18 ` [RFC PATCH v2 makedumpfile 1/3] elf_info.c: Introduce is_pmem_pt_load_range Li Zhijian
@ 2023-04-27 10:18 ` Li Zhijian
2023-04-27 10:18 ` [RFC PATCH v2 makedumpfile 3/3] makedumpfile.c: Allow excluding metadata of pmem region Li Zhijian
2023-04-28 18:59 ` [RFC PATCH v2 0/3] pmem memmap dump support Dan Williams
7 siblings, 0 replies; 18+ messages in thread
From: Li Zhijian @ 2023-04-27 10:18 UTC (permalink / raw)
To: x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst, Li Zhijian,
Baoquan He, Vivek Goyal, Dave Young
Generally, the pmem is too large to suitable to be dumped. Further, only
the namespace of the pmem is dumpable, but actually currently we have no
idea the excatly layout of the namespace in pmem.
CC: Baoquan He <bhe@redhat.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Dave Young <dyoung@redhat.com>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
makedumpfile.c | 18 ++++++++++++++----
1 file changed, 14 insertions(+), 4 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index f40368364cf3..98c3b8c7ced9 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -100,6 +100,7 @@ mdf_pfn_t pfn_user;
mdf_pfn_t pfn_free;
mdf_pfn_t pfn_hwpoison;
mdf_pfn_t pfn_offline;
+mdf_pfn_t pfn_pmem_userdata;
mdf_pfn_t pfn_elf_excluded;
mdf_pfn_t num_dumped;
@@ -6326,6 +6327,7 @@ __exclude_unnecessary_pages(unsigned long mem_map,
unsigned int order_offset, dtor_offset;
unsigned long flags, mapping, private = 0;
unsigned long compound_dtor, compound_head = 0;
+ unsigned int is_pmem;
/*
* If a multi-page exclusion is pending, do it first
@@ -6377,6 +6379,13 @@ __exclude_unnecessary_pages(unsigned long mem_map,
continue;
}
+ is_pmem = is_pmem_pt_load_range(pfn << PAGESHIFT(), (pfn + 1) << PAGESHIFT());
+ if (is_pmem) {
+ pfn_pmem_userdata++;
+ clear_bit_on_2nd_bitmap_for_kernel(pfn, cycle);
+ continue;
+ }
+
index_pg = pfn % PGMM_CACHED;
pcache = page_cache + (index_pg * SIZE(page));
@@ -8084,7 +8093,7 @@ write_elf_pages_cyclic(struct cache_data *cd_header, struct cache_data *cd_page)
*/
if (info->flag_cyclic) {
pfn_zero = pfn_cache = pfn_cache_private = 0;
- pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
+ pfn_user = pfn_free = pfn_hwpoison = pfn_offline = pfn_pmem_userdata = 0;
pfn_memhole = info->max_mapnr;
}
@@ -9422,7 +9431,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data *cd_header, struct cache_d
* Reset counter for debug message.
*/
pfn_zero = pfn_cache = pfn_cache_private = 0;
- pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
+ pfn_user = pfn_free = pfn_hwpoison = pfn_offline = pfn_pmem_userdata = 0;
pfn_memhole = info->max_mapnr;
/*
@@ -10370,7 +10379,7 @@ print_report(void)
*/
pfn_original = info->max_mapnr - pfn_memhole;
- pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
+ pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private + pfn_pmem_userdata
+ pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
REPORT_MSG("\n");
@@ -10387,6 +10396,7 @@ print_report(void)
REPORT_MSG(" Free pages : 0x%016llx\n", pfn_free);
REPORT_MSG(" Hwpoison pages : 0x%016llx\n", pfn_hwpoison);
REPORT_MSG(" Offline pages : 0x%016llx\n", pfn_offline);
+ REPORT_MSG(" pmem userdata pages : 0x%016llx\n", pfn_pmem_userdata);
REPORT_MSG(" Remaining pages : 0x%016llx\n",
pfn_original - pfn_excluded);
@@ -10426,7 +10436,7 @@ print_mem_usage(void)
*/
pfn_original = info->max_mapnr - pfn_memhole;
- pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
+ pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private + pfn_pmem_userdata
+ pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
shrinking = (pfn_original - pfn_excluded) * 100;
shrinking = shrinking / pfn_original;
--
2.29.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [RFC PATCH v2 makedumpfile 3/3] makedumpfile.c: Allow excluding metadata of pmem region
2023-04-27 10:18 [RFC PATCH v2 0/3] pmem memmap dump support Li Zhijian
` (5 preceding siblings ...)
2023-04-27 10:18 ` [RFC PATCH v2 makedumpfile 2/3] makedumpfile.c: Exclude all pmem pages Li Zhijian
@ 2023-04-27 10:18 ` Li Zhijian
2023-04-28 18:59 ` [RFC PATCH v2 0/3] pmem memmap dump support Dan Williams
7 siblings, 0 replies; 18+ messages in thread
From: Li Zhijian @ 2023-04-27 10:18 UTC (permalink / raw)
To: x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst, Li Zhijian,
Baoquan He, Vivek Goyal, Dave Young
Extend -d option(BIT(5)) to allow user excluding metadata
CC: Baoquan He <bhe@redhat.com>
CC: Vivek Goyal <vgoyal@redhat.com>
CC: Dave Young <dyoung@redhat.com>
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
makedumpfile.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++---
makedumpfile.h | 10 +++-
2 files changed, 145 insertions(+), 8 deletions(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index 98c3b8c7ced9..e4d036505311 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -100,6 +100,7 @@ mdf_pfn_t pfn_user;
mdf_pfn_t pfn_free;
mdf_pfn_t pfn_hwpoison;
mdf_pfn_t pfn_offline;
+mdf_pfn_t pfn_pmem_metadata;
mdf_pfn_t pfn_pmem_userdata;
mdf_pfn_t pfn_elf_excluded;
@@ -1623,6 +1624,7 @@ get_symbol_info(void)
SYMBOL_INIT(mem_map, "mem_map");
SYMBOL_INIT(vmem_map, "vmem_map");
SYMBOL_INIT(mem_section, "mem_section");
+ SYMBOL_INIT(devm_memmap_vmcore_head, "devm_memmap_vmcore_head");
SYMBOL_INIT(pkmap_count, "pkmap_count");
SYMBOL_INIT_NEXT(pkmap_count_next, "pkmap_count");
SYMBOL_INIT(system_utsname, "system_utsname");
@@ -1727,6 +1729,11 @@ get_structure_info(void)
info->flag_use_count = FALSE;
}
+ SIZE_INIT(devm_memmap_vmcore, "devm_memmap_vmcore");
+ OFFSET_INIT(devm_memmap_vmcore.entry, "devm_memmap_vmcore", "entry");
+ OFFSET_INIT(devm_memmap_vmcore.start, "devm_memmap_vmcore", "start");
+ OFFSET_INIT(devm_memmap_vmcore.end, "devm_memmap_vmcore", "end");
+
OFFSET_INIT(page.mapping, "page", "mapping");
OFFSET_INIT(page._mapcount, "page", "_mapcount");
OFFSET_INIT(page.private, "page", "private");
@@ -2757,6 +2764,7 @@ read_vmcoreinfo(void)
READ_SYMBOL("mem_map", mem_map);
READ_SYMBOL("vmem_map", vmem_map);
READ_SYMBOL("mem_section", mem_section);
+ READ_SYMBOL("devm_memmap_vmcore_head", devm_memmap_vmcore_head);
READ_SYMBOL("pkmap_count", pkmap_count);
READ_SYMBOL("pkmap_count_next", pkmap_count_next);
READ_SYMBOL("system_utsname", system_utsname);
@@ -2805,6 +2813,7 @@ read_vmcoreinfo(void)
READ_STRUCTURE_SIZE("pageflags", pageflags);
READ_STRUCTURE_SIZE("vmemmap_backing", vmemmap_backing);
READ_STRUCTURE_SIZE("mmu_psize_def", mmu_psize_def);
+ READ_STRUCTURE_SIZE("devm_memmap_vmcore", devm_memmap_vmcore);
READ_MEMBER_OFFSET("page.flags", page.flags);
@@ -2852,6 +2861,9 @@ read_vmcoreinfo(void)
READ_MEMBER_OFFSET("mmu_psize_def.shift", mmu_psize_def.shift);
READ_MEMBER_OFFSET("cpu_spec.mmu_features", cpu_spec.mmu_features);
READ_MEMBER_OFFSET("uts_namespace.name", uts_namespace.name);
+ READ_MEMBER_OFFSET("devm_memmap_vmcore.entry", devm_memmap_vmcore.entry);
+ READ_MEMBER_OFFSET("devm_memmap_vmcore.start", devm_memmap_vmcore.start);
+ READ_MEMBER_OFFSET("devm_memmap_vmcore.end", devm_memmap_vmcore.end);
READ_STRUCTURE_SIZE("printk_log", printk_log);
READ_STRUCTURE_SIZE("printk_ringbuffer", printk_ringbuffer);
@@ -3210,6 +3222,88 @@ pgdat4:
return SYMBOL(contig_page_data);
}
+struct devm_memmap_entry {
+ unsigned long start, end;
+ struct devm_memmap_entry *next;
+};
+
+static struct devm_memmap_entry *devm_memmap;
+
+static void pmem_add_next(unsigned long start, unsigned long end)
+{
+ struct devm_memmap_entry *tail = devm_memmap, *node;
+
+ node = calloc(1, sizeof(*node));
+ if (!node)
+ return;
+
+ node->start = start;
+ node->end = end;
+ node->next = NULL;
+
+ if (!devm_memmap) {
+ devm_memmap = node;
+ return;
+ }
+
+ while (tail->next)
+ tail = tail->next;
+
+ tail->next = node;
+}
+
+static void dump_pmem_metadata(int i)
+{
+ long head_next;
+ long entry, head = SYMBOL(devm_memmap_vmcore_head);
+ long devm_entry, start_p, end_p;
+ unsigned long start, end;
+ static int cnt = 0;
+
+ entry = head;
+
+ if (head == NOT_FOUND_SYMBOL ||
+ OFFSET(list_head.next) == NOT_FOUND_STRUCTURE ||
+ OFFSET(devm_memmap_vmcore.start) == NOT_FOUND_STRUCTURE ||
+ OFFSET(devm_memmap_vmcore.end) == NOT_FOUND_STRUCTURE ||
+ OFFSET(devm_memmap_vmcore.entry) == NOT_FOUND_STRUCTURE)
+ return;
+
+ MSG("list_head.next: %ld\n", OFFSET(list_head.next));
+ MSG("devm_memmap_vmcore.start: %ld\n", OFFSET(devm_memmap_vmcore.start));
+ MSG("devm_memmap_vmcore.end: %ld\n", OFFSET(devm_memmap_vmcore.end));
+ MSG("devm_memmap_vmcore.entry: %ld\n", OFFSET(devm_memmap_vmcore.entry));
+
+again:
+ if (!readmem(VADDR, entry + OFFSET(list_head.next), &head_next, sizeof(head_next)))
+ return;
+
+ if (head_next == head) {
+ return;
+ }
+
+ entry = head_next;
+
+ devm_entry = entry - OFFSET(devm_memmap_vmcore.entry);
+ start_p = devm_entry + OFFSET(devm_memmap_vmcore.start);
+ end_p = devm_entry + OFFSET(devm_memmap_vmcore.end);
+
+ if (!readmem(VADDR, start_p, &start, sizeof(unsigned long))) {
+ goto fail;
+ }
+
+ if (!readmem(VADDR, end_p, &end, sizeof(unsigned long))) {
+ goto fail;
+ }
+
+ MSG("devm_memmap_vmcore[%d]: addr: %lx, [%lx - %lx )\n", cnt++, devm_entry, start, end);
+ pmem_add_next(start, end);
+ goto again;
+
+fail:
+ return;
+}
+
void
dump_mem_map(mdf_pfn_t pfn_start, mdf_pfn_t pfn_end,
unsigned long mem_map, int num_mm)
@@ -3728,6 +3822,7 @@ get_mem_section(unsigned int mem_section_size, unsigned long *mem_maps,
return FALSE;
}
+ dump_pmem_metadata(1);
/*
* There was a report that the first validation wrongly returned TRUE
* with -x vmlinux and SPARSEMEM_EXTREME v2 on s390x, so skip it.
@@ -6311,6 +6406,30 @@ exclude_range(mdf_pfn_t *counter, mdf_pfn_t pfn, mdf_pfn_t endpfn,
}
}
+static int is_pmem_metadata_range(unsigned long start, unsigned long end)
+{
+ struct devm_memmap_entry *head = devm_memmap;
+
+ while (head) {
+ if (head->start <= start && head->end >= end)
+ return TRUE;
+ head = head->next;
+ }
+
+ return FALSE;
+}
+
+static void cleanup_pmem_metadata(void)
+{
+ struct devm_memmap_entry *head = devm_memmap;
+
+ while (head) {
+ struct devm_memmap_entry *next = head->next;
+ free(head);
+ head = next;
+ }
+}
+
int
__exclude_unnecessary_pages(unsigned long mem_map,
mdf_pfn_t pfn_start, mdf_pfn_t pfn_end, struct cycle *cycle)
@@ -6381,9 +6500,17 @@ __exclude_unnecessary_pages(unsigned long mem_map,
is_pmem = is_pmem_pt_load_range(pfn << PAGESHIFT(), (pfn + 1) << PAGESHIFT());
if (is_pmem) {
- pfn_pmem_userdata++;
- clear_bit_on_2nd_bitmap_for_kernel(pfn, cycle);
- continue;
+ if (is_pmem_metadata_range(pfn, pfn + 1)) {
+ if (info->dump_level & DL_EXCLUDE_PMEM_META) {
+ pfn_pmem_metadata++;
+ clear_bit_on_2nd_bitmap_for_kernel(pfn, cycle);
+ continue;
+ }
+ } else {
+ pfn_pmem_userdata++;
+ clear_bit_on_2nd_bitmap_for_kernel(pfn, cycle);
+ continue;
+ }
}
index_pg = pfn % PGMM_CACHED;
@@ -8092,7 +8219,7 @@ write_elf_pages_cyclic(struct cache_data *cd_header, struct cache_data *cd_page)
* Reset counter for debug message.
*/
if (info->flag_cyclic) {
- pfn_zero = pfn_cache = pfn_cache_private = 0;
+ pfn_zero = pfn_cache = pfn_cache_private = pfn_pmem_metadata = 0;
pfn_user = pfn_free = pfn_hwpoison = pfn_offline = pfn_pmem_userdata = 0;
pfn_memhole = info->max_mapnr;
}
@@ -9430,7 +9557,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data *cd_header, struct cache_d
/*
* Reset counter for debug message.
*/
- pfn_zero = pfn_cache = pfn_cache_private = 0;
+ pfn_zero = pfn_cache = pfn_cache_private = pfn_pmem_metadata = 0;
pfn_user = pfn_free = pfn_hwpoison = pfn_offline = pfn_pmem_userdata = 0;
pfn_memhole = info->max_mapnr;
@@ -10380,7 +10507,7 @@ print_report(void)
pfn_original = info->max_mapnr - pfn_memhole;
pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private + pfn_pmem_userdata
- + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
+ + pfn_user + pfn_free + pfn_hwpoison + pfn_offline + pfn_pmem_metadata;
REPORT_MSG("\n");
REPORT_MSG("Original pages : 0x%016llx\n", pfn_original);
@@ -10396,6 +10523,7 @@ print_report(void)
REPORT_MSG(" Free pages : 0x%016llx\n", pfn_free);
REPORT_MSG(" Hwpoison pages : 0x%016llx\n", pfn_hwpoison);
REPORT_MSG(" Offline pages : 0x%016llx\n", pfn_offline);
+ REPORT_MSG(" pmem metadata pages : 0x%016llx\n", pfn_pmem_metadata);
REPORT_MSG(" pmem userdata pages : 0x%016llx\n", pfn_pmem_userdata);
REPORT_MSG(" Remaining pages : 0x%016llx\n",
pfn_original - pfn_excluded);
@@ -10437,7 +10565,7 @@ print_mem_usage(void)
pfn_original = info->max_mapnr - pfn_memhole;
pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private + pfn_pmem_userdata
- + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
+ + pfn_user + pfn_free + pfn_hwpoison + pfn_offline + pfn_pmem_metadata;
shrinking = (pfn_original - pfn_excluded) * 100;
shrinking = shrinking / pfn_original;
total_size = info->page_size * pfn_original;
@@ -12403,6 +12531,7 @@ out:
}
}
free_elf_info();
+ cleanup_pmem_metadata();
return retcd;
}
diff --git a/makedumpfile.h b/makedumpfile.h
index 21dec7d1145c..790fa698bb0e 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -206,7 +206,7 @@ test_bit(int nr, unsigned long addr)
* Dump Level
*/
#define MIN_DUMP_LEVEL (0)
-#define MAX_DUMP_LEVEL (31)
+#define MAX_DUMP_LEVEL (63)
#define NUM_ARRAY_DUMP_LEVEL (MAX_DUMP_LEVEL + 1) /* enough to allocate
all the dump_level */
#define DL_EXCLUDE_ZERO (0x001) /* Exclude Pages filled with Zeros */
@@ -216,6 +216,7 @@ test_bit(int nr, unsigned long addr)
with Private Pages */
#define DL_EXCLUDE_USER_DATA (0x008) /* Exclude UserProcessData Pages */
#define DL_EXCLUDE_FREE (0x010) /* Exclude Free Pages */
+#define DL_EXCLUDE_PMEM_META (0x020) /* Exclude pmem metadata Pages */
/*
@@ -1711,6 +1712,7 @@ struct symbol_table {
unsigned long long mem_map;
unsigned long long vmem_map;
unsigned long long mem_section;
+ unsigned long long devm_memmap_vmcore_head;
unsigned long long pkmap_count;
unsigned long long pkmap_count_next;
unsigned long long system_utsname;
@@ -1817,6 +1819,7 @@ struct size_table {
long node_memblk_s;
long nodemask_t;
long printk_log;
+ long devm_memmap_vmcore;
/*
* for lockless printk ringbuffer
@@ -1896,6 +1899,11 @@ struct offset_table {
long next;
long prev;
} list_head;
+ struct devm_memmap_vmcore {
+ long entry;
+ long start;
+ long end;
+ } devm_memmap_vmcore;
struct node_memblk_s {
long start_paddr;
long size;
--
2.29.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* RE: [RFC PATCH v2 0/3] pmem memmap dump support
2023-04-27 10:18 [RFC PATCH v2 0/3] pmem memmap dump support Li Zhijian
` (6 preceding siblings ...)
2023-04-27 10:18 ` [RFC PATCH v2 makedumpfile 3/3] makedumpfile.c: Allow excluding metadata of pmem region Li Zhijian
@ 2023-04-28 18:59 ` Dan Williams
2023-05-08 9:45 ` Zhijian Li (Fujitsu)
7 siblings, 1 reply; 18+ messages in thread
From: Dan Williams @ 2023-04-28 18:59 UTC (permalink / raw)
To: Li Zhijian, x86, nvdimm, kexec
Cc: linux-kernel, y-goto, yangx.jy, ruansy.fnst
Li Zhijian wrote:
> Hello folks,
>
> About 2 months ago, we posted our first RFC[3] and received your kindly feedback. Thank you :)
> Now, I'm back with the code.
>
> Currently, this RFC has already implemented to supported case D*. And the case A&B is disabled
> deliberately in makedumpfile. It includes changes in 3 source code as below:
I think the reason this patchkit is difficult to follow is that it
spends a lot of time describing a chosen solution, but not enough time
describing the problem and the tradeoffs.
For example why is updating /proc/vmcore with pmem metadata the chosen
solution? Why not leave the kernel out of it and have makedumpfile
tooling aware of how to parse persistent memory namespace info-blocks
and retrieve that dump itself? This is what I proposed here:
http://lore.kernel.org/r/641484f7ef780_a52e2940@dwillia2-mobl3.amr.corp.intel.com.notmuch
...but never got an answer, or I missed the answer.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC PATCH v2 0/3] pmem memmap dump support
2023-04-28 18:59 ` [RFC PATCH v2 0/3] pmem memmap dump support Dan Williams
@ 2023-05-08 9:45 ` Zhijian Li (Fujitsu)
2023-05-10 10:41 ` Zhijian Li (Fujitsu)
0 siblings, 1 reply; 18+ messages in thread
From: Zhijian Li (Fujitsu) @ 2023-05-08 9:45 UTC (permalink / raw)
To: Dan Williams, x86, nvdimm, kexec
Cc: linux-kernel, Yasunori Gotou (Fujitsu), Xiao Yang (Fujitsu),
Shiyang Ruan (Fujitsu)
Dan,
On 29/04/2023 02:59, Dan Williams wrote:
> Li Zhijian wrote:
>> Hello folks,
>>
>> About 2 months ago, we posted our first RFC[3] and received your kindly feedback. Thank you :)
>> Now, I'm back with the code.
>>
>> Currently, this RFC has already implemented to supported case D*. And the case A&B is disabled
>> deliberately in makedumpfile. It includes changes in 3 source code as below:
>
> I think the reason this patchkit is difficult to follow is that it
> spends a lot of time describing a chosen solution, but not enough time
> describing the problem and the tradeoffs.
>
> For example why is updating /proc/vmcore with pmem metadata the chosen
> solution? Why not leave the kernel out of it and have makedumpfile
> tooling aware of how to parse persistent memory namespace info-blocks
> and retrieve that dump itself? This is what I proposed here:
>
> http://lore.kernel.org/r/641484f7ef780_a52e2940@dwillia2-mobl3.amr.corp.intel.com.notmuch
Sorry for the late reply. I'm just back from the vacation.
And sorry again for missing your previous *important* information in V1.
Your proposal also sounds to me with less kernel changes, but more ndctl coupling with makedumpfile tools.
In my current understanding, it will includes following source changes.
-----------+-------------------------------------------------------------------+
Source | changes |
-----------+-------------------------------------------------------------------+
I. | 1. enter force_raw in kdump kernel automatically(avoid metadata being updated again)|
kernel | |
| 2. mark the whole pmem's PT_LOAD for kexec_file_load(2) syscall |
-----------+-------------------------------------------------------------------+
II. kexec- | 1. mark the whole pmem's PT_LOAD for kexe_load(2) syscall |
tool | |
-----------+-------------------------------------------------------------------+
III. | 1. parse the infoblock and calculate the boundaries of userdata and metadata |
makedump- | 2. skip pmem userdata region |
file | 3. exclude pmem metadata region if needed |
-----------+-------------------------------------------------------------------+
I will try rewrite it with your proposal ASAP
Thanks again
Thanks
Zhijian
>
> ...but never got an answer, or I missed the answer.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC PATCH v2 0/3] pmem memmap dump support
2023-05-08 9:45 ` Zhijian Li (Fujitsu)
@ 2023-05-10 10:41 ` Zhijian Li (Fujitsu)
2023-05-25 5:36 ` Li, Zhijian
0 siblings, 1 reply; 18+ messages in thread
From: Zhijian Li (Fujitsu) @ 2023-05-10 10:41 UTC (permalink / raw)
To: Dan Williams, x86, nvdimm, kexec
Cc: linux-kernel, Yasunori Gotou (Fujitsu), Xiao Yang (Fujitsu),
Shiyang Ruan (Fujitsu)
Hi Dan
on 5/8/2023 5:45 PM, Zhijian Li (Fujitsu) wrote:
> Dan,
>
>
> On 29/04/2023 02:59, Dan Williams wrote:
>> Li Zhijian wrote:
>>> Hello folks,
>>>
>>> About 2 months ago, we posted our first RFC[3] and received your kindly feedback. Thank you :)
>>> Now, I'm back with the code.
>>>
>>> Currently, this RFC has already implemented to supported case D*. And the case A&B is disabled
>>> deliberately in makedumpfile. It includes changes in 3 source code as below:
>> I think the reason this patchkit is difficult to follow is that it
>> spends a lot of time describing a chosen solution, but not enough time
>> describing the problem and the tradeoffs.
>>
>> For example why is updating /proc/vmcore with pmem metadata the chosen
>> solution? Why not leave the kernel out of it and have makedumpfile
>> tooling aware of how to parse persistent memory namespace info-blocks
>> and retrieve that dump itself? This is what I proposed here:
>>
>> http://lore.kernel.org/r/641484f7ef780_a52e2940@dwillia2-mobl3.amr.corp.intel.com.notmuch
> Sorry for the late reply. I'm just back from the vacation.
> And sorry again for missing your previous *important* information in V1.
>
> Your proposal also sounds to me with less kernel changes, but more ndctl coupling with makedumpfile tools.
> In my current understanding, it will includes following source changes.
The kernel and makedumpfile has updated. It's still in a early stage, but in order to make sure I'm following your proposal.
i want to share the changes with you early. Alternatively, you are able to refer to my github for the full details.
https://github.com/zhijianli88/makedumpfile/commit/8ebfe38c015cfca0545cb3b1d7a6cc9a58fc9bb3
If I'm going the wrong way, fee free to let me know :)
>
> -----------+-------------------------------------------------------------------+
> Source | changes |
> -----------+-------------------------------------------------------------------+
> I. | 1. enter force_raw in kdump kernel automatically(avoid metadata being updated again)|
kernel should adapt it so that the metadata of pmem will be updated again in the kdump kernel:
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index c60ec0b373c5..2e59be8b9c78 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/nd.h>
+#include <linux/crash_dump.h>
#include "nd-core.h"
#include "pmem.h"
#include "pfn.h"
@@ -1504,6 +1505,8 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
return ERR_PTR(-ENODEV);
}
+ if (is_kdump_kernel())
+ ndns->force_raw = true;
return ndns;
}
EXPORT_SYMBOL(nvdimm_namespace_common_probe);
> kernel | |
> | 2. mark the whole pmem's PT_LOAD for kexec_file_load(2) syscall |
> -----------+-------------------------------------------------------------------+
> II. kexec- | 1. mark the whole pmem's PT_LOAD for kexe_load(2) syscall |
> tool | |
> -----------+-------------------------------------------------------------------+
> III. | 1. parse the infoblock and calculate the boundaries of userdata and metadata |
> makedump- | 2. skip pmem userdata region |
> file | 3. exclude pmem metadata region if needed |
> -----------+-------------------------------------------------------------------+
>
> I will try rewrite it with your proposal ASAP
inspect_pmem_namespace() will walk the namespaces and the read its resource.start and infoblock. With this
information, we can calculate the boundaries of userdata and metadata easily. But currently this changes are
strongly coupling with the ndctl/pmem which looks a bit messy and ugly.
============makedumpfile=======
diff --git a/Makefile b/Makefile
index a289e41ef44d..4b4ded639cfd 100644
--- a/Makefile
+++ b/Makefile
@@ -50,7 +50,7 @@ OBJ_PART=$(patsubst %.c,%.o,$(SRC_PART))
SRC_ARCH = arch/arm.c arch/arm64.c arch/x86.c arch/x86_64.c arch/ia64.c arch/ppc64.c arch/s390x.c arch/ppc.c arch/sparc64.c arch/mips64.c arch/loongarch64.c
OBJ_ARCH=$(patsubst %.c,%.o,$(SRC_ARCH))
-LIBS = -ldw -lbz2 -ldl -lelf -lz
+LIBS = -ldw -lbz2 -ldl -lelf -lz -lndctl
ifneq ($(LINKTYPE), dynamic)
LIBS := -static $(LIBS) -llzma
endif
diff --git a/makedumpfile.c b/makedumpfile.c
index 98c3b8c7ced9..db68d05a29f9 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -27,6 +27,8 @@
#include <limits.h>
#include <assert.h>
#include <zlib.h>
+#include <sys/types.h>
+#include <ndctl/libndctl.h>
+
+#define INFOBLOCK_SZ (8192)
+#define SZ_4K (4096)
+#define PFN_SIG_LEN 16
+
+typedef uint64_t u64;
+typedef int64_t s64;
+typedef uint32_t u32;
+typedef int32_t s32;
+typedef uint16_t u16;
+typedef int16_t s16;
+typedef uint8_t u8;
+typedef int8_t s8;
+
+typedef int64_t le64;
+typedef int32_t le32;
+typedef int16_t le16;
+
+struct pfn_sb {
+ u8 signature[PFN_SIG_LEN];
+ u8 uuid[16];
+ u8 parent_uuid[16];
+ le32 flags;
+ le16 version_major;
+ le16 version_minor;
+ le64 dataoff; /* relative to namespace_base + start_pad */
+ le64 npfns;
+ le32 mode;
+ /* minor-version-1 additions for section alignment */
+ le32 start_pad;
+ le32 end_trunc;
+ /* minor-version-2 record the base alignment of the mapping */
+ le32 align;
+ /* minor-version-3 guarantee the padding and flags are zero */
+ /* minor-version-4 record the page size and struct page size */
+ le32 page_size;
+ le16 page_struct_size;
+ u8 padding[3994];
+ le64 checksum;
+};
+
+static int nd_read_infoblock_dataoff(struct ndctl_namespace *ndns)
+{
+ int fd, rc;
+ char path[50];
+ char buf[INFOBLOCK_SZ + 1];
+ struct pfn_sb *pfn_sb = (struct pfn_sb *)(buf + SZ_4K);
+
+ sprintf(path, "/dev/%s", ndctl_namespace_get_block_device(ndns));
+
+ fd = open(path, O_RDONLY|O_EXCL);
+ if (fd < 0)
+ return -1;
+
+
+ rc = read(fd, buf, INFOBLOCK_SZ);
+ if (rc < INFOBLOCK_SZ) {
+ return -1;
+ }
+
+ return pfn_sb->dataoff;
+}
+
+int inspect_pmem_namespace(void)
+{
+ struct ndctl_ctx *ctx;
+ struct ndctl_bus *bus;
+ int rc = -1;
+
+ fprintf(stderr, "\n\ninspect_pmem_namespace!!\n\n");
+ rc = ndctl_new(&ctx);
+ if (rc)
+ return -1;
+
+ ndctl_bus_foreach(ctx, bus) {
+ struct ndctl_region *region;
+
+ ndctl_region_foreach(bus, region) {
+ struct ndctl_namespace *ndns;
+
+ ndctl_namespace_foreach(region, ndns) {
+ enum ndctl_namespace_mode mode;
+ long long start, end_metadata;
+
+ mode = ndctl_namespace_get_mode(ndns);
+ /* kdump kernel should set force_raw, mode become *safe* */
+ if (mode == NDCTL_NS_MODE_SAFE) {
+ fprintf(stderr, "Only raw can be dumpable\n");
+ continue;
+ }
+
+ start = ndctl_namespace_get_resource(ndns);
+ end_metadata = nd_read_infoblock_dataoff(ndns);
+
+ /* metadata really starts from 2M alignment */
+ if (start != ULLONG_MAX && end_metadata > 2 * 1024 * 1024) // 2M
+ pmem_add_next(start, end_metadata);
+ }
+ }
+ }
+
+ ndctl_unref(ctx);
+ return 0;
+}
+
Thanks
Zhijian
>
> Thanks again
>
> Thanks
> Zhijian
>
>> ...but never got an answer, or I missed the answer.
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [RFC PATCH v2 0/3] pmem memmap dump support
2023-05-10 10:41 ` Zhijian Li (Fujitsu)
@ 2023-05-25 5:36 ` Li, Zhijian
0 siblings, 0 replies; 18+ messages in thread
From: Li, Zhijian @ 2023-05-25 5:36 UTC (permalink / raw)
To: Dan Williams, bhe
Cc: linux-kernel, Yasunori Gotou (Fujitsu), Xiao Yang (Fujitsu),
Shiyang Ruan (Fujitsu),
x86, nvdimm, kexec
Ping
Baoquan, Dan
Sorry to bother you again.
Could you further comment a word or two on this set?
Thanks
Zhijian
on 5/10/2023 6:41 PM, Zhijian Li (Fujitsu) wrote:
> Hi Dan
>
>
> on 5/8/2023 5:45 PM, Zhijian Li (Fujitsu) wrote:
>> Dan,
>>
>>
>> On 29/04/2023 02:59, Dan Williams wrote:
>>> Li Zhijian wrote:
>>>> Hello folks,
>>>>
>>>> About 2 months ago, we posted our first RFC[3] and received your kindly feedback. Thank you :)
>>>> Now, I'm back with the code.
>>>>
>>>> Currently, this RFC has already implemented to supported case D*. And the case A&B is disabled
>>>> deliberately in makedumpfile. It includes changes in 3 source code as below:
>>> I think the reason this patchkit is difficult to follow is that it
>>> spends a lot of time describing a chosen solution, but not enough time
>>> describing the problem and the tradeoffs.
>>>
>>> For example why is updating /proc/vmcore with pmem metadata the chosen
>>> solution? Why not leave the kernel out of it and have makedumpfile
>>> tooling aware of how to parse persistent memory namespace info-blocks
>>> and retrieve that dump itself? This is what I proposed here:
>>>
>>> http://lore.kernel.org/r/641484f7ef780_a52e2940@dwillia2-mobl3.amr.corp.intel.com.notmuch
>> Sorry for the late reply. I'm just back from the vacation.
>> And sorry again for missing your previous *important* information in V1.
>>
>> Your proposal also sounds to me with less kernel changes, but more ndctl coupling with makedumpfile tools.
>> In my current understanding, it will includes following source changes.
> The kernel and makedumpfile has updated. It's still in a early stage, but in order to make sure I'm following your proposal.
> i want to share the changes with you early. Alternatively, you are able to refer to my github for the full details.
> https://github.com/zhijianli88/makedumpfile/commit/8ebfe38c015cfca0545cb3b1d7a6cc9a58fc9bb3
>
> If I'm going the wrong way, fee free to let me know :)
>
>
>> -----------+-------------------------------------------------------------------+
>> Source | changes |
>> -----------+-------------------------------------------------------------------+
>> I. | 1. enter force_raw in kdump kernel automatically(avoid metadata being updated again)|
> kernel should adapt it so that the metadata of pmem will be updated again in the kdump kernel:
>
> diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
> index c60ec0b373c5..2e59be8b9c78 100644
> --- a/drivers/nvdimm/namespace_devs.c
> +++ b/drivers/nvdimm/namespace_devs.c
> @@ -8,6 +8,7 @@
> #include <linux/slab.h>
> #include <linux/list.h>
> #include <linux/nd.h>
> +#include <linux/crash_dump.h>
> #include "nd-core.h"
> #include "pmem.h"
> #include "pfn.h"
> @@ -1504,6 +1505,8 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
> return ERR_PTR(-ENODEV);
> }
>
> + if (is_kdump_kernel())
> + ndns->force_raw = true;
> return ndns;
> }
> EXPORT_SYMBOL(nvdimm_namespace_common_probe);
>
>> kernel | |
>> | 2. mark the whole pmem's PT_LOAD for kexec_file_load(2) syscall |
>> -----------+-------------------------------------------------------------------+
>> II. kexec- | 1. mark the whole pmem's PT_LOAD for kexe_load(2) syscall |
>> tool | |
>> -----------+-------------------------------------------------------------------+
>> III. | 1. parse the infoblock and calculate the boundaries of userdata and metadata |
>> makedump- | 2. skip pmem userdata region |
>> file | 3. exclude pmem metadata region if needed |
>> -----------+-------------------------------------------------------------------+
>>
>> I will try rewrite it with your proposal ASAP
> inspect_pmem_namespace() will walk the namespaces and the read its resource.start and infoblock. With this
> information, we can calculate the boundaries of userdata and metadata easily. But currently this changes are
> strongly coupling with the ndctl/pmem which looks a bit messy and ugly.
>
> ============makedumpfile=======
>
> diff --git a/Makefile b/Makefile
> index a289e41ef44d..4b4ded639cfd 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -50,7 +50,7 @@ OBJ_PART=$(patsubst %.c,%.o,$(SRC_PART))
> SRC_ARCH = arch/arm.c arch/arm64.c arch/x86.c arch/x86_64.c arch/ia64.c arch/ppc64.c arch/s390x.c arch/ppc.c arch/sparc64.c arch/mips64.c arch/loongarch64.c
> OBJ_ARCH=$(patsubst %.c,%.o,$(SRC_ARCH))
>
> -LIBS = -ldw -lbz2 -ldl -lelf -lz
> +LIBS = -ldw -lbz2 -ldl -lelf -lz -lndctl
> ifneq ($(LINKTYPE), dynamic)
> LIBS := -static $(LIBS) -llzma
> endif
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 98c3b8c7ced9..db68d05a29f9 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -27,6 +27,8 @@
> #include <limits.h>
> #include <assert.h>
> #include <zlib.h>
> +#include <sys/types.h>
> +#include <ndctl/libndctl.h>
>
> +
> +#define INFOBLOCK_SZ (8192)
> +#define SZ_4K (4096)
> +#define PFN_SIG_LEN 16
> +
> +typedef uint64_t u64;
> +typedef int64_t s64;
> +typedef uint32_t u32;
> +typedef int32_t s32;
> +typedef uint16_t u16;
> +typedef int16_t s16;
> +typedef uint8_t u8;
> +typedef int8_t s8;
> +
> +typedef int64_t le64;
> +typedef int32_t le32;
> +typedef int16_t le16;
> +
> +struct pfn_sb {
> + u8 signature[PFN_SIG_LEN];
> + u8 uuid[16];
> + u8 parent_uuid[16];
> + le32 flags;
> + le16 version_major;
> + le16 version_minor;
> + le64 dataoff; /* relative to namespace_base + start_pad */
> + le64 npfns;
> + le32 mode;
> + /* minor-version-1 additions for section alignment */
> + le32 start_pad;
> + le32 end_trunc;
> + /* minor-version-2 record the base alignment of the mapping */
> + le32 align;
> + /* minor-version-3 guarantee the padding and flags are zero */
> + /* minor-version-4 record the page size and struct page size */
> + le32 page_size;
> + le16 page_struct_size;
> + u8 padding[3994];
> + le64 checksum;
> +};
> +
> +static int nd_read_infoblock_dataoff(struct ndctl_namespace *ndns)
> +{
> + int fd, rc;
> + char path[50];
> + char buf[INFOBLOCK_SZ + 1];
> + struct pfn_sb *pfn_sb = (struct pfn_sb *)(buf + SZ_4K);
> +
> + sprintf(path, "/dev/%s", ndctl_namespace_get_block_device(ndns));
> +
> + fd = open(path, O_RDONLY|O_EXCL);
> + if (fd < 0)
> + return -1;
> +
> +
> + rc = read(fd, buf, INFOBLOCK_SZ);
> + if (rc < INFOBLOCK_SZ) {
> + return -1;
> + }
> +
> + return pfn_sb->dataoff;
> +}
> +
> +int inspect_pmem_namespace(void)
> +{
> + struct ndctl_ctx *ctx;
> + struct ndctl_bus *bus;
> + int rc = -1;
> +
> + fprintf(stderr, "\n\ninspect_pmem_namespace!!\n\n");
> + rc = ndctl_new(&ctx);
> + if (rc)
> + return -1;
> +
> + ndctl_bus_foreach(ctx, bus) {
> + struct ndctl_region *region;
> +
> + ndctl_region_foreach(bus, region) {
> + struct ndctl_namespace *ndns;
> +
> + ndctl_namespace_foreach(region, ndns) {
> + enum ndctl_namespace_mode mode;
> + long long start, end_metadata;
> +
> + mode = ndctl_namespace_get_mode(ndns);
> + /* kdump kernel should set force_raw, mode become *safe* */
> + if (mode == NDCTL_NS_MODE_SAFE) {
> + fprintf(stderr, "Only raw can be dumpable\n");
> + continue;
> + }
> +
> + start = ndctl_namespace_get_resource(ndns);
> + end_metadata = nd_read_infoblock_dataoff(ndns);
> +
> + /* metadata really starts from 2M alignment */
> + if (start != ULLONG_MAX && end_metadata > 2 * 1024 * 1024) // 2M
> + pmem_add_next(start, end_metadata);
> + }
> + }
> + }
> +
> + ndctl_unref(ctx);
> + return 0;
> +}
> +
>
> Thanks
> Zhijian
>
>
>
>> Thanks again
>>
>> Thanks
>> Zhijian
>>
>>> ...but never got an answer, or I missed the answer.
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 18+ messages in thread