[v2,4/4] efi: Fix handling of multiple efi_fake_mem= entries
diff mbox series

Message ID 157782987865.367056.15199592105978588123.stgit@dwillia2-desk3.amr.corp.intel.com
State Superseded
Headers show
Series
  • efi: Fix handling of multiple efi_fake_mem= entries
Related show

Commit Message

Dan Williams Dec. 31, 2019, 10:04 p.m. UTC
Dave noticed that when specifying multiple efi_fake_mem= entries only
the last entry was successfully being reflected in the efi memory map.
This is due to the fact that the efi_memmap_insert() is being called
multiple times, but on successive invocations the insertion should be
applied to the last new memmap rather than the original map at
efi_fake_memmap() entry.

Rework efi_fake_memmap() to install the new memory map after each
efi_fake_mem= entry is parsed.

This also fixes an issue in efi_fake_memmap() that caused it to litter
emtpy entries into the end of the efi memory map. The empty entry causes
efi_memmap_insert() to attempt more memmap splits / copies than
efi_memmap_split_count() accounted for when sizing the new map.

    BUG: unable to handle page fault for address: ffffffffff281000
    [..]
    RIP: 0010:efi_memmap_insert+0x11d/0x191
    [..]
    Call Trace:
     ? bgrt_init+0xbe/0xbe
     ? efi_arch_mem_reserve+0x1cb/0x228
     ? acpi_parse_bgrt+0xa/0xd
     ? acpi_table_parse+0x86/0xb8
     ? acpi_boot_init+0x494/0x4e3
     ? acpi_parse_x2apic+0x87/0x87
     ? setup_acpi_sci+0xa2/0xa2
     ? setup_arch+0x8db/0x9e1
     ? start_kernel+0x6a/0x547
     ? secondary_startup_64+0xb6/0xc0

Commit af1648984828 "x86/efi: Update e820 with reserved EFI boot
services data to fix kexec breakage" is listed in Fixes: since it
introduces more occurrences where efi_memmap_insert() is invoked after
an efi_fake_mem= configuration has been parsed. Previously the side
effects of vestigial empty entries were benign, but with commit
af1648984828 that follow-on efi_memmap_insert() invocation triggers the
above crash signature.

Fixes: 0f96a99dab36 ("efi: Add 'efi_fake_mem' boot option")
Fixes: af1648984828 ("x86/efi: Update e820 with reserved EFI boot services...")
Link: https://lore.kernel.org/r/20191231014630.GA24942@dhcp-128-65.nay.redhat.com
Reported-by: Dave Young <dyoung@redhat.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Michael Weiser <michael@weiser.dinsnail.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/firmware/efi/fake_mem.c |   32 +++++++++++++++++---------------
 drivers/firmware/efi/memmap.c   |    2 +-
 include/linux/efi.h             |    2 ++
 3 files changed, 20 insertions(+), 16 deletions(-)

Comments

Dave Young Jan. 1, 2020, 4:51 a.m. UTC | #1
Hi Dan,
On 12/31/19 at 02:04pm, Dan Williams wrote:
> Dave noticed that when specifying multiple efi_fake_mem= entries only
> the last entry was successfully being reflected in the efi memory map.
> This is due to the fact that the efi_memmap_insert() is being called
> multiple times, but on successive invocations the insertion should be
> applied to the last new memmap rather than the original map at
> efi_fake_memmap() entry.
> 
> Rework efi_fake_memmap() to install the new memory map after each
> efi_fake_mem= entry is parsed.
> 
> This also fixes an issue in efi_fake_memmap() that caused it to litter
> emtpy entries into the end of the efi memory map. The empty entry causes
> efi_memmap_insert() to attempt more memmap splits / copies than
> efi_memmap_split_count() accounted for when sizing the new map.
> 
>     BUG: unable to handle page fault for address: ffffffffff281000
>     [..]
>     RIP: 0010:efi_memmap_insert+0x11d/0x191
>     [..]
>     Call Trace:
>      ? bgrt_init+0xbe/0xbe
>      ? efi_arch_mem_reserve+0x1cb/0x228
>      ? acpi_parse_bgrt+0xa/0xd
>      ? acpi_table_parse+0x86/0xb8
>      ? acpi_boot_init+0x494/0x4e3
>      ? acpi_parse_x2apic+0x87/0x87
>      ? setup_acpi_sci+0xa2/0xa2
>      ? setup_arch+0x8db/0x9e1
>      ? start_kernel+0x6a/0x547
>      ? secondary_startup_64+0xb6/0xc0
> 
> Commit af1648984828 "x86/efi: Update e820 with reserved EFI boot
> services data to fix kexec breakage" is listed in Fixes: since it
> introduces more occurrences where efi_memmap_insert() is invoked after
> an efi_fake_mem= configuration has been parsed. Previously the side
> effects of vestigial empty entries were benign, but with commit
> af1648984828 that follow-on efi_memmap_insert() invocation triggers the
> above crash signature.
> 
> Fixes: 0f96a99dab36 ("efi: Add 'efi_fake_mem' boot option")
> Fixes: af1648984828 ("x86/efi: Update e820 with reserved EFI boot services...")
> Link: https://lore.kernel.org/r/20191231014630.GA24942@dhcp-128-65.nay.redhat.com
> Reported-by: Dave Young <dyoung@redhat.com>
> Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
> Cc: Michael Weiser <michael@weiser.dinsnail.net>
> Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@kernel.org>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/firmware/efi/fake_mem.c |   32 +++++++++++++++++---------------
>  drivers/firmware/efi/memmap.c   |    2 +-
>  include/linux/efi.h             |    2 ++
>  3 files changed, 20 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
> index 7e53e5520548..68d752d8af21 100644
> --- a/drivers/firmware/efi/fake_mem.c
> +++ b/drivers/firmware/efi/fake_mem.c
> @@ -34,26 +34,17 @@ static int __init cmp_fake_mem(const void *x1, const void *x2)
>  	return 0;
>  }
>  
> -void __init efi_fake_memmap(void)
> +static void __init efi_fake_range(struct efi_mem_range *efi_range)
>  {
>  	int new_nr_map = efi.memmap.nr_map;
>  	efi_memory_desc_t *md;
>  	phys_addr_t new_memmap_phy;
>  	unsigned long flags = 0;
>  	void *new_memmap;
> -	int i;
> -
> -	if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
> -		return;
>  
>  	/* count up the number of EFI memory descriptor */
> -	for (i = 0; i < nr_fake_mem; i++) {
> -		for_each_efi_memory_desc(md) {
> -			struct range *r = &efi_fake_mems[i].range;
> -
> -			new_nr_map += efi_memmap_split_count(md, r);
> -		}
> -	}
> +	for_each_efi_memory_desc(md)
> +		new_nr_map += efi_memmap_split_count(md, &efi_range->range);

I have another concern here :(

THe efi_memmap_split_count mean to only split for a specific md, and you
can see arch/x86/platform/efi/quirks.c about the use:
        if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
                pr_err("Region spans EFI memory descriptors, %pa\n", &addr);
                return;
        }

Any memory region to be inserted but spans different md will be
rejected.  So the memmap insert logic seems does not support the
spanned ranges.  I did not find a case two contiguous same type ranges
eg. two "Conventional memory", if have they should have been merged. 

So maybe just use same way as the quirks.c here to find the valid md first
then get the split count?

Otherwise I tested the series bootup test passed.

BTW, another issue about fakemem,  currently it only works with normal
physical boot,  in case of kexec reboot the kernel only aware of EFI
runtime memory ranges, we do not pass other types in memmap.  But maybe
we can live with it considering fake mem is only for debugging purpose.

Thanks
Dave
Dan Williams Jan. 1, 2020, 5:04 a.m. UTC | #2
On Tue, Dec 31, 2019 at 8:52 PM Dave Young <dyoung@redhat.com> wrote:
>
> Hi Dan,
> On 12/31/19 at 02:04pm, Dan Williams wrote:
> > Dave noticed that when specifying multiple efi_fake_mem= entries only
> > the last entry was successfully being reflected in the efi memory map.
> > This is due to the fact that the efi_memmap_insert() is being called
> > multiple times, but on successive invocations the insertion should be
> > applied to the last new memmap rather than the original map at
> > efi_fake_memmap() entry.
> >
> > Rework efi_fake_memmap() to install the new memory map after each
> > efi_fake_mem= entry is parsed.
> >
> > This also fixes an issue in efi_fake_memmap() that caused it to litter
> > emtpy entries into the end of the efi memory map. The empty entry causes
> > efi_memmap_insert() to attempt more memmap splits / copies than
> > efi_memmap_split_count() accounted for when sizing the new map.
> >
> >     BUG: unable to handle page fault for address: ffffffffff281000
> >     [..]
> >     RIP: 0010:efi_memmap_insert+0x11d/0x191
> >     [..]
> >     Call Trace:
> >      ? bgrt_init+0xbe/0xbe
> >      ? efi_arch_mem_reserve+0x1cb/0x228
> >      ? acpi_parse_bgrt+0xa/0xd
> >      ? acpi_table_parse+0x86/0xb8
> >      ? acpi_boot_init+0x494/0x4e3
> >      ? acpi_parse_x2apic+0x87/0x87
> >      ? setup_acpi_sci+0xa2/0xa2
> >      ? setup_arch+0x8db/0x9e1
> >      ? start_kernel+0x6a/0x547
> >      ? secondary_startup_64+0xb6/0xc0
> >
> > Commit af1648984828 "x86/efi: Update e820 with reserved EFI boot
> > services data to fix kexec breakage" is listed in Fixes: since it
> > introduces more occurrences where efi_memmap_insert() is invoked after
> > an efi_fake_mem= configuration has been parsed. Previously the side
> > effects of vestigial empty entries were benign, but with commit
> > af1648984828 that follow-on efi_memmap_insert() invocation triggers the
> > above crash signature.
> >
> > Fixes: 0f96a99dab36 ("efi: Add 'efi_fake_mem' boot option")
> > Fixes: af1648984828 ("x86/efi: Update e820 with reserved EFI boot services...")
> > Link: https://lore.kernel.org/r/20191231014630.GA24942@dhcp-128-65.nay.redhat.com
> > Reported-by: Dave Young <dyoung@redhat.com>
> > Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
> > Cc: Michael Weiser <michael@weiser.dinsnail.net>
> > Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Ingo Molnar <mingo@kernel.org>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > ---
> >  drivers/firmware/efi/fake_mem.c |   32 +++++++++++++++++---------------
> >  drivers/firmware/efi/memmap.c   |    2 +-
> >  include/linux/efi.h             |    2 ++
> >  3 files changed, 20 insertions(+), 16 deletions(-)
> >
> > diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
> > index 7e53e5520548..68d752d8af21 100644
> > --- a/drivers/firmware/efi/fake_mem.c
> > +++ b/drivers/firmware/efi/fake_mem.c
> > @@ -34,26 +34,17 @@ static int __init cmp_fake_mem(const void *x1, const void *x2)
> >       return 0;
> >  }
> >
> > -void __init efi_fake_memmap(void)
> > +static void __init efi_fake_range(struct efi_mem_range *efi_range)
> >  {
> >       int new_nr_map = efi.memmap.nr_map;
> >       efi_memory_desc_t *md;
> >       phys_addr_t new_memmap_phy;
> >       unsigned long flags = 0;
> >       void *new_memmap;
> > -     int i;
> > -
> > -     if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
> > -             return;
> >
> >       /* count up the number of EFI memory descriptor */
> > -     for (i = 0; i < nr_fake_mem; i++) {
> > -             for_each_efi_memory_desc(md) {
> > -                     struct range *r = &efi_fake_mems[i].range;
> > -
> > -                     new_nr_map += efi_memmap_split_count(md, r);
> > -             }
> > -     }
> > +     for_each_efi_memory_desc(md)
> > +             new_nr_map += efi_memmap_split_count(md, &efi_range->range);
>
> I have another concern here :(
>
> THe efi_memmap_split_count mean to only split for a specific md, and you
> can see arch/x86/platform/efi/quirks.c about the use:
>         if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
>                 pr_err("Region spans EFI memory descriptors, %pa\n", &addr);
>                 return;
>         }
>
> Any memory region to be inserted but spans different md will be
> rejected.  So the memmap insert logic seems does not support the
> spanned ranges.  I did not find a case two contiguous same type ranges
> eg. two "Conventional memory", if have they should have been merged.
>
> So maybe just use same way as the quirks.c here to find the valid md first
> then get the split count?

I don't immediately see why it would be a problem to just let the md
loop that efi_fake_memmap() performs try to split multiple entries. It
may end up with more splits than necessary in which case we'll need
that piece from my original patch to clean those up. Thanks for the
heads up, I'll give it a try and see what shakes out. Are you seeing
any misbehavior on your end?

>
> Otherwise I tested the series bootup test passed.
>
> BTW, another issue about fakemem,  currently it only works with normal
> physical boot,  in case of kexec reboot the kernel only aware of EFI
> runtime memory ranges, we do not pass other types in memmap.  But maybe
> we can live with it considering fake mem is only for debugging purpose.

Does kexec preserve iomem? I.e. as long as the initial translation of
efi entries to e820, and resulting resource tree, is preserved by
successive kexec cycles then I think we're ok.
Dave Young Jan. 1, 2020, 6:15 a.m. UTC | #3
On 12/31/19 at 09:04pm, Dan Williams wrote:
> On Tue, Dec 31, 2019 at 8:52 PM Dave Young <dyoung@redhat.com> wrote:
> >
> > Hi Dan,
> > On 12/31/19 at 02:04pm, Dan Williams wrote:
> > > Dave noticed that when specifying multiple efi_fake_mem= entries only
> > > the last entry was successfully being reflected in the efi memory map.
> > > This is due to the fact that the efi_memmap_insert() is being called
> > > multiple times, but on successive invocations the insertion should be
> > > applied to the last new memmap rather than the original map at
> > > efi_fake_memmap() entry.
> > >
> > > Rework efi_fake_memmap() to install the new memory map after each
> > > efi_fake_mem= entry is parsed.
> > >
> > > This also fixes an issue in efi_fake_memmap() that caused it to litter
> > > emtpy entries into the end of the efi memory map. The empty entry causes
> > > efi_memmap_insert() to attempt more memmap splits / copies than
> > > efi_memmap_split_count() accounted for when sizing the new map.
> > >
> > >     BUG: unable to handle page fault for address: ffffffffff281000
> > >     [..]
> > >     RIP: 0010:efi_memmap_insert+0x11d/0x191
> > >     [..]
> > >     Call Trace:
> > >      ? bgrt_init+0xbe/0xbe
> > >      ? efi_arch_mem_reserve+0x1cb/0x228
> > >      ? acpi_parse_bgrt+0xa/0xd
> > >      ? acpi_table_parse+0x86/0xb8
> > >      ? acpi_boot_init+0x494/0x4e3
> > >      ? acpi_parse_x2apic+0x87/0x87
> > >      ? setup_acpi_sci+0xa2/0xa2
> > >      ? setup_arch+0x8db/0x9e1
> > >      ? start_kernel+0x6a/0x547
> > >      ? secondary_startup_64+0xb6/0xc0
> > >
> > > Commit af1648984828 "x86/efi: Update e820 with reserved EFI boot
> > > services data to fix kexec breakage" is listed in Fixes: since it
> > > introduces more occurrences where efi_memmap_insert() is invoked after
> > > an efi_fake_mem= configuration has been parsed. Previously the side
> > > effects of vestigial empty entries were benign, but with commit
> > > af1648984828 that follow-on efi_memmap_insert() invocation triggers the
> > > above crash signature.
> > >
> > > Fixes: 0f96a99dab36 ("efi: Add 'efi_fake_mem' boot option")
> > > Fixes: af1648984828 ("x86/efi: Update e820 with reserved EFI boot services...")
> > > Link: https://lore.kernel.org/r/20191231014630.GA24942@dhcp-128-65.nay.redhat.com
> > > Reported-by: Dave Young <dyoung@redhat.com>
> > > Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
> > > Cc: Michael Weiser <michael@weiser.dinsnail.net>
> > > Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Ingo Molnar <mingo@kernel.org>
> > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > > ---
> > >  drivers/firmware/efi/fake_mem.c |   32 +++++++++++++++++---------------
> > >  drivers/firmware/efi/memmap.c   |    2 +-
> > >  include/linux/efi.h             |    2 ++
> > >  3 files changed, 20 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
> > > index 7e53e5520548..68d752d8af21 100644
> > > --- a/drivers/firmware/efi/fake_mem.c
> > > +++ b/drivers/firmware/efi/fake_mem.c
> > > @@ -34,26 +34,17 @@ static int __init cmp_fake_mem(const void *x1, const void *x2)
> > >       return 0;
> > >  }
> > >
> > > -void __init efi_fake_memmap(void)
> > > +static void __init efi_fake_range(struct efi_mem_range *efi_range)
> > >  {
> > >       int new_nr_map = efi.memmap.nr_map;
> > >       efi_memory_desc_t *md;
> > >       phys_addr_t new_memmap_phy;
> > >       unsigned long flags = 0;
> > >       void *new_memmap;
> > > -     int i;
> > > -
> > > -     if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
> > > -             return;
> > >
> > >       /* count up the number of EFI memory descriptor */
> > > -     for (i = 0; i < nr_fake_mem; i++) {
> > > -             for_each_efi_memory_desc(md) {
> > > -                     struct range *r = &efi_fake_mems[i].range;
> > > -
> > > -                     new_nr_map += efi_memmap_split_count(md, r);
> > > -             }
> > > -     }
> > > +     for_each_efi_memory_desc(md)
> > > +             new_nr_map += efi_memmap_split_count(md, &efi_range->range);
> >
> > I have another concern here :(
> >
> > THe efi_memmap_split_count mean to only split for a specific md, and you
> > can see arch/x86/platform/efi/quirks.c about the use:
> >         if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
> >                 pr_err("Region spans EFI memory descriptors, %pa\n", &addr);
> >                 return;
> >         }
> >
> > Any memory region to be inserted but spans different md will be
> > rejected.  So the memmap insert logic seems does not support the
> > spanned ranges.  I did not find a case two contiguous same type ranges
> > eg. two "Conventional memory", if have they should have been merged.
> >
> > So maybe just use same way as the quirks.c here to find the valid md first
> > then get the split count?
> 
> I don't immediately see why it would be a problem to just let the md
> loop that efi_fake_memmap() performs try to split multiple entries. It
> may end up with more splits than necessary in which case we'll need
> that piece from my original patch to clean those up. Thanks for the
> heads up, I'll give it a try and see what shakes out. Are you seeing
> any misbehavior on your end?

Just some worries, but I did not see any misbehaviors :)

> 
> >
> > Otherwise I tested the series bootup test passed.
> >
> > BTW, another issue about fakemem,  currently it only works with normal
> > physical boot,  in case of kexec reboot the kernel only aware of EFI
> > runtime memory ranges, we do not pass other types in memmap.  But maybe
> > we can live with it considering fake mem is only for debugging purpose.
> 
> Does kexec preserve iomem? I.e. as long as the initial translation of
> efi entries to e820, and resulting resource tree, is preserved by
> successive kexec cycles then I think we're ok.

It will not preserve them automatically, but that can be fixed if
needed.

There are two places:
1. the in kernel loader, we can do similar with below commit (for Soft
Reseved instead):
commit 980621daf368f2b9aa69c7ea01baa654edb7577b
Author: Lianbo Jiang <lijiang@redhat.com>
Date:   Tue Apr 23 09:30:07 2019 +0800

    x86/crash: Add e820 reserved ranges to kdump kernel's e820 table

2. the userspace loader, in kexec-tools:
It only parse and pass "Reserved" for the time being, also need handle
the Soft Reserved" part as well.

Thanks
Dave
Dave Young Jan. 1, 2020, 6:20 a.m. UTC | #4
> > Does kexec preserve iomem? I.e. as long as the initial translation of
> > efi entries to e820, and resulting resource tree, is preserved by
> > successive kexec cycles then I think we're ok.
> 
> It will not preserve them automatically, but that can be fixed if
> needed.
> 
> There are two places:
> 1. the in kernel loader, we can do similar with below commit (for Soft
> Reseved instead):
> commit 980621daf368f2b9aa69c7ea01baa654edb7577b
> Author: Lianbo Jiang <lijiang@redhat.com>
> Date:   Tue Apr 23 09:30:07 2019 +0800
> 
>     x86/crash: Add e820 reserved ranges to kdump kernel's e820 table

Oops, that is for kdump only, for kexec, should update the kexec e820
table.  But before doing that we need first to see if this is necessary. 

Thanks
Dave
Dan Williams Jan. 1, 2020, 6:36 p.m. UTC | #5
On Tue, Dec 31, 2019 at 10:21 PM Dave Young <dyoung@redhat.com> wrote:
>
> > > Does kexec preserve iomem? I.e. as long as the initial translation of
> > > efi entries to e820, and resulting resource tree, is preserved by
> > > successive kexec cycles then I think we're ok.
> >
> > It will not preserve them automatically, but that can be fixed if
> > needed.
> >
> > There are two places:
> > 1. the in kernel loader, we can do similar with below commit (for Soft
> > Reseved instead):
> > commit 980621daf368f2b9aa69c7ea01baa654edb7577b
> > Author: Lianbo Jiang <lijiang@redhat.com>
> > Date:   Tue Apr 23 09:30:07 2019 +0800
> >
> >     x86/crash: Add e820 reserved ranges to kdump kernel's e820 table
>
> Oops, that is for kdump only, for kexec, should update the kexec e820
> table.  But before doing that we need first to see if this is necessary.

We can cross that bridge later, but I expect it will eventually be
necessary. The soft-reservation facility will become more prevalent as
more platforms ship with DRAM differentiated memory ranges, like
high-bandwidth-memory, and the system needs to reserve it from general
kernel allocations. See commit 262b45ae3ab4 "x86/efi: EFI soft
reservation to E820 enumeration" and commit fe3e5e65c06e "efi:
Enumerate EFI_MEMORY_SP" for more details.
Dave Young Jan. 2, 2020, 2:21 a.m. UTC | #6
On 01/01/20 at 10:36am, Dan Williams wrote:
> On Tue, Dec 31, 2019 at 10:21 PM Dave Young <dyoung@redhat.com> wrote:
> >
> > > > Does kexec preserve iomem? I.e. as long as the initial translation of
> > > > efi entries to e820, and resulting resource tree, is preserved by
> > > > successive kexec cycles then I think we're ok.
> > >
> > > It will not preserve them automatically, but that can be fixed if
> > > needed.
> > >
> > > There are two places:
> > > 1. the in kernel loader, we can do similar with below commit (for Soft
> > > Reseved instead):
> > > commit 980621daf368f2b9aa69c7ea01baa654edb7577b
> > > Author: Lianbo Jiang <lijiang@redhat.com>
> > > Date:   Tue Apr 23 09:30:07 2019 +0800
> > >
> > >     x86/crash: Add e820 reserved ranges to kdump kernel's e820 table
> >
> > Oops, that is for kdump only, for kexec, should update the kexec e820
> > table.  But before doing that we need first to see if this is necessary.
> 
> We can cross that bridge later, but I expect it will eventually be
> necessary. The soft-reservation facility will become more prevalent as
> more platforms ship with DRAM differentiated memory ranges, like
> high-bandwidth-memory, and the system needs to reserve it from general
> kernel allocations. See commit 262b45ae3ab4 "x86/efi: EFI soft
> reservation to E820 enumeration" and commit fe3e5e65c06e "efi:
> Enumerate EFI_MEMORY_SP" for more details.

Ok, agreed the EFI_MEMORY_SP should be preserved across kexec reboot,
I think those firmware provided EFI_MEMORY_SP should be persistent
because the e820 table is just a copy.   But I have no such hardware
to test,  could you do a test to confirm if possible?

The test steps should be:
# -s means to use kexec_file_load syscall
kexec -s -l bzImage --initrd initramfs-file --reuse-cmdline 
reboot

Maybe this should be fine for the time being.  And
leave the faked mem only works once during the physical boot?

Thanks
Dave

Patch
diff mbox series

diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index 7e53e5520548..68d752d8af21 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -34,26 +34,17 @@  static int __init cmp_fake_mem(const void *x1, const void *x2)
 	return 0;
 }
 
-void __init efi_fake_memmap(void)
+static void __init efi_fake_range(struct efi_mem_range *efi_range)
 {
 	int new_nr_map = efi.memmap.nr_map;
 	efi_memory_desc_t *md;
 	phys_addr_t new_memmap_phy;
 	unsigned long flags = 0;
 	void *new_memmap;
-	int i;
-
-	if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
-		return;
 
 	/* count up the number of EFI memory descriptor */
-	for (i = 0; i < nr_fake_mem; i++) {
-		for_each_efi_memory_desc(md) {
-			struct range *r = &efi_fake_mems[i].range;
-
-			new_nr_map += efi_memmap_split_count(md, r);
-		}
-	}
+	for_each_efi_memory_desc(md)
+		new_nr_map += efi_memmap_split_count(md, &efi_range->range);
 
 	/* allocate memory for new EFI memmap */
 	new_memmap_phy = efi_memmap_alloc(new_nr_map, &flags);
@@ -64,17 +55,28 @@  void __init efi_fake_memmap(void)
 	new_memmap = early_memremap(new_memmap_phy,
 				    efi.memmap.desc_size * new_nr_map);
 	if (!new_memmap) {
-		memblock_free(new_memmap_phy, efi.memmap.desc_size * new_nr_map);
+		__efi_memmap_free(new_memmap_phy,
+				efi.memmap.desc_size * new_nr_map, flags);
 		return;
 	}
 
-	for (i = 0; i < nr_fake_mem; i++)
-		efi_memmap_insert(&efi.memmap, new_memmap, &efi_fake_mems[i]);
+	efi_memmap_insert(&efi.memmap, new_memmap, efi_range);
 
 	/* swap into new EFI memmap */
 	early_memunmap(new_memmap, efi.memmap.desc_size * new_nr_map);
 
 	efi_memmap_install(new_memmap_phy, new_nr_map, flags);
+}
+
+void __init efi_fake_memmap(void)
+{
+	int i;
+
+	if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
+		return;
+
+	for (i = 0; i < nr_fake_mem; i++)
+		efi_fake_range(&efi_fake_mems[i]);
 
 	/* print new EFI memmap */
 	efi_print_memmap();
diff --git a/drivers/firmware/efi/memmap.c b/drivers/firmware/efi/memmap.c
index 188ab3cd5c52..de66c2a0e8f8 100644
--- a/drivers/firmware/efi/memmap.c
+++ b/drivers/firmware/efi/memmap.c
@@ -29,7 +29,7 @@  static phys_addr_t __init __efi_memmap_alloc_late(unsigned long size)
 	return PFN_PHYS(page_to_pfn(p));
 }
 
-static void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags)
+void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags)
 {
 	if (WARN_ON(slab_is_available() && (flags & EFI_MEMMAP_MEMBLOCK)))
 		return;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index fa2668a992ae..6ae31e064321 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1061,6 +1061,8 @@  extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
 
 extern phys_addr_t __init efi_memmap_alloc(unsigned int num_entries,
 		unsigned long *flags);
+extern void __efi_memmap_free(u64 phys, unsigned long size,
+		unsigned long flags);
 extern int __init efi_memmap_init_early(struct efi_memory_map_data *data);
 extern int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size);
 extern void __init efi_memmap_unmap(void);