[PATCHv2,1/3] x86, ptdump: Add section for EFI runtime services
diff mbox series

Message ID 1411313216-2641-2-git-send-email-minipli@googlemail.com
State New, archived
Headers show
Series
  • x86, ptdump: a EFI related fix + enhancements
Related show

Commit Message

Mathias Krause Sept. 21, 2014, 3:26 p.m. UTC
In commit 3891a04aafd6 ("x86-64, espfix: Don't leak bits 31:16 of %esp
returning..") the "ESPFix Area" was added to the page table dump special
sections. That area, though, has a limited amount of entries printed.

The EFI runtime services are, unfortunately, located in-between the
espfix area and the high kernel memory mapping. Due to the enforced
limitation for the espfix area, the EFI mappings won't be printed in the
page table dump.

To make the ESP runtime service mappings visible again, provide them a
dedicated entry.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: Matt Fleming <matt.fleming@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
---
v2: same as v1

 arch/x86/include/asm/pgtable_64_types.h |    2 ++
 arch/x86/mm/dump_pagetables.c           |    3 +++
 arch/x86/platform/efi/efi_64.c          |    3 +--
 3 files changed, 6 insertions(+), 2 deletions(-)

Comments

Matt Fleming Oct. 3, 2014, 1:47 p.m. UTC | #1
On Sun, 21 Sep, at 05:26:54PM, Mathias Krause wrote:
> In commit 3891a04aafd6 ("x86-64, espfix: Don't leak bits 31:16 of %esp
> returning..") the "ESPFix Area" was added to the page table dump special
> sections. That area, though, has a limited amount of entries printed.
> 
> The EFI runtime services are, unfortunately, located in-between the
> espfix area and the high kernel memory mapping. Due to the enforced
> limitation for the espfix area, the EFI mappings won't be printed in the
> page table dump.
> 
> To make the ESP runtime service mappings visible again, provide them a
> dedicated entry.
> 
> Signed-off-by: Mathias Krause <minipli@googlemail.com>
> Cc: Matt Fleming <matt.fleming@intel.com>
> Cc: H. Peter Anvin <hpa@zytor.com>
> ---
> v2: same as v1
> 
>  arch/x86/include/asm/pgtable_64_types.h |    2 ++
>  arch/x86/mm/dump_pagetables.c           |    3 +++
>  arch/x86/platform/efi/efi_64.c          |    3 +--
>  3 files changed, 6 insertions(+), 2 deletions(-)
 
Looks OK to me. Borislav?

> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> index 7166e25ecb57..602b6028c5b6 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -63,6 +63,8 @@ typedef struct { pteval_t pte; } pte_t;
>  #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
>  #define ESPFIX_PGD_ENTRY _AC(-2, UL)
>  #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
> +#define EFI_VA_START	 ( -4 * (_AC(1, UL) << 30))
> +#define EFI_VA_END	 (-68 * (_AC(1, UL) << 30))
>  
>  #define EARLY_DYNAMIC_PAGE_TABLES	64
>  
> diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
> index 95a427e57887..1a8053d1012e 100644
> --- a/arch/x86/mm/dump_pagetables.c
> +++ b/arch/x86/mm/dump_pagetables.c
> @@ -76,6 +76,9 @@ static struct addr_marker address_markers[] = {
>  # ifdef CONFIG_X86_ESPFIX64
>  	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
>  # endif
> +# ifdef CONFIG_EFI
> +	{ EFI_VA_END,		"EFI Runtime Services" },
> +# endif
>  	{ __START_KERNEL_map,   "High Kernel Mapping" },
>  	{ MODULES_VADDR,        "Modules" },
>  	{ MODULES_END,          "End Modules" },
> diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
> index 290d397e1dd9..899c7f17ad85 100644
> --- a/arch/x86/platform/efi/efi_64.c
> +++ b/arch/x86/platform/efi/efi_64.c
> @@ -48,8 +48,7 @@ static unsigned long efi_flags __initdata;
>   * We allocate runtime services regions bottom-up, starting from -4G, i.e.
>   * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
>   */
> -static u64 efi_va	= -4 * (1UL << 30);
> -#define EFI_VA_END	(-68 * (1UL << 30))
> +static u64 efi_va = EFI_VA_START;
>  
>  /*
>   * Scratch space used for switching the pagetable in the EFI stub
> -- 
> 1.7.10.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
Borislav Petkov Oct. 7, 2014, 3:01 p.m. UTC | #2
On Fri, Oct 03, 2014 at 02:47:07PM +0100, Matt Fleming wrote:
> Looks OK to me. Borislav?

It needs more work AFAICT because with it, espfix area gets cut off
prematurely:

...
[    0.134611] ---[ Vmemmap ]---
[    0.135003] 0xffffea0000000000-0xffffea0002000000          32M     RW         PSE GLB NX pmd
[    0.136743] 0xffffea0002000000-0xffffea0040000000         992M                           pmd
[    0.138091] 0xffffea0040000000-0xffffea8000000000         511G                           pud
[    0.139611] 0xffffea8000000000-0xffffff0000000000       20992G                           pgd
[    0.140610] ---[ ESPfix Area ]---
[    0.141003] 0xffffff0000000000-0xffffff8000000000         512G                           pgd
[    0.142614] 0xffffff8000000000-0xffffffef00000000         444G                           pud
[    0.144088] ---[ EFI Runtime Services ]---
[    0.144722] 0xffffffef00000000-0xfffffffec0000000          63G                           pud
[    0.146090] 0xfffffffec0000000-0xfffffffefbe00000         958M                           pmd
[    0.147613] 0xfffffffefbe00000-0xfffffffefbfe0000        1920K                           pte
[    0.149003] 0xfffffffefbfe0000-0xfffffffefc000000         128K     RW                 x  pte
[    0.150484] 0xfffffffefc000000-0xfffffffefc065000         404K                           pte
[    0.151612] 0xfffffffefc065000-0xfffffffefc200000        1644K     RW                 x  pte
[    0.153285] 0xfffffffefc200000-0xfffffffefc400000           2M     RW         PSE     x  pmd
[    0.154721] 0xfffffffefc400000-0xfffffffefc5e0000        1920K     RW                 x  pte
...

and I think we want to see something more from the espfix area (this is
what we have now):

[    0.138086] ---[ ESPfix Area ]---
[    0.138590] 0xffffff0000000000-0xffffff8000000000         512G                           pgd
[    0.140099] 0xffffff8000000000-0xfffffffec0000000         507G                           pud
[    0.141444] 0xfffffffec0000000-0xfffffffefbe00000         958M                           pmd
[    0.142597] 0xfffffffefbe00000-0xfffffffefbfe0000        1920K                           pte
[    0.144086] 0xfffffffefbfe0000-0xfffffffefc000000         128K     RW                 x  pte
[    0.145545] 0xfffffffefc000000-0xfffffffefc065000         404K                           pte
[    0.146597] 0xfffffffefc065000-0xfffffffefc200000        1644K     RW                 x  pte
[    0.148346] 0xfffffffefc200000-0xfffffffefc400000           2M     RW         PSE     x  pmd
[    0.149776] 0xfffffffefc400000-0xfffffffefc5e0000        1920K     RW                 x  pte
[    0.151347] 0xfffffffefc5e0000-0xfffffffefc631000         324K                           pte
[    0.152593] 0xfffffffefc631000-0xfffffffefc655000         144K     RW                 x  pte
[    0.154143] 0xfffffffefc655000-0xfffffffefc801000        1712K                           pte
[    0.155437] 0xfffffffefc801000-0xfffffffefc831000         192K     RW                 x  pte
[    0.157004] 0xfffffffefc831000-0xfffffffefc881000         320K                           pte
[    0.158088] 0xfffffffefc881000-0xfffffffefca01000        1536K     RW                 x  pte
[    0.159712] 0xfffffffefca01000-0xfffffffefcb34000        1228K                           pte
[    0.161117] ... 36 entries skipped ...

But yeah, this issue needs to be addressed one way or the other as the
espfix dump skips the runtime services.

And frankly, I don't see where we're setting that ->max_lines thing but
it sounds like a promising thing to use. :)

Thanks.
Mathias Krause Oct. 7, 2014, 5:07 p.m. UTC | #3
On Tue, Oct 07, 2014 at 05:01:32PM +0200, Borislav Petkov wrote:
> On Fri, Oct 03, 2014 at 02:47:07PM +0100, Matt Fleming wrote:
> > Looks OK to me. Borislav?
> 
> It needs more work AFAICT because with it, espfix area gets cut off
> prematurely:
> 

I don't think so. See below...

> ...
> [    0.134611] ---[ Vmemmap ]---
> [    0.135003] 0xffffea0000000000-0xffffea0002000000          32M     RW         PSE GLB NX pmd
> [    0.136743] 0xffffea0002000000-0xffffea0040000000         992M                           pmd
> [    0.138091] 0xffffea0040000000-0xffffea8000000000         511G                           pud
> [    0.139611] 0xffffea8000000000-0xffffff0000000000       20992G                           pgd
> [    0.140610] ---[ ESPfix Area ]---
> [    0.141003] 0xffffff0000000000-0xffffff8000000000         512G                           pgd
> [    0.142614] 0xffffff8000000000-0xffffffef00000000         444G                           pud
> [    0.144088] ---[ EFI Runtime Services ]---
> [    0.144722] 0xffffffef00000000-0xfffffffec0000000          63G                           pud
> [    0.146090] 0xfffffffec0000000-0xfffffffefbe00000         958M                           pmd
> [    0.147613] 0xfffffffefbe00000-0xfffffffefbfe0000        1920K                           pte
> [    0.149003] 0xfffffffefbfe0000-0xfffffffefc000000         128K     RW                 x  pte
> [    0.150484] 0xfffffffefc000000-0xfffffffefc065000         404K                           pte
> [    0.151612] 0xfffffffefc065000-0xfffffffefc200000        1644K     RW                 x  pte
> [    0.153285] 0xfffffffefc200000-0xfffffffefc400000           2M     RW         PSE     x  pmd
> [    0.154721] 0xfffffffefc400000-0xfffffffefc5e0000        1920K     RW                 x  pte
> ...
> 
> and I think we want to see something more from the espfix area (this is
> what we have now):
> 
> [    0.138086] ---[ ESPfix Area ]---
> [    0.138590] 0xffffff0000000000-0xffffff8000000000         512G                           pgd
> [    0.140099] 0xffffff8000000000-0xfffffffec0000000         507G                           pud
> [    0.141444] 0xfffffffec0000000-0xfffffffefbe00000         958M                           pmd
> [    0.142597] 0xfffffffefbe00000-0xfffffffefbfe0000        1920K                           pte
> [    0.144086] 0xfffffffefbfe0000-0xfffffffefc000000         128K     RW                 x  pte
> [    0.145545] 0xfffffffefc000000-0xfffffffefc065000         404K                           pte
> [    0.146597] 0xfffffffefc065000-0xfffffffefc200000        1644K     RW                 x  pte
> [    0.148346] 0xfffffffefc200000-0xfffffffefc400000           2M     RW         PSE     x  pmd
> [    0.149776] 0xfffffffefc400000-0xfffffffefc5e0000        1920K     RW                 x  pte
> [    0.151347] 0xfffffffefc5e0000-0xfffffffefc631000         324K                           pte
> [    0.152593] 0xfffffffefc631000-0xfffffffefc655000         144K     RW                 x  pte
> [    0.154143] 0xfffffffefc655000-0xfffffffefc801000        1712K                           pte
> [    0.155437] 0xfffffffefc801000-0xfffffffefc831000         192K     RW                 x  pte
> [    0.157004] 0xfffffffefc831000-0xfffffffefc881000         320K                           pte
> [    0.158088] 0xfffffffefc881000-0xfffffffefca01000        1536K     RW                 x  pte
> [    0.159712] 0xfffffffefca01000-0xfffffffefcb34000        1228K                           pte
> [    0.161117] ... 36 entries skipped ...

What you can see here are actually the EFI runtime service mappings, not
the ESP fix area. Check the addresses and compare them. You should find
similarities ;) And, in fact, the EFI mappings are incomplete in the
second dump, i.e. the vanilla kernel one, because of the enforced limit
for the ESP fix area.

So, in your examples are actually *no* ESP fix area mappings as those
would be r/o. In fact, I think, the above dumps are the result of a
CONFIG_EFI_PGT_DUMP enabled kernel that dumps the page table after
setting up the EFI mappings. There are no ESP fix mappings in this dump
because those are only set up after the EFI runtime service mappings.
See the following code in init/main:

  #ifdef CONFIG_X86
      if (efi_enabled(EFI_RUNTIME_SERVICES))
          efi_enter_virtual_mode();
  #endif
  #ifdef CONFIG_X86_ESPFIX64
      /* Should be run before the first non-init thread is created */
      init_espfix_bsp();
  #endif

To get a more complete view of the mappings, have a look at the debugfs
file /sys/kernel/debug/kernel_page_tables.

For v3.17 I get (notice the missing EFI mappings):
...
---[ Vmemmap ]---
0xffffea0000000000-0xffffff0000000000          21T                           pgd
---[ ESPfix Area ]---
0xffffff0000000000-0xffffff3b00000000         236G                           pud
0xffffff3b00000000-0xffffff3b0000e000          56K                           pte
0xffffff3b0000e000-0xffffff3b0000f000           4K     ro             GLB NX pte
0xffffff3b0000f000-0xffffff3b0001e000          60K                           pte
0xffffff3b0001e000-0xffffff3b0001f000           4K     ro             GLB NX pte
0xffffff3b0001f000-0xffffff3b0002e000          60K                           pte
0xffffff3b0002e000-0xffffff3b0002f000           4K     ro             GLB NX pte
0xffffff3b0002f000-0xffffff3b0003e000          60K                           pte
0xffffff3b0003e000-0xffffff3b0003f000           4K     ro             GLB NX pte
0xffffff3b0003f000-0xffffff3b0004e000          60K                           pte
0xffffff3b0004e000-0xffffff3b0004f000           4K     ro             GLB NX pte
0xffffff3b0004f000-0xffffff3b0005e000          60K                           pte
0xffffff3b0005e000-0xffffff3b0005f000           4K     ro             GLB NX pte
0xffffff3b0005f000-0xffffff3b0006e000          60K                           pte
0xffffff3b0006e000-0xffffff3b0006f000           4K     ro             GLB NX pte
0xffffff3b0006f000-0xffffff3b0007e000          60K                           pte
... 131165 entries skipped ... 
---[ High Kernel Mapping ]---
0xffffffff80000000-0xffffffff81000000          16M                           pmd
...

For v3.17 plus this patch I get this:
...
---[ Vmemmap ]---
0xffffea0000000000-0xffffff0000000000          21T                           pgd
---[ ESPfix Area ]---
0xffffff0000000000-0xffffff5600000000         344G                           pud
0xffffff5600000000-0xffffff5600005000          20K                           pte
0xffffff5600005000-0xffffff5600006000           4K     ro             GLB NX pte
0xffffff5600006000-0xffffff5600015000          60K                           pte
0xffffff5600015000-0xffffff5600016000           4K     ro             GLB NX pte
0xffffff5600016000-0xffffff5600025000          60K                           pte
0xffffff5600025000-0xffffff5600026000           4K     ro             GLB NX pte
0xffffff5600026000-0xffffff5600035000          60K                           pte
0xffffff5600035000-0xffffff5600036000           4K     ro             GLB NX pte
0xffffff5600036000-0xffffff5600045000          60K                           pte
0xffffff5600045000-0xffffff5600046000           4K     ro             GLB NX pte
0xffffff5600046000-0xffffff5600055000          60K                           pte
0xffffff5600055000-0xffffff5600056000           4K     ro             GLB NX pte
0xffffff5600056000-0xffffff5600065000          60K                           pte
0xffffff5600065000-0xffffff5600066000           4K     ro             GLB NX pte
0xffffff5600066000-0xffffff5600075000          60K                           pte
... 131059 entries skipped ... 
---[ EFI Runtime Services ]---
0xffffffef00000000-0xfffffffec0000000          63G                           pud
0xfffffffec0000000-0xfffffffef8800000         904M                           pmd
0xfffffffef8800000-0xfffffffef89d0000        1856K                           pte
0xfffffffef89d0000-0xfffffffef8a00000         192K     RW                 x  pte
0xfffffffef8a00000-0xfffffffef8a75000         468K                           pte
0xfffffffef8a75000-0xfffffffef8c00000        1580K     RW                 x  pte
0xfffffffef8c00000-0xfffffffef8e00000           2M     RW         PSE     x  pmd
0xfffffffef8e00000-0xfffffffef8fd0000        1856K     RW                 x  pte
0xfffffffef8fd0000-0xfffffffef9041000         452K                           pte
0xfffffffef9041000-0xfffffffef9065000         144K     RW                 x  pte
0xfffffffef9065000-0xfffffffef9211000        1712K                           pte
0xfffffffef9211000-0xfffffffef9241000         192K     RW                 x  pte
0xfffffffef9241000-0xfffffffef9291000         320K                           pte
0xfffffffef9291000-0xfffffffef9411000        1536K     RW                 x  pte
0xfffffffef9411000-0xfffffffef9529000        1120K                           pte
0xfffffffef9529000-0xfffffffef9600000         860K     RW                 x  pte
0xfffffffef9600000-0xfffffffefa400000          14M     RW         PSE     x  pmd
0xfffffffefa400000-0xfffffffefa491000         580K     RW                 x  pte
0xfffffffefa491000-0xfffffffefa528000         604K                           pte
0xfffffffefa528000-0xfffffffefa529000           4K     RW                 x  pte
0xfffffffefa529000-0xfffffffefa708000        1916K                           pte
0xfffffffefa708000-0xfffffffefa728000         128K     RW                 x  pte
0xfffffffefa728000-0xfffffffefa907000        1916K                           pte
0xfffffffefa907000-0xfffffffefa908000           4K     RW                 x  pte
0xfffffffefa908000-0xfffffffefab06000        2040K                           pte
0xfffffffefab06000-0xfffffffefab07000           4K     RW                 x  pte
0xfffffffefab07000-0xfffffffefad05000        2040K                           pte
0xfffffffefad05000-0xfffffffefad06000           4K     RW                 x  pte
0xfffffffefad06000-0xfffffffefae07000        1028K                           pte
0xfffffffefae07000-0xfffffffefaf05000        1016K     RW                 x  pte
0xfffffffefaf05000-0xfffffffefb005000           1M                           pte
0xfffffffefb005000-0xfffffffefb007000           8K     RW                 x  pte
0xfffffffefb007000-0xfffffffefb1ea000        1932K                           pte
0xfffffffefb1ea000-0xfffffffefb205000         108K     RW                 x  pte
0xfffffffefb205000-0xfffffffefb3e1000        1904K                           pte
0xfffffffefb3e1000-0xfffffffefb3ea000          36K     RW                 x  pte
0xfffffffefb3ea000-0xfffffffefb5cf000        1940K                           pte
0xfffffffefb5cf000-0xfffffffefb600000         196K     RW                 x  pte
0xfffffffefb600000-0xfffffffefb800000           2M     RW         PSE     x  pmd
0xfffffffefb800000-0xfffffffefb9e1000        1924K     RW                 x  pte
0xfffffffefb9e1000-0xfffffffefbb26000        1300K                           pte
0xfffffffefbb26000-0xfffffffefbbcf000         676K     RW                 x  pte
0xfffffffefbbcf000-0xfffffffefbc80000         708K                           pte
0xfffffffefbc80000-0xfffffffefbd26000         664K     RW                 x  pte
0xfffffffefbd26000-0xfffffffefbe7d000        1372K                           pte
0xfffffffefbe7d000-0xfffffffefbe80000          12K     RW                 x  pte
0xfffffffefbe80000-0xfffffffefc05e000        1912K                           pte
0xfffffffefc05e000-0xfffffffefc07d000         124K     RW                 x  pte
0xfffffffefc07d000-0xfffffffefc237000        1768K                           pte
0xfffffffefc237000-0xfffffffefc25e000         156K     RW                 x  pte
0xfffffffefc25e000-0xfffffffefc434000        1880K                           pte
0xfffffffefc434000-0xfffffffefc437000          12K     RW                 x  pte
0xfffffffefc437000-0xfffffffefc62e000        2012K                           pte
0xfffffffefc62e000-0xfffffffefc634000          24K     RW                 x  pte
0xfffffffefc634000-0xfffffffefc82c000        2016K                           pte
0xfffffffefc634000-0xfffffffefc82c000        2016K                           pte
0xfffffffefc82c000-0xfffffffefc82e000           8K     RW                 x  pte
0xfffffffefc82e000-0xfffffffefca2a000        2032K                           pte
0xfffffffefca2a000-0xfffffffefca2c000           8K     RW                 x  pte
0xfffffffefca2c000-0xfffffffefcc28000        2032K                           pte
0xfffffffefcc28000-0xfffffffefcc2a000           8K     RW                 x  pte
0xfffffffefcc2a000-0xfffffffefce15000        1964K                           pte
0xfffffffefce15000-0xfffffffefce28000          76K     RW                 x  pte
0xfffffffefce28000-0xfffffffefd012000        1960K                           pte
0xfffffffefd012000-0xfffffffefd015000          12K     RW                 x  pte
0xfffffffefd015000-0xfffffffefd20e000        2020K                           pte
0xfffffffefd20e000-0xfffffffefd212000          16K     RW                 x  pte
0xfffffffefd212000-0xfffffffefd40d000        2028K                           pte
0xfffffffefd40d000-0xfffffffefd40e000           4K     RW                 x  pte
0xfffffffefd40e000-0xfffffffefd5e9000        1900K                           pte
0xfffffffefd5e9000-0xfffffffefd60d000         144K     RW                 x  pte
0xfffffffefd60d000-0xfffffffefd7e7000        1896K                           pte
0xfffffffefd7e7000-0xfffffffefd7e9000           8K     RW                 x  pte
0xfffffffefd7e9000-0xfffffffefd9e0000        2012K                           pte
0xfffffffefd9e0000-0xfffffffefd9e7000          28K     RW                 x  pte
0xfffffffefd9e7000-0xfffffffefdbdf000        2016K                           pte
0xfffffffefdbdf000-0xfffffffefdbe0000           4K     RW                 x  pte
0xfffffffefdbe0000-0xfffffffefddce000        1976K                           pte
0xfffffffefddce000-0xfffffffefdddf000          68K     RW                 x  pte
0xfffffffefdddf000-0xfffffffefdfcd000        1976K                           pte
0xfffffffefdfcd000-0xfffffffefdfce000           4K     RW                 x  pte
0xfffffffefdfce000-0xfffffffefe1af000        1924K                           pte
0xfffffffefe1af000-0xfffffffefe1cd000         120K     RW                 x  pte
0xfffffffefe1cd000-0xfffffffefe3ae000        1924K                           pte
0xfffffffefe3ae000-0xfffffffefe3af000           4K     RW                 x  pte
0xfffffffefe3af000-0xfffffffefe5a8000        2020K                           pte
0xfffffffefe5a8000-0xfffffffefe5ae000          24K     RW                 x  pte
0xfffffffefe5ae000-0xfffffffefe7a5000        2012K                           pte
0xfffffffefe7a5000-0xfffffffefe7a8000          12K     RW                 x  pte
0xfffffffefe7a8000-0xfffffffefe96d000        1812K                           pte
0xfffffffefe96d000-0xfffffffefe9a5000         224K     RW                 x  pte
0xfffffffefe9a5000-0xfffffffefeb69000        1808K                           pte
0xfffffffefeb69000-0xfffffffefeb6d000          16K     RW                 x  pte
0xfffffffefeb6d000-0xfffffffefed65000        2016K                           pte
0xfffffffefed65000-0xfffffffefed69000          16K     RW                 x  pte
0xfffffffefed69000-0xfffffffefef5f000        2008K                           pte
0xfffffffefef5f000-0xfffffffefef65000          24K     RW                 x  pte
0xfffffffefef65000-0xfffffffeff0dc000        1500K                           pte
0xfffffffeff0dc000-0xfffffffeff15f000         524K     RW                 x  pte
0xfffffffeff15f000-0xfffffffeff261000        1032K                           pte
0xfffffffeff261000-0xfffffffeff2dc000         492K     RW                 x  pte
0xfffffffeff2dc000-0xfffffffeff45f000        1548K                           pte
0xfffffffeff45f000-0xfffffffeff460000           4K     RW                 x  pte
0xfffffffeff460000-0xfffffffeff600000        1664K                           pte
0xfffffffeff600000-0xfffffffeff620000         128K     RW                 x  pte
0xfffffffeff620000-0xfffffffeff800000        1920K                           pte
0xfffffffeff800000-0xffffffff00000000           8M     RW         PSE     x  pmd
0xffffffff00000000-0xffffffff80000000           2G                           pud
---[ High Kernel Mapping ]---
...

The ESP fix area is trimmed, as expected. The EFI runtime service area
is, beside being rather lengthy, complete. Also, as expected.

> 
> But yeah, this issue needs to be addressed one way or the other as the
> espfix dump skips the runtime services.
> 
> And frankly, I don't see where we're setting that ->max_lines thing but
> it sounds like a promising thing to use. :)

It's the third parameter in the address_markers[] array in
arch/x86/mm/dump_pagetables.c:

  # ifdef CONFIG_X86_ESPFIX64
    { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
  # endif

So, it's 16 entries for the ESP fix area. If it's set to 0, as it is for
all the other entries, no limitation applies.


Thanks,
Mathias

> 
> Thanks.
> 
> -- 
> Regards/Gruss,
>     Boris.
> --
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Borislav Petkov Oct. 8, 2014, 3:17 p.m. UTC | #4
On Tue, Oct 07, 2014 at 07:07:48PM +0200, Mathias Krause wrote:
> What you can see here are actually the EFI runtime service mappings, not
> the ESP fix area. Check the addresses and compare them. You should find
> similarities ;) And, in fact, the EFI mappings are incomplete in the
> second dump, i.e. the vanilla kernel one, because of the enforced limit
> for the ESP fix area.
> 
> So, in your examples are actually *no* ESP fix area mappings as those
> would be r/o. In fact, I think, the above dumps are the result of a
> CONFIG_EFI_PGT_DUMP enabled kernel that dumps the page table after
> setting up the EFI mappings. There are no ESP fix mappings in this dump
> because those are only set up after the EFI runtime service mappings.

Ok, I think I know what the deal is:

So, the ptdump we do to dmesg very early at boot is the EFI pagetable which
shouldn't have espfix mappings...

> See the following code in init/main:
> 
>   #ifdef CONFIG_X86
>       if (efi_enabled(EFI_RUNTIME_SERVICES))
>           efi_enter_virtual_mode();
>   #endif
>   #ifdef CONFIG_X86_ESPFIX64
>       /* Should be run before the first non-init thread is created */
>       init_espfix_bsp();
>   #endif

... exactly because of this: we're setting up the EFI mappings in
the EFI page table before we do the espfix mappings in the *kernel*
pagetable which is a separate one.

So, if we have to be really correct about it, the first dump to dmesg
which comes down the efi_enter_virtual_mode() path shouldn't contain the
espfix area at all.

Later dumps from debugfs cannot select the EFI pagetable so they should
not be dumping the EFI runtime services.

I don't have a good idea about how to do that right now though, maybe
the address markers should have flags or so...

Thanks.
Mathias Krause Oct. 8, 2014, 9:58 p.m. UTC | #5
On 8 October 2014 17:17, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Oct 07, 2014 at 07:07:48PM +0200, Mathias Krause wrote:
>> What you can see here are actually the EFI runtime service mappings, not
>> the ESP fix area. Check the addresses and compare them. You should find
>> similarities ;) And, in fact, the EFI mappings are incomplete in the
>> second dump, i.e. the vanilla kernel one, because of the enforced limit
>> for the ESP fix area.
>>
>> So, in your examples are actually *no* ESP fix area mappings as those
>> would be r/o. In fact, I think, the above dumps are the result of a
>> CONFIG_EFI_PGT_DUMP enabled kernel that dumps the page table after
>> setting up the EFI mappings. There are no ESP fix mappings in this dump
>> because those are only set up after the EFI runtime service mappings.
>
> Ok, I think I know what the deal is:
>
> So, the ptdump we do to dmesg very early at boot is the EFI pagetable which
> shouldn't have espfix mappings...
>
>> See the following code in init/main:
>>
>>   #ifdef CONFIG_X86
>>       if (efi_enabled(EFI_RUNTIME_SERVICES))
>>           efi_enter_virtual_mode();
>>   #endif
>>   #ifdef CONFIG_X86_ESPFIX64
>>       /* Should be run before the first non-init thread is created */
>>       init_espfix_bsp();
>>   #endif
>
> ... exactly because of this: we're setting up the EFI mappings in
> the EFI page table before we do the espfix mappings in the *kernel*
> pagetable which is a separate one.

Well, that is only partly correct. The call chain in efi_map_regions()
[ -> efi_map_region() -> __map_region() -> kernel_map_pages_in_pgd()
-> ..."magic"... ] does not only map the EFI regions in
trampoline_pgd, but also in kernel page table, i.e. init_level4_pgt.
That can easily be shown by looking at the kernel_page_tables debugfs
file on a running system. You'll notice large RWX portions covering
the "phys" mappings in the "Low Kernel Mapping" area and the "virt"
mappings in the "EFI Runtime Services" area. Now reboot with "noefi"
and see those be gone.

>
> So, if we have to be really correct about it, the first dump to dmesg
> which comes down the efi_enter_virtual_mode() path shouldn't contain the
> espfix area at all.

Correct -- because init_espfix_bsp() hadn't been called by then. Nice,
we agree on this, at least ;)

>
> Later dumps from debugfs cannot select the EFI pagetable so they should
> not be dumping the EFI runtime services.

Well, beside the debugfs file is always using init_level4_pgt, reality
shows the EFI mappings are visible there, too. So why omit them?

>
> I don't have a good idea about how to do that right now though, maybe
> the address markers should have flags or so...

Well, maybe I got it all wrong and there should be no EFI mappings in
the kernel page table at all? If so, how about fixing
kernel_map_pages_in_pgd() to not do so? It's you're code after all...
;)

Thanks,
Mathias

>
> Thanks.
>
> --
> Regards/Gruss,
>     Boris.
>
> Sent from a fat crate under my desk. Formatting is fine.
> --
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Borislav Petkov Oct. 8, 2014, 10:26 p.m. UTC | #6
On Wed, Oct 08, 2014 at 11:58:20PM +0200, Mathias Krause wrote:
> Well, that is only partly correct. The call chain in efi_map_regions()
> [ -> efi_map_region() -> __map_region() -> kernel_map_pages_in_pgd()
> -> ..."magic"... ] does not only map the EFI regions in
> trampoline_pgd, but also in kernel page table, i.e. init_level4_pgt.

No, this is completely correct. If it isn't, then it needs to be. We
can't have EFI mappings in the kernel page table for a reason.

EFI mappings only land in trampoline_pgd, not in the kernel page table,
.i.e *not* in init_level4_pgt. Look at what the first argument of every
invocation of kernel_map_pages_in_pgd() is.

> That can easily be shown by looking at the kernel_page_tables debugfs
> file on a running system. You'll notice large RWX portions covering
> the "phys" mappings in the "Low Kernel Mapping" area and the "virt"
> mappings in the "EFI Runtime Services" area. Now reboot with "noefi"
> and see those be gone.

You need to show me - I don't see them here, in my guest.

> Well, beside the debugfs file is always using init_level4_pgt, reality
> shows the EFI mappings are visible there, too. So why omit them?

Again, you need to show me - I don't see any EFI mappings in my setup
here when cat-ting /sys/kernel/debug/kernel_page_tables

> Well, maybe I got it all wrong and there should be no EFI mappings in
> the kernel page table at all? If so, how about fixing
> kernel_map_pages_in_pgd() to not do so? It's you're code after all...
> ;)

Well, if you can show me where kernel_map_pages_in_pgd() is called with
init_level4_pgt as a first argument, I'd gladly fix it.

The 3 calls to it in 3.17 are all in efi_64.c and everytime it is
real_mode_header->trampoline_pgd that gets handed down:

arch/x86/platform/efi/efi_64.c:161:     if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) {
arch/x86/platform/efi/efi_64.c:187:     if (kernel_map_pages_in_pgd(pgd, text >> PAGE_SHIFT, text, npages, 0)) {
arch/x86/platform/efi/efi_64.c:210:     if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))

So show me please what exactly you're seeing.
Mathias Krause Oct. 12, 2014, 12:55 p.m. UTC | #7
On Thu, Oct 09, 2014 at 12:26:19AM +0200, Borislav Petkov wrote:
> On Wed, Oct 08, 2014 at 11:58:20PM +0200, Mathias Krause wrote:
> > Well, that is only partly correct. The call chain in efi_map_regions()
> > [ -> efi_map_region() -> __map_region() -> kernel_map_pages_in_pgd()
> > -> ..."magic"... ] does not only map the EFI regions in
> > trampoline_pgd, but also in kernel page table, i.e. init_level4_pgt.
> 
> No, this is completely correct. If it isn't, then it needs to be. We
> can't have EFI mappings in the kernel page table for a reason.

What would be the reason for not having the EFI mappings in kernel page
table? Don't get me wrong, I don't want those either, but are there
other reasons beside you(?) and me not liking rwx mappings of firmware
code and data in the kernel address space?

> EFI mappings only land in trampoline_pgd, not in the kernel page table,
> .i.e *not* in init_level4_pgt. Look at what the first argument of every
> invocation of kernel_map_pages_in_pgd() is.

I can see the first argument of kernel_map_pages_in_pgd() but that
doesn't mean the EFI mappings wont be added to the kernel page table as
well. In fact, they are -- as I've shown you multiple times already and
figured the reason why, meanwhile. The reason lies in how trampoline_pgd
gets set up in arch/x86/realmode/init.c:

  trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
  trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
  trampoline_pgd[511] = init_level4_pgt[511].pgd;

This means, trampoline_pgd[0] is effectively just an alias for
init_level4_pgt[pgd_index(__PAGE_OFFSET)], trampoline_pgd[511] one for
init_level4_pgt[511].

So, when adding the EFI physical mappings to trampoline_pgd[0], we're
actually messing with init_level4_pgt[pgd_index(__PAGE_OFFSET)]. When
adding the virtual mappings, we're messing with init_level4_pgt[511]. So
we *are*, in fact, adding the EFI mappings to the kernel page table.

There's a lengthy comment in arch/x86/platform/efi/efi.c that mentions
the duplication of pgd entries -- and therefore whole hierarchies --
between trampoline_pgd and init_level4_pgt. And, ironically, that
comment is yours from earlier this year. Looks like you forgot about
that in the meantime ;)

> 
> > That can easily be shown by looking at the kernel_page_tables debugfs
> > file on a running system. You'll notice large RWX portions covering
> > the "phys" mappings in the "Low Kernel Mapping" area and the "virt"
> > mappings in the "EFI Runtime Services" area. Now reboot with "noefi"
> > and see those be gone.
> 
> You need to show me - I don't see them here, in my guest.

I thought I did so in my previous emails when showing you the content of
my /sys/kernel/debug/kernel_page_tables file. I even highlighted the EFI
mappings in your dumps -- wrongly labeled as "ESPfix Area". But see
below...

> 
> > Well, beside the debugfs file is always using init_level4_pgt, reality
> > shows the EFI mappings are visible there, too. So why omit them?
> 
> Again, you need to show me - I don't see any EFI mappings in my setup
> here when cat-ting /sys/kernel/debug/kernel_page_tables
> 

Three prerequisites:

1/ Have you applied the patch marking the EFI mappings as "EFI Runtime
   Services"? If not, they will be hidden behind the "ESPfix Area".
2/ Is the guest you've run your tests on EFI enabled? If not, you wont
   see any EFI mappings.
3/ Did you put "noefi" in your kernel command line? If so, no mappings
   either.

After checking the above, the "EFI Runtime Services" area should contain
a few rwx EFI mappings.

> > Well, maybe I got it all wrong and there should be no EFI mappings in
> > the kernel page table at all? If so, how about fixing
> > kernel_map_pages_in_pgd() to not do so? It's you're code after all...
> > ;)
> 
> Well, if you can show me where kernel_map_pages_in_pgd() is called with
> init_level4_pgt as a first argument, I'd gladly fix it.

It's not. But that's not the point. It's the sharing of pgd hierarchies
of trampoline_pgd with init_level4_pgt I've explained above that makes
mappings in the former apply to the latter as well.

> 
> The 3 calls to it in 3.17 are all in efi_64.c and everytime it is
> real_mode_header->trampoline_pgd that gets handed down:
> 
> arch/x86/platform/efi/efi_64.c:161:     if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) {
> arch/x86/platform/efi/efi_64.c:187:     if (kernel_map_pages_in_pgd(pgd, text >> PAGE_SHIFT, text, npages, 0)) {
> arch/x86/platform/efi/efi_64.c:210:     if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
> 
> So show me please what exactly you're seeing.

I see the EFI mappings in the kernel address space, i.e. through
init_level4_pgt. As those are rwx, they can easily be greped for.

Compare this (EFI enabled qemu system)..:

bbox:~# grep -e '---\|RW.*x' /sys/kernel/debug/kernel_page_tables
---[ User Space ]---
---[ Kernel Space ]---
---[ Low Kernel Mapping ]---
0xffff880000800000-0xffff880001000000           8M     RW         PSE GLB x  pmd
0xffff880001800000-0xffff880001a00000           2M     RW         PSE GLB x  pmd
0xffff880001a00000-0xffff880001a74000         464K     RW             GLB x  pte
0xffff88001c000000-0xffff88001c020000         128K     RW             GLB x  pte
0xffff88001e061000-0xffff88001e25e000        2036K     RW             GLB x  pte
0xffff88001e25e000-0xffff88001e27d000         124K     RW                 x  pte
0xffff88001e27d000-0xffff88001e280000          12K     RW             GLB x  pte
0xffff88001e280000-0xffff88001e3cf000        1340K     RW                 x  pte
0xffff88001e3cf000-0xffff88001e400000         196K     RW             GLB x  pte
0xffff88001e400000-0xffff88001e600000           2M     RW         PSE GLB x  pmd
0xffff88001e600000-0xffff88001e7e1000        1924K     RW             GLB x  pte
0xffff88001e7e1000-0xffff88001e7ea000          36K     RW                 x  pte
0xffff88001e7ea000-0xffff88001e905000        1132K     RW             GLB x  pte
0xffff88001e905000-0xffff88001e906000           4K     RW                 x  pte
0xffff88001e906000-0xffff88001e907000           4K     RW             GLB x  pte
0xffff88001e907000-0xffff88001e908000           4K     RW                 x  pte
0xffff88001e908000-0xffff88001e928000         128K     RW             GLB x  pte
0xffff88001e928000-0xffff88001e929000           4K     RW                 x  pte
0xffff88001e929000-0xffff88001ea00000         860K     RW             GLB x  pte
0xffff88001ea00000-0xffff88001f800000          14M     RW         PSE GLB x  pmd
0xffff88001f800000-0xffff88001fa11000        2116K     RW             GLB x  pte
0xffff88001fa11000-0xffff88001fa65000         336K     RW                 x  pte
0xffff88001fa75000-0xffff88001fc00000        1580K     RW             GLB x  pte
0xffff88001fc00000-0xffff88001fe00000           2M     RW         PSE GLB x  pmd
0xffff88001fe00000-0xffff88001ffd0000        1856K     RW             GLB x  pte
0xffff88001ffd0000-0xffff880020000000         192K     RW                 x  pte
---[ vmalloc() Area ]---
---[ Vmemmap ]---
---[ ESPfix Area ]---
---[ EFI Runtime Services ]---
0xfffffffef93d0000-0xfffffffef9400000         192K     RW                 x  pte
0xfffffffef9475000-0xfffffffef9600000        1580K     RW                 x  pte
0xfffffffef9600000-0xfffffffef9800000           2M     RW         PSE     x  pmd
0xfffffffef9800000-0xfffffffef99d0000        1856K     RW                 x  pte
0xfffffffef9a41000-0xfffffffef9a65000         144K     RW                 x  pte
0xfffffffef9c11000-0xfffffffef9c41000         192K     RW                 x  pte
0xfffffffef9c91000-0xfffffffef9e11000        1536K     RW                 x  pte
0xfffffffef9f29000-0xfffffffefa000000         860K     RW                 x  pte
0xfffffffefa000000-0xfffffffefae00000          14M     RW         PSE     x  pmd
0xfffffffefae00000-0xfffffffefae91000         580K     RW                 x  pte
0xfffffffefaf28000-0xfffffffefaf29000           4K     RW                 x  pte
0xfffffffefb108000-0xfffffffefb128000         128K     RW                 x  pte
0xfffffffefb307000-0xfffffffefb308000           4K     RW                 x  pte
0xfffffffefb506000-0xfffffffefb507000           4K     RW                 x  pte
0xfffffffefb705000-0xfffffffefb706000           4K     RW                 x  pte
0xfffffffefb807000-0xfffffffefb905000        1016K     RW                 x  pte
0xfffffffefba05000-0xfffffffefba07000           8K     RW                 x  pte
0xfffffffefbbea000-0xfffffffefbc05000         108K     RW                 x  pte
0xfffffffefbde1000-0xfffffffefbdea000          36K     RW                 x  pte
0xfffffffefbfcf000-0xfffffffefc000000         196K     RW                 x  pte
0xfffffffefc000000-0xfffffffefc200000           2M     RW         PSE     x  pmd
0xfffffffefc200000-0xfffffffefc3e1000        1924K     RW                 x  pte
0xfffffffefc526000-0xfffffffefc5cf000         676K     RW                 x  pte
0xfffffffefc680000-0xfffffffefc726000         664K     RW                 x  pte
0xfffffffefc87d000-0xfffffffefc880000          12K     RW                 x  pte
0xfffffffefca5e000-0xfffffffefca7d000         124K     RW                 x  pte
0xfffffffefcc37000-0xfffffffefcc5e000         156K     RW                 x  pte
0xfffffffefce34000-0xfffffffefce37000          12K     RW                 x  pte
0xfffffffefd02e000-0xfffffffefd034000          24K     RW                 x  pte
0xfffffffefd22c000-0xfffffffefd22e000           8K     RW                 x  pte
0xfffffffefd42a000-0xfffffffefd42c000           8K     RW                 x  pte
0xfffffffefd628000-0xfffffffefd62a000           8K     RW                 x  pte
0xfffffffefd815000-0xfffffffefd828000          76K     RW                 x  pte
0xfffffffefda12000-0xfffffffefda15000          12K     RW                 x  pte
0xfffffffefdc0e000-0xfffffffefdc12000          16K     RW                 x  pte
0xfffffffefde0d000-0xfffffffefde0e000           4K     RW                 x  pte
0xfffffffefdfe9000-0xfffffffefe00d000         144K     RW                 x  pte
0xfffffffefe1e7000-0xfffffffefe1e9000           8K     RW                 x  pte
0xfffffffefe3e0000-0xfffffffefe3e7000          28K     RW                 x  pte
0xfffffffefe5df000-0xfffffffefe5e0000           4K     RW                 x  pte
0xfffffffefe7ce000-0xfffffffefe7df000          68K     RW                 x  pte
0xfffffffefe9cd000-0xfffffffefe9ce000           4K     RW                 x  pte
0xfffffffefebb8000-0xfffffffefebcd000          84K     RW                 x  pte
0xfffffffefedb6000-0xfffffffefedb8000           8K     RW                 x  pte
0xfffffffefefb0000-0xfffffffefefb6000          24K     RW                 x  pte
0xfffffffeff1a6000-0xfffffffeff1b0000          40K     RW                 x  pte
0xfffffffeff2de000-0xfffffffeff3a6000         800K     RW                 x  pte
0xfffffffeff461000-0xfffffffeff4de000         500K     RW                 x  pte
0xfffffffeff600000-0xfffffffeff620000         128K     RW                 x  pte
0xfffffffeff800000-0xffffffff00000000           8M     RW         PSE     x  pmd
---[ High Kernel Mapping ]---
0xffffffff81a74000-0xffffffff81c00000        1584K     RW             GLB x  pte
---[ Modules ]---
---[ End Modules ]---

..with that (same system booted with "noefi"):

bbox:~# grep -e '---\|RW.*x' /sys/kernel/debug/kernel_page_tables
---[ User Space ]---
---[ Kernel Space ]---
---[ Low Kernel Mapping ]---
---[ vmalloc() Area ]---
---[ Vmemmap ]---
---[ ESPfix Area ]---
---[ EFI Runtime Services ]---
---[ High Kernel Mapping ]---
0xffffffff81a74000-0xffffffff81c00000        1584K     RW             GLB x  pte
---[ Modules ]---
---[ End Modules ]---

The first grep shows the physical EFI mappings in the "Low Kernel
Mapping" area and the virtual ones in the "EFI Runtime Services" area.
The second grep has none as the EFI runtime services are disabled in
this case -- no EFI memory regions will be (re)mapped.

The writable mapping in the "High Kernel Mapping" for both dumps is
probably the heap as it starts right after __brk_limit -- so not EFI
related, probably just another bug ;)


Regards,
Mathias

> 
> -- 
> Regards/Gruss,
>     Boris.
> 
> Sent from a fat crate under my desk. Formatting is fine.
> --
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Borislav Petkov Oct. 28, 2014, 6:57 p.m. UTC | #8
On Sun, Oct 12, 2014 at 02:55:15PM +0200, Mathias Krause wrote:

...

> There's a lengthy comment in arch/x86/platform/efi/efi.c that mentions
> the duplication of pgd entries -- and therefore whole hierarchies --
> between trampoline_pgd and init_level4_pgt. And, ironically, that
> comment is yours from earlier this year. Looks like you forgot about
> that in the meantime ;)

Absolutely - you can see how crazy I am about UEFI :-)

Ok, thanks for refreshing this for me, your patch is good, so

Acked-by: Borislav Petkov <bp@suse.de>

What this whole story shows, however, is that the EFI mappings are in
fact in the kernel page table and this shouldn't be IMO - I'd like to
very much have them split because otherwise there's no need to switch
page tables at all. And besides, having UEFI in its own address space is
a good thing in itself anyway.

So, I've already hacked up something to have a completely separate EFI
page table - need to find out why it doesn't work yet but qemu was
b0rked until recently so we had to deal with that first... bla bla.

Thanks :-)
Mathias Krause Oct. 28, 2014, 7:48 p.m. UTC | #9
On 28 October 2014 19:57, Borislav Petkov <bp@alien8.de> wrote:
> [...]
>
> Ok, thanks for refreshing this for me, your patch is good, so
>
> Acked-by: Borislav Petkov <bp@suse.de>

Thanks. But as you said, the EFI mappings shouldn't be in the kernel's
page table in the first place, so I'd rather see a patch doing that
instead. But, in the meantime, this patch is valid, as it shows the
"status quo".

> What this whole story shows, however, is that the EFI mappings are in
> fact in the kernel page table and this shouldn't be IMO - I'd like to
> very much have them split because otherwise there's no need to switch
> page tables at all.

Indeed.

> And besides, having UEFI in its own address space is
> a good thing in itself anyway.
>
> So, I've already hacked up something to have a completely separate EFI
> page table - need to find out why it doesn't work yet

I tried so too but failed early as well. I tried putting the EFI
virtual mappings not in trampoline_pgd[511] but trampoline_pgd[510].
However, that didn't work out. I got page faults when trying to invoke
EFI functions, as, apparently, efi.systab was only mapped in the EFI
page table but not the kernel's page table -- at least not at the same
address. So when efi_call_virt() tries to dereference
efi.systab->runtime->f, it just traps.
I tried to hack around that by fiddling with get_systab_virt_addr() to
make it point to the direct mapping for the phys_addr but failed on
the first few attempts to get the math right. Then I noticed it was
way to late to hack EFI code and fell asleep. Next day I just gave up
and 'git reset --hard HEAD'. :(

> but qemu was
> b0rked until recently so we had to deal with that first... bla bla.

Debian's version of qemu + OVMF works fine here. Probably slightly
outdated but still good enough for testing EFI stuff ;)


Regards,
Mathias
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Borislav Petkov Oct. 28, 2014, 8:13 p.m. UTC | #10
On Tue, Oct 28, 2014 at 08:48:23PM +0100, Mathias Krause wrote:
> I tried so too but failed early as well. I tried putting the EFI
> virtual mappings not in trampoline_pgd[511] but trampoline_pgd[510].
> However, that didn't work out. I got page faults when trying to invoke
> EFI functions, as, apparently, efi.systab was only mapped in the EFI
> page table but not the kernel's page table -- at least not at the same
> address. So when efi_call_virt() tries to dereference
> efi.systab->runtime->f, it just traps.
> I tried to hack around that by fiddling with get_systab_virt_addr() to
> make it point to the direct mapping for the phys_addr but failed on
> the first few attempts to get the math right. Then I noticed it was
> way to late to hack EFI code and fell asleep. Next day I just gave up
> and 'git reset --hard HEAD'. :(

I know exactly what you mean. The nasty thing about it is, debugging
this is not trivial as once you switch to another page table, you don't
have a #PF handler and the guest triple-faults. All of the above issues
I've encountered already while hacking on the current code :-) *Cringe*

I want to do experimentation with tracing page faults in KVM with the
nested PF handling in hw turned off so that I can see all #PFs. Maybe
that'll tell me something more, we'll see.

> Debian's version of qemu + OVMF works fine here. Probably slightly
> outdated but still good enough for testing EFI stuff ;)

Yeah, Paolo fixed it already:

https://lkml.kernel.org/r/1414420306-2771-2-git-send-email-pbonzini@redhat.com

I think we want to keep qemu+kvm functioning :-)

Thanks.
Mathias Krause Oct. 28, 2014, 9:14 p.m. UTC | #11
On 28 October 2014 21:13, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Oct 28, 2014 at 08:48:23PM +0100, Mathias Krause wrote:
>> I tried so too but failed early as well. I tried putting the EFI
>> virtual mappings not in trampoline_pgd[511] but trampoline_pgd[510].
>> However, that didn't work out. I got page faults when trying to invoke
>> EFI functions, as, apparently, efi.systab was only mapped in the EFI
>> page table but not the kernel's page table -- at least not at the same
>> address. So when efi_call_virt() tries to dereference
>> efi.systab->runtime->f, it just traps.
>> I tried to hack around that by fiddling with get_systab_virt_addr() to
>> make it point to the direct mapping for the phys_addr but failed on
>> the first few attempts to get the math right. Then I noticed it was
>> way to late to hack EFI code and fell asleep. Next day I just gave up
>> and 'git reset --hard HEAD'. :(
>
> I know exactly what you mean. The nasty thing about it is, debugging
> this is not trivial as once you switch to another page table, you don't
> have a #PF handler and the guest triple-faults. All of the above issues
> I've encountered already while hacking on the current code :-) *Cringe*

Mapping the kernel into the EFI page table may help ;) Then the
kernel's #PF handler would be present and able to print a register
dump, at least.

So, assuming you're not mapping the EFI virtual mappings below the
pgd[511] hierarchy, making pgd[511] equal init_level4_pgt[511] should
help in this case. In fact, you need to map portions of the kernel
into the EFI page table anyway. Otherwise the EFI code wouldn't be
able to access, e.g., the data it should write to NVRAM. So the EFI
code would just trap and trigger a #PF -- and because of the missing
#PF handler, a #DF -- and because of the missing #DF handler the
triple fault. ;)

>
> I want to do experimentation with tracing page faults in KVM with the
> nested PF handling in hw turned off so that I can see all #PFs. Maybe
> that'll tell me something more, we'll see.

Oh, well. Have fun with that! I would take the "map kernel into EFI
page table" route instead. ;)

>
>> Debian's version of qemu + OVMF works fine here. Probably slightly
>> outdated but still good enough for testing EFI stuff ;)
>
> Yeah, Paolo fixed it already:
>
> https://lkml.kernel.org/r/1414420306-2771-2-git-send-email-pbonzini@redhat.com
>
> I think we want to keep qemu+kvm functioning :-)

Of course! It makes testing a lot easier.


Cheers,
Mathias
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Borislav Petkov Oct. 28, 2014, 9:26 p.m. UTC | #12
On Tue, Oct 28, 2014 at 10:14:25PM +0100, Mathias Krause wrote:
> Oh, well. Have fun with that! I would take the "map kernel into EFI
> page table" route instead. ;)

Actually, I want to try to keep them completely separate and sync only
before an EFI RT call for function arguments. And then remove PGDs after
I return from it. We'll see how it all works out.
Mathias Krause Oct. 28, 2014, 9:49 p.m. UTC | #13
On 28 October 2014 22:26, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Oct 28, 2014 at 10:14:25PM +0100, Mathias Krause wrote:
>> Oh, well. Have fun with that! I would take the "map kernel into EFI
>> page table" route instead. ;)
>
> Actually, I want to try to keep them completely separate and sync only
> before an EFI RT call for function arguments.

Sync only data or kernel code, too?

> And then remove PGDs after
> I return from it. We'll see how it all works out.

That shouldn't be needed as you're switching away from the EFI page
table, so its entries wouldn't be effective any more anyway.

Really, I'd just map the EFI RT service virtual mappings "somewhere"
but at pgd[511] and have pgd[511] initially set up as
init_level4_pgt[511]. Then, thing should "just work"(TM).

If you fear the EFI code might do harm to the kernel code/data, then
you've lost anyway. Nothing will prevent the EFI code from doing nasty
things -- like setting up its own mappings to tamper with the kernel's
memory.

Regards,
Mathias
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Borislav Petkov Oct. 28, 2014, 10:07 p.m. UTC | #14
On Tue, Oct 28, 2014 at 10:49:36PM +0100, Mathias Krause wrote:
> Sync only data or kernel code, too?

Data only should be enough.

> Really, I'd just map the EFI RT service virtual mappings "somewhere"
> but at pgd[511] and have pgd[511] initially set up as
> init_level4_pgt[511]. Then, thing should "just work"(TM).

Nah, nothing with EFI just works :-)

> If you fear the EFI code might do harm to the kernel code/data, then
> you've lost anyway. Nothing will prevent the EFI code from doing nasty
> things -- like setting up its own mappings to tamper with the kernel's
> memory.

Sure, nothing would surprise me anymore. However, I'd prefer to not be
an enabler. :)
Mathias Krause Oct. 29, 2014, 8:06 a.m. UTC | #15
On 28 October 2014 23:07, Borislav Petkov <bp@alien8.de> wrote:
> On Tue, Oct 28, 2014 at 10:49:36PM +0100, Mathias Krause wrote:
>> Sync only data or kernel code, too?
>
> Data only should be enough.

No, not really. This is why:

1/ When setting up the virtual mapping via a call to the
SetVirtualAddressMap EFI service we're running with interrupts
enabled, as we skip to disable them in efi_call_phys_prolog(). This
means, IRQs might happen and, in fact, they *do* happen. I know for
sure, because I had to debug an issue with a "wbinvd" instruction
within that EFI code delaying execution long enough, that the timer
interrupt triggers. So we should be prepared to handle those by having
the kernel mapped during the EFI call or bad things will happen.
2/ When the EFI code traps, e.g., because the EFI code is buggy or the
arguments the kernel provides are borked, not having a trap handler
for #PF, #GP, #UD, you name it, will make the system just triple fault
and reset. From a user perspective, that's not nice. ;)

So, having the kernel mapped during EFI calls is kinda required. I
rather prefer seeing the system panic with a backtrace instead of just
triple faulting.

>> Really, I'd just map the EFI RT service virtual mappings "somewhere"
>> but at pgd[511] and have pgd[511] initially set up as
>> init_level4_pgt[511]. Then, thing should "just work"(TM).
>
> Nah, nothing with EFI just works :-)

Yeah, learned it the hard way, too ;)

>
>> If you fear the EFI code might do harm to the kernel code/data, then
>> you've lost anyway. Nothing will prevent the EFI code from doing nasty
>> things -- like setting up its own mappings to tamper with the kernel's
>> memory.
>
> Sure, nothing would surprise me anymore. However, I'd prefer to not be
> an enabler. :)

We have the EFI code 'n data mapped RWX into the kernel address space
at predictable addresses, now. It can't get any worse ;)


Regards,
Mathias
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Matt Fleming Oct. 29, 2014, 2:20 p.m. UTC | #16
On Tue, 28 Oct, at 10:14:25PM, Mathias Krause wrote:
> 
> Mapping the kernel into the EFI page table may help ;) Then the
> kernel's #PF handler would be present and able to print a register
> dump, at least.
 
The kernel is already mapped into the EFI page table.

> So, assuming you're not mapping the EFI virtual mappings below the
> pgd[511] hierarchy, making pgd[511] equal init_level4_pgt[511] should
> help in this case. In fact, you need to map portions of the kernel
> into the EFI page table anyway. Otherwise the EFI code wouldn't be
> able to access, e.g., the data it should write to NVRAM. So the EFI
> code would just trap and trigger a #PF -- and because of the missing
> #PF handler, a #DF -- and because of the missing #DF handler the
> triple fault. ;)
 
Exactly.

We don't setup a separate page table for EFI calls for any kind of
isolation, we do it to make use of the existing 1:1 mappings in
trampoline_pgd because some firmware directly reference physical
addresses at runtime. It actually doesn't work too well in practice,
because you soon hit other issues on those firmware, but there you go.

So the fact that we have EFI mappings in init_level4_pgt[] isn't
indicative of any kind of bug, it's potentially a bit unclean, but
that's about it.
Matt Fleming Oct. 29, 2014, 2:22 p.m. UTC | #17
On Tue, 28 Oct, at 08:48:23PM, Mathias Krause wrote:
> On 28 October 2014 19:57, Borislav Petkov <bp@alien8.de> wrote:
> > [...]
> >
> > Ok, thanks for refreshing this for me, your patch is good, so
> >
> > Acked-by: Borislav Petkov <bp@suse.de>
> 
> Thanks. But as you said, the EFI mappings shouldn't be in the kernel's
> page table in the first place, so I'd rather see a patch doing that
> instead. But, in the meantime, this patch is valid, as it shows the
> "status quo".

Now, everyone agrees this patch is OK? There are no comments that still
need addressing?
Mathias Krause Oct. 29, 2014, 3:19 p.m. UTC | #18
On 29 October 2014 15:20, Matt Fleming <matt@console-pimps.org> wrote:
> On Tue, 28 Oct, at 10:14:25PM, Mathias Krause wrote:
>>
>> Mapping the kernel into the EFI page table may help ;) Then the
>> kernel's #PF handler would be present and able to print a register
>> dump, at least.
>
> The kernel is already mapped into the EFI page table.

I was referring to Boris' ongoing work, trying to completely separate
the EFI page table from the kernel's. He was hinting to only map the
data parts of the kernel into the EFI page table and only for the
actual EFI call. But that's not such a good idea, IMHO, as explained
below.

>
>> So, assuming you're not mapping the EFI virtual mappings below the
>> pgd[511] hierarchy, making pgd[511] equal init_level4_pgt[511] should
>> help in this case. In fact, you need to map portions of the kernel
>> into the EFI page table anyway. Otherwise the EFI code wouldn't be
>> able to access, e.g., the data it should write to NVRAM. So the EFI
>> code would just trap and trigger a #PF -- and because of the missing
>> #PF handler, a #DF -- and because of the missing #DF handler the
>> triple fault. ;)
>
> Exactly.

>
> We don't setup a separate page table for EFI calls for any kind of
> isolation, we do it to make use of the existing 1:1 mappings in
> trampoline_pgd because some firmware directly reference physical
> addresses at runtime.

Ah, that makes sense now. I though we need those only for the
SetVirtualAddressMap transition.

> It actually doesn't work too well in practice,
> because you soon hit other issues on those firmware, but there you go.
>
> So the fact that we have EFI mappings in init_level4_pgt[] isn't
> indicative of any kind of bug, it's potentially a bit unclean, but
> that's about it.

Well, not only unclean but ugly, because of the RWX mappings. That's
all I was complaining about. I tried to make those r/o and nx during
normal operation and only change the attributes to RWX for the EFI
call but unfortunately set_memory_{x,nx,ro,rw} don't like to be called
with interrupts/preemption disabled.
Maybe moving the EFI virtual mappings to another pgd slot will make it
possible as in this case only the pgd entry needs to be modified. But
I leave those experiments to Boris. I had enough "fun" with EFI
already ;)

Regards,
Mathias
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Mathias Krause Oct. 29, 2014, 3:22 p.m. UTC | #19
On 29 October 2014 15:22, Matt Fleming <matt@console-pimps.org> wrote:
> On Tue, 28 Oct, at 08:48:23PM, Mathias Krause wrote:
>> On 28 October 2014 19:57, Borislav Petkov <bp@alien8.de> wrote:
>> > [...]
>> >
>> > Ok, thanks for refreshing this for me, your patch is good, so
>> >
>> > Acked-by: Borislav Petkov <bp@suse.de>
>>
>> Thanks. But as you said, the EFI mappings shouldn't be in the kernel's
>> page table in the first place, so I'd rather see a patch doing that
>> instead. But, in the meantime, this patch is valid, as it shows the
>> "status quo".
>
> Now, everyone agrees this patch is OK? There are no comments that still
> need addressing?

The patch was fine from the beginning. The discussion kind of hijacked
the thread to argue about the underlying issue. So, the patch is still
good for me to be merged.

Thanks,
Mathias
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Mathias Krause Nov. 11, 2014, 9:59 p.m. UTC | #20
On 29 October 2014 15:22, Matt Fleming <matt@console-pimps.org> wrote:
> On Tue, 28 Oct, at 08:48:23PM, Mathias Krause wrote:
>> On 28 October 2014 19:57, Borislav Petkov <bp@alien8.de> wrote:
>> > [...]
>> >
>> > Ok, thanks for refreshing this for me, your patch is good, so
>> >
>> > Acked-by: Borislav Petkov <bp@suse.de>
>>
>> Thanks. But as you said, the EFI mappings shouldn't be in the kernel's
>> page table in the first place, so I'd rather see a patch doing that
>> instead. But, in the meantime, this patch is valid, as it shows the
>> "status quo".
>
> Now, everyone agrees this patch is OK? There are no comments that still
> need addressing?

Matt, do you mind to take this patch (and only this patch) through the EFI tree?
There were objections to patch 2 and 3 of this series from the x86
folks so I won't re-do those.

Regards,
Mathias
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Matt Fleming Nov. 11, 2014, 10:32 p.m. UTC | #21
On Tue, 11 Nov, at 10:59:07PM, Mathias Krause wrote:
> 
> Matt, do you mind to take this patch (and only this patch) through the EFI tree?
> There were objections to patch 2 and 3 of this series from the x86
> folks so I won't re-do those.

Applied with Borislav's ACK, thanks Mathias.

Patch
diff mbox series

diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 7166e25ecb57..602b6028c5b6 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -63,6 +63,8 @@  typedef struct { pteval_t pte; } pte_t;
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 #define ESPFIX_PGD_ENTRY _AC(-2, UL)
 #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
+#define EFI_VA_START	 ( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END	 (-68 * (_AC(1, UL) << 30))
 
 #define EARLY_DYNAMIC_PAGE_TABLES	64
 
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 95a427e57887..1a8053d1012e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -76,6 +76,9 @@  static struct addr_marker address_markers[] = {
 # ifdef CONFIG_X86_ESPFIX64
 	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
 # endif
+# ifdef CONFIG_EFI
+	{ EFI_VA_END,		"EFI Runtime Services" },
+# endif
 	{ __START_KERNEL_map,   "High Kernel Mapping" },
 	{ MODULES_VADDR,        "Modules" },
 	{ MODULES_END,          "End Modules" },
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 290d397e1dd9..899c7f17ad85 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -48,8 +48,7 @@  static unsigned long efi_flags __initdata;
  * We allocate runtime services regions bottom-up, starting from -4G, i.e.
  * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
  */
-static u64 efi_va	= -4 * (1UL << 30);
-#define EFI_VA_END	(-68 * (1UL << 30))
+static u64 efi_va = EFI_VA_START;
 
 /*
  * Scratch space used for switching the pagetable in the EFI stub