linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] x86/mm: warn on W+x mappings
@ 2015-10-02 19:29 Stephen Smalley
  2015-10-02 20:44 ` Kees Cook
  2015-10-03 11:27 ` Ingo Molnar
  0 siblings, 2 replies; 36+ messages in thread
From: Stephen Smalley @ 2015-10-02 19:29 UTC (permalink / raw)
  To: x86; +Cc: linux-kernel, keescook, Stephen Smalley

Warn on any residual W+x mappings after setting NX
if DEBUG_WX is enabled.  Introduce a separate
X86_PTDUMP_CORE config that enables the code for
dumping the page tables without enabling the debugfs
interface, so that DEBUG_WX can be enabled without
exposing the debugfs interface.  Switch EFI_PGT_DUMP
to using X86_PTDUMP_CORE so that it also does not require
enabling the debugfs interface.

On success:
x86/mm: Checked W+X mappings: passed, no W+X pages found.

On failure:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 1 at arch/x86/mm/dump_pagetables.c:226 note_page+0x610/0x7b0()
x86/mm: Found insecure W+X mapping at address ffffffff81755000/__stop___ex_table+0xfa8/0xabfa8
Modules linked in:
CPU: 1 PID: 1 Comm: swapper/0 Tainted: G        W       4.3.0-rc3+ #19
 0000000000000000 00000000e96b193f ffff88042c5dbd48 ffffffff81380a5f
 ffff88042c5dbd90 ffff88042c5dbd80 ffffffff8109d3f2 80000000000001e1
 0000000000000003 ffff88042c5dbe90 ffff88042c5dbe90 0000000000000000
Call Trace:
 [<ffffffff81380a5f>] dump_stack+0x44/0x55
 [<ffffffff8109d3f2>] warn_slowpath_common+0x82/0xc0
 [<ffffffff8109d48c>] warn_slowpath_fmt+0x5c/0x80
 [<ffffffff8106cfc9>] ? note_page+0x5c9/0x7b0
 [<ffffffff8106d010>] note_page+0x610/0x7b0
 [<ffffffff8106d409>] ptdump_walk_pgd_level_core+0x259/0x3c0
 [<ffffffff8106d5a7>] ptdump_walk_pgd_level_checkwx+0x17/0x20
 [<ffffffff81063905>] mark_rodata_ro+0xf5/0x100
 [<ffffffff817415a0>] ? rest_init+0x80/0x80
 [<ffffffff817415bd>] kernel_init+0x1d/0xe0
 [<ffffffff8174cd1f>] ret_from_fork+0x3f/0x70
 [<ffffffff817415a0>] ? rest_init+0x80/0x80
---[ end trace a1f23a1e42a2ac76 ]---
x86/mm: Checked W+X mappings: FAILED, 171 W+X pages found.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
---
v2 addresses Kees' concern about being able to enable this check
without enabling the debugfs interface, and reworks the output to
present failure and success in the manner suggested by Ingo.

 arch/x86/Kconfig.debug         | 19 ++++++++++++++++++-
 arch/x86/include/asm/pgtable.h |  7 +++++++
 arch/x86/mm/Makefile           |  2 +-
 arch/x86/mm/dump_pagetables.c  | 42 +++++++++++++++++++++++++++++++++++++++++-
 arch/x86/mm/init_64.c          |  2 ++
 5 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d8c0d32..c6fe16b 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -65,10 +65,14 @@ config EARLY_PRINTK_EFI
 	  This is useful for kernel debugging when your machine crashes very
 	  early before the console code is initialized.
 
+config X86_PTDUMP_CORE
+	def_bool n
+
 config X86_PTDUMP
 	bool "Export kernel pagetable layout to userspace via debugfs"
 	depends on DEBUG_KERNEL
 	select DEBUG_FS
+	select X86_PTDUMP_CORE
 	---help---
 	  Say Y here if you want to show the kernel pagetable layout in a
 	  debugfs file. This information is only useful for kernel developers
@@ -79,13 +83,26 @@ config X86_PTDUMP
 
 config EFI_PGT_DUMP
 	bool "Dump the EFI pagetable"
-	depends on EFI && X86_PTDUMP
+	depends on EFI
+	select X86_PTDUMP_CORE
 	---help---
 	  Enable this if you want to dump the EFI page table before
 	  enabling virtual mode. This can be used to debug miscellaneous
 	  issues with the mapping of the EFI runtime regions into that
 	  table.
 
+config DEBUG_WX
+	bool "Warn on W+X mappings at boot"
+	select X86_PTDUMP_CORE
+	---help---
+	  Generate a warning if any W+X mappings are found at boot.
+	  This is useful for discovering cases where the kernel is leaving
+	  W+X mappings after applying NX, as such mappings are a security risk.
+	  Look for a message in dmesg output like this:
+	  x86/mm: Checked W+X mappings: passed, no W+X pages found.
+	  or like this:
+	  x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found.
+
 config DEBUG_RODATA
 	bool "Write protect kernel read-only data structures"
 	default y
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 867da5b..f2b6bed 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -19,6 +19,13 @@
 #include <asm/x86_init.h>
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_checkwx(void);
+
+#ifdef CONFIG_DEBUG_WX
+#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
+#else
+#define debug_checkwx() do { } while (0)
+#endif
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a482d10..65c47fd 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_SMP)		+= tlb.o
 obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
 
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
-obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
+obj-$(CONFIG_X86_PTDUMP_CORE)	+= dump_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index f0cedf3..19c64af 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -32,6 +32,8 @@ struct pg_state {
 	const struct addr_marker *marker;
 	unsigned long lines;
 	bool to_dmesg;
+	bool check_wx;
+	unsigned long wx_pages;
 };
 
 struct addr_marker {
@@ -214,6 +216,16 @@ static void note_page(struct seq_file *m, struct pg_state *st,
 		const char *unit = units;
 		unsigned long delta;
 		int width = sizeof(unsigned long) * 2;
+		pgprotval_t pr = pgprot_val(st->current_prot);
+
+		if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
+			WARN_ONCE(1,
+				  "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
+				  (void *)st->start_address,
+				  (void *)st->start_address);
+			st->wx_pages += (st->current_address -
+					 st->start_address) / PAGE_SIZE;
+		}
 
 		/*
 		 * Now print the actual finished series
@@ -344,7 +356,8 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 #define pgd_none(a)  pud_none(__pud(pgd_val(a)))
 #endif
 
-void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
+				       bool checkwx)
 {
 #ifdef CONFIG_X86_64
 	pgd_t *start = (pgd_t *) &init_level4_pgt;
@@ -359,6 +372,10 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 		st.to_dmesg = true;
 	}
 
+	st.check_wx = checkwx;
+	if (checkwx)
+		st.wx_pages = 0;
+
 	for (i = 0; i < PTRS_PER_PGD; i++) {
 		st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
 		if (!pgd_none(*start)) {
@@ -378,8 +395,26 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 	/* Flush out the last page */
 	st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
 	note_page(m, &st, __pgprot(0), 0);
+	if (!checkwx)
+		return;
+	if (st.wx_pages)
+		pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
+			st.wx_pages);
+	else
+		pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
+}
+
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+{
+	ptdump_walk_pgd_level_core(m, pgd, false);
 }
 
+void ptdump_walk_pgd_level_checkwx(void)
+{
+	ptdump_walk_pgd_level_core(NULL, NULL, true);
+}
+
+#ifdef CONFIG_X86_PTDUMP
 static int ptdump_show(struct seq_file *m, void *v)
 {
 	ptdump_walk_pgd_level(m, NULL);
@@ -397,10 +432,13 @@ static const struct file_operations ptdump_fops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
+#endif
 
 static int pt_dump_init(void)
 {
+#ifdef CONFIG_X86_PTDUMP
 	struct dentry *pe;
+#endif
 
 #ifdef CONFIG_X86_32
 	/* Not a compile-time constant on x86-32 */
@@ -412,10 +450,12 @@ static int pt_dump_init(void)
 	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
 #endif
 
+#ifdef CONFIG_X86_PTDUMP
 	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
 				 &ptdump_fops);
 	if (!pe)
 		return -ENOMEM;
+#endif
 
 	return 0;
 }
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 30564e2..f8b1573 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1150,6 +1150,8 @@ void mark_rodata_ro(void)
 	free_init_pages("unused kernel",
 			(unsigned long) __va(__pa_symbol(rodata_end)),
 			(unsigned long) __va(__pa_symbol(_sdata)));
+
+	debug_checkwx();
 }
 
 #endif
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-02 19:29 [PATCH v2] x86/mm: warn on W+x mappings Stephen Smalley
@ 2015-10-02 20:44 ` Kees Cook
  2015-10-03 11:27 ` Ingo Molnar
  1 sibling, 0 replies; 36+ messages in thread
From: Kees Cook @ 2015-10-02 20:44 UTC (permalink / raw)
  To: Stephen Smalley; +Cc: x86, LKML

On Fri, Oct 2, 2015 at 12:29 PM, Stephen Smalley <sds@tycho.nsa.gov> wrote:
> Warn on any residual W+x mappings after setting NX
> if DEBUG_WX is enabled.  Introduce a separate
> X86_PTDUMP_CORE config that enables the code for
> dumping the page tables without enabling the debugfs
> interface, so that DEBUG_WX can be enabled without
> exposing the debugfs interface.  Switch EFI_PGT_DUMP
> to using X86_PTDUMP_CORE so that it also does not require
> enabling the debugfs interface.
>
> On success:
> x86/mm: Checked W+X mappings: passed, no W+X pages found.
>
> On failure:
> ------------[ cut here ]------------
> WARNING: CPU: 1 PID: 1 at arch/x86/mm/dump_pagetables.c:226 note_page+0x610/0x7b0()
> x86/mm: Found insecure W+X mapping at address ffffffff81755000/__stop___ex_table+0xfa8/0xabfa8
> Modules linked in:
> CPU: 1 PID: 1 Comm: swapper/0 Tainted: G        W       4.3.0-rc3+ #19
>  0000000000000000 00000000e96b193f ffff88042c5dbd48 ffffffff81380a5f
>  ffff88042c5dbd90 ffff88042c5dbd80 ffffffff8109d3f2 80000000000001e1
>  0000000000000003 ffff88042c5dbe90 ffff88042c5dbe90 0000000000000000
> Call Trace:
>  [<ffffffff81380a5f>] dump_stack+0x44/0x55
>  [<ffffffff8109d3f2>] warn_slowpath_common+0x82/0xc0
>  [<ffffffff8109d48c>] warn_slowpath_fmt+0x5c/0x80
>  [<ffffffff8106cfc9>] ? note_page+0x5c9/0x7b0
>  [<ffffffff8106d010>] note_page+0x610/0x7b0
>  [<ffffffff8106d409>] ptdump_walk_pgd_level_core+0x259/0x3c0
>  [<ffffffff8106d5a7>] ptdump_walk_pgd_level_checkwx+0x17/0x20
>  [<ffffffff81063905>] mark_rodata_ro+0xf5/0x100
>  [<ffffffff817415a0>] ? rest_init+0x80/0x80
>  [<ffffffff817415bd>] kernel_init+0x1d/0xe0
>  [<ffffffff8174cd1f>] ret_from_fork+0x3f/0x70
>  [<ffffffff817415a0>] ? rest_init+0x80/0x80
> ---[ end trace a1f23a1e42a2ac76 ]---
> x86/mm: Checked W+X mappings: FAILED, 171 W+X pages found.
>
> Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>

This looks good to me. I'd like to (separately) merge the x86, arm,
and arm64 ptdump logic so that this can be common infrastructure. Then
the other arch would gain this as well.

Acked-by: Kees Cook <keescook@chromium.org>

-Kees

> ---
> v2 addresses Kees' concern about being able to enable this check
> without enabling the debugfs interface, and reworks the output to
> present failure and success in the manner suggested by Ingo.
>
>  arch/x86/Kconfig.debug         | 19 ++++++++++++++++++-
>  arch/x86/include/asm/pgtable.h |  7 +++++++
>  arch/x86/mm/Makefile           |  2 +-
>  arch/x86/mm/dump_pagetables.c  | 42 +++++++++++++++++++++++++++++++++++++++++-
>  arch/x86/mm/init_64.c          |  2 ++
>  5 files changed, 69 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
> index d8c0d32..c6fe16b 100644
> --- a/arch/x86/Kconfig.debug
> +++ b/arch/x86/Kconfig.debug
> @@ -65,10 +65,14 @@ config EARLY_PRINTK_EFI
>           This is useful for kernel debugging when your machine crashes very
>           early before the console code is initialized.
>
> +config X86_PTDUMP_CORE
> +       def_bool n
> +
>  config X86_PTDUMP
>         bool "Export kernel pagetable layout to userspace via debugfs"
>         depends on DEBUG_KERNEL
>         select DEBUG_FS
> +       select X86_PTDUMP_CORE
>         ---help---
>           Say Y here if you want to show the kernel pagetable layout in a
>           debugfs file. This information is only useful for kernel developers
> @@ -79,13 +83,26 @@ config X86_PTDUMP
>
>  config EFI_PGT_DUMP
>         bool "Dump the EFI pagetable"
> -       depends on EFI && X86_PTDUMP
> +       depends on EFI
> +       select X86_PTDUMP_CORE
>         ---help---
>           Enable this if you want to dump the EFI page table before
>           enabling virtual mode. This can be used to debug miscellaneous
>           issues with the mapping of the EFI runtime regions into that
>           table.
>
> +config DEBUG_WX
> +       bool "Warn on W+X mappings at boot"
> +       select X86_PTDUMP_CORE
> +       ---help---
> +         Generate a warning if any W+X mappings are found at boot.
> +         This is useful for discovering cases where the kernel is leaving
> +         W+X mappings after applying NX, as such mappings are a security risk.
> +         Look for a message in dmesg output like this:
> +         x86/mm: Checked W+X mappings: passed, no W+X pages found.
> +         or like this:
> +         x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found.
> +
>  config DEBUG_RODATA
>         bool "Write protect kernel read-only data structures"
>         default y
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 867da5b..f2b6bed 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -19,6 +19,13 @@
>  #include <asm/x86_init.h>
>
>  void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
> +void ptdump_walk_pgd_level_checkwx(void);
> +
> +#ifdef CONFIG_DEBUG_WX
> +#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
> +#else
> +#define debug_checkwx() do { } while (0)
> +#endif
>
>  /*
>   * ZERO_PAGE is a global shared page that is always zero: used
> diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
> index a482d10..65c47fd 100644
> --- a/arch/x86/mm/Makefile
> +++ b/arch/x86/mm/Makefile
> @@ -14,7 +14,7 @@ obj-$(CONFIG_SMP)             += tlb.o
>  obj-$(CONFIG_X86_32)           += pgtable_32.o iomap_32.o
>
>  obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage.o
> -obj-$(CONFIG_X86_PTDUMP)       += dump_pagetables.o
> +obj-$(CONFIG_X86_PTDUMP_CORE)  += dump_pagetables.o
>
>  obj-$(CONFIG_HIGHMEM)          += highmem_32.o
>
> diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
> index f0cedf3..19c64af 100644
> --- a/arch/x86/mm/dump_pagetables.c
> +++ b/arch/x86/mm/dump_pagetables.c
> @@ -32,6 +32,8 @@ struct pg_state {
>         const struct addr_marker *marker;
>         unsigned long lines;
>         bool to_dmesg;
> +       bool check_wx;
> +       unsigned long wx_pages;
>  };
>
>  struct addr_marker {
> @@ -214,6 +216,16 @@ static void note_page(struct seq_file *m, struct pg_state *st,
>                 const char *unit = units;
>                 unsigned long delta;
>                 int width = sizeof(unsigned long) * 2;
> +               pgprotval_t pr = pgprot_val(st->current_prot);
> +
> +               if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
> +                       WARN_ONCE(1,
> +                                 "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
> +                                 (void *)st->start_address,
> +                                 (void *)st->start_address);
> +                       st->wx_pages += (st->current_address -
> +                                        st->start_address) / PAGE_SIZE;
> +               }
>
>                 /*
>                  * Now print the actual finished series
> @@ -344,7 +356,8 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
>  #define pgd_none(a)  pud_none(__pud(pgd_val(a)))
>  #endif
>
> -void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
> +static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
> +                                      bool checkwx)
>  {
>  #ifdef CONFIG_X86_64
>         pgd_t *start = (pgd_t *) &init_level4_pgt;
> @@ -359,6 +372,10 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
>                 st.to_dmesg = true;
>         }
>
> +       st.check_wx = checkwx;
> +       if (checkwx)
> +               st.wx_pages = 0;
> +
>         for (i = 0; i < PTRS_PER_PGD; i++) {
>                 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
>                 if (!pgd_none(*start)) {
> @@ -378,8 +395,26 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
>         /* Flush out the last page */
>         st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
>         note_page(m, &st, __pgprot(0), 0);
> +       if (!checkwx)
> +               return;
> +       if (st.wx_pages)
> +               pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
> +                       st.wx_pages);
> +       else
> +               pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
> +}
> +
> +void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
> +{
> +       ptdump_walk_pgd_level_core(m, pgd, false);
>  }
>
> +void ptdump_walk_pgd_level_checkwx(void)
> +{
> +       ptdump_walk_pgd_level_core(NULL, NULL, true);
> +}
> +
> +#ifdef CONFIG_X86_PTDUMP
>  static int ptdump_show(struct seq_file *m, void *v)
>  {
>         ptdump_walk_pgd_level(m, NULL);
> @@ -397,10 +432,13 @@ static const struct file_operations ptdump_fops = {
>         .llseek         = seq_lseek,
>         .release        = single_release,
>  };
> +#endif
>
>  static int pt_dump_init(void)
>  {
> +#ifdef CONFIG_X86_PTDUMP
>         struct dentry *pe;
> +#endif
>
>  #ifdef CONFIG_X86_32
>         /* Not a compile-time constant on x86-32 */
> @@ -412,10 +450,12 @@ static int pt_dump_init(void)
>         address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
>  #endif
>
> +#ifdef CONFIG_X86_PTDUMP
>         pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
>                                  &ptdump_fops);
>         if (!pe)
>                 return -ENOMEM;
> +#endif
>
>         return 0;
>  }
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 30564e2..f8b1573 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1150,6 +1150,8 @@ void mark_rodata_ro(void)
>         free_init_pages("unused kernel",
>                         (unsigned long) __va(__pa_symbol(rodata_end)),
>                         (unsigned long) __va(__pa_symbol(_sdata)));
> +
> +       debug_checkwx();
>  }
>
>  #endif
> --
> 2.1.0
>



-- 
Kees Cook
Chrome OS Security

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-02 19:29 [PATCH v2] x86/mm: warn on W+x mappings Stephen Smalley
  2015-10-02 20:44 ` Kees Cook
@ 2015-10-03 11:27 ` Ingo Molnar
  2015-10-05 19:13   ` Stephen Smalley
  1 sibling, 1 reply; 36+ messages in thread
From: Ingo Molnar @ 2015-10-03 11:27 UTC (permalink / raw)
  To: Stephen Smalley; +Cc: x86, linux-kernel, keescook


* Stephen Smalley <sds@tycho.nsa.gov> wrote:

> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 30564e2..f8b1573 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1150,6 +1150,8 @@ void mark_rodata_ro(void)
>  	free_init_pages("unused kernel",
>  			(unsigned long) __va(__pa_symbol(rodata_end)),
>  			(unsigned long) __va(__pa_symbol(_sdata)));
> +
> +	debug_checkwx();

Any reason to not do this on NX capable 32-bit kernels as well?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-03 11:27 ` Ingo Molnar
@ 2015-10-05 19:13   ` Stephen Smalley
  2015-10-06  7:32     ` Ingo Molnar
  0 siblings, 1 reply; 36+ messages in thread
From: Stephen Smalley @ 2015-10-05 19:13 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: x86, linux-kernel, keescook

On 10/03/2015 07:27 AM, Ingo Molnar wrote:
> 
> * Stephen Smalley <sds@tycho.nsa.gov> wrote:
> 
>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index 30564e2..f8b1573 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -1150,6 +1150,8 @@ void mark_rodata_ro(void)
>>  	free_init_pages("unused kernel",
>>  			(unsigned long) __va(__pa_symbol(rodata_end)),
>>  			(unsigned long) __va(__pa_symbol(_sdata)));
>> +
>> +	debug_checkwx();
> 
> Any reason to not do this on NX capable 32-bit kernels as well?

Done in v3.  However, I do see lots of W+X mappings there.

[    1.012796] WARNING: CPU: 1 PID: 1 at arch/x86/mm/dump_pagetables.c:225 note_page+0x65d/0x840()
[    1.012803] x86/mm: Found insecure W+X mapping at address f4a00000/0xf4a00000
[    1.012805] Modules linked in:
[    1.012833] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 4.3.0-rc4+ #2
[    1.012837] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
[    1.012844]  c0d32967 173b7da7 00000000 f7105e7c c0713490 f7105ebc f7105eac c045d077
[    1.012848]  c0c47ef8 f7105edc 00000001 c0c4de42 000000e1 c04551fd c04551fd f7105f3c
[    1.012851]  00000002 00000000 f7105ec8 c045d0ee 00000009 f7105ebc c0c47ef8 f7105edc
[    1.012855] Call Trace:
[    1.012868]  [<c0713490>] dump_stack+0x41/0x61
[    1.012871]  [<c045d077>] warn_slowpath_common+0x87/0xc0
[    1.012873]  [<c04551fd>] ? note_page+0x65d/0x840
[    1.012875]  [<c04551fd>] ? note_page+0x65d/0x840
[    1.012877]  [<c045d0ee>] warn_slowpath_fmt+0x3e/0x60
[    1.012878]  [<c04551fd>] note_page+0x65d/0x840
[    1.012880]  [<c04555b6>] ptdump_walk_pgd_level_core+0x1d6/0x2d0
[    1.012883]  [<c04557a6>] ptdump_walk_pgd_level_checkwx+0x16/0x20
[    1.012886]  [<c044ba15>] mark_rodata_ro+0x135/0x160
[    1.012898]  [<c0a9b33f>] kernel_init+0x1f/0xe0
[    1.012906]  [<c0484b41>] ? schedule_tail+0x11/0x50
[    1.012909]  [<c0aa0bc1>] ret_from_kernel_thread+0x21/0x30
[    1.012910]  [<c0a9b320>] ? rest_init+0x70/0x70
[    1.012912] ---[ end trace 40a4f3d5e8fb70ac ]---
[    1.012954] x86/mm: Checked W+X mappings: FAILED, 6556 W+X pages found.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-05 19:13   ` Stephen Smalley
@ 2015-10-06  7:32     ` Ingo Molnar
  2015-10-06 15:37       ` Stephen Smalley
  0 siblings, 1 reply; 36+ messages in thread
From: Ingo Molnar @ 2015-10-06  7:32 UTC (permalink / raw)
  To: Stephen Smalley
  Cc: x86, linux-kernel, keescook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Borislav Petkov, Denys Vlasenko,
	Brian Gerst


* Stephen Smalley <sds@tycho.nsa.gov> wrote:

> On 10/03/2015 07:27 AM, Ingo Molnar wrote:
> > 
> > * Stephen Smalley <sds@tycho.nsa.gov> wrote:
> > 
> >> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> >> index 30564e2..f8b1573 100644
> >> --- a/arch/x86/mm/init_64.c
> >> +++ b/arch/x86/mm/init_64.c
> >> @@ -1150,6 +1150,8 @@ void mark_rodata_ro(void)
> >>  	free_init_pages("unused kernel",
> >>  			(unsigned long) __va(__pa_symbol(rodata_end)),
> >>  			(unsigned long) __va(__pa_symbol(_sdata)));
> >> +
> >> +	debug_checkwx();
> > 
> > Any reason to not do this on NX capable 32-bit kernels as well?
> 
> Done in v3.  However, I do see lots of W+X mappings there.

Ha! That's a debug check plan gone very well! :)

> [    1.012796] WARNING: CPU: 1 PID: 1 at arch/x86/mm/dump_pagetables.c:225 note_page+0x65d/0x840()
> [    1.012803] x86/mm: Found insecure W+X mapping at address f4a00000/0xf4a00000

What does this range correspond to on your kernel?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-06  7:32     ` Ingo Molnar
@ 2015-10-06 15:37       ` Stephen Smalley
  2015-10-12 11:36         ` Borislav Petkov
  0 siblings, 1 reply; 36+ messages in thread
From: Stephen Smalley @ 2015-10-06 15:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: x86, linux-kernel, keescook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Borislav Petkov, Denys Vlasenko,
	Brian Gerst

On 10/06/2015 03:32 AM, Ingo Molnar wrote:
> 
> * Stephen Smalley <sds@tycho.nsa.gov> wrote:
> 
>> On 10/03/2015 07:27 AM, Ingo Molnar wrote:
>>>
>>> * Stephen Smalley <sds@tycho.nsa.gov> wrote:
>>>
>>>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>>>> index 30564e2..f8b1573 100644
>>>> --- a/arch/x86/mm/init_64.c
>>>> +++ b/arch/x86/mm/init_64.c
>>>> @@ -1150,6 +1150,8 @@ void mark_rodata_ro(void)
>>>>  	free_init_pages("unused kernel",
>>>>  			(unsigned long) __va(__pa_symbol(rodata_end)),
>>>>  			(unsigned long) __va(__pa_symbol(_sdata)));
>>>> +
>>>> +	debug_checkwx();
>>>
>>> Any reason to not do this on NX capable 32-bit kernels as well?
>>
>> Done in v3.  However, I do see lots of W+X mappings there.
> 
> Ha! That's a debug check plan gone very well! :)
> 
>> [    1.012796] WARNING: CPU: 1 PID: 1 at arch/x86/mm/dump_pagetables.c:225 note_page+0x65d/0x840()
>> [    1.012803] x86/mm: Found insecure W+X mapping at address f4a00000/0xf4a00000
> 
> What does this range correspond to on your kernel?

>From dmesg:
[    0.000000] virtual kernel memory layout:
                   fixmap  : 0xffa96000 - 0xfffff000   (5540 kB)
                   pkmap   : 0xff800000 - 0xffa00000   (2048 kB)
                   vmalloc : 0xf7ffe000 - 0xff7fe000   ( 120 MB)
                   lowmem  : 0xc0000000 - 0xf77fe000   ( 887 MB)
                     .init : 0xc0dde000 - 0xc0e9d000   ( 764 kB)
                     .data : 0xc0aa2ba0 - 0xc0ddca00   (3303 kB)
                     .text : 0xc0400000 - 0xc0aa2ba0   (6794 kB)

/sys/kernel/debug/kernel_page_tables seems to have many such mappings,
even before the reported one under Kernel Mapping, plus one in the vmalloc() area:

---[ Kernel Mapping ]---
0xc0000000-0xc009b000         620K     RW                 GLB NX pte
0xc009b000-0xc009c000           4K     ro                 GLB NX pte
0xc009c000-0xc009d000           4K     ro                 GLB x  pte
0xc009d000-0xc0200000        1420K     RW                 GLB NX pte
0xc0200000-0xc0400000           2M     RW         PSE     GLB NX pmd
0xc0400000-0xc0a00000           6M     ro         PSE     GLB x  pmd
0xc0a00000-0xc0aa3000         652K     ro                 GLB x  pte
0xc0aa3000-0xc0d2a000        2588K     ro                 GLB NX pte
0xc0d2a000-0xc1000000        2904K     RW                 GLB NX pte
0xc1000000-0xe7000000         608M     RW         PSE     GLB NX pmd
0xe7000000-0xe7027000         156K     RW                 GLB x  pte
0xe7027000-0xe7028000           4K     ro                 GLB x  pte
0xe7028000-0xe709b000         460K     RW                 GLB x  pte
0xe709b000-0xe709c000           4K     ro                 GLB x  pte
0xe709c000-0xe70b8000         112K     RW                 GLB x  pte
0xe70b8000-0xe70b9000           4K     ro                 GLB x  pte
0xe70b9000-0xe7108000         316K     RW                 GLB x  pte
0xe7108000-0xe710a000           8K     ro                 GLB x  pte
0xe710a000-0xe7127000         116K     RW                 GLB x  pte
0xe7127000-0xe712a000          12K     ro                 GLB x  pte
<many additional ones elided>
0xf2c5c000-0xf2c5d000           4K     ro                 GLB x  pte
0xf2c5d000-0xf2e00000        1676K     RW                 GLB x  pte
0xf2e00000-0xf4a00000          28M     RW         PSE     GLB NX pmd
0xf4a00000-0xf4b28000        1184K     RW                 GLB x  pte
0xf4b28000-0xf4c00000         864K     RW                 GLB NX pte
0xf4c00000-0xf5200000           6M     RW         PSE     GLB x  pmd
0xf5200000-0xf525d000         372K     RW                 GLB x  pte
0xf525d000-0xf525e000           4K     ro                 GLB x  pte
0xf525e000-0xf525f000           4K     RW                 GLB x  pte
0xf525f000-0xf5260000           4K     ro                 GLB x  pte
0xf5260000-0xf526a000          40K     RW                 GLB x  pte
0xf6400000-0xf658c000        1584K     RW                 GLB NX pte
0xf658c000-0xf6600000         464K     RW                 GLB x  pte
0xf6600000-0xf7600000          16M     RW         PSE     GLB NX pmd
0xf7600000-0xf77fe000        2040K     RW                 GLB NX pte
0xf77fe000-0xf7800000           8K                               pte
0xf7800000-0xf7e00000           6M                               pmd
0xf7e00000-0xf7ffe000        2040K                               pte
---[ vmalloc() Area ]---
0xf7ffe000-0xf7fff000           4K     RW                 GLB NX pte
0xf7fff000-0xf8000000           4K                               pte
0xf8000000-0xf8002000           8K     RW                 GLB NX pte
...
0xf86f3000-0xf8800000        1076K                               pte
0xf8800000-0xf8a00000           2M     RW PWT     PSE     GLB x  pmd
0xf8a00000-0xf8b00000           1M     RW PWT             GLB NX pte

$ grep -c 'RW.*x' kernel_page_tables 
114

There was also an earlier W+X mapping originally that I squelched via pci=nobios.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-06 15:37       ` Stephen Smalley
@ 2015-10-12 11:36         ` Borislav Petkov
  2015-10-12 12:41           ` Matt Fleming
  0 siblings, 1 reply; 36+ messages in thread
From: Borislav Petkov @ 2015-10-12 11:36 UTC (permalink / raw)
  To: Stephen Smalley, Ingo Molnar, Matt Fleming
  Cc: x86, linux-kernel, keescook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Denys Vlasenko, Brian Gerst

On Tue, Oct 06, 2015 at 11:37:57AM -0400, Stephen Smalley wrote:
> > What does this range correspond to on your kernel?

Got a W+X splat here too, on the UEFI box with rc5+tip/master:

[    6.792949] rtc_cmos 00:02: setting system clock to 2015-10-12 11:17:03 UTC (1444648623)
[    6.807863] Freeing unused kernel memory: 1312K (ffffffff81f5f000 - ffffffff820a7000)
[    6.815831] usb 3-1: new high-speed USB device number 2 using ehci-pci
[    6.823261] Write protecting the kernel read-only data: 14336k
[    6.832196] Freeing unused kernel memory: 1796K (ffff88000383f000 - ffff880003a00000)
[    6.842210] Freeing unused kernel memory: 284K (ffff880003db9000 - ffff880003e00000)
[    6.850524] ------------[ cut here ]------------
[    6.855682] WARNING: CPU: 5 PID: 1 at arch/x86/mm/dump_pagetables.c:225 note_page+0x61e/0x7e0()
[    6.864944] x86/mm: Found insecure W+X mapping at address ffff88000005e000/0xffff88000005e000
[    6.874022] Modules linked in:
[    6.877643] CPU: 5 PID: 1 Comm: swapper/0 Not tainted 4.3.0-rc5+ #1
[    6.884462] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A13 05/11/2014
[    6.892416]  ffffffff81caf1f7 ffff88043bdffd60 ffffffff813aab2c ffff88043bdffda8
[    6.900460]  ffff88043bdffd98 ffffffff81066776 ffff880004e55308 0000000000000004
[    6.907816] usb 4-1: new high-speed USB device number 2 using ehci-pci
[    6.915499]  8000000000000163 ffff88043bdffe98 0000000000000000 ffff88043bdffdf8
[    6.923520] Call Trace:
[    6.926512]  [<ffffffff813aab2c>] dump_stack+0x4e/0x82
[    6.931551] usb 3-1: New USB device found, idVendor=8087, idProduct=0024
[    6.931552] usb 3-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0
[    6.933120] hub 3-1:1.0: USB hub found
[    6.933369] hub 3-1:1.0: 6 ports detected
[    6.955784]  [<ffffffff81066776>] warn_slowpath_common+0x86/0xc0
[    6.962341]  [<ffffffff810667fc>] warn_slowpath_fmt+0x4c/0x50
[    6.968631]  [<ffffffff8105bb7e>] note_page+0x61e/0x7e0
[    6.974404]  [<ffffffff8105c09f>] ptdump_walk_pgd_level_core+0x35f/0x3f0
[    6.981651]  [<ffffffff8105c1d7>] ptdump_walk_pgd_level_checkwx+0x17/0x20
[    6.988996]  [<ffffffff81051b0e>] mark_rodata_ro+0xee/0x100
[    6.995124]  [<ffffffff81828610>] ? rest_init+0x140/0x140
[    7.001064]  [<ffffffff8182862d>] kernel_init+0x1d/0xe0
[    7.006841]  [<ffffffff81836f6f>] ret_from_fork+0x3f/0x70
[    7.012774]  [<ffffffff81828610>] ? rest_init+0x140/0x140
[    7.018706] ---[ end trace 920055014e07ef1e ]---
[    7.024302] x86/mm: Checked W+X mappings: FAILED, 69568 W+X pages found.

And yes, there are a bunch of those mappings here too:

$ grep -c 'RW.*x' /sys/kernel/debug/kernel_page_tables
75

Some of them are the UEFI runtime regions. I guess we can try to map
them as RO maybe, they need to be X. Matt, any reasons against that?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 11:36         ` Borislav Petkov
@ 2015-10-12 12:41           ` Matt Fleming
  2015-10-12 12:49             ` Ingo Molnar
  0 siblings, 1 reply; 36+ messages in thread
From: Matt Fleming @ 2015-10-12 12:41 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Stephen Smalley, Ingo Molnar, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel

On Mon, 12 Oct, at 01:36:05PM, Borislav Petkov wrote:
> On Tue, Oct 06, 2015 at 11:37:57AM -0400, Stephen Smalley wrote:
> > > What does this range correspond to on your kernel?
> 
> Got a W+X splat here too, on the UEFI box with rc5+tip/master:
> 
> [    6.792949] rtc_cmos 00:02: setting system clock to 2015-10-12 11:17:03 UTC (1444648623)
> [    6.807863] Freeing unused kernel memory: 1312K (ffffffff81f5f000 - ffffffff820a7000)
> [    6.815831] usb 3-1: new high-speed USB device number 2 using ehci-pci
> [    6.823261] Write protecting the kernel read-only data: 14336k
> [    6.832196] Freeing unused kernel memory: 1796K (ffff88000383f000 - ffff880003a00000)
> [    6.842210] Freeing unused kernel memory: 284K (ffff880003db9000 - ffff880003e00000)
> [    6.850524] ------------[ cut here ]------------
> [    6.855682] WARNING: CPU: 5 PID: 1 at arch/x86/mm/dump_pagetables.c:225 note_page+0x61e/0x7e0()
> [    6.864944] x86/mm: Found insecure W+X mapping at address ffff88000005e000/0xffff88000005e000
> [    6.874022] Modules linked in:
> [    6.877643] CPU: 5 PID: 1 Comm: swapper/0 Not tainted 4.3.0-rc5+ #1
> [    6.884462] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A13 05/11/2014
> [    6.892416]  ffffffff81caf1f7 ffff88043bdffd60 ffffffff813aab2c ffff88043bdffda8
> [    6.900460]  ffff88043bdffd98 ffffffff81066776 ffff880004e55308 0000000000000004
> [    6.907816] usb 4-1: new high-speed USB device number 2 using ehci-pci
> [    6.915499]  8000000000000163 ffff88043bdffe98 0000000000000000 ffff88043bdffdf8
> [    6.923520] Call Trace:
> [    6.926512]  [<ffffffff813aab2c>] dump_stack+0x4e/0x82
> [    6.931551] usb 3-1: New USB device found, idVendor=8087, idProduct=0024
> [    6.931552] usb 3-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0
> [    6.933120] hub 3-1:1.0: USB hub found
> [    6.933369] hub 3-1:1.0: 6 ports detected
> [    6.955784]  [<ffffffff81066776>] warn_slowpath_common+0x86/0xc0
> [    6.962341]  [<ffffffff810667fc>] warn_slowpath_fmt+0x4c/0x50
> [    6.968631]  [<ffffffff8105bb7e>] note_page+0x61e/0x7e0
> [    6.974404]  [<ffffffff8105c09f>] ptdump_walk_pgd_level_core+0x35f/0x3f0
> [    6.981651]  [<ffffffff8105c1d7>] ptdump_walk_pgd_level_checkwx+0x17/0x20
> [    6.988996]  [<ffffffff81051b0e>] mark_rodata_ro+0xee/0x100
> [    6.995124]  [<ffffffff81828610>] ? rest_init+0x140/0x140
> [    7.001064]  [<ffffffff8182862d>] kernel_init+0x1d/0xe0
> [    7.006841]  [<ffffffff81836f6f>] ret_from_fork+0x3f/0x70
> [    7.012774]  [<ffffffff81828610>] ? rest_init+0x140/0x140
> [    7.018706] ---[ end trace 920055014e07ef1e ]---
> [    7.024302] x86/mm: Checked W+X mappings: FAILED, 69568 W+X pages found.
> 
> And yes, there are a bunch of those mappings here too:
> 
> $ grep -c 'RW.*x' /sys/kernel/debug/kernel_page_tables
> 75
> 
> Some of them are the UEFI runtime regions. I guess we can try to map
> them as RO maybe, they need to be X. Matt, any reasons against that?

I'm glad you asked (but you won't be)!

Basically, it's guaranteed that there exist some machines that contain
data in EfiRuntimeCode regions (and so require write permission) and
code in EfiRuntimeData regions (and therefore require eXecute),
because the whole point of the new EFI_PROPERTIES_TABLE feature in
UEFI v2.5 was to make it explicit when the firmware does not include
such regions.

I do have patches sitting in a git branch that begin to implement
support for mapping EFI runtime data regions as NX, and code regions
as RO when EFI_PROPERTIES_TABLE is enabled,

  https://git.kernel.org/cgit/linux/kernel/git/mfleming/efi.git/log/?h=memmap

Things got stalled when we realised that Linux didn't even boot with
the feature enabled, see commit a5caa209ba9c ("x86/efi: Fix boot crash
by mapping EFI memmap entries bottom-up at runtime, instead of
top-down").

For more information on what the current limitations are for mapping
EFI regions, check out this whitepaper,

  https://firmware.intel.com/sites/default/files/resources/A_Tour_Beyond_BIOS_Memory_Practices_with_UEFI.pdf

-- 
Matt Fleming, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 12:41           ` Matt Fleming
@ 2015-10-12 12:49             ` Ingo Molnar
  2015-10-12 12:55               ` Matt Fleming
  0 siblings, 1 reply; 36+ messages in thread
From: Ingo Molnar @ 2015-10-12 12:49 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Borislav Petkov, Stephen Smalley, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel


* Matt Fleming <matt@codeblueprint.co.uk> wrote:

> On Mon, 12 Oct, at 01:36:05PM, Borislav Petkov wrote:
> > On Tue, Oct 06, 2015 at 11:37:57AM -0400, Stephen Smalley wrote:
> > > > What does this range correspond to on your kernel?
> > 
> > Got a W+X splat here too, on the UEFI box with rc5+tip/master:
> > 
> > [    6.792949] rtc_cmos 00:02: setting system clock to 2015-10-12 11:17:03 UTC (1444648623)
> > [    6.807863] Freeing unused kernel memory: 1312K (ffffffff81f5f000 - ffffffff820a7000)
> > [    6.815831] usb 3-1: new high-speed USB device number 2 using ehci-pci
> > [    6.823261] Write protecting the kernel read-only data: 14336k
> > [    6.832196] Freeing unused kernel memory: 1796K (ffff88000383f000 - ffff880003a00000)
> > [    6.842210] Freeing unused kernel memory: 284K (ffff880003db9000 - ffff880003e00000)
> > [    6.850524] ------------[ cut here ]------------
> > [    6.855682] WARNING: CPU: 5 PID: 1 at arch/x86/mm/dump_pagetables.c:225 note_page+0x61e/0x7e0()
> > [    6.864944] x86/mm: Found insecure W+X mapping at address ffff88000005e000/0xffff88000005e000
> > [    6.874022] Modules linked in:
> > [    6.877643] CPU: 5 PID: 1 Comm: swapper/0 Not tainted 4.3.0-rc5+ #1
> > [    6.884462] Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A13 05/11/2014
> > [    6.892416]  ffffffff81caf1f7 ffff88043bdffd60 ffffffff813aab2c ffff88043bdffda8
> > [    6.900460]  ffff88043bdffd98 ffffffff81066776 ffff880004e55308 0000000000000004
> > [    6.907816] usb 4-1: new high-speed USB device number 2 using ehci-pci
> > [    6.915499]  8000000000000163 ffff88043bdffe98 0000000000000000 ffff88043bdffdf8
> > [    6.923520] Call Trace:
> > [    6.926512]  [<ffffffff813aab2c>] dump_stack+0x4e/0x82
> > [    6.931551] usb 3-1: New USB device found, idVendor=8087, idProduct=0024
> > [    6.931552] usb 3-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0
> > [    6.933120] hub 3-1:1.0: USB hub found
> > [    6.933369] hub 3-1:1.0: 6 ports detected
> > [    6.955784]  [<ffffffff81066776>] warn_slowpath_common+0x86/0xc0
> > [    6.962341]  [<ffffffff810667fc>] warn_slowpath_fmt+0x4c/0x50
> > [    6.968631]  [<ffffffff8105bb7e>] note_page+0x61e/0x7e0
> > [    6.974404]  [<ffffffff8105c09f>] ptdump_walk_pgd_level_core+0x35f/0x3f0
> > [    6.981651]  [<ffffffff8105c1d7>] ptdump_walk_pgd_level_checkwx+0x17/0x20
> > [    6.988996]  [<ffffffff81051b0e>] mark_rodata_ro+0xee/0x100
> > [    6.995124]  [<ffffffff81828610>] ? rest_init+0x140/0x140
> > [    7.001064]  [<ffffffff8182862d>] kernel_init+0x1d/0xe0
> > [    7.006841]  [<ffffffff81836f6f>] ret_from_fork+0x3f/0x70
> > [    7.012774]  [<ffffffff81828610>] ? rest_init+0x140/0x140
> > [    7.018706] ---[ end trace 920055014e07ef1e ]---
> > [    7.024302] x86/mm: Checked W+X mappings: FAILED, 69568 W+X pages found.
> > 
> > And yes, there are a bunch of those mappings here too:
> > 
> > $ grep -c 'RW.*x' /sys/kernel/debug/kernel_page_tables
> > 75
> > 
> > Some of them are the UEFI runtime regions. I guess we can try to map
> > them as RO maybe, they need to be X. Matt, any reasons against that?
> 
> I'm glad you asked (but you won't be)!
> 
> Basically, it's guaranteed that there exist some machines that contain
> data in EfiRuntimeCode regions (and so require write permission) and
> code in EfiRuntimeData regions (and therefore require eXecute),
> because the whole point of the new EFI_PROPERTIES_TABLE feature in
> UEFI v2.5 was to make it explicit when the firmware does not include
> such regions.

So why not unmap them after bootup? Is there any reason to call into EFI code 
while the system is up and running?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 12:49             ` Ingo Molnar
@ 2015-10-12 12:55               ` Matt Fleming
  2015-10-12 14:17                 ` Ingo Molnar
  0 siblings, 1 reply; 36+ messages in thread
From: Matt Fleming @ 2015-10-12 12:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Stephen Smalley, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel

On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
> 
> 
> So why not unmap them after bootup? Is there any reason to call into EFI code 
> while the system is up and running?

That's where the runtime services code lives. So if you want things
like EFI variables (used by the distro installer, among other things)
you need to map the runtime regions.

You can of course disable that by using the "noefi" kernel parameter,
which should unmap everything for you once you've finished booting.

-- 
Matt Fleming, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 12:55               ` Matt Fleming
@ 2015-10-12 14:17                 ` Ingo Molnar
  2015-10-12 14:49                   ` Matt Fleming
  2015-10-12 14:56                   ` Josh Triplett
  0 siblings, 2 replies; 36+ messages in thread
From: Ingo Molnar @ 2015-10-12 14:17 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Borislav Petkov, Stephen Smalley, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel


* Matt Fleming <matt@codeblueprint.co.uk> wrote:

> On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
> > 
> > 
> > So why not unmap them after bootup? Is there any reason to call into EFI code 
> > while the system is up and running?
> 
> That's where the runtime services code lives. So if you want things like EFI 
> variables (used by the distro installer, among other things) you need to map the 
> runtime regions.

So EFI variables could be queried during bootup and saved on the Linux side.

Calling into firmware after the kernel has booted up is fragile in general - 
beyond W+X the security considerations.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 14:17                 ` Ingo Molnar
@ 2015-10-12 14:49                   ` Matt Fleming
  2015-10-12 15:34                     ` Ard Biesheuvel
  2015-10-14 15:18                     ` Ingo Molnar
  2015-10-12 14:56                   ` Josh Triplett
  1 sibling, 2 replies; 36+ messages in thread
From: Matt Fleming @ 2015-10-12 14:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Stephen Smalley, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel

On Mon, 12 Oct, at 04:17:54PM, Ingo Molnar wrote:
> 
> * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> 
> > On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
> > > 
> > > 
> > > So why not unmap them after bootup? Is there any reason to call into EFI code 
> > > while the system is up and running?
> > 
> > That's where the runtime services code lives. So if you want things like EFI 
> > variables (used by the distro installer, among other things) you need to map the 
> > runtime regions.
> 
> So EFI variables could be queried during bootup and saved on the Linux side.
 
Right, we could do that, but then we wouldn't be able to support
creation/updating variables at runtime, such as when you install a
distribution for the first time, or want to boot a new kernel filename
directly from the firmware without a boot loader (and need to modify
the BootXXXX variables).

And it's not just EFI variables that need runtime support either, for
some platforms the only way to reboot/poweroff is with EFI, such as on
the ASUS T100TA (Intel Baytrail-T).

That's not to say your suggestion doesn't make sense for some cases, I
can definitely see how turning off runtime support but providing a
cache of EFI variables would be useful for some scenarios. But I don't
think it's ever going to be workable as a default option.

> Calling into firmware after the kernel has booted up is fragile in general - 
> beyond W+X the security considerations.

It isn't intended to be fragile, and effort has gone into defining the
context under which the EFI runtime services can operate (though there
are obviously gaps in that specification).

The entire point of the EFI runtime services is that they can be
invoked from the OS, and because hardware/firmware developers rely on
that when designing platforms, it's going to be something that Linux
is going to have to be able to do. Of course, that doesn't we
shouldn't be able to turn it off if the user is happy to sacrifice
some platform functionality.

Additionally, if we've got suggestions for the firmware developers on
what we want the runtime context to look like, let's propose it to
them. They're pretty receptive in my experience.

-- 
Matt Fleming, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 14:17                 ` Ingo Molnar
  2015-10-12 14:49                   ` Matt Fleming
@ 2015-10-12 14:56                   ` Josh Triplett
  2015-10-14 15:19                     ` Ingo Molnar
  1 sibling, 1 reply; 36+ messages in thread
From: Josh Triplett @ 2015-10-12 14:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Matt Fleming, Borislav Petkov, Stephen Smalley, x86,
	linux-kernel, keescook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra

On Mon, Oct 12, 2015 at 04:17:54PM +0200, Ingo Molnar wrote:
> * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> > On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
> > > So why not unmap them after bootup? Is there any reason to call into EFI code 
> > > while the system is up and running?
> > 
> > That's where the runtime services code lives. So if you want things like EFI 
> > variables (used by the distro installer, among other things) you need to map the 
> > runtime regions.
> 
> So EFI variables could be queried during bootup and saved on the Linux side.

That wouldn't support writing to EFI variables.  Or using the EFI
capsule update system to update firmware.

- Josh Triplett

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 14:49                   ` Matt Fleming
@ 2015-10-12 15:34                     ` Ard Biesheuvel
  2015-10-12 15:50                       ` Matt Fleming
  2015-10-14 15:18                     ` Ingo Molnar
  1 sibling, 1 reply; 36+ messages in thread
From: Ard Biesheuvel @ 2015-10-12 15:34 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Ingo Molnar, Borislav Petkov, Stephen Smalley, x86, linux-kernel,
	Kees Cook, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra,
	Andy Lutomirski, Denys Vlasenko, Brian Gerst, linux-efi

On 12 October 2015 at 16:49, Matt Fleming <matt@codeblueprint.co.uk> wrote:
> On Mon, 12 Oct, at 04:17:54PM, Ingo Molnar wrote:
>>
>> * Matt Fleming <matt@codeblueprint.co.uk> wrote:
>>
>> > On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
>> > >
>> > >
>> > > So why not unmap them after bootup? Is there any reason to call into EFI code
>> > > while the system is up and running?
>> >
>> > That's where the runtime services code lives. So if you want things like EFI
>> > variables (used by the distro installer, among other things) you need to map the
>> > runtime regions.
>>
>> So EFI variables could be queried during bootup and saved on the Linux side.
>
> Right, we could do that, but then we wouldn't be able to support
> creation/updating variables at runtime, such as when you install a
> distribution for the first time, or want to boot a new kernel filename
> directly from the firmware without a boot loader (and need to modify
> the BootXXXX variables).
>
> And it's not just EFI variables that need runtime support either, for
> some platforms the only way to reboot/poweroff is with EFI, such as on
> the ASUS T100TA (Intel Baytrail-T).
>
> That's not to say your suggestion doesn't make sense for some cases, I
> can definitely see how turning off runtime support but providing a
> cache of EFI variables would be useful for some scenarios. But I don't
> think it's ever going to be workable as a default option.
>
>> Calling into firmware after the kernel has booted up is fragile in general -
>> beyond W+X the security considerations.
>
> It isn't intended to be fragile, and effort has gone into defining the
> context under which the EFI runtime services can operate (though there
> are obviously gaps in that specification).
>
> The entire point of the EFI runtime services is that they can be
> invoked from the OS, and because hardware/firmware developers rely on
> that when designing platforms, it's going to be something that Linux
> is going to have to be able to do. Of course, that doesn't we
> shouldn't be able to turn it off if the user is happy to sacrifice
> some platform functionality.
>
> Additionally, if we've got suggestions for the firmware developers on
> what we want the runtime context to look like, let's propose it to
> them. They're pretty receptive in my experience.
>

On arm64, we only map in all of the UEFI runtime services regions
during the time any of these services are being invoked. I think this
should be mostly feasible on x86 as well, although it would involve
yet another rewrite of the EFI region mapping code, and most likely a
long list of quirks for platforms that are not able to deal with it
correctly for one reason or the other (but that all come down to: 'if
you are not doing it like Windows does it, you must be doing it
wrong'). That would make the 'secure' way of mapping things an opt-in
feature, which is generally not desirable for security features (since
it will rarely be used in the real world then).

So enabling the Properties Table memprotect feature as soon as the
spec defines it in a meaningful way is probably a better way to go,
and our current involvement is focused on defining it such that it can
be enabled by default by firmwares rather than ending up an obscure
switch in the BIOS screen that only the paranoid ever turn on.

-- 
Ard.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 15:34                     ` Ard Biesheuvel
@ 2015-10-12 15:50                       ` Matt Fleming
  2015-10-12 16:43                         ` Ard Biesheuvel
  0 siblings, 1 reply; 36+ messages in thread
From: Matt Fleming @ 2015-10-12 15:50 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Ingo Molnar, Borislav Petkov, Stephen Smalley, x86, linux-kernel,
	Kees Cook, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra,
	Andy Lutomirski, Denys Vlasenko, Brian Gerst, linux-efi

On Mon, 12 Oct, at 05:34:53PM, Ard Biesheuvel wrote:
> 
> On arm64, we only map in all of the UEFI runtime services regions
> during the time any of these services are being invoked. I think this
> should be mostly feasible on x86 as well, although it would involve
> yet another rewrite of the EFI region mapping code, and most likely a
> long list of quirks for platforms that are not able to deal with it
> correctly for one reason or the other (but that all come down to: 'if
> you are not doing it like Windows does it, you must be doing it
> wrong').

Actually, we use separate page tables for mapping the EFI runtime
services on x86 right now. These tables are only used when making
runtime calls, just like on arm64.

So we've got a little bit of isolation right now.

> That would make the 'secure' way of mapping things an opt-in
> feature, which is generally not desirable for security features
> (since it will rarely be used in the real world then).

I'd like to think that we're coming to EFI_PROPERTIES_TABLE early
enough that we can work out all the kinks, get things working out of
the box in upstream tianocore, and have it be the standard way to
expose memory regions.

At least that's my hope.

> So enabling the Properties Table memprotect feature as soon as the
> spec defines it in a meaningful way is probably a better way to go,
> and our current involvement is focused on defining it such that it can
> be enabled by default by firmwares rather than ending up an obscure
> switch in the BIOS screen that only the paranoid ever turn on.

Indeed.

-- 
Matt Fleming, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 15:50                       ` Matt Fleming
@ 2015-10-12 16:43                         ` Ard Biesheuvel
  0 siblings, 0 replies; 36+ messages in thread
From: Ard Biesheuvel @ 2015-10-12 16:43 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Ingo Molnar, Borislav Petkov, Stephen Smalley, x86, linux-kernel,
	Kees Cook, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra,
	Andy Lutomirski, Denys Vlasenko, Brian Gerst, linux-efi

On 12 October 2015 at 17:50, Matt Fleming <matt@codeblueprint.co.uk> wrote:
> On Mon, 12 Oct, at 05:34:53PM, Ard Biesheuvel wrote:
>>
>> On arm64, we only map in all of the UEFI runtime services regions
>> during the time any of these services are being invoked. I think this
>> should be mostly feasible on x86 as well, although it would involve
>> yet another rewrite of the EFI region mapping code, and most likely a
>> long list of quirks for platforms that are not able to deal with it
>> correctly for one reason or the other (but that all come down to: 'if
>> you are not doing it like Windows does it, you must be doing it
>> wrong').
>
> Actually, we use separate page tables for mapping the EFI runtime
> services on x86 right now. These tables are only used when making
> runtime calls, just like on arm64.
>
> So we've got a little bit of isolation right now.
>

Ah ok. I thought that only applied to the duplicate 1:1 mapping, not
to the high mapping.

But that does reduce the attack surface considerably. Combined with
strict w^x once the UEFI 2.5 feature is fully supported, I am a lot
less nervous about RWX EFI runtime regions being used to subvert the
system.

-- 
Ard.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 14:49                   ` Matt Fleming
  2015-10-12 15:34                     ` Ard Biesheuvel
@ 2015-10-14 15:18                     ` Ingo Molnar
  2015-10-14 15:30                       ` Andy Lutomirski
  2015-10-14 21:02                       ` Matt Fleming
  1 sibling, 2 replies; 36+ messages in thread
From: Ingo Molnar @ 2015-10-14 15:18 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Borislav Petkov, Stephen Smalley, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel


* Matt Fleming <matt@codeblueprint.co.uk> wrote:

> On Mon, 12 Oct, at 04:17:54PM, Ingo Molnar wrote:
> > 
> > * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> > 
> > > On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
> > > > 
> > > > 
> > > > So why not unmap them after bootup? Is there any reason to call into EFI code 
> > > > while the system is up and running?
> > > 
> > > That's where the runtime services code lives. So if you want things like EFI 
> > > variables (used by the distro installer, among other things) you need to map the 
> > > runtime regions.
> > 
> > So EFI variables could be queried during bootup and saved on the Linux side.
>  
> Right, we could do that, but then we wouldn't be able to support
> creation/updating variables at runtime, such as when you install a
> distribution for the first time, or want to boot a new kernel filename
> directly from the firmware without a boot loader (and need to modify
> the BootXXXX variables).

Do we know the precise position and address range of these variables?

We could map them writable (but not executable), and the rest executable (but not 
writable).

That raises the question whether the same physical page ever mixes variables and 
actual code - but the hope would be that it's suffiently page granular for this to 
work.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-12 14:56                   ` Josh Triplett
@ 2015-10-14 15:19                     ` Ingo Molnar
  2015-10-14 16:47                       ` Josh Triplett
  0 siblings, 1 reply; 36+ messages in thread
From: Ingo Molnar @ 2015-10-14 15:19 UTC (permalink / raw)
  To: Josh Triplett
  Cc: Matt Fleming, Borislav Petkov, Stephen Smalley, x86,
	linux-kernel, keescook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra


* Josh Triplett <josh@joshtriplett.org> wrote:

> On Mon, Oct 12, 2015 at 04:17:54PM +0200, Ingo Molnar wrote:
> > * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> > > On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
> > > > So why not unmap them after bootup? Is there any reason to call into EFI code 
> > > > while the system is up and running?
> > > 
> > > That's where the runtime services code lives. So if you want things like EFI 
> > > variables (used by the distro installer, among other things) you need to map the 
> > > runtime regions.
> > 
> > So EFI variables could be queried during bootup and saved on the Linux side.
> 
> That wouldn't support writing to EFI variables.  Or using the EFI
> capsule update system to update firmware.

Well, if we know the location of those pages then we could map those 'rw-' - while 
the rest would be mapped 'r-x'.

The 'rwx' mappings that are created are problematic from a security POV - they 
basically undo many of our NX protections...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-14 15:18                     ` Ingo Molnar
@ 2015-10-14 15:30                       ` Andy Lutomirski
  2015-10-14 15:35                         ` Borislav Petkov
  2015-10-14 21:02                       ` Matt Fleming
  1 sibling, 1 reply; 36+ messages in thread
From: Andy Lutomirski @ 2015-10-14 15:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Matt Fleming, Borislav Petkov, Stephen Smalley, X86 ML,
	linux-kernel, Kees Cook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Denys Vlasenko, Brian Gerst,
	linux-efi, Ard Biesheuvel

On Wed, Oct 14, 2015 at 8:18 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Matt Fleming <matt@codeblueprint.co.uk> wrote:
>
>> On Mon, 12 Oct, at 04:17:54PM, Ingo Molnar wrote:
>> >
>> > * Matt Fleming <matt@codeblueprint.co.uk> wrote:
>> >
>> > > On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
>> > > >
>> > > >
>> > > > So why not unmap them after bootup? Is there any reason to call into EFI code
>> > > > while the system is up and running?
>> > >
>> > > That's where the runtime services code lives. So if you want things like EFI
>> > > variables (used by the distro installer, among other things) you need to map the
>> > > runtime regions.
>> >
>> > So EFI variables could be queried during bootup and saved on the Linux side.
>>
>> Right, we could do that, but then we wouldn't be able to support
>> creation/updating variables at runtime, such as when you install a
>> distribution for the first time, or want to boot a new kernel filename
>> directly from the firmware without a boot loader (and need to modify
>> the BootXXXX variables).
>
> Do we know the precise position and address range of these variables?
>
> We could map them writable (but not executable), and the rest executable (but not
> writable).
>
> That raises the question whether the same physical page ever mixes variables and
> actual code - but the hope would be that it's suffiently page granular for this to
> work.

Can we just unmap these things until someone tries to do an EFI call,
and then unmap them again after the call returns?  We already switch
pgds for EFI IIRC.

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-14 15:30                       ` Andy Lutomirski
@ 2015-10-14 15:35                         ` Borislav Petkov
  2015-10-15 10:10                           ` Matt Fleming
  0 siblings, 1 reply; 36+ messages in thread
From: Borislav Petkov @ 2015-10-14 15:35 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Ingo Molnar, Matt Fleming, Stephen Smalley, X86 ML, linux-kernel,
	Kees Cook, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra,
	Andy Lutomirski, Denys Vlasenko, Brian Gerst, linux-efi,
	Ard Biesheuvel

On Wed, Oct 14, 2015 at 08:30:48AM -0700, Andy Lutomirski wrote:
> Can we just unmap these things until someone tries to do an EFI call,
> and then unmap them again after the call returns?  We already switch
> pgds for EFI IIRC.

hpa did mention an EFI-aware page fault handler at the time. I guess we
could do that too...

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-14 15:19                     ` Ingo Molnar
@ 2015-10-14 16:47                       ` Josh Triplett
  2015-10-21  9:43                         ` Ingo Molnar
  0 siblings, 1 reply; 36+ messages in thread
From: Josh Triplett @ 2015-10-14 16:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Matt Fleming, Borislav Petkov, Stephen Smalley, x86,
	linux-kernel, keescook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra

On Wed, Oct 14, 2015 at 05:19:40PM +0200, Ingo Molnar wrote:
> 
> * Josh Triplett <josh@joshtriplett.org> wrote:
> 
> > On Mon, Oct 12, 2015 at 04:17:54PM +0200, Ingo Molnar wrote:
> > > * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> > > > On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
> > > > > So why not unmap them after bootup? Is there any reason to call into EFI code 
> > > > > while the system is up and running?
> > > > 
> > > > That's where the runtime services code lives. So if you want things like EFI 
> > > > variables (used by the distro installer, among other things) you need to map the 
> > > > runtime regions.
> > > 
> > > So EFI variables could be queried during bootup and saved on the Linux side.
> > 
> > That wouldn't support writing to EFI variables.  Or using the EFI
> > capsule update system to update firmware.
> 
> Well, if we know the location of those pages then we could map those 'rw-' - while 
> the rest would be mapped 'r-x'.

We have no way to do so in the absence of the additional code/data
separation information provided by more recent firmware.

- Josh Triplett

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-14 15:18                     ` Ingo Molnar
  2015-10-14 15:30                       ` Andy Lutomirski
@ 2015-10-14 21:02                       ` Matt Fleming
  2015-10-21  9:42                         ` Ingo Molnar
  1 sibling, 1 reply; 36+ messages in thread
From: Matt Fleming @ 2015-10-14 21:02 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Stephen Smalley, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel

On Wed, 14 Oct, at 05:18:07PM, Ingo Molnar wrote:
> 
> * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> 
> > On Mon, 12 Oct, at 04:17:54PM, Ingo Molnar wrote:
> > > 
> > > * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> > > 
> > > > On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
> > > > > 
> > > > > 
> > > > > So why not unmap them after bootup? Is there any reason to call into EFI code 
> > > > > while the system is up and running?
> > > > 
> > > > That's where the runtime services code lives. So if you want things like EFI 
> > > > variables (used by the distro installer, among other things) you need to map the 
> > > > runtime regions.
> > > 
> > > So EFI variables could be queried during bootup and saved on the Linux side.
> >  
> > Right, we could do that, but then we wouldn't be able to support
> > creation/updating variables at runtime, such as when you install a
> > distribution for the first time, or want to boot a new kernel filename
> > directly from the firmware without a boot loader (and need to modify
> > the BootXXXX variables).
> 
> Do we know the precise position and address range of these variables?
> 
> We could map them writable (but not executable), and the rest executable (but not 
> writable).
 
The variables are stored in NVRAM, which we don't map into the kernel
virtual address space. We have to initiate the transaction of writing
to the variables by executing EFI runtime services.

We obviously have buffers that we pass to the BIOS that contain
variable data, but these should be NX anyway because they're regular
kernel allocations.

> That raises the question whether the same physical page ever mixes variables and 
> actual code - but the hope would be that it's suffiently page granular for this to 
> work.

I don't think that would ever happen.

-- 
Matt Fleming, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-14 15:35                         ` Borislav Petkov
@ 2015-10-15 10:10                           ` Matt Fleming
  2015-10-15 10:33                             ` Borislav Petkov
  0 siblings, 1 reply; 36+ messages in thread
From: Matt Fleming @ 2015-10-15 10:10 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Andy Lutomirski, Ingo Molnar, Stephen Smalley, X86 ML,
	linux-kernel, Kees Cook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Denys Vlasenko, Brian Gerst,
	linux-efi, Ard Biesheuvel, Ricardo Neri, luv

On Wed, 14 Oct, at 05:35:22PM, Borislav Petkov wrote:
> On Wed, Oct 14, 2015 at 08:30:48AM -0700, Andy Lutomirski wrote:
> > Can we just unmap these things until someone tries to do an EFI call,
> > and then unmap them again after the call returns?  We already switch
> > pgds for EFI IIRC.
> 
> hpa did mention an EFI-aware page fault handler at the time. I guess we
> could do that too...

We do this for the Linux UEFI Validation project kernel [1]. There, we
do not map EFI Boot Services regions by default, only if the firmware
tries to access them.

This gives us the opporunity to print an error message if Boot
Services regions are accessed after ExitBootServices() (which is the
bug mjg59 describes in commit 916f676f8dc0 ("x86, efi: Retain boot
service code until after switching to virtual mode")).

But for the issue being discussed in this thread, the thing unmapping
the EFI regions buys you is that they're no longer accessible from the
x86 sleep/wakeup code paths, since those also use trampoline_pgd which
is where the EFI page tables are mapped.

And that's probably a good idea.

[1] - https://git.kernel.org/cgit/linux/kernel/git/mfleming/efi.git/commit/?h=stable&id=9b78793058bf93958aa9529400cb2617ec1bc958

-- 
Matt Fleming, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-15 10:10                           ` Matt Fleming
@ 2015-10-15 10:33                             ` Borislav Petkov
  2015-10-16  1:45                               ` Ricardo Neri
  0 siblings, 1 reply; 36+ messages in thread
From: Borislav Petkov @ 2015-10-15 10:33 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Andy Lutomirski, Ingo Molnar, Stephen Smalley, X86 ML,
	linux-kernel, Kees Cook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Denys Vlasenko, Brian Gerst,
	linux-efi, Ard Biesheuvel, Ricardo Neri, luv

On Thu, Oct 15, 2015 at 11:10:16AM +0100, Matt Fleming wrote:
> We do this for the Linux UEFI Validation project kernel [1]. There, we
> do not map EFI Boot Services regions by default, only if the firmware
> tries to access them.
> 
> This gives us the opporunity to print an error message if Boot
> Services regions are accessed after ExitBootServices() (which is the
> bug mjg59 describes in commit 916f676f8dc0 ("x86, efi: Retain boot
> service code until after switching to virtual mode")).

Yeah, that's actually a good idea. Why not upstream it for the wider
audience so that people can actually start reporting b0rked UEFIs? With
a big and nice FW_BUG splat in there...

> But for the issue being discussed in this thread, the thing unmapping
> the EFI regions buys you is that they're no longer accessible from the
> x86 sleep/wakeup code paths, since those also use trampoline_pgd which
> is where the EFI page tables are mapped.
> 
> And that's probably a good idea.
> 
> [1] - https://git.kernel.org/cgit/linux/kernel/git/mfleming/efi.git/commit/?h=stable&id=9b78793058bf93958aa9529400cb2617ec1bc958

In reading that commit message above, the fact that the braindead
decision of allowing SetVirtualAddressMap() to be called only once
reminds me that we can't really have a PF handler for runtime
services as *all* mappings need to be ready before calling
SetVirtualAddressMap().

Or, alternatively, we can prep them, call SetVirtualAddressMap() and
then unmap them all and map them again at the same addresses only in the
PF handler, each time a runtime call happens. When that call finishes,
we unmap them again...

Hmm, perhaps not worth the trouble...


-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-15 10:33                             ` Borislav Petkov
@ 2015-10-16  1:45                               ` Ricardo Neri
  0 siblings, 0 replies; 36+ messages in thread
From: Ricardo Neri @ 2015-10-16  1:45 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Matt Fleming, Andy Lutomirski, Ingo Molnar, Stephen Smalley,
	X86 ML, linux-kernel, Kees Cook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Denys Vlasenko, Brian Gerst,
	linux-efi, Ard Biesheuvel, luv

On Thu, 2015-10-15 at 12:33 +0200, Borislav Petkov wrote:
> Yeah, that's actually a good idea. Why not upstream it for the wider
> audience so that people can actually start reporting b0rked UEFIs?
> With
> a big and nice FW_BUG splat in there...

We attempted to upstream in the past but later I discovered that my
implementation in particular is causing warnings due to SMP. Also, I
need to implement an alternative or extension to the current
efi_map_region, which have the __init qualifier. This is because
mappings might happen after the __inits have been discarded. I have this
work currently in my scope.

Thanks and BR,
Ricardo


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-14 21:02                       ` Matt Fleming
@ 2015-10-21  9:42                         ` Ingo Molnar
  2015-10-21 12:49                           ` Ingo Molnar
  2015-10-21 20:38                           ` Matt Fleming
  0 siblings, 2 replies; 36+ messages in thread
From: Ingo Molnar @ 2015-10-21  9:42 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Borislav Petkov, Stephen Smalley, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel


* Matt Fleming <matt@codeblueprint.co.uk> wrote:

> > > Right, we could do that, but then we wouldn't be able to support 
> > > creation/updating variables at runtime, such as when you install a 
> > > distribution for the first time, or want to boot a new kernel filename 
> > > directly from the firmware without a boot loader (and need to modify the 
> > > BootXXXX variables).
> > 
> > Do we know the precise position and address range of these variables?
> > 
> > We could map them writable (but not executable), and the rest executable (but 
> > not writable).
>  
> The variables are stored in NVRAM, which we don't map into the kernel virtual 
> address space. [...]

Just curious: is there firmware that memory maps those variables privately?

> [...] We have to initiate the transaction of writing to the variables by 
> executing EFI runtime services.
> 
> We obviously have buffers that we pass to the BIOS that contain variable data, 
> but these should be NX anyway because they're regular kernel allocations.
> 
> > That raises the question whether the same physical page ever mixes variables 
> > and actual code - but the hope would be that it's suffiently page granular for 
> > this to work.
> 
> I don't think that would ever happen.

Ok, that's promising, so how about this then to solve the security weakness the 
new warning unearthed: map the whole EFI range as 'r-x (NX)', but detect writes 
from the page fault handler and transparently allow them to flip over the range to 
'rw-'.

Note that for security reasons we don't allow a subsequent flipping back to NX if 
there's an NX fault on the same page, i.e. this new mechanism is a monotonic 
one-way process that should dynamically 'map out' data pages versus executable 
pages.

It should also be pretty robust, assuming we can take page faults while EFI code 
is executing and is trying to modify EFI data: is that the case?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-14 16:47                       ` Josh Triplett
@ 2015-10-21  9:43                         ` Ingo Molnar
  0 siblings, 0 replies; 36+ messages in thread
From: Ingo Molnar @ 2015-10-21  9:43 UTC (permalink / raw)
  To: Josh Triplett
  Cc: Matt Fleming, Borislav Petkov, Stephen Smalley, x86,
	linux-kernel, keescook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra


* Josh Triplett <josh@joshtriplett.org> wrote:

> On Wed, Oct 14, 2015 at 05:19:40PM +0200, Ingo Molnar wrote:
> > 
> > * Josh Triplett <josh@joshtriplett.org> wrote:
> > 
> > > On Mon, Oct 12, 2015 at 04:17:54PM +0200, Ingo Molnar wrote:
> > > > * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> > > > > On Mon, 12 Oct, at 02:49:36PM, Ingo Molnar wrote:
> > > > > > So why not unmap them after bootup? Is there any reason to call into EFI code 
> > > > > > while the system is up and running?
> > > > > 
> > > > > That's where the runtime services code lives. So if you want things like EFI 
> > > > > variables (used by the distro installer, among other things) you need to map the 
> > > > > runtime regions.
> > > > 
> > > > So EFI variables could be queried during bootup and saved on the Linux side.
> > > 
> > > That wouldn't support writing to EFI variables.  Or using the EFI
> > > capsule update system to update firmware.
> > 
> > Well, if we know the location of those pages then we could map those 'rw-' - while 
> > the rest would be mapped 'r-x'.
> 
> We have no way to do so in the absence of the additional code/data
> separation information provided by more recent firmware.

But we could map those out via transparent page faults dynamically, as those 
accesses happen. It should be maximally compatible AFAICS, even without the new 
EFI extensions - and at no time would there be vulnerable 'rwx' mappings in the 
kernel page tables.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-21  9:42                         ` Ingo Molnar
@ 2015-10-21 12:49                           ` Ingo Molnar
  2015-10-21 12:57                             ` Ard Biesheuvel
  2015-10-21 20:38                           ` Matt Fleming
  1 sibling, 1 reply; 36+ messages in thread
From: Ingo Molnar @ 2015-10-21 12:49 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Borislav Petkov, Stephen Smalley, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel


* Ingo Molnar <mingo@kernel.org> wrote:

> 
> * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> 
> > > > Right, we could do that, but then we wouldn't be able to support 
> > > > creation/updating variables at runtime, such as when you install a 
> > > > distribution for the first time, or want to boot a new kernel filename 
> > > > directly from the firmware without a boot loader (and need to modify the 
> > > > BootXXXX variables).
> > > 
> > > Do we know the precise position and address range of these variables?
> > > 
> > > We could map them writable (but not executable), and the rest executable (but 
> > > not writable).
> >  
> > The variables are stored in NVRAM, which we don't map into the kernel virtual 
> > address space. [...]
> 
> Just curious: is there firmware that memory maps those variables privately?
> 
> > [...] We have to initiate the transaction of writing to the variables by 
> > executing EFI runtime services.
> > 
> > We obviously have buffers that we pass to the BIOS that contain variable data, 
> > but these should be NX anyway because they're regular kernel allocations.
> > 
> > > That raises the question whether the same physical page ever mixes variables 
> > > and actual code - but the hope would be that it's suffiently page granular for 
> > > this to work.
> > 
> > I don't think that would ever happen.
> 
> Ok, that's promising, so how about this then to solve the security weakness the 
> new warning unearthed: map the whole EFI range as 'r-x (NX)', but detect writes 
> from the page fault handler and transparently allow them to flip over the range 
> to 'rw-'.

So I meant to say 'page' instead of 'range'.

I.e. this dynamic mechanism would flip pages over to 'rw-', as write faults occur 
from EFI code that writes to them.

We don't need to know which regions are writable data, and which regions are 
executable-code/readonly-data.

The following aspect would guarantee safety:

> Note that for security reasons we don't allow a subsequent flipping back to NX 
> if there's an NX fault on the same page, i.e. this new mechanism is a monotonic 
> one-way process that should dynamically 'map out' data pages versus executable 
> pages.
> 
> It should also be pretty robust, assuming we can take page faults while EFI code 
> is executing and is trying to modify EFI data: is that the case?

and this is why I asked whether boundaries between 'Code' and 'Writable data' 
sections are page granular - which they do appear to be. (i.e. there are no 
singular pages that are both writable data and code at once.)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-21 12:49                           ` Ingo Molnar
@ 2015-10-21 12:57                             ` Ard Biesheuvel
  2015-10-21 13:24                               ` Borislav Petkov
  0 siblings, 1 reply; 36+ messages in thread
From: Ard Biesheuvel @ 2015-10-21 12:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Matt Fleming, Borislav Petkov, Stephen Smalley, x86,
	linux-kernel, Kees Cook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Denys Vlasenko, Brian Gerst,
	linux-efi

On 21 October 2015 at 14:49, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Ingo Molnar <mingo@kernel.org> wrote:
>
>>
>> * Matt Fleming <matt@codeblueprint.co.uk> wrote:
>>
>> > > > Right, we could do that, but then we wouldn't be able to support
>> > > > creation/updating variables at runtime, such as when you install a
>> > > > distribution for the first time, or want to boot a new kernel filename
>> > > > directly from the firmware without a boot loader (and need to modify the
>> > > > BootXXXX variables).
>> > >
>> > > Do we know the precise position and address range of these variables?
>> > >
>> > > We could map them writable (but not executable), and the rest executable (but
>> > > not writable).
>> >
>> > The variables are stored in NVRAM, which we don't map into the kernel virtual
>> > address space. [...]
>>
>> Just curious: is there firmware that memory maps those variables privately?
>>
>> > [...] We have to initiate the transaction of writing to the variables by
>> > executing EFI runtime services.
>> >
>> > We obviously have buffers that we pass to the BIOS that contain variable data,
>> > but these should be NX anyway because they're regular kernel allocations.
>> >
>> > > That raises the question whether the same physical page ever mixes variables
>> > > and actual code - but the hope would be that it's suffiently page granular for
>> > > this to work.
>> >
>> > I don't think that would ever happen.
>>
>> Ok, that's promising, so how about this then to solve the security weakness the
>> new warning unearthed: map the whole EFI range as 'r-x (NX)', but detect writes
>> from the page fault handler and transparently allow them to flip over the range
>> to 'rw-'.
>
> So I meant to say 'page' instead of 'range'.
>
> I.e. this dynamic mechanism would flip pages over to 'rw-', as write faults occur
> from EFI code that writes to them.
>
> We don't need to know which regions are writable data, and which regions are
> executable-code/readonly-data.
>
> The following aspect would guarantee safety:
>
>> Note that for security reasons we don't allow a subsequent flipping back to NX
>> if there's an NX fault on the same page, i.e. this new mechanism is a monotonic
>> one-way process that should dynamically 'map out' data pages versus executable
>> pages.
>>
>> It should also be pretty robust, assuming we can take page faults while EFI code
>> is executing and is trying to modify EFI data: is that the case?
>
> and this is why I asked whether boundaries between 'Code' and 'Writable data'
> sections are page granular - which they do appear to be. (i.e. there are no
> singular pages that are both writable data and code at once.)
>

No, sadly they are not. Only in specific cases (which have to do with
the new UEFIv2.5 memory protection feature that got this discussion
started in the first place) can we assume that UEFI runtime pages are
mappable either RW- or R-X but not RWX, and in those cases, we have
the permissions bits that tell us unambiguously which pages are text
and which are data. For the remaining cases, which is the vast
majority, no such assumptions can be made, and since the UEFI runtime
regions are typically populated with a bunch of PE/COFF images (each
of which consists of text + data), inferring where the boundaries are
between them does not seem tractable (for instance, to only map
'boundary' pages RWX)

-- 
Ard.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-21 12:57                             ` Ard Biesheuvel
@ 2015-10-21 13:24                               ` Borislav Petkov
  2015-10-21 13:28                                 ` Ard Biesheuvel
  0 siblings, 1 reply; 36+ messages in thread
From: Borislav Petkov @ 2015-10-21 13:24 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Ingo Molnar, Matt Fleming, Stephen Smalley, x86, linux-kernel,
	Kees Cook, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra,
	Andy Lutomirski, Denys Vlasenko, Brian Gerst, linux-efi

On Wed, Oct 21, 2015 at 02:57:47PM +0200, Ard Biesheuvel wrote:
> ... For the remaining cases, which is the vast majority, no such
> assumptions can be made, and since the UEFI runtime regions are
> typically populated with a bunch of PE/COFF images (each of which
> consists of text + data), inferring where the boundaries are between
> them does not seem tractable (for instance, to only map 'boundary'
> pages RWX)

How much of a problem would it be if we still do the on-demand page
faulting and map a trailing piece of code together with the data in a
page RWX?

Still better than mapping the *whole* thing RWX, no?

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-21 13:24                               ` Borislav Petkov
@ 2015-10-21 13:28                                 ` Ard Biesheuvel
  2015-10-21 14:36                                   ` Borislav Petkov
  0 siblings, 1 reply; 36+ messages in thread
From: Ard Biesheuvel @ 2015-10-21 13:28 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ingo Molnar, Matt Fleming, Stephen Smalley, x86, linux-kernel,
	Kees Cook, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra,
	Andy Lutomirski, Denys Vlasenko, Brian Gerst, linux-efi

On 21 October 2015 at 15:24, Borislav Petkov <bp@alien8.de> wrote:
> On Wed, Oct 21, 2015 at 02:57:47PM +0200, Ard Biesheuvel wrote:
>> ... For the remaining cases, which is the vast majority, no such
>> assumptions can be made, and since the UEFI runtime regions are
>> typically populated with a bunch of PE/COFF images (each of which
>> consists of text + data), inferring where the boundaries are between
>> them does not seem tractable (for instance, to only map 'boundary'
>> pages RWX)
>
> How much of a problem would it be if we still do the on-demand page
> faulting and map a trailing piece of code together with the data in a
> page RWX?
>
> Still better than mapping the *whole* thing RWX, no?
>

In theory, yes. In practice, since this is supposed to be a security
enhancement, we need some kind of ground truth to tell us which pages
can be legally modified *and* executed, so that we can detect the
illegal cases. My point was that, since a multitude of PE/COFF images
can be covered by a single EfiRuntimeServicesCode region, the UEFI
memory map does not give us enough information to make the distinction
between a page that sits on the text/data boundary of some PE/COFF
image and a page that sits wholly in either.

-- 
Ard.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-21 13:28                                 ` Ard Biesheuvel
@ 2015-10-21 14:36                                   ` Borislav Petkov
  2015-10-21 18:46                                     ` Andy Lutomirski
  0 siblings, 1 reply; 36+ messages in thread
From: Borislav Petkov @ 2015-10-21 14:36 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Ingo Molnar, Matt Fleming, Stephen Smalley, x86, linux-kernel,
	Kees Cook, Thomas Gleixner, H. Peter Anvin, Peter Zijlstra,
	Andy Lutomirski, Denys Vlasenko, Brian Gerst, linux-efi

On Wed, Oct 21, 2015 at 03:28:56PM +0200, Ard Biesheuvel wrote:
> In theory, yes. In practice, since this is supposed to be a security
> enhancement, we need some kind of ground truth to tell us which pages
> can be legally modified *and* executed, so that we can detect the
> illegal cases. My point was that, since a multitude of PE/COFF images
> can be covered by a single EfiRuntimeServicesCode region, the UEFI
> memory map does not give us enough information to make the distinction
> between a page that sits on the text/data boundary of some PE/COFF
> image and a page that sits wholly in either.

Well, we're going to simply allow the accesses to in-kernel users which
fault on those ranges, assuming that in-kernel modifiers are legit and
DTRT. Which means, we don't really need to know which pages can be
legally modified - we simply trust the in-kernel users.

The moment you're able to load an evil kernel module, guarding against
those writes is the last thing you need to worry about...

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-21 14:36                                   ` Borislav Petkov
@ 2015-10-21 18:46                                     ` Andy Lutomirski
  2015-10-21 20:45                                       ` Matt Fleming
  0 siblings, 1 reply; 36+ messages in thread
From: Andy Lutomirski @ 2015-10-21 18:46 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Ard Biesheuvel, Ingo Molnar, Matt Fleming, Stephen Smalley, x86,
	linux-kernel, Kees Cook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Denys Vlasenko, Brian Gerst,
	linux-efi

On Wed, Oct 21, 2015 at 7:36 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Wed, Oct 21, 2015 at 03:28:56PM +0200, Ard Biesheuvel wrote:
>> In theory, yes. In practice, since this is supposed to be a security
>> enhancement, we need some kind of ground truth to tell us which pages
>> can be legally modified *and* executed, so that we can detect the
>> illegal cases. My point was that, since a multitude of PE/COFF images
>> can be covered by a single EfiRuntimeServicesCode region, the UEFI
>> memory map does not give us enough information to make the distinction
>> between a page that sits on the text/data boundary of some PE/COFF
>> image and a page that sits wholly in either.
>
> Well, we're going to simply allow the accesses to in-kernel users which
> fault on those ranges, assuming that in-kernel modifiers are legit and
> DTRT. Which means, we don't really need to know which pages can be
> legally modified - we simply trust the in-kernel users.
>
> The moment you're able to load an evil kernel module, guarding against
> those writes is the last thing you need to worry about...

I don't think we can do a whole lot to help against broken UEFI code,
but having anything mapped RWX is a nice target for people trying to
exploit kernel bugs.  Hence my suggestion to clear W except when
actually running UEFI code.

If the UEFI stuff is mapped in its own PGD entry, we could just RO
that entire PGD entry everywhere except the UEFI pgd (and make sure to
clear G so that the TLB entries get zapped).

--Andy

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-21  9:42                         ` Ingo Molnar
  2015-10-21 12:49                           ` Ingo Molnar
@ 2015-10-21 20:38                           ` Matt Fleming
  1 sibling, 0 replies; 36+ messages in thread
From: Matt Fleming @ 2015-10-21 20:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Borislav Petkov, Stephen Smalley, x86, linux-kernel, keescook,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra, Andy Lutomirski,
	Denys Vlasenko, Brian Gerst, linux-efi, Ard Biesheuvel

On Wed, 21 Oct, at 11:42:42AM, Ingo Molnar wrote:
> 
> * Matt Fleming <matt@codeblueprint.co.uk> wrote:
> 
> > > > Right, we could do that, but then we wouldn't be able to support 
> > > > creation/updating variables at runtime, such as when you install a 
> > > > distribution for the first time, or want to boot a new kernel filename 
> > > > directly from the firmware without a boot loader (and need to modify the 
> > > > BootXXXX variables).
> > > 
> > > Do we know the precise position and address range of these variables?
> > > 
> > > We could map them writable (but not executable), and the rest executable (but 
> > > not writable).
> >  
> > The variables are stored in NVRAM, which we don't map into the kernel virtual 
> > address space. [...]
> 
> Just curious: is there firmware that memory maps those variables privately?
 
Good question, not sure. I suspect not because it becomes much harder
to protect those oh-so-precious variables from errant code wanting to
write to them.

Usually things get written on x86 from SMM code.

> > [...] We have to initiate the transaction of writing to the variables by 
> > executing EFI runtime services.
> > 
> > We obviously have buffers that we pass to the BIOS that contain variable data, 
> > but these should be NX anyway because they're regular kernel allocations.
> > 
> > > That raises the question whether the same physical page ever mixes variables 
> > > and actual code - but the hope would be that it's suffiently page granular for 
> > > this to work.
> > 
> > I don't think that would ever happen.
> 
> Ok, that's promising, so how about this then to solve the security weakness the 
> new warning unearthed: map the whole EFI range as 'r-x (NX)', but detect writes 
> from the page fault handler and transparently allow them to flip over the range to 
> 'rw-'.
> 
> Note that for security reasons we don't allow a subsequent flipping back to NX if 
> there's an NX fault on the same page, i.e. this new mechanism is a monotonic 
> one-way process that should dynamically 'map out' data pages versus executable 
> pages.
> 
> It should also be pretty robust, assuming we can take page faults while EFI code 
> is executing and is trying to modify EFI data: is that the case?

Yes, we can do that but I think I misunderstood what you were asking
when you said,

 > That raises the question whether the same physical page ever mixes variables and
 > actual code - but the hope would be that it's suffiently page granular for this to
 > work.
 
I was talking about EFI variables as defined in the UEFI spec, i.e.
backed by some peristent storage mechanism. I wasn't talking about
".data" objects.

It *is* possible for physical pages to contain both EFI code and data,
as Ard mentioned, and we have no way of distinguishing when EFI code
tried to write to a EfiRuntimeServicesCode page/region because there's
also legitimate data there and when an exploit attempt is taking
place.

In which case, I think we'd essentially map everything with execute
permission apart from the heap and other dynamically allocated objects
stored in EfiRuntimeServicesData.

But at that point, can't we just leave all these regions unmapped
unless we're in the EFI code paths? And that includes not leaving the
mappings around duing the suspend/resume code. 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-21 18:46                                     ` Andy Lutomirski
@ 2015-10-21 20:45                                       ` Matt Fleming
  2015-10-21 20:49                                         ` Andy Lutomirski
  0 siblings, 1 reply; 36+ messages in thread
From: Matt Fleming @ 2015-10-21 20:45 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Borislav Petkov, Ard Biesheuvel, Ingo Molnar, Stephen Smalley,
	x86, linux-kernel, Kees Cook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Denys Vlasenko, Brian Gerst,
	linux-efi

On Wed, 21 Oct, at 11:46:53AM, Andy Lutomirski wrote:
>
> If the UEFI stuff is mapped in its own PGD entry, we could just RO
> that entire PGD entry everywhere except the UEFI pgd (and make sure to
> clear G so that the TLB entries get zapped).

What would be the benefit of making it RO as opposed to not having it
mapped at all? The mappings only exist in the trampoline_pgd right now
for x86 which minimizes the potentially vulnerable code paths to the
EFI runtime calls and the suspend/resume code.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH v2] x86/mm: warn on W+x mappings
  2015-10-21 20:45                                       ` Matt Fleming
@ 2015-10-21 20:49                                         ` Andy Lutomirski
  0 siblings, 0 replies; 36+ messages in thread
From: Andy Lutomirski @ 2015-10-21 20:49 UTC (permalink / raw)
  To: Matt Fleming
  Cc: Borislav Petkov, Ard Biesheuvel, Ingo Molnar, Stephen Smalley,
	x86, linux-kernel, Kees Cook, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, Andy Lutomirski, Denys Vlasenko, Brian Gerst,
	linux-efi

On Wed, Oct 21, 2015 at 1:45 PM, Matt Fleming <matt@codeblueprint.co.uk> wrote:
> On Wed, 21 Oct, at 11:46:53AM, Andy Lutomirski wrote:
>>
>> If the UEFI stuff is mapped in its own PGD entry, we could just RO
>> that entire PGD entry everywhere except the UEFI pgd (and make sure to
>> clear G so that the TLB entries get zapped).
>
> What would be the benefit of making it RO as opposed to not having it
> mapped at all?

Nothing.

> The mappings only exist in the trampoline_pgd right now
> for x86 which minimizes the potentially vulnerable code paths to the
> EFI runtime calls and the suspend/resume code.

Oh, I didn't realize it.

So what's the problem here?  Honestly, while UEFI is full of
questionable things, I don't really see how an unprivileged user
program should be able to cause malicious input to be send to UEFI
code, so it should be quite difficult to exploit a buffer overflow or
other errant write in UEFI to escalate privileges from user to
anything else.  (Kernel -> SMM escalation is a whole different story,
but preventing that is SMM's business, not the kernel's.  I've
actually been a wee bit tempted to write a /dev/smram driver to expose
SMRAM using a portfolio of old known exploits.)

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2015-10-21 20:49 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-02 19:29 [PATCH v2] x86/mm: warn on W+x mappings Stephen Smalley
2015-10-02 20:44 ` Kees Cook
2015-10-03 11:27 ` Ingo Molnar
2015-10-05 19:13   ` Stephen Smalley
2015-10-06  7:32     ` Ingo Molnar
2015-10-06 15:37       ` Stephen Smalley
2015-10-12 11:36         ` Borislav Petkov
2015-10-12 12:41           ` Matt Fleming
2015-10-12 12:49             ` Ingo Molnar
2015-10-12 12:55               ` Matt Fleming
2015-10-12 14:17                 ` Ingo Molnar
2015-10-12 14:49                   ` Matt Fleming
2015-10-12 15:34                     ` Ard Biesheuvel
2015-10-12 15:50                       ` Matt Fleming
2015-10-12 16:43                         ` Ard Biesheuvel
2015-10-14 15:18                     ` Ingo Molnar
2015-10-14 15:30                       ` Andy Lutomirski
2015-10-14 15:35                         ` Borislav Petkov
2015-10-15 10:10                           ` Matt Fleming
2015-10-15 10:33                             ` Borislav Petkov
2015-10-16  1:45                               ` Ricardo Neri
2015-10-14 21:02                       ` Matt Fleming
2015-10-21  9:42                         ` Ingo Molnar
2015-10-21 12:49                           ` Ingo Molnar
2015-10-21 12:57                             ` Ard Biesheuvel
2015-10-21 13:24                               ` Borislav Petkov
2015-10-21 13:28                                 ` Ard Biesheuvel
2015-10-21 14:36                                   ` Borislav Petkov
2015-10-21 18:46                                     ` Andy Lutomirski
2015-10-21 20:45                                       ` Matt Fleming
2015-10-21 20:49                                         ` Andy Lutomirski
2015-10-21 20:38                           ` Matt Fleming
2015-10-12 14:56                   ` Josh Triplett
2015-10-14 15:19                     ` Ingo Molnar
2015-10-14 16:47                       ` Josh Triplett
2015-10-21  9:43                         ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).