All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2] arm64: add support to dump the kernel page tables
@ 2014-11-17 22:18 Laura Abbott
  2014-11-19 23:01 ` Kees Cook
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Laura Abbott @ 2014-11-17 22:18 UTC (permalink / raw)
  To: linux-arm-kernel

In a similar manner to arm, it's useful to be able to dump the page
tables to verify permissions and memory types. Add a debugfs file
to check the page tables.

Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
---
v2: Addressed comments from Mark Rutland. Made the logic a bit more
consistent between functions. Now tested on 4K and 64K pages.
---
 arch/arm64/Kconfig.debug |  12 ++
 arch/arm64/mm/Makefile   |   1 +
 arch/arm64/mm/dump.c     | 358 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 371 insertions(+)
 create mode 100644 arch/arm64/mm/dump.c

diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index 0a12933..5fdd6dc 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -6,6 +6,18 @@ config FRAME_POINTER
 	bool
 	default y
 
+config ARM64_PTDUMP
+	bool "Export kernel pagetable layout to userspace via debugfs"
+	depends on DEBUG_KERNEL
+	select DEBUG_FS
+        help
+	  Say Y here if you want to show the kernel pagetable layout in a
+	  debugfs file. This information is only useful for kernel developers
+	  who are working in architecture specific areas of the kernel.
+	  It is probably not a good idea to enable this feature in a production
+	  kernel.
+	  If in doubt, say "N"
+
 config STRICT_DEVMEM
 	bool "Filter access to /dev/mem"
 	depends on MMU
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index c56179e..773d37a 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,3 +3,4 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   ioremap.o mmap.o pgd.o mmu.o \
 				   context.o proc.o pageattr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+obj-$(CONFIG_ARM64_PTDUMP)	+= dump.o
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
new file mode 100644
index 0000000..57dcee4
--- /dev/null
+++ b/arch/arm64/mm/dump.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2014, The Linux Foundation. All rights reserved.
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * Derived from x86 and arm implementation:
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgtable.h>
+
+#define LOWEST_ADDR	(UL(0xffffffffffffffff) << VA_BITS)
+
+struct addr_marker {
+	unsigned long start_address;
+	const char *name;
+};
+
+static struct addr_marker address_markers[] = {
+	{ 0,			"vmalloc() Area" },
+	{ VMALLOC_END,		"vmalloc() End" },
+	{ MODULES_VADDR,	"Modules" },
+	{ PAGE_OFFSET,		"Kernel Mapping" },
+	{ -1,			NULL },
+};
+
+struct pg_state {
+	struct seq_file *seq;
+	const struct addr_marker *marker;
+	unsigned long start_address;
+	unsigned level;
+	u64 current_prot;
+};
+
+struct prot_bits {
+	u64		mask;
+	u64		val;
+	const char	*set;
+	const char	*clear;
+};
+
+static const struct prot_bits pte_bits[] = {
+	{
+		.mask	= PTE_USER,
+		.val	= PTE_USER,
+		.set	= "USR",
+		.clear	= "   ",
+	}, {
+		.mask	= PTE_RDONLY,
+		.val	= PTE_RDONLY,
+		.set	= "ro",
+		.clear	= "RW",
+	}, {
+		.mask	= PTE_PXN,
+		.val	= PTE_PXN,
+		.set	= "NX",
+		.clear	= "x ",
+	}, {
+		.mask	= PTE_SHARED,
+		.val	= PTE_SHARED,
+		.set	= "SHD",
+		.clear	= "   ",
+	}, {
+		.mask	= PTE_AF,
+		.val	= PTE_AF,
+		.set	= "AF",
+		.clear	= "  ",
+	}, {
+		.mask	= PTE_NG,
+		.val	= PTE_NG,
+		.set	= "NG",
+		.clear	= "  ",
+	}, {
+		.mask	= PTE_UXN,
+		.val	= PTE_UXN,
+		.set	= "UXN",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_DEVICE_nGnRnE),
+		.set	= "DEVICE/nGnRnE",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_DEVICE_nGnRE),
+		.set	= "DEVICE/nGnRE",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_DEVICE_GRE),
+		.set	= "DEVICE/GRE",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_NORMAL_NC),
+		.set	= "MEM/BUFFERABLE",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_NORMAL),
+		.set	= "MEM/NORMAL",
+	}
+};
+
+static const struct prot_bits section_bits[] = {
+	{
+		.mask	= PMD_SECT_USER,
+		.val	= PMD_SECT_USER,
+		.set	= "USR",
+		.clear	= "   ",
+	}, {
+		.mask	= PMD_SECT_RDONLY,
+		.val	= PMD_SECT_RDONLY,
+		.set	= "ro",
+		.clear	= "RW",
+	}, {
+		.mask	= PMD_SECT_PXN,
+		.val	= PMD_SECT_PXN,
+		.set	= "NX",
+		.clear	= "x ",
+	}, {
+		.mask	= PMD_SECT_S,
+		.val	= PMD_SECT_S,
+		.set	= "SHD",
+		.clear	= "   ",
+	}, {
+		.mask	= PMD_SECT_AF,
+		.val	= PMD_SECT_AF,
+		.set	= "AF",
+		.clear	= "  ",
+	}, {
+		.mask	= PMD_SECT_NG,
+		.val	= PMD_SECT_NG,
+		.set	= "NG",
+		.clear	= "  ",
+	}, {
+		.mask	= PMD_SECT_UXN,
+		.val	= PMD_SECT_UXN,
+		.set	= "UXN",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_DEVICE_nGnRnE),
+		.set	= "DEVICE/nGnRnE",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_DEVICE_nGnRE),
+		.set	= "DEVICE/nGnRE",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_DEVICE_GRE),
+		.set	= "DEVICE/GRE",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_NORMAL_NC),
+		.set	= "MEM/BUFFERABLE",
+	}, {
+		.mask	= PTE_ATTRINDX_MASK,
+		.val	= PTE_ATTRINDX(MT_NORMAL),
+		.set	= "MEM/NORMAL",
+	}
+};
+
+struct pg_level {
+	const struct prot_bits *bits;
+	size_t num;
+	u64 mask;
+};
+
+static struct pg_level pg_level[] = {
+	{
+	}, { /* pgd */
+		.bits	= pte_bits,
+		.num	= ARRAY_SIZE(pte_bits),
+	}, { /* pud */
+		.bits	= pte_bits,
+		.num	= ARRAY_SIZE(pte_bits),
+	}, { /* pmd */
+		.bits	= section_bits,
+		.num	= ARRAY_SIZE(section_bits),
+	}, { /* pte */
+		.bits	= pte_bits,
+		.num	= ARRAY_SIZE(pte_bits),
+	},
+};
+
+static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
+			size_t num)
+{
+	unsigned i;
+
+	for (i = 0; i < num; i++, bits++) {
+		const char *s;
+
+		if ((st->current_prot & bits->mask) == bits->val)
+			s = bits->set;
+		else
+			s = bits->clear;
+
+		if (s)
+			seq_printf(st->seq, " %s", s);
+	}
+}
+
+static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
+				u64 val)
+{
+	static const char units[] = "KMGTPE";
+	u64 prot = val & pg_level[level].mask;
+
+	if (addr < LOWEST_ADDR)
+		return;
+
+	if (!st->level) {
+		st->level = level;
+		st->current_prot = prot;
+		st->start_address = addr;
+		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+	} else if (prot != st->current_prot || level != st->level ||
+		   addr >= st->marker[1].start_address) {
+		const char *unit = units;
+		unsigned long delta;
+
+		if (st->current_prot) {
+			seq_printf(st->seq, "0x%16lx-0x%16lx   ",
+				   st->start_address, addr);
+
+			delta = (addr - st->start_address) >> 10;
+			while (!(delta & 1023) && unit[1]) {
+				delta >>= 10;
+				unit++;
+			}
+			seq_printf(st->seq, "%9lu%c", delta, *unit);
+			if (pg_level[st->level].bits)
+				dump_prot(st, pg_level[st->level].bits,
+					  pg_level[st->level].num);
+			seq_puts(st->seq, "\n");
+		}
+
+		if (addr >= st->marker[1].start_address) {
+			st->marker++;
+			seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+		}
+		st->start_address = addr;
+		st->current_prot = prot;
+		st->level = level;
+	}
+}
+
+static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
+{
+	pte_t *pte = pte_offset_kernel(pmd, 0);
+	unsigned long addr;
+	unsigned i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+		addr = start + i * PAGE_SIZE;
+		note_page(st, addr, 4, pte_val(*pte));
+	}
+}
+
+static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
+{
+	pmd_t *pmd = pmd_offset(pud, 0);
+	unsigned long addr;
+	unsigned i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
+		addr = start + i * PMD_SIZE;
+		if (pmd_none(*pmd) || pmd_sect(*pmd) || pmd_bad(*pmd))
+			note_page(st, addr, 3, pmd_val(*pmd));
+		else
+			walk_pte(st, pmd, addr);
+	}
+}
+
+static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+	pud_t *pud = pud_offset(pgd, 0);
+	unsigned long addr;
+	unsigned i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+		addr = start + i * PUD_SIZE;
+		if (pud_none(*pud) || pud_sect(*pud) || pud_bad(*pud))
+			note_page(st, addr, 2, pud_val(*pud));
+		else
+			walk_pmd(st, pud, addr);
+	}
+}
+
+static unsigned long normalize_addr(unsigned long u)
+{
+	return u | LOWEST_ADDR;
+}
+
+static void walk_pgd(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+	unsigned i;
+	unsigned long addr;
+
+	for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
+		addr = normalize_addr(start + i * PGDIR_SIZE);
+		if (pgd_none(*pgd) || pgd_bad(*pgd))
+			note_page(st, addr, 1, pgd_val(*pgd));
+		else
+			walk_pud(st, pgd, addr);
+	}
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+	struct pg_state st;
+
+	memset(&st, 0, sizeof(st));
+	st.seq = m;
+	st.marker = address_markers;
+
+	walk_pgd(&st, swapper_pg_dir, 0);
+
+	note_page(&st, 0, 0, 0);
+	return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+	.open		= ptdump_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int ptdump_init(void)
+{
+	struct dentry *pe;
+	unsigned i, j;
+
+	for (i = 0; i < ARRAY_SIZE(pg_level); i++)
+		if (pg_level[i].bits)
+			for (j = 0; j < pg_level[i].num; j++)
+				pg_level[i].mask |= pg_level[i].bits[j].mask;
+
+	address_markers[0].start_address = VMALLOC_START;
+
+	pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
+				 &ptdump_fops);
+	return pe ? 0 : -ENOMEM;
+}
+device_initcall(ptdump_init);
-- 
Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCHv2] arm64: add support to dump the kernel page tables
  2014-11-17 22:18 [PATCHv2] arm64: add support to dump the kernel page tables Laura Abbott
@ 2014-11-19 23:01 ` Kees Cook
  2014-11-20 23:20   ` CONFIG_DEBUG_SET_MODULE_RONX broken (was Re: [PATCHv2] arm64: add support to dump the kernel page tables) Laura Abbott
  2014-11-24 16:05 ` [PATCHv2] arm64: add support to dump the kernel page tables Steve Capper
  2014-11-24 19:28 ` Mark Rutland
  2 siblings, 1 reply; 8+ messages in thread
From: Kees Cook @ 2014-11-19 23:01 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 17, 2014 at 2:18 PM, Laura Abbott <lauraa@codeaurora.org> wrote:
> In a similar manner to arm, it's useful to be able to dump the page
> tables to verify permissions and memory types. Add a debugfs file
> to check the page tables.
>
> Signed-off-by: Laura Abbott <lauraa@codeaurora.org>

This seems to behave well for me. Thanks!

Tested-by: Kees Cook <keescook@chromium.org>

In my configuration, though, with CONFIG_DEBUG_SET_MODULE_RONX=y, I'm
only seeing RO, and no NX changes. I haven't constructed a test for
the module memory behavior directly (to see if this is a page table
issue or a PTDUMP reporting issue). This series I'm testing has gone
through some backporting on my end, so I wanted to just double-check
and see if you saw this too, or if it's specific to my environment:

---[ Modules ]---
0xffffffbffc000000-0xffffffbffc005000          20K     ro x  SHD AF
UXN MEM/NORMAL
0xffffffbffc005000-0xffffffbffc007000           8K     RW x  SHD AF
UXN MEM/NORMAL
0xffffffbffc00c000-0xffffffbffc00e000           8K     ro x  SHD AF
UXN MEM/NORMAL
0xffffffbffc00e000-0xffffffbffc010000           8K     RW x  SHD AF
UXN MEM/NORMAL
...

Thanks,

-Kees

> ---
> v2: Addressed comments from Mark Rutland. Made the logic a bit more
> consistent between functions. Now tested on 4K and 64K pages.
> ---
>  arch/arm64/Kconfig.debug |  12 ++
>  arch/arm64/mm/Makefile   |   1 +
>  arch/arm64/mm/dump.c     | 358 +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 371 insertions(+)
>  create mode 100644 arch/arm64/mm/dump.c
>
> diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
> index 0a12933..5fdd6dc 100644
> --- a/arch/arm64/Kconfig.debug
> +++ b/arch/arm64/Kconfig.debug
> @@ -6,6 +6,18 @@ config FRAME_POINTER
>         bool
>         default y
>
> +config ARM64_PTDUMP
> +       bool "Export kernel pagetable layout to userspace via debugfs"
> +       depends on DEBUG_KERNEL
> +       select DEBUG_FS
> +        help
> +         Say Y here if you want to show the kernel pagetable layout in a
> +         debugfs file. This information is only useful for kernel developers
> +         who are working in architecture specific areas of the kernel.
> +         It is probably not a good idea to enable this feature in a production
> +         kernel.
> +         If in doubt, say "N"
> +
>  config STRICT_DEVMEM
>         bool "Filter access to /dev/mem"
>         depends on MMU
> diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
> index c56179e..773d37a 100644
> --- a/arch/arm64/mm/Makefile
> +++ b/arch/arm64/mm/Makefile
> @@ -3,3 +3,4 @@ obj-y                           := dma-mapping.o extable.o fault.o init.o \
>                                    ioremap.o mmap.o pgd.o mmu.o \
>                                    context.o proc.o pageattr.o
>  obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage.o
> +obj-$(CONFIG_ARM64_PTDUMP)     += dump.o
> diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
> new file mode 100644
> index 0000000..57dcee4
> --- /dev/null
> +++ b/arch/arm64/mm/dump.c
> @@ -0,0 +1,358 @@
> +/*
> + * Copyright (c) 2014, The Linux Foundation. All rights reserved.
> + * Debug helper to dump the current kernel pagetables of the system
> + * so that we can see what the various memory ranges are set to.
> + *
> + * Derived from x86 and arm implementation:
> + * (C) Copyright 2008 Intel Corporation
> + *
> + * Author: Arjan van de Ven <arjan@linux.intel.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; version 2
> + * of the License.
> + */
> +#include <linux/debugfs.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/seq_file.h>
> +
> +#include <asm/pgtable.h>
> +
> +#define LOWEST_ADDR    (UL(0xffffffffffffffff) << VA_BITS)
> +
> +struct addr_marker {
> +       unsigned long start_address;
> +       const char *name;
> +};
> +
> +static struct addr_marker address_markers[] = {
> +       { 0,                    "vmalloc() Area" },
> +       { VMALLOC_END,          "vmalloc() End" },
> +       { MODULES_VADDR,        "Modules" },
> +       { PAGE_OFFSET,          "Kernel Mapping" },
> +       { -1,                   NULL },
> +};
> +
> +struct pg_state {
> +       struct seq_file *seq;
> +       const struct addr_marker *marker;
> +       unsigned long start_address;
> +       unsigned level;
> +       u64 current_prot;
> +};
> +
> +struct prot_bits {
> +       u64             mask;
> +       u64             val;
> +       const char      *set;
> +       const char      *clear;
> +};
> +
> +static const struct prot_bits pte_bits[] = {
> +       {
> +               .mask   = PTE_USER,
> +               .val    = PTE_USER,
> +               .set    = "USR",
> +               .clear  = "   ",
> +       }, {
> +               .mask   = PTE_RDONLY,
> +               .val    = PTE_RDONLY,
> +               .set    = "ro",
> +               .clear  = "RW",
> +       }, {
> +               .mask   = PTE_PXN,
> +               .val    = PTE_PXN,
> +               .set    = "NX",
> +               .clear  = "x ",
> +       }, {
> +               .mask   = PTE_SHARED,
> +               .val    = PTE_SHARED,
> +               .set    = "SHD",
> +               .clear  = "   ",
> +       }, {
> +               .mask   = PTE_AF,
> +               .val    = PTE_AF,
> +               .set    = "AF",
> +               .clear  = "  ",
> +       }, {
> +               .mask   = PTE_NG,
> +               .val    = PTE_NG,
> +               .set    = "NG",
> +               .clear  = "  ",
> +       }, {
> +               .mask   = PTE_UXN,
> +               .val    = PTE_UXN,
> +               .set    = "UXN",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_DEVICE_nGnRnE),
> +               .set    = "DEVICE/nGnRnE",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_DEVICE_nGnRE),
> +               .set    = "DEVICE/nGnRE",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_DEVICE_GRE),
> +               .set    = "DEVICE/GRE",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_NORMAL_NC),
> +               .set    = "MEM/BUFFERABLE",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_NORMAL),
> +               .set    = "MEM/NORMAL",
> +       }
> +};
> +
> +static const struct prot_bits section_bits[] = {
> +       {
> +               .mask   = PMD_SECT_USER,
> +               .val    = PMD_SECT_USER,
> +               .set    = "USR",
> +               .clear  = "   ",
> +       }, {
> +               .mask   = PMD_SECT_RDONLY,
> +               .val    = PMD_SECT_RDONLY,
> +               .set    = "ro",
> +               .clear  = "RW",
> +       }, {
> +               .mask   = PMD_SECT_PXN,
> +               .val    = PMD_SECT_PXN,
> +               .set    = "NX",
> +               .clear  = "x ",
> +       }, {
> +               .mask   = PMD_SECT_S,
> +               .val    = PMD_SECT_S,
> +               .set    = "SHD",
> +               .clear  = "   ",
> +       }, {
> +               .mask   = PMD_SECT_AF,
> +               .val    = PMD_SECT_AF,
> +               .set    = "AF",
> +               .clear  = "  ",
> +       }, {
> +               .mask   = PMD_SECT_NG,
> +               .val    = PMD_SECT_NG,
> +               .set    = "NG",
> +               .clear  = "  ",
> +       }, {
> +               .mask   = PMD_SECT_UXN,
> +               .val    = PMD_SECT_UXN,
> +               .set    = "UXN",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_DEVICE_nGnRnE),
> +               .set    = "DEVICE/nGnRnE",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_DEVICE_nGnRE),
> +               .set    = "DEVICE/nGnRE",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_DEVICE_GRE),
> +               .set    = "DEVICE/GRE",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_NORMAL_NC),
> +               .set    = "MEM/BUFFERABLE",
> +       }, {
> +               .mask   = PTE_ATTRINDX_MASK,
> +               .val    = PTE_ATTRINDX(MT_NORMAL),
> +               .set    = "MEM/NORMAL",
> +       }
> +};
> +
> +struct pg_level {
> +       const struct prot_bits *bits;
> +       size_t num;
> +       u64 mask;
> +};
> +
> +static struct pg_level pg_level[] = {
> +       {
> +       }, { /* pgd */
> +               .bits   = pte_bits,
> +               .num    = ARRAY_SIZE(pte_bits),
> +       }, { /* pud */
> +               .bits   = pte_bits,
> +               .num    = ARRAY_SIZE(pte_bits),
> +       }, { /* pmd */
> +               .bits   = section_bits,
> +               .num    = ARRAY_SIZE(section_bits),
> +       }, { /* pte */
> +               .bits   = pte_bits,
> +               .num    = ARRAY_SIZE(pte_bits),
> +       },
> +};
> +
> +static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
> +                       size_t num)
> +{
> +       unsigned i;
> +
> +       for (i = 0; i < num; i++, bits++) {
> +               const char *s;
> +
> +               if ((st->current_prot & bits->mask) == bits->val)
> +                       s = bits->set;
> +               else
> +                       s = bits->clear;
> +
> +               if (s)
> +                       seq_printf(st->seq, " %s", s);
> +       }
> +}
> +
> +static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
> +                               u64 val)
> +{
> +       static const char units[] = "KMGTPE";
> +       u64 prot = val & pg_level[level].mask;
> +
> +       if (addr < LOWEST_ADDR)
> +               return;
> +
> +       if (!st->level) {
> +               st->level = level;
> +               st->current_prot = prot;
> +               st->start_address = addr;
> +               seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
> +       } else if (prot != st->current_prot || level != st->level ||
> +                  addr >= st->marker[1].start_address) {
> +               const char *unit = units;
> +               unsigned long delta;
> +
> +               if (st->current_prot) {
> +                       seq_printf(st->seq, "0x%16lx-0x%16lx   ",
> +                                  st->start_address, addr);
> +
> +                       delta = (addr - st->start_address) >> 10;
> +                       while (!(delta & 1023) && unit[1]) {
> +                               delta >>= 10;
> +                               unit++;
> +                       }
> +                       seq_printf(st->seq, "%9lu%c", delta, *unit);
> +                       if (pg_level[st->level].bits)
> +                               dump_prot(st, pg_level[st->level].bits,
> +                                         pg_level[st->level].num);
> +                       seq_puts(st->seq, "\n");
> +               }
> +
> +               if (addr >= st->marker[1].start_address) {
> +                       st->marker++;
> +                       seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
> +               }
> +               st->start_address = addr;
> +               st->current_prot = prot;
> +               st->level = level;
> +       }
> +}
> +
> +static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
> +{
> +       pte_t *pte = pte_offset_kernel(pmd, 0);
> +       unsigned long addr;
> +       unsigned i;
> +
> +       for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
> +               addr = start + i * PAGE_SIZE;
> +               note_page(st, addr, 4, pte_val(*pte));
> +       }
> +}
> +
> +static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
> +{
> +       pmd_t *pmd = pmd_offset(pud, 0);
> +       unsigned long addr;
> +       unsigned i;
> +
> +       for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
> +               addr = start + i * PMD_SIZE;
> +               if (pmd_none(*pmd) || pmd_sect(*pmd) || pmd_bad(*pmd))
> +                       note_page(st, addr, 3, pmd_val(*pmd));
> +               else
> +                       walk_pte(st, pmd, addr);
> +       }
> +}
> +
> +static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +{
> +       pud_t *pud = pud_offset(pgd, 0);
> +       unsigned long addr;
> +       unsigned i;
> +
> +       for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
> +               addr = start + i * PUD_SIZE;
> +               if (pud_none(*pud) || pud_sect(*pud) || pud_bad(*pud))
> +                       note_page(st, addr, 2, pud_val(*pud));
> +               else
> +                       walk_pmd(st, pud, addr);
> +       }
> +}
> +
> +static unsigned long normalize_addr(unsigned long u)
> +{
> +       return u | LOWEST_ADDR;
> +}
> +
> +static void walk_pgd(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +{
> +       unsigned i;
> +       unsigned long addr;
> +
> +       for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
> +               addr = normalize_addr(start + i * PGDIR_SIZE);
> +               if (pgd_none(*pgd) || pgd_bad(*pgd))
> +                       note_page(st, addr, 1, pgd_val(*pgd));
> +               else
> +                       walk_pud(st, pgd, addr);
> +       }
> +}
> +
> +static int ptdump_show(struct seq_file *m, void *v)
> +{
> +       struct pg_state st;
> +
> +       memset(&st, 0, sizeof(st));
> +       st.seq = m;
> +       st.marker = address_markers;
> +
> +       walk_pgd(&st, swapper_pg_dir, 0);
> +
> +       note_page(&st, 0, 0, 0);
> +       return 0;
> +}
> +
> +static int ptdump_open(struct inode *inode, struct file *file)
> +{
> +       return single_open(file, ptdump_show, NULL);
> +}
> +
> +static const struct file_operations ptdump_fops = {
> +       .open           = ptdump_open,
> +       .read           = seq_read,
> +       .llseek         = seq_lseek,
> +       .release        = single_release,
> +};
> +
> +static int ptdump_init(void)
> +{
> +       struct dentry *pe;
> +       unsigned i, j;
> +
> +       for (i = 0; i < ARRAY_SIZE(pg_level); i++)
> +               if (pg_level[i].bits)
> +                       for (j = 0; j < pg_level[i].num; j++)
> +                               pg_level[i].mask |= pg_level[i].bits[j].mask;
> +
> +       address_markers[0].start_address = VMALLOC_START;
> +
> +       pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
> +                                &ptdump_fops);
> +       return pe ? 0 : -ENOMEM;
> +}
> +device_initcall(ptdump_init);
> --
> Qualcomm Innovation Center, Inc.
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel



-- 
Kees Cook
Chrome OS Security

^ permalink raw reply	[flat|nested] 8+ messages in thread

* CONFIG_DEBUG_SET_MODULE_RONX broken (was Re: [PATCHv2] arm64: add support to dump the kernel page tables)
  2014-11-19 23:01 ` Kees Cook
@ 2014-11-20 23:20   ` Laura Abbott
  2014-11-26  6:05     ` Rusty Russell
  0 siblings, 1 reply; 8+ messages in thread
From: Laura Abbott @ 2014-11-20 23:20 UTC (permalink / raw)
  To: linux-arm-kernel

(cc Rusty Russell for reference)

On 11/19/2014 3:01 PM, Kees Cook wrote:
> On Mon, Nov 17, 2014 at 2:18 PM, Laura Abbott <lauraa@codeaurora.org> wrote:
>> In a similar manner to arm, it's useful to be able to dump the page
>> tables to verify permissions and memory types. Add a debugfs file
>> to check the page tables.
>>
>> Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
>
> This seems to behave well for me. Thanks!
>
> Tested-by: Kees Cook <keescook@chromium.org>
>
> In my configuration, though, with CONFIG_DEBUG_SET_MODULE_RONX=y, I'm
> only seeing RO, and no NX changes. I haven't constructed a test for
> the module memory behavior directly (to see if this is a page table
> issue or a PTDUMP reporting issue). This series I'm testing has gone
> through some backporting on my end, so I wanted to just double-check
> and see if you saw this too, or if it's specific to my environment:
>
> ---[ Modules ]---
> 0xffffffbffc000000-0xffffffbffc005000          20K     ro x  SHD AF
> UXN MEM/NORMAL
> 0xffffffbffc005000-0xffffffbffc007000           8K     RW x  SHD AF
> UXN MEM/NORMAL
> 0xffffffbffc00c000-0xffffffbffc00e000           8K     ro x  SHD AF
> UXN MEM/NORMAL
> 0xffffffbffc00e000-0xffffffbffc010000           8K     RW x  SHD AF
> UXN MEM/NORMAL
> ...
>
> Thanks,
>
> -Kees
>

Yep, I'm seeing the same thing. We're failing the bounds check:

if (!is_module_address(start) || !is_module_address(end - 1))
                 return -EINVAL;

There are now two problems with this check

1) 4982223e51e8 module: set nx before marking module MODULE_STATE_COMING
moved around the order of when nx was set. Now we hit the mod->state ==
MODULE_STATE_UNFORMED in __module_address so module_address on anything
always returns false. I think my previous testing must have been done
on a branch without that patch.

2) It's possible for the end of the region we are trying to set as nx
to not be fully page size aligned. This seems to be caused by things
getting aligned in layout_section but becoming unaligned in layout_symtab

diff --git a/kernel/module.c b/kernel/module.c
index 972151b..3791330 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2316,10 +2316,14 @@ static void layout_symtab(struct module *mod, struct load_info *info)
         info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
         mod->core_size += strtab_size;
  
+       mod->core_size = debug_align(mod->core_size);
+
         /* Put string table section at end of init part of module. */
         strsect->sh_flags |= SHF_ALLOC;
         strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
                                          info->index.str) | INIT_OFFSET_MASK;
+
+       mod->init_size = debug_align(mod->init_size);
         pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
  }
  
I haven't tried a bisect to see if this is new.

I'm kind of tempted to switch the bounds check back to
(addr >= MODULES_VADDR && addr < MODULES_END) unless there is a clean way to
fixup module.c

Thanks,
Laura

-- 
Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCHv2] arm64: add support to dump the kernel page tables
  2014-11-17 22:18 [PATCHv2] arm64: add support to dump the kernel page tables Laura Abbott
  2014-11-19 23:01 ` Kees Cook
@ 2014-11-24 16:05 ` Steve Capper
  2014-11-24 19:28 ` Mark Rutland
  2 siblings, 0 replies; 8+ messages in thread
From: Steve Capper @ 2014-11-24 16:05 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 17, 2014 at 02:18:30PM -0800, Laura Abbott wrote:
> In a similar manner to arm, it's useful to be able to dump the page
> tables to verify permissions and memory types. Add a debugfs file
> to check the page tables.

Hi Laura,
Some comments below:

The output looked sensible otherwise and worked well on a Juno running
4/64KB pages and 2,3,4-levels of page table.

> 
> Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
> ---
> v2: Addressed comments from Mark Rutland. Made the logic a bit more
> consistent between functions. Now tested on 4K and 64K pages.
> ---
>  arch/arm64/Kconfig.debug |  12 ++
>  arch/arm64/mm/Makefile   |   1 +
>  arch/arm64/mm/dump.c     | 358 +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 371 insertions(+)
>  create mode 100644 arch/arm64/mm/dump.c
> 
> diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
> index 0a12933..5fdd6dc 100644
> --- a/arch/arm64/Kconfig.debug
> +++ b/arch/arm64/Kconfig.debug
> @@ -6,6 +6,18 @@ config FRAME_POINTER
>  	bool
>  	default y
>  
> +config ARM64_PTDUMP
> +	bool "Export kernel pagetable layout to userspace via debugfs"
> +	depends on DEBUG_KERNEL
> +	select DEBUG_FS
> +        help
> +	  Say Y here if you want to show the kernel pagetable layout in a
> +	  debugfs file. This information is only useful for kernel developers
> +	  who are working in architecture specific areas of the kernel.
> +	  It is probably not a good idea to enable this feature in a production
> +	  kernel.
> +	  If in doubt, say "N"
> +
>  config STRICT_DEVMEM
>  	bool "Filter access to /dev/mem"
>  	depends on MMU
> diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
> index c56179e..773d37a 100644
> --- a/arch/arm64/mm/Makefile
> +++ b/arch/arm64/mm/Makefile
> @@ -3,3 +3,4 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
>  				   ioremap.o mmap.o pgd.o mmu.o \
>  				   context.o proc.o pageattr.o
>  obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
> +obj-$(CONFIG_ARM64_PTDUMP)	+= dump.o
> diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
> new file mode 100644
> index 0000000..57dcee4
> --- /dev/null
> +++ b/arch/arm64/mm/dump.c
> @@ -0,0 +1,358 @@
> +/*
> + * Copyright (c) 2014, The Linux Foundation. All rights reserved.
> + * Debug helper to dump the current kernel pagetables of the system
> + * so that we can see what the various memory ranges are set to.
> + *
> + * Derived from x86 and arm implementation:
> + * (C) Copyright 2008 Intel Corporation
> + *
> + * Author: Arjan van de Ven <arjan@linux.intel.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; version 2
> + * of the License.
> + */
> +#include <linux/debugfs.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/seq_file.h>
> +
> +#include <asm/pgtable.h>
> +
> +#define LOWEST_ADDR	(UL(0xffffffffffffffff) << VA_BITS)
> +
> +struct addr_marker {
> +	unsigned long start_address;
> +	const char *name;
> +};
> +
> +static struct addr_marker address_markers[] = {
> +	{ 0,			"vmalloc() Area" },

I don't think we need a 0 placeholder as VMALLOC_START is a compile
time constant?

> +	{ VMALLOC_END,		"vmalloc() End" },

The vmemmap region could be represented here, otherwise on my Juno I
get stragglers appearing. (I had SPARSEMEM_VMEMMAP=y).

> +	{ MODULES_VADDR,	"Modules" },

It would be nice to have an "End Modules" here.
(That's a matter of taste though!)

> +	{ PAGE_OFFSET,		"Kernel Mapping" },
> +	{ -1,			NULL },
> +};
> +
> +struct pg_state {
> +	struct seq_file *seq;
> +	const struct addr_marker *marker;
> +	unsigned long start_address;
> +	unsigned level;
> +	u64 current_prot;
> +};
> +
> +struct prot_bits {
> +	u64		mask;
> +	u64		val;
> +	const char	*set;
> +	const char	*clear;
> +};
> +
> +static const struct prot_bits pte_bits[] = {
> +	{
> +		.mask	= PTE_USER,
> +		.val	= PTE_USER,
> +		.set	= "USR",
> +		.clear	= "   ",
> +	}, {
> +		.mask	= PTE_RDONLY,
> +		.val	= PTE_RDONLY,
> +		.set	= "ro",
> +		.clear	= "RW",
> +	}, {
> +		.mask	= PTE_PXN,
> +		.val	= PTE_PXN,
> +		.set	= "NX",
> +		.clear	= "x ",
> +	}, {
> +		.mask	= PTE_SHARED,
> +		.val	= PTE_SHARED,
> +		.set	= "SHD",
> +		.clear	= "   ",
> +	}, {
> +		.mask	= PTE_AF,
> +		.val	= PTE_AF,
> +		.set	= "AF",
> +		.clear	= "  ",
> +	}, {
> +		.mask	= PTE_NG,
> +		.val	= PTE_NG,
> +		.set	= "NG",
> +		.clear	= "  ",
> +	}, {
> +		.mask	= PTE_UXN,
> +		.val	= PTE_UXN,
> +		.set	= "UXN",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_DEVICE_nGnRnE),
> +		.set	= "DEVICE/nGnRnE",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_DEVICE_nGnRE),
> +		.set	= "DEVICE/nGnRE",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_DEVICE_GRE),
> +		.set	= "DEVICE/GRE",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_NORMAL_NC),
> +		.set	= "MEM/BUFFERABLE",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_NORMAL),
> +		.set	= "MEM/NORMAL",
> +	}
> +};
> +
> +static const struct prot_bits section_bits[] = {
> +	{
> +		.mask	= PMD_SECT_USER,
> +		.val	= PMD_SECT_USER,
> +		.set	= "USR",
> +		.clear	= "   ",
> +	}, {
> +		.mask	= PMD_SECT_RDONLY,
> +		.val	= PMD_SECT_RDONLY,
> +		.set	= "ro",
> +		.clear	= "RW",
> +	}, {
> +		.mask	= PMD_SECT_PXN,
> +		.val	= PMD_SECT_PXN,
> +		.set	= "NX",
> +		.clear	= "x ",
> +	}, {
> +		.mask	= PMD_SECT_S,
> +		.val	= PMD_SECT_S,
> +		.set	= "SHD",
> +		.clear	= "   ",
> +	}, {
> +		.mask	= PMD_SECT_AF,
> +		.val	= PMD_SECT_AF,
> +		.set	= "AF",
> +		.clear	= "  ",
> +	}, {
> +		.mask	= PMD_SECT_NG,
> +		.val	= PMD_SECT_NG,
> +		.set	= "NG",
> +		.clear	= "  ",
> +	}, {
> +		.mask	= PMD_SECT_UXN,
> +		.val	= PMD_SECT_UXN,
> +		.set	= "UXN",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_DEVICE_nGnRnE),
> +		.set	= "DEVICE/nGnRnE",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_DEVICE_nGnRE),
> +		.set	= "DEVICE/nGnRE",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_DEVICE_GRE),
> +		.set	= "DEVICE/GRE",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_NORMAL_NC),
> +		.set	= "MEM/BUFFERABLE",
> +	}, {
> +		.mask	= PTE_ATTRINDX_MASK,
> +		.val	= PTE_ATTRINDX(MT_NORMAL),
> +		.set	= "MEM/NORMAL",
> +	}
> +};
> +
> +struct pg_level {
> +	const struct prot_bits *bits;
> +	size_t num;
> +	u64 mask;
> +};
> +
> +static struct pg_level pg_level[] = {
> +	{
> +	}, { /* pgd */
> +		.bits	= pte_bits,
> +		.num	= ARRAY_SIZE(pte_bits),
> +	}, { /* pud */
> +		.bits	= pte_bits,
> +		.num	= ARRAY_SIZE(pte_bits),
> +	}, { /* pmd */
> +		.bits	= section_bits,
> +		.num	= ARRAY_SIZE(section_bits),
> +	}, { /* pte */
> +		.bits	= pte_bits,
> +		.num	= ARRAY_SIZE(pte_bits),
> +	},
> +};

When I had a look, pte_bits matched section_bits perfectly. Could we
drop section_bits entirely and switch over to pte_bits?

> +
> +static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
> +			size_t num)
> +{
> +	unsigned i;
> +
> +	for (i = 0; i < num; i++, bits++) {
> +		const char *s;
> +
> +		if ((st->current_prot & bits->mask) == bits->val)
> +			s = bits->set;
> +		else
> +			s = bits->clear;
> +
> +		if (s)
> +			seq_printf(st->seq, " %s", s);
> +	}
> +}
> +
> +static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
> +				u64 val)
> +{
> +	static const char units[] = "KMGTPE";
> +	u64 prot = val & pg_level[level].mask;
> +
> +	if (addr < LOWEST_ADDR)
> +		return;
> +
> +	if (!st->level) {
> +		st->level = level;
> +		st->current_prot = prot;
> +		st->start_address = addr;
> +		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
> +	} else if (prot != st->current_prot || level != st->level ||
> +		   addr >= st->marker[1].start_address) {
> +		const char *unit = units;
> +		unsigned long delta;
> +
> +		if (st->current_prot) {
> +			seq_printf(st->seq, "0x%16lx-0x%16lx   ",
> +				   st->start_address, addr);
> +
> +			delta = (addr - st->start_address) >> 10;
> +			while (!(delta & 1023) && unit[1]) {
> +				delta >>= 10;
> +				unit++;
> +			}
> +			seq_printf(st->seq, "%9lu%c", delta, *unit);
> +			if (pg_level[st->level].bits)
> +				dump_prot(st, pg_level[st->level].bits,
> +					  pg_level[st->level].num);
> +			seq_puts(st->seq, "\n");
> +		}
> +
> +		if (addr >= st->marker[1].start_address) {
> +			st->marker++;
> +			seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
> +		}
> +		st->start_address = addr;
> +		st->current_prot = prot;
> +		st->level = level;
> +	}

When I added "End Modules", I needed to add this here:
	if (addr >= st->marker[1].start_address) {
        	st->marker++;
		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
	}

(because it shared the same VA as "kernel mapping").

> +}
> +
> +static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
> +{
> +	pte_t *pte = pte_offset_kernel(pmd, 0);
> +	unsigned long addr;
> +	unsigned i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
> +		addr = start + i * PAGE_SIZE;
> +		note_page(st, addr, 4, pte_val(*pte));
> +	}
> +}
> +
> +static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
> +{
> +	pmd_t *pmd = pmd_offset(pud, 0);
> +	unsigned long addr;
> +	unsigned i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
> +		addr = start + i * PMD_SIZE;
> +		if (pmd_none(*pmd) || pmd_sect(*pmd) || pmd_bad(*pmd))
> +			note_page(st, addr, 3, pmd_val(*pmd));
> +		else
> +			walk_pte(st, pmd, addr);
> +	}
> +}
> +
> +static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +{
> +	pud_t *pud = pud_offset(pgd, 0);
> +	unsigned long addr;
> +	unsigned i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
> +		addr = start + i * PUD_SIZE;
> +		if (pud_none(*pud) || pud_sect(*pud) || pud_bad(*pud))
> +			note_page(st, addr, 2, pud_val(*pud));
> +		else
> +			walk_pmd(st, pud, addr);
> +	}
> +}
> +
> +static unsigned long normalize_addr(unsigned long u)
> +{
> +	return u | LOWEST_ADDR;
> +}
> +
> +static void walk_pgd(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +{
> +	unsigned i;
> +	unsigned long addr;
> +
> +	for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
> +		addr = normalize_addr(start + i * PGDIR_SIZE);
> +		if (pgd_none(*pgd) || pgd_bad(*pgd))
> +			note_page(st, addr, 1, pgd_val(*pgd));
> +		else
> +			walk_pud(st, pgd, addr);
> +	}
> +}
> +
> +static int ptdump_show(struct seq_file *m, void *v)
> +{
> +	struct pg_state st;
> +
> +	memset(&st, 0, sizeof(st));
> +	st.seq = m;
> +	st.marker = address_markers;
> +
> +	walk_pgd(&st, swapper_pg_dir, 0);
> +
> +	note_page(&st, 0, 0, 0);
> +	return 0;
> +}
> +
> +static int ptdump_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, ptdump_show, NULL);
> +}
> +
> +static const struct file_operations ptdump_fops = {
> +	.open		= ptdump_open,
> +	.read		= seq_read,
> +	.llseek		= seq_lseek,
> +	.release	= single_release,
> +};
> +
> +static int ptdump_init(void)
> +{
> +	struct dentry *pe;
> +	unsigned i, j;
> +
> +	for (i = 0; i < ARRAY_SIZE(pg_level); i++)
> +		if (pg_level[i].bits)
> +			for (j = 0; j < pg_level[i].num; j++)
> +				pg_level[i].mask |= pg_level[i].bits[j].mask;
> +
> +	address_markers[0].start_address = VMALLOC_START;

I don't think we need this.

> +
> +	pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
> +				 &ptdump_fops);
> +	return pe ? 0 : -ENOMEM;
> +}
> +device_initcall(ptdump_init);
> -- 

Cheers,
-- 
Steve

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCHv2] arm64: add support to dump the kernel page tables
  2014-11-17 22:18 [PATCHv2] arm64: add support to dump the kernel page tables Laura Abbott
  2014-11-19 23:01 ` Kees Cook
  2014-11-24 16:05 ` [PATCHv2] arm64: add support to dump the kernel page tables Steve Capper
@ 2014-11-24 19:28 ` Mark Rutland
  2 siblings, 0 replies; 8+ messages in thread
From: Mark Rutland @ 2014-11-24 19:28 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Laura,

On Mon, Nov 17, 2014 at 10:18:30PM +0000, Laura Abbott wrote:
> In a similar manner to arm, it's useful to be able to dump the page
> tables to verify permissions and memory types. Add a debugfs file
> to check the page tables.
> 
> Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
> ---
> v2: Addressed comments from Mark Rutland. Made the logic a bit more
> consistent between functions. Now tested on 4K and 64K pages.
> ---
>  arch/arm64/Kconfig.debug |  12 ++
>  arch/arm64/mm/Makefile   |   1 +
>  arch/arm64/mm/dump.c     | 358 +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 371 insertions(+)
>  create mode 100644 arch/arm64/mm/dump.c

[...]

> +static struct pg_level pg_level[] = {
> +	{
> +	}, { /* pgd */
> +		.bits	= pte_bits,
> +		.num	= ARRAY_SIZE(pte_bits),
> +	}, { /* pud */
> +		.bits	= pte_bits,
> +		.num	= ARRAY_SIZE(pte_bits),
> +	}, { /* pmd */
> +		.bits	= section_bits,
> +		.num	= ARRAY_SIZE(section_bits),
> +	}, { /* pte */
> +		.bits	= pte_bits,
> +		.num	= ARRAY_SIZE(pte_bits),
> +	},
> +};

Can't we have section mappings in puds? We have a pud_sect() check in
walk_pud, so I would have expected puds to use section_bits as well
(perhaps pgds too, which I believe the architecture allows for block
mappings in some configurations).

That said, the section and pte bits are the same at the moment, so we
could just use the same bits at all levels for now, and introduce
section_bits if required in future.

[...]

> +static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +{
> +	pud_t *pud = pud_offset(pgd, 0);
> +	unsigned long addr;
> +	unsigned i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
> +		addr = start + i * PUD_SIZE;
> +		if (pud_none(*pud) || pud_sect(*pud) || pud_bad(*pud))
> +			note_page(st, addr, 2, pud_val(*pud));
> +		else
> +			walk_pmd(st, pud, addr);
> +	}
> +}
> +
> +static unsigned long normalize_addr(unsigned long u)
> +{
> +	return u | LOWEST_ADDR;
> +}
> +
> +static void walk_pgd(struct pg_state *st, pgd_t *pgd, unsigned long start)
> +{
> +	unsigned i;
> +	unsigned long addr;
> +
> +	for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
> +		addr = normalize_addr(start + i * PGDIR_SIZE);

Could we get rid of normalize_addr, and pass in LOWEST_ADDR from
ptdump_show? Then this could be:

		addr = start + i * PGD_SIZE;

Which would make it look like the other walk_* functions, and it would
mean the that start parameter would be the actual start address.

Similarly we could pass in the init_mm rather than and use pgd_offset on
that, as we do in show_pte (in mm/fault.c).

> +		if (pgd_none(*pgd) || pgd_bad(*pgd))
> +			note_page(st, addr, 1, pgd_val(*pgd));
> +		else
> +			walk_pud(st, pgd, addr);
> +	}
> +}
> +
> +static int ptdump_show(struct seq_file *m, void *v)
> +{
> +	struct pg_state st;
> +
> +	memset(&st, 0, sizeof(st));
> +	st.seq = m;
> +	st.marker = address_markers;

If you change this to:

struct pg_state st = {
	.seq = m,
	.marker = address_markers,
};

The other fields will be initialized to zero without the need for an
explicit memset.

Otherwise this looks good to me. Thanks for slogging through this so
far!

With those changes:

Reviewed-by: Mark Rutland <mark.rutland@arm.com>

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* CONFIG_DEBUG_SET_MODULE_RONX broken (was Re: [PATCHv2] arm64: add support to dump the kernel page tables)
  2014-11-20 23:20   ` CONFIG_DEBUG_SET_MODULE_RONX broken (was Re: [PATCHv2] arm64: add support to dump the kernel page tables) Laura Abbott
@ 2014-11-26  6:05     ` Rusty Russell
  2014-12-01 21:53       ` Laura Abbott
  0 siblings, 1 reply; 8+ messages in thread
From: Rusty Russell @ 2014-11-26  6:05 UTC (permalink / raw)
  To: linux-arm-kernel

Laura Abbott <lauraa@codeaurora.org> writes:
> (cc Rusty Russell for reference)
>
> On 11/19/2014 3:01 PM, Kees Cook wrote:
>> On Mon, Nov 17, 2014 at 2:18 PM, Laura Abbott <lauraa@codeaurora.org> wrote:
>>> In a similar manner to arm, it's useful to be able to dump the page
>>> tables to verify permissions and memory types. Add a debugfs file
>>> to check the page tables.
>>>
>>> Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
>>
>> This seems to behave well for me. Thanks!
>>
>> Tested-by: Kees Cook <keescook@chromium.org>
>>
>> In my configuration, though, with CONFIG_DEBUG_SET_MODULE_RONX=y, I'm
>> only seeing RO, and no NX changes. I haven't constructed a test for
>> the module memory behavior directly (to see if this is a page table
>> issue or a PTDUMP reporting issue). This series I'm testing has gone
>> through some backporting on my end, so I wanted to just double-check
>> and see if you saw this too, or if it's specific to my environment:
>>
>> ---[ Modules ]---
>> 0xffffffbffc000000-0xffffffbffc005000          20K     ro x  SHD AF
>> UXN MEM/NORMAL
>> 0xffffffbffc005000-0xffffffbffc007000           8K     RW x  SHD AF
>> UXN MEM/NORMAL
>> 0xffffffbffc00c000-0xffffffbffc00e000           8K     ro x  SHD AF
>> UXN MEM/NORMAL
>> 0xffffffbffc00e000-0xffffffbffc010000           8K     RW x  SHD AF
>> UXN MEM/NORMAL
>> ...
>>
>> Thanks,
>>
>> -Kees
>>
>
> Yep, I'm seeing the same thing. We're failing the bounds check:
>
> if (!is_module_address(start) || !is_module_address(end - 1))
>                  return -EINVAL;

That's a weird test, but I can agree that it's now broken.  What's it for?

> There are now two problems with this check
>
> 1) 4982223e51e8 module: set nx before marking module MODULE_STATE_COMING
> moved around the order of when nx was set. Now we hit the mod->state ==
> MODULE_STATE_UNFORMED in __module_address so module_address on anything
> always returns false. I think my previous testing must have been done
> on a branch without that patch.
>
> 2) It's possible for the end of the region we are trying to set as nx
> to not be fully page size aligned. This seems to be caused by things
> getting aligned in layout_section but becoming unaligned in layout_symtab

Yes, but you only need the first change in your patch: mod->init_size
should already be aligned (and unlike mod->core_size, we haven't
modified it).

> diff --git a/kernel/module.c b/kernel/module.c
> index 972151b..3791330 100644
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -2316,10 +2316,14 @@ static void layout_symtab(struct module *mod, struct load_info *info)
>          info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
>          mod->core_size += strtab_size;
>   
> +       mod->core_size = debug_align(mod->core_size);
> +
>          /* Put string table section at end of init part of module. */
>          strsect->sh_flags |= SHF_ALLOC;
>          strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
>                                           info->index.str) | INIT_OFFSET_MASK;
> +
> +       mod->init_size = debug_align(mod->init_size);
>          pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
>   }
>   
> I haven't tried a bisect to see if this is new.
>
> I'm kind of tempted to switch the bounds check back to
> (addr >= MODULES_VADDR && addr < MODULES_END) unless there is a clean way to
> fixup module.c

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* CONFIG_DEBUG_SET_MODULE_RONX broken (was Re: [PATCHv2] arm64: add support to dump the kernel page tables)
  2014-11-26  6:05     ` Rusty Russell
@ 2014-12-01 21:53       ` Laura Abbott
  2014-12-15  3:46         ` Rusty Russell
  0 siblings, 1 reply; 8+ messages in thread
From: Laura Abbott @ 2014-12-01 21:53 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/25/2014 10:05 PM, Rusty Russell wrote:
>>
>> Yep, I'm seeing the same thing. We're failing the bounds check:
>>
>> if (!is_module_address(start) || !is_module_address(end - 1))
>>                   return -EINVAL;
>
> That's a weird test, but I can agree that it's now broken.  What's it for?
>

Both arm and arm64 map underlying memory with page table mappings that may
be greater than PAGE_SIZE. Rather than deal with the hassle of trying to
tear down the larger mappings and call stop_machine to flush the TLBs, we
just disallow changing attributes of arbitrary memory. Module memory is
always mapped with 4K pages so it's safe to change the attributes, hence
the bounds check above. On arm we just bounds check via
MODULE_START <= addr < MODULE_END so that wasn't affected.

>> There are now two problems with this check
>>
>> 1) 4982223e51e8 module: set nx before marking module MODULE_STATE_COMING
>> moved around the order of when nx was set. Now we hit the mod->state ==
>> MODULE_STATE_UNFORMED in __module_address so module_address on anything
>> always returns false. I think my previous testing must have been done
>> on a branch without that patch.
>>
>> 2) It's possible for the end of the region we are trying to set as nx
>> to not be fully page size aligned. This seems to be caused by things
>> getting aligned in layout_section but becoming unaligned in layout_symtab
>
> Yes, but you only need the first change in your patch: mod->init_size
> should already be aligned (and unlike mod->core_size, we haven't
> modified it).
>

the init size can be modified via get_offset though. In my testing I needed
to align up both mod->init_size and mod->core_size for is_module_address to
pass on all set_memory_* calls made by the module layer.

>> diff --git a/kernel/module.c b/kernel/module.c
>> index 972151b..3791330 100644
>> --- a/kernel/module.c
>> +++ b/kernel/module.c
>> @@ -2316,10 +2316,14 @@ static void layout_symtab(struct module *mod, struct load_info *info)
>>           info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
>>           mod->core_size += strtab_size;
>>
>> +       mod->core_size = debug_align(mod->core_size);
>> +
>>           /* Put string table section at end of init part of module. */
>>           strsect->sh_flags |= SHF_ALLOC;
>>           strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
>>                                            info->index.str) | INIT_OFFSET_MASK;
>> +
>> +       mod->init_size = debug_align(mod->init_size);
>>           pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
>>    }
>>
>> I haven't tried a bisect to see if this is new.
>>
>> I'm kind of tempted to switch the bounds check back to
>> (addr >= MODULES_VADDR && addr < MODULES_END) unless there is a clean way to
>> fixup module.c
>
> Thanks,
> Rusty.
>

Thanks,
Laura

-- 
Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 8+ messages in thread

* CONFIG_DEBUG_SET_MODULE_RONX broken (was Re: [PATCHv2] arm64: add support to dump the kernel page tables)
  2014-12-01 21:53       ` Laura Abbott
@ 2014-12-15  3:46         ` Rusty Russell
  0 siblings, 0 replies; 8+ messages in thread
From: Rusty Russell @ 2014-12-15  3:46 UTC (permalink / raw)
  To: linux-arm-kernel

Laura Abbott <lauraa@codeaurora.org> writes:
> On 11/25/2014 10:05 PM, Rusty Russell wrote:
>>>
>>> Yep, I'm seeing the same thing. We're failing the bounds check:
>>>
>>> if (!is_module_address(start) || !is_module_address(end - 1))
>>>                   return -EINVAL;
>>
>> That's a weird test, but I can agree that it's now broken.  What's it for?
>>
>
> Both arm and arm64 map underlying memory with page table mappings that may
> be greater than PAGE_SIZE. Rather than deal with the hassle of trying to
> tear down the larger mappings and call stop_machine to flush the TLBs, we
> just disallow changing attributes of arbitrary memory. Module memory is
> always mapped with 4K pages so it's safe to change the attributes, hence
> the bounds check above. On arm we just bounds check via
> MODULE_START <= addr < MODULE_END so that wasn't affected.

OK, perhaps as you say, we should do this.

>> Yes, but you only need the first change in your patch: mod->init_size
>> should already be aligned (and unlike mod->core_size, we haven't
>> modified it).
>>
>
> the init size can be modified via get_offset though. In my testing I needed
> to align up both mod->init_size and mod->core_size for is_module_address to
> pass on all set_memory_* calls made by the module layer.

You're right.  It doesn't seem to hurt any other code path, but if you
want this, please send me a proper explanation w/ signed-off-by.

Thanks!
Rusty.

>>> diff --git a/kernel/module.c b/kernel/module.c
>>> index 972151b..3791330 100644
>>> --- a/kernel/module.c
>>> +++ b/kernel/module.c
>>> @@ -2316,10 +2316,14 @@ static void layout_symtab(struct module *mod, struct load_info *info)
>>>           info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
>>>           mod->core_size += strtab_size;
>>>
>>> +       mod->core_size = debug_align(mod->core_size);
>>> +
>>>           /* Put string table section at end of init part of module. */
>>>           strsect->sh_flags |= SHF_ALLOC;
>>>           strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
>>>                                            info->index.str) | INIT_OFFSET_MASK;
>>> +
>>> +       mod->init_size = debug_align(mod->init_size);
>>>           pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
>>>    }
>>>
>>> I haven't tried a bisect to see if this is new.
>>>
>>> I'm kind of tempted to switch the bounds check back to
>>> (addr >= MODULES_VADDR && addr < MODULES_END) unless there is a clean way to
>>> fixup module.c
>>
>> Thanks,
>> Rusty.
>>
>
> Thanks,
> Laura
>
> -- 
> Qualcomm Innovation Center, Inc.
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2014-12-15  3:46 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-17 22:18 [PATCHv2] arm64: add support to dump the kernel page tables Laura Abbott
2014-11-19 23:01 ` Kees Cook
2014-11-20 23:20   ` CONFIG_DEBUG_SET_MODULE_RONX broken (was Re: [PATCHv2] arm64: add support to dump the kernel page tables) Laura Abbott
2014-11-26  6:05     ` Rusty Russell
2014-12-01 21:53       ` Laura Abbott
2014-12-15  3:46         ` Rusty Russell
2014-11-24 16:05 ` [PATCHv2] arm64: add support to dump the kernel page tables Steve Capper
2014-11-24 19:28 ` Mark Rutland

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.