LinuxPPC-Dev Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v1 0/5] Convert powerpc to GENERIC_PTDUMP
@ 2021-04-15 17:18 Christophe Leroy
  2021-04-15 17:18 ` [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables Christophe Leroy
                   ` (4 more replies)
  0 siblings, 5 replies; 24+ messages in thread
From: Christophe Leroy @ 2021-04-15 17:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

This series converts powerpc to generic PTDUMP.

For that, we first need to add missing hugepd support
to pagewalk and ptdump.

Christophe Leroy (5):
  mm: pagewalk: Fix walk for hugepage tables
  mm: ptdump: Fix build failure
  mm: ptdump: Provide page size to notepage()
  mm: ptdump: Support hugepd table entries
  powerpc/mm: Convert powerpc to GENERIC_PTDUMP

 arch/arm64/mm/ptdump.c            |   2 +-
 arch/powerpc/Kconfig              |   2 +
 arch/powerpc/Kconfig.debug        |  30 ------
 arch/powerpc/mm/Makefile          |   2 +-
 arch/powerpc/mm/mmu_decl.h        |   2 +-
 arch/powerpc/mm/ptdump/8xx.c      |   6 +-
 arch/powerpc/mm/ptdump/Makefile   |   9 +-
 arch/powerpc/mm/ptdump/book3s64.c |   6 +-
 arch/powerpc/mm/ptdump/ptdump.c   | 161 +++++++++---------------------
 arch/powerpc/mm/ptdump/shared.c   |   6 +-
 arch/riscv/mm/ptdump.c            |   2 +-
 arch/s390/mm/dump_pagetables.c    |   3 +-
 arch/x86/mm/dump_pagetables.c     |   2 +-
 include/linux/ptdump.h            |   2 +-
 mm/pagewalk.c                     |  54 ++++++++--
 mm/ptdump.c                       |  33 ++++--
 16 files changed, 145 insertions(+), 177 deletions(-)

-- 
2.25.0


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables
  2021-04-15 17:18 [PATCH v1 0/5] Convert powerpc to GENERIC_PTDUMP Christophe Leroy
@ 2021-04-15 17:18 ` Christophe Leroy
  2021-04-15 22:43   ` Daniel Axtens
  2021-04-15 17:18 ` [PATCH v1 2/5] mm: ptdump: Fix build failure Christophe Leroy
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2021-04-15 17:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.

Add walk_hugepd_range() and use it to walk hugepage tables.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 mm/pagewalk.c | 54 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..410a9d8f7572 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,6 +58,32 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+			     unsigned long end, struct mm_walk *walk, int pdshift)
+{
+	int err = 0;
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+	const struct mm_walk_ops *ops = walk->ops;
+	int shift = hugepd_shift(*phpd);
+	int page_size = 1 << shift;
+
+	if (addr & (page_size - 1))
+		return 0;
+
+	for (;;) {
+		pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
+
+		err = ops->pte_entry(pte, addr, addr + page_size, walk);
+		if (err)
+			break;
+		if (addr >= end - page_size)
+			break;
+		addr += page_size;
+	}
+#endif
+	return err;
+}
+
 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 			  struct mm_walk *walk)
 {
@@ -108,7 +134,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 				goto again;
 		}
 
-		err = walk_pte_range(pmd, addr, next, walk);
+		if (is_hugepd(__hugepd(pmd_val(*pmd))))
+			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
+		else
+			err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
@@ -157,7 +186,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 		if (pud_none(*pud))
 			goto again;
 
-		err = walk_pmd_range(pud, addr, next, walk);
+		if (is_hugepd(__hugepd(pud_val(*pud))))
+			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
+		else
+			err = walk_pmd_range(pud, addr, next, walk);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
@@ -189,8 +221,13 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 			if (err)
 				break;
 		}
-		if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
-			err = walk_pud_range(p4d, addr, next, walk);
+		if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) {
+			if (is_hugepd(__hugepd(p4d_val(*p4d))))
+				err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk,
+							P4D_SHIFT);
+			else
+				err = walk_pud_range(p4d, addr, next, walk);
+		}
 		if (err)
 			break;
 	} while (p4d++, addr = next, addr != end);
@@ -225,8 +262,13 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
 				break;
 		}
 		if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
-		    ops->pte_entry)
-			err = walk_p4d_range(pgd, addr, next, walk);
+		    ops->pte_entry) {
+			if (is_hugepd(__hugepd(pgd_val(*pgd))))
+				err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk,
+							PGDIR_SHIFT);
+			else
+				err = walk_p4d_range(pgd, addr, next, walk);
+		}
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-- 
2.25.0


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v1 2/5] mm: ptdump: Fix build failure
  2021-04-15 17:18 [PATCH v1 0/5] Convert powerpc to GENERIC_PTDUMP Christophe Leroy
  2021-04-15 17:18 ` [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables Christophe Leroy
@ 2021-04-15 17:18 ` Christophe Leroy
  2021-04-15 17:18 ` [PATCH v1 3/5] mm: ptdump: Provide page size to notepage() Christophe Leroy
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 24+ messages in thread
From: Christophe Leroy @ 2021-04-15 17:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

	  CC      mm/ptdump.o
	In file included from <command-line>:
	mm/ptdump.c: In function 'ptdump_pte_entry':
	././include/linux/compiler_types.h:320:38: error: call to '__compiletime_assert_207' declared with attribute error: Unsupported access size for {READ,WRITE}_ONCE().
	  320 |  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
	      |                                      ^
	././include/linux/compiler_types.h:301:4: note: in definition of macro '__compiletime_assert'
	  301 |    prefix ## suffix();    \
	      |    ^~~~~~
	././include/linux/compiler_types.h:320:2: note: in expansion of macro '_compiletime_assert'
	  320 |  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
	      |  ^~~~~~~~~~~~~~~~~~~
	./include/asm-generic/rwonce.h:36:2: note: in expansion of macro 'compiletime_assert'
	   36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
	      |  ^~~~~~~~~~~~~~~~~~
	./include/asm-generic/rwonce.h:49:2: note: in expansion of macro 'compiletime_assert_rwonce_type'
	   49 |  compiletime_assert_rwonce_type(x);    \
	      |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	mm/ptdump.c:114:14: note: in expansion of macro 'READ_ONCE'
	  114 |  pte_t val = READ_ONCE(*pte);
	      |              ^~~~~~~~~
	make[2]: *** [mm/ptdump.o] Error 1

READ_ONCE() cannot be used for reading PTEs. Use ptep_get()
instead. See commit 481e980a7c19 ("mm: Allow arches to provide ptep_get()")
and commit c0e1c8c22beb ("powerpc/8xx: Provide ptep_get() with 16k pages")
for details.

Fixes: 30d621f6723b ("mm: add generic ptdump")
Cc: Steven Price <steven.price@arm.com>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 mm/ptdump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 4354c1422d57..da751448d0e4 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -111,7 +111,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
 			    unsigned long next, struct mm_walk *walk)
 {
 	struct ptdump_state *st = walk->private;
-	pte_t val = READ_ONCE(*pte);
+	pte_t val = ptep_get(pte);
 
 	if (st->effective_prot)
 		st->effective_prot(st, 4, pte_val(val));
-- 
2.25.0


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-15 17:18 [PATCH v1 0/5] Convert powerpc to GENERIC_PTDUMP Christophe Leroy
  2021-04-15 17:18 ` [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables Christophe Leroy
  2021-04-15 17:18 ` [PATCH v1 2/5] mm: ptdump: Fix build failure Christophe Leroy
@ 2021-04-15 17:18 ` Christophe Leroy
  2021-04-15 23:12   ` Daniel Axtens
  2021-04-16  9:28   ` Steven Price
  2021-04-15 17:18 ` [PATCH v1 4/5] mm: ptdump: Support hugepd table entries Christophe Leroy
  2021-04-15 17:18 ` [PATCH v1 5/5] powerpc/mm: Convert powerpc to GENERIC_PTDUMP Christophe Leroy
  4 siblings, 2 replies; 24+ messages in thread
From: Christophe Leroy @ 2021-04-15 17:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

In order to support large pages on powerpc, notepage()
needs to know the page size of the page.

Add a page_size argument to notepage().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/arm64/mm/ptdump.c         |  2 +-
 arch/riscv/mm/ptdump.c         |  2 +-
 arch/s390/mm/dump_pagetables.c |  3 ++-
 arch/x86/mm/dump_pagetables.c  |  2 +-
 include/linux/ptdump.h         |  2 +-
 mm/ptdump.c                    | 16 ++++++++--------
 6 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 0e050d76b83a..ea1a1c3a3ea0 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -257,7 +257,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
 }
 
 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
-		      u64 val)
+		      u64 val, unsigned long page_size)
 {
 	struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
 	static const char units[] = "KMGTPE";
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index ace74dec7492..0a7f276ba799 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -235,7 +235,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
 }
 
 static void note_page(struct ptdump_state *pt_st, unsigned long addr,
-		      int level, u64 val)
+		      int level, u64 val, unsigned long page_size)
 {
 	struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
 	u64 pa = PFN_PHYS(pte_pfn(__pte(val)));
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index e40a30647d99..29673c38e773 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -116,7 +116,8 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
 #endif /* CONFIG_DEBUG_WX */
 }
 
-static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
+		      u64 val, unsigned long page_size)
 {
 	int width = sizeof(unsigned long) * 2;
 	static const char units[] = "KMGTPE";
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e1b599ecbbc2..2ec76737c1f1 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -272,7 +272,7 @@ static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
  * print what we collected so far.
  */
 static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
-		      u64 val)
+		      u64 val, unsigned long page_size)
 {
 	struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
 	pgprotval_t new_prot, new_eff;
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index 2a3a95586425..3a971fadc95e 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -13,7 +13,7 @@ struct ptdump_range {
 struct ptdump_state {
 	/* level is 0:PGD to 4:PTE, or -1 if unknown */
 	void (*note_page)(struct ptdump_state *st, unsigned long addr,
-			  int level, u64 val);
+			  int level, u64 val, unsigned long page_size);
 	void (*effective_prot)(struct ptdump_state *st, int level, u64 val);
 	const struct ptdump_range *range;
 };
diff --git a/mm/ptdump.c b/mm/ptdump.c
index da751448d0e4..61cd16afb1c8 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -17,7 +17,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
 {
 	struct ptdump_state *st = walk->private;
 
-	st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+	st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]), PAGE_SIZE);
 
 	walk->action = ACTION_CONTINUE;
 
@@ -41,7 +41,7 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
 		st->effective_prot(st, 0, pgd_val(val));
 
 	if (pgd_leaf(val))
-		st->note_page(st, addr, 0, pgd_val(val));
+		st->note_page(st, addr, 0, pgd_val(val), PGDIR_SIZE);
 
 	return 0;
 }
@@ -62,7 +62,7 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
 		st->effective_prot(st, 1, p4d_val(val));
 
 	if (p4d_leaf(val))
-		st->note_page(st, addr, 1, p4d_val(val));
+		st->note_page(st, addr, 1, p4d_val(val), P4D_SIZE);
 
 	return 0;
 }
@@ -83,7 +83,7 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
 		st->effective_prot(st, 2, pud_val(val));
 
 	if (pud_leaf(val))
-		st->note_page(st, addr, 2, pud_val(val));
+		st->note_page(st, addr, 2, pud_val(val), PUD_SIZE);
 
 	return 0;
 }
@@ -102,7 +102,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
 	if (st->effective_prot)
 		st->effective_prot(st, 3, pmd_val(val));
 	if (pmd_leaf(val))
-		st->note_page(st, addr, 3, pmd_val(val));
+		st->note_page(st, addr, 3, pmd_val(val), PMD_SIZE);
 
 	return 0;
 }
@@ -116,7 +116,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
 	if (st->effective_prot)
 		st->effective_prot(st, 4, pte_val(val));
 
-	st->note_page(st, addr, 4, pte_val(val));
+	st->note_page(st, addr, 4, pte_val(val), PAGE_SIZE);
 
 	return 0;
 }
@@ -126,7 +126,7 @@ static int ptdump_hole(unsigned long addr, unsigned long next,
 {
 	struct ptdump_state *st = walk->private;
 
-	st->note_page(st, addr, depth, 0);
+	st->note_page(st, addr, depth, 0, 0);
 
 	return 0;
 }
@@ -153,5 +153,5 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
 	mmap_read_unlock(mm);
 
 	/* Flush out the last page */
-	st->note_page(st, 0, -1, 0);
+	st->note_page(st, 0, -1, 0, 0);
 }
-- 
2.25.0


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v1 4/5] mm: ptdump: Support hugepd table entries
  2021-04-15 17:18 [PATCH v1 0/5] Convert powerpc to GENERIC_PTDUMP Christophe Leroy
                   ` (2 preceding siblings ...)
  2021-04-15 17:18 ` [PATCH v1 3/5] mm: ptdump: Provide page size to notepage() Christophe Leroy
@ 2021-04-15 17:18 ` Christophe Leroy
  2021-04-15 23:29   ` Daniel Axtens
  2021-04-15 17:18 ` [PATCH v1 5/5] powerpc/mm: Convert powerpc to GENERIC_PTDUMP Christophe Leroy
  4 siblings, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2021-04-15 17:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

Which hugepd, page table entries can be at any level
and can be of any size.

Add support for them.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 mm/ptdump.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/mm/ptdump.c b/mm/ptdump.c
index 61cd16afb1c8..6efdb8c15a7d 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -112,11 +112,24 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
 {
 	struct ptdump_state *st = walk->private;
 	pte_t val = ptep_get(pte);
+	unsigned long page_size = next - addr;
+	int level;
+
+	if (page_size >= PGDIR_SIZE)
+		level = 0;
+	else if (page_size >= P4D_SIZE)
+		level = 1;
+	else if (page_size >= PUD_SIZE)
+		level = 2;
+	else if (page_size >= PMD_SIZE)
+		level = 3;
+	else
+		level = 4;
 
 	if (st->effective_prot)
-		st->effective_prot(st, 4, pte_val(val));
+		st->effective_prot(st, level, pte_val(val));
 
-	st->note_page(st, addr, 4, pte_val(val), PAGE_SIZE);
+	st->note_page(st, addr, level, pte_val(val), page_size);
 
 	return 0;
 }
-- 
2.25.0


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v1 5/5] powerpc/mm: Convert powerpc to GENERIC_PTDUMP
  2021-04-15 17:18 [PATCH v1 0/5] Convert powerpc to GENERIC_PTDUMP Christophe Leroy
                   ` (3 preceding siblings ...)
  2021-04-15 17:18 ` [PATCH v1 4/5] mm: ptdump: Support hugepd table entries Christophe Leroy
@ 2021-04-15 17:18 ` Christophe Leroy
  4 siblings, 0 replies; 24+ messages in thread
From: Christophe Leroy @ 2021-04-15 17:18 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

This patch converts powerpc to the generic PTDUMP implementation.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/Kconfig              |   2 +
 arch/powerpc/Kconfig.debug        |  30 ------
 arch/powerpc/mm/Makefile          |   2 +-
 arch/powerpc/mm/mmu_decl.h        |   2 +-
 arch/powerpc/mm/ptdump/8xx.c      |   6 +-
 arch/powerpc/mm/ptdump/Makefile   |   9 +-
 arch/powerpc/mm/ptdump/book3s64.c |   6 +-
 arch/powerpc/mm/ptdump/ptdump.c   | 161 +++++++++---------------------
 arch/powerpc/mm/ptdump/shared.c   |   6 +-
 9 files changed, 68 insertions(+), 156 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 475d77a6ebbe..40259437a28f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -120,6 +120,7 @@ config PPC
 	select ARCH_32BIT_OFF_T if PPC32
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEBUG_VM_PGTABLE
+	select ARCH_HAS_DEBUG_WX		if STRICT_KERNEL_RWX
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
@@ -177,6 +178,7 @@ config PPC
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
 	select GENERIC_PCI_IOMAP		if PCI
+	select GENERIC_PTDUMP
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 6342f9da4545..05b1180ea502 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -360,36 +360,6 @@ config FAIL_IOMMU
 
 	  If you are unsure, say N.
 
-config PPC_PTDUMP
-	bool "Export kernel pagetable layout to userspace via debugfs"
-	depends on DEBUG_KERNEL && DEBUG_FS
-	help
-	  This option exports the state of the kernel pagetables to a
-	  debugfs file. This is only useful for kernel developers who are
-	  working in architecture specific areas of the kernel - probably
-	  not a good idea to enable this feature in a production kernel.
-
-	  If you are unsure, say N.
-
-config PPC_DEBUG_WX
-	bool "Warn on W+X mappings at boot"
-	depends on PPC_PTDUMP && STRICT_KERNEL_RWX
-	help
-	  Generate a warning if any W+X mappings are found at boot.
-
-	  This is useful for discovering cases where the kernel is leaving
-	  W+X mappings after applying NX, as such mappings are a security risk.
-
-	  Note that even if the check fails, your kernel is possibly
-	  still fine, as W+X mappings are not a security hole in
-	  themselves, what they do is that they make the exploitation
-	  of other unfixed kernel bugs easier.
-
-	  There is no runtime or memory usage effect of this option
-	  once the kernel has booted up - it's a one time check.
-
-	  If in doubt, say "Y".
-
 config PPC_FAST_ENDIAN_SWITCH
 	bool "Deprecated fast endian-switch syscall"
 	depends on DEBUG_KERNEL && PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index c3df3a8501d4..c90d58aaebe2 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
-obj-$(CONFIG_PPC_PTDUMP)	+= ptdump/
+obj-$(CONFIG_PTDUMP_CORE)	+= ptdump/
 obj-$(CONFIG_KASAN)		+= kasan/
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 7dac910c0b21..dd1cabc2ea0f 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -180,7 +180,7 @@ static inline void mmu_mark_rodata_ro(void) { }
 void __init mmu_mapin_immr(void);
 #endif
 
-#ifdef CONFIG_PPC_DEBUG_WX
+#ifdef CONFIG_DEBUG_WX
 void ptdump_check_wx(void);
 #else
 static inline void ptdump_check_wx(void) { }
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 86da2a669680..fac932eb8f9a 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -75,8 +75,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-	{
-	}, { /* pgd */
+	{ /* pgd */
+		.flag	= flag_array,
+		.num	= ARRAY_SIZE(flag_array),
+	}, { /* p4d */
 		.flag	= flag_array,
 		.num	= ARRAY_SIZE(flag_array),
 	}, { /* pud */
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
index 712762be3cb1..4050cbb55acf 100644
--- a/arch/powerpc/mm/ptdump/Makefile
+++ b/arch/powerpc/mm/ptdump/Makefile
@@ -5,5 +5,10 @@ obj-y	+= ptdump.o
 obj-$(CONFIG_4xx)		+= shared.o
 obj-$(CONFIG_PPC_8xx)		+= 8xx.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)	+= shared.o
-obj-$(CONFIG_PPC_BOOK3S_32)	+= shared.o bats.o segment_regs.o
-obj-$(CONFIG_PPC_BOOK3S_64)	+= book3s64.o hashpagetable.o
+obj-$(CONFIG_PPC_BOOK3S_32)	+= shared.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= book3s64.o
+
+ifdef CONFIG_PTDUMP_DEBUGFS
+obj-$(CONFIG_PPC_BOOK3S_32)	+= bats.o segment_regs.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= hashpagetable.o
+endif
diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c
index 14f73868db66..5ad92d9dc5d1 100644
--- a/arch/powerpc/mm/ptdump/book3s64.c
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -103,8 +103,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-	{
-	}, { /* pgd */
+	{ /* pgd */
+		.flag	= flag_array,
+		.num	= ARRAY_SIZE(flag_array),
+	}, { /* p4d */
 		.flag	= flag_array,
 		.num	= ARRAY_SIZE(flag_array),
 	}, { /* pud */
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index aca354fb670b..9fb1f4fd8af4 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -16,6 +16,7 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/ptdump.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <asm/fixmap.h>
@@ -54,13 +55,14 @@
  *
  */
 struct pg_state {
+	struct ptdump_state ptdump;
 	struct seq_file *seq;
 	const struct addr_marker *marker;
 	unsigned long start_address;
 	unsigned long start_pa;
 	unsigned long last_pa;
 	unsigned long page_size;
-	unsigned int level;
+	int level;
 	u64 current_flags;
 	bool check_wx;
 	unsigned long wx_pages;
@@ -216,14 +218,18 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr,
 	}
 }
 
-static void note_page(struct pg_state *st, unsigned long addr,
-	       unsigned int level, u64 val, unsigned long page_size)
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
+		      u64 val, unsigned long page_size)
 {
-	u64 flag = val & pg_level[level].mask;
+	struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+	u64 flag = 0;
 	u64 pa = val & PTE_RPN_MASK;
 
+	if (level >= 0)
+		flag = val & pg_level[level].mask;
+
 	/* At first no level is set */
-	if (!st->level) {
+	if (st->level == -1) {
 		pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
 		note_page_update_state(st, addr, level, val, page_size);
 	/*
@@ -262,94 +268,6 @@ static void note_page(struct pg_state *st, unsigned long addr,
 	st->last_pa = pa;
 }
 
-static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
-{
-	pte_t *pte = pte_offset_kernel(pmd, 0);
-	unsigned long addr;
-	unsigned int i;
-
-	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
-		addr = start + i * PAGE_SIZE;
-		note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE);
-
-	}
-}
-
-static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start,
-			int pdshift, int level)
-{
-#ifdef CONFIG_ARCH_HAS_HUGEPD
-	unsigned int i;
-	int shift = hugepd_shift(*phpd);
-	int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1;
-
-	if (start & ((1 << shift) - 1))
-		return;
-
-	for (i = 0; i < ptrs_per_hpd; i++) {
-		unsigned long addr = start + (i << shift);
-		pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
-
-		note_page(st, addr, level + 1, pte_val(*pte), 1 << shift);
-	}
-#endif
-}
-
-static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
-{
-	pmd_t *pmd = pmd_offset(pud, 0);
-	unsigned long addr;
-	unsigned int i;
-
-	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
-		addr = start + i * PMD_SIZE;
-		if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd))
-			/* pmd exists */
-			walk_pte(st, pmd, addr);
-		else
-			note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE);
-	}
-}
-
-static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
-{
-	pud_t *pud = pud_offset(p4d, 0);
-	unsigned long addr;
-	unsigned int i;
-
-	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
-		addr = start + i * PUD_SIZE;
-		if (!pud_none(*pud) && !pud_is_leaf(*pud))
-			/* pud exists */
-			walk_pmd(st, pud, addr);
-		else
-			note_page(st, addr, 2, pud_val(*pud), PUD_SIZE);
-	}
-}
-
-static void walk_pagetables(struct pg_state *st)
-{
-	unsigned int i;
-	unsigned long addr = st->start_address & PGDIR_MASK;
-	pgd_t *pgd = pgd_offset_k(addr);
-
-	/*
-	 * Traverse the linux pagetable structure and dump pages that are in
-	 * the hash pagetable.
-	 */
-	for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
-		p4d_t *p4d = p4d_offset(pgd, 0);
-
-		if (p4d_none(*p4d) || p4d_is_leaf(*p4d))
-			note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE);
-		else if (is_hugepd(__hugepd(p4d_val(*p4d))))
-			walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1);
-		else
-			/* p4d exists */
-			walk_pud(st, p4d, addr);
-	}
-}
-
 static void populate_markers(void)
 {
 	int i = 0;
@@ -399,32 +317,29 @@ static int ptdump_show(struct seq_file *m, void *v)
 	struct pg_state st = {
 		.seq = m,
 		.marker = address_markers,
-		.start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
+		.level = -1,
+		.ptdump = {
+			.note_page = note_page,
+			.range = (struct ptdump_range[]){
+				{TASK_SIZE, ~0UL},
+				{0, 0}
+			}
+		}
 	};
 
 #ifdef CONFIG_PPC64
 	if (!radix_enabled())
-		st.start_address = KERN_VIRT_START;
+		st.ptdump.range.start = KERN_VIRT_START;
+	else
+		st.ptdump.range.start = PAGE_OFFSET;
 #endif
 
 	/* Traverse kernel page tables */
-	walk_pagetables(&st);
-	note_page(&st, 0, 0, 0, 0);
+	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 	return 0;
 }
 
-
-static int ptdump_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
-	.open		= ptdump_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
 
 static void build_pgtable_complete_mask(void)
 {
@@ -436,22 +351,34 @@ static void build_pgtable_complete_mask(void)
 				pg_level[i].mask |= pg_level[i].flag[j].mask;
 }
 
-#ifdef CONFIG_PPC_DEBUG_WX
+#ifdef CONFIG_DEBUG_WX
 void ptdump_check_wx(void)
 {
 	struct pg_state st = {
 		.seq = NULL,
-		.marker = address_markers,
+		.marker = (struct addr_marker[]) {
+			{ 0, NULL},
+			{ -1, NULL},
+		},
+		.level = -1,
 		.check_wx = true,
-		.start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
+		.ptdump = {
+			.note_page = note_page,
+			.range = (struct ptdump_range[]){
+				{TASK_SIZE, ~0UL},
+				{0, 0}
+			}
+		}
 	};
 
 #ifdef CONFIG_PPC64
 	if (!radix_enabled())
-		st.start_address = KERN_VIRT_START;
+		st.ptdump.range.start = KERN_VIRT_START;
+	else
+		st.ptdump.range.start = PAGE_OFFSET;
 #endif
 
-	walk_pagetables(&st);
+	ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
 
 	if (st.wx_pages)
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
@@ -465,8 +392,10 @@ static int ptdump_init(void)
 {
 	populate_markers();
 	build_pgtable_complete_mask();
-	debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
-			    &ptdump_fops);
+
+	if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS))
+		debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+
 	return 0;
 }
 device_initcall(ptdump_init);
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index c005fe041c18..03607ab90c66 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -68,8 +68,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-	{
-	}, { /* pgd */
+	{ /* pgd */
+		.flag	= flag_array,
+		.num	= ARRAY_SIZE(flag_array),
+	}, { /* p4d */
 		.flag	= flag_array,
 		.num	= ARRAY_SIZE(flag_array),
 	}, { /* pud */
-- 
2.25.0


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables
  2021-04-15 17:18 ` [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables Christophe Leroy
@ 2021-04-15 22:43   ` Daniel Axtens
  2021-04-16  5:48     ` Christophe Leroy
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Axtens @ 2021-04-15 22:43 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

Hi Christophe,

> Pagewalk ignores hugepd entries and walk down the tables
> as if it was traditionnal entries, leading to crazy result.
>
> Add walk_hugepd_range() and use it to walk hugepage tables.
>
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  mm/pagewalk.c | 54 +++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 48 insertions(+), 6 deletions(-)
>
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index e81640d9f177..410a9d8f7572 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -58,6 +58,32 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  	return err;
>  }
>  
> +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
> +			     unsigned long end, struct mm_walk *walk, int pdshift)
> +{
> +	int err = 0;
> +#ifdef CONFIG_ARCH_HAS_HUGEPD
> +	const struct mm_walk_ops *ops = walk->ops;
> +	int shift = hugepd_shift(*phpd);
> +	int page_size = 1 << shift;
> +
> +	if (addr & (page_size - 1))
> +		return 0;
> +
> +	for (;;) {
> +		pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
> +
> +		err = ops->pte_entry(pte, addr, addr + page_size, walk);
> +		if (err)
> +			break;
> +		if (addr >= end - page_size)
> +			break;
> +		addr += page_size;
> +	}

Initially I thought this was a somewhat unintuitive way to structure
this loop, but I see it parallels the structure of walk_pte_range_inner,
so I think the consistency is worth it.

I notice the pte walking code potentially takes some locks: does this
code need to do that?

arch/powerpc/mm/hugetlbpage.c says that hugepds are protected by the
mm->page_table_lock, but I don't think we're taking it in this code.

> +#endif
> +	return err;
> +}
> +
>  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  			  struct mm_walk *walk)
>  {
> @@ -108,7 +134,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>  				goto again;
>  		}
>  
> -		err = walk_pte_range(pmd, addr, next, walk);
> +		if (is_hugepd(__hugepd(pmd_val(*pmd))))
> +			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
> +		else
> +			err = walk_pte_range(pmd, addr, next, walk);
>  		if (err)
>  			break;
>  	} while (pmd++, addr = next, addr != end);
> @@ -157,7 +186,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
>  		if (pud_none(*pud))
>  			goto again;
>  
> -		err = walk_pmd_range(pud, addr, next, walk);
> +		if (is_hugepd(__hugepd(pud_val(*pud))))
> +			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
> +		else
> +			err = walk_pmd_range(pud, addr, next, walk);

I'm a bit worried you might end up calling into walk_hugepd_range with
ops->pte_entry == NULL, and then jumping to 0.

static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
			  struct mm_walk *walk)
{
...
        pud = pud_offset(p4d, addr);
	do {
                ...
                if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
		    walk->action == ACTION_CONTINUE ||
		    !(ops->pmd_entry || ops->pte_entry)) <<< THIS CHECK
			continue;
                ...
		if (is_hugepd(__hugepd(pud_val(*pud))))
			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
		else
			err = walk_pmd_range(pud, addr, next, walk);
		if (err)
			break;
	} while (pud++, addr = next, addr != end);

walk_pud_range will proceed if there is _either_ an ops->pmd_entry _or_
an ops->pte_entry, but walk_hugepd_range will call ops->pte_entry
unconditionally.

The same issue applies to walk_{p4d,pgd}_range...

Kind regards,
Daniel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-15 17:18 ` [PATCH v1 3/5] mm: ptdump: Provide page size to notepage() Christophe Leroy
@ 2021-04-15 23:12   ` Daniel Axtens
  2021-04-16  5:19     ` Christophe Leroy
  2021-04-16  9:28   ` Steven Price
  1 sibling, 1 reply; 24+ messages in thread
From: Daniel Axtens @ 2021-04-15 23:12 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

Hi Christophe,

>  static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
> -		      u64 val)
> +		      u64 val, unsigned long page_size)

Compilers can warn about unused parameters at -Wextra level.  However,
reading scripts/Makefile.extrawarn it looks like the warning is
explicitly _disabled_ in the kernel at W=1 and not reenabled at W=2 or
W=3. So I guess this is fine...

> @@ -126,7 +126,7 @@ static int ptdump_hole(unsigned long addr, unsigned long next,
>  {
>  	struct ptdump_state *st = walk->private;
>  
> -	st->note_page(st, addr, depth, 0);
> +	st->note_page(st, addr, depth, 0, 0);

I know it doesn't matter at this point, but I'm not really thrilled by
the idea of passing 0 as the size here. Doesn't the hole have a known
page size?

>  
>  	return 0;
>  }
> @@ -153,5 +153,5 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
>  	mmap_read_unlock(mm);
>  
>  	/* Flush out the last page */
> -	st->note_page(st, 0, -1, 0);
> +	st->note_page(st, 0, -1, 0, 0);

I'm more OK with the idea of passing 0 as the size when the depth is -1
(don't know): if we don't know the depth we conceptually can't know the
page size.

Regards,
Daniel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 4/5] mm: ptdump: Support hugepd table entries
  2021-04-15 17:18 ` [PATCH v1 4/5] mm: ptdump: Support hugepd table entries Christophe Leroy
@ 2021-04-15 23:29   ` Daniel Axtens
  2021-04-16  5:25     ` Christophe Leroy
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Axtens @ 2021-04-15 23:29 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

Hi Christophe,

> Which hugepd, page table entries can be at any level
> and can be of any size.
>
> Add support for them.
>
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  mm/ptdump.c | 17 +++++++++++++++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/mm/ptdump.c b/mm/ptdump.c
> index 61cd16afb1c8..6efdb8c15a7d 100644
> --- a/mm/ptdump.c
> +++ b/mm/ptdump.c
> @@ -112,11 +112,24 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
>  {
>  	struct ptdump_state *st = walk->private;
>  	pte_t val = ptep_get(pte);
> +	unsigned long page_size = next - addr;
> +	int level;
> +
> +	if (page_size >= PGDIR_SIZE)
> +		level = 0;
> +	else if (page_size >= P4D_SIZE)
> +		level = 1;
> +	else if (page_size >= PUD_SIZE)
> +		level = 2;
> +	else if (page_size >= PMD_SIZE)
> +		level = 3;
> +	else
> +		level = 4;
>  
>  	if (st->effective_prot)
> -		st->effective_prot(st, 4, pte_val(val));
> +		st->effective_prot(st, level, pte_val(val));
>  
> -	st->note_page(st, addr, 4, pte_val(val), PAGE_SIZE);
> +	st->note_page(st, addr, level, pte_val(val), page_size);

It seems to me that passing both level and page_size is a bit redundant,
but I guess it does reduce the impact on each arch's code?

Kind regards,
Daniel

>  
>  	return 0;
>  }
> -- 
> 2.25.0

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-15 23:12   ` Daniel Axtens
@ 2021-04-16  5:19     ` Christophe Leroy
  0 siblings, 0 replies; 24+ messages in thread
From: Christophe Leroy @ 2021-04-16  5:19 UTC (permalink / raw)
  To: Daniel Axtens, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel



Le 16/04/2021 à 01:12, Daniel Axtens a écrit :
> Hi Christophe,
> 
>>   static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
>> -		      u64 val)
>> +		      u64 val, unsigned long page_size)
> 
> Compilers can warn about unused parameters at -Wextra level.  However,
> reading scripts/Makefile.extrawarn it looks like the warning is
> explicitly _disabled_ in the kernel at W=1 and not reenabled at W=2 or
> W=3. So I guess this is fine...

There are a lot lot lot functions having unused parameters in the kernel , especially the ones that 
are re-implemented by each architecture.

> 
>> @@ -126,7 +126,7 @@ static int ptdump_hole(unsigned long addr, unsigned long next,
>>   {
>>   	struct ptdump_state *st = walk->private;
>>   
>> -	st->note_page(st, addr, depth, 0);
>> +	st->note_page(st, addr, depth, 0, 0);
> 
> I know it doesn't matter at this point, but I'm not really thrilled by
> the idea of passing 0 as the size here. Doesn't the hole have a known
> page size?

The hole has a size for sure, I don't think we can call it a page size:

On powerpc 8xx, we have 4 page sizes: 8M, 512k, 16k and 4k.
A page table will cover 4M areas and will contain pages of size 512k, 16k and 4k.
A PGD table contains either entries which points to a page table (covering 4M), or two identical 
consecutive entries pointing to the same hugepd which contains a single PTE for an 8M page.

So, if a PGD entry is empty, the hole is 4M, it corresponds to none of the page sizes the 
architecture supports.


But looking at what is done with that size, it can make sense to pass it to notepage() anyway. Let's 
do that.

> 
>>   
>>   	return 0;
>>   }
>> @@ -153,5 +153,5 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
>>   	mmap_read_unlock(mm);
>>   
>>   	/* Flush out the last page */
>> -	st->note_page(st, 0, -1, 0);
>> +	st->note_page(st, 0, -1, 0, 0);
> 
> I'm more OK with the idea of passing 0 as the size when the depth is -1
> (don't know): if we don't know the depth we conceptually can't know the
> page size.
> 
> Regards,
> Daniel
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 4/5] mm: ptdump: Support hugepd table entries
  2021-04-15 23:29   ` Daniel Axtens
@ 2021-04-16  5:25     ` Christophe Leroy
  0 siblings, 0 replies; 24+ messages in thread
From: Christophe Leroy @ 2021-04-16  5:25 UTC (permalink / raw)
  To: Daniel Axtens, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

Hi Daniel,

Le 16/04/2021 à 01:29, Daniel Axtens a écrit :
> Hi Christophe,
> 
>> Which hugepd, page table entries can be at any level
>> and can be of any size.
>>
>> Add support for them.
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>> ---
>>   mm/ptdump.c | 17 +++++++++++++++--
>>   1 file changed, 15 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/ptdump.c b/mm/ptdump.c
>> index 61cd16afb1c8..6efdb8c15a7d 100644
>> --- a/mm/ptdump.c
>> +++ b/mm/ptdump.c
>> @@ -112,11 +112,24 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
>>   {
>>   	struct ptdump_state *st = walk->private;
>>   	pte_t val = ptep_get(pte);
>> +	unsigned long page_size = next - addr;
>> +	int level;
>> +
>> +	if (page_size >= PGDIR_SIZE)
>> +		level = 0;
>> +	else if (page_size >= P4D_SIZE)
>> +		level = 1;
>> +	else if (page_size >= PUD_SIZE)
>> +		level = 2;
>> +	else if (page_size >= PMD_SIZE)
>> +		level = 3;
>> +	else
>> +		level = 4;
>>   
>>   	if (st->effective_prot)
>> -		st->effective_prot(st, 4, pte_val(val));
>> +		st->effective_prot(st, level, pte_val(val));
>>   
>> -	st->note_page(st, addr, 4, pte_val(val), PAGE_SIZE);
>> +	st->note_page(st, addr, level, pte_val(val), page_size);
> 
> It seems to me that passing both level and page_size is a bit redundant,
> but I guess it does reduce the impact on each arch's code?

Exactly, as shown above, the level can be re-calculated based on the page size, but it would be a 
unnecessary impact on all architectures and would duplicate the re-calculation of the level whereas 
in most cases we get it for free from the caller.

> 
> Kind regards,
> Daniel
> 
>>   
>>   	return 0;
>>   }
>> -- 
>> 2.25.0

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables
  2021-04-15 22:43   ` Daniel Axtens
@ 2021-04-16  5:48     ` Christophe Leroy
  0 siblings, 0 replies; 24+ messages in thread
From: Christophe Leroy @ 2021-04-16  5:48 UTC (permalink / raw)
  To: Daniel Axtens, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, Steven Price, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel



Le 16/04/2021 à 00:43, Daniel Axtens a écrit :
> Hi Christophe,
> 
>> Pagewalk ignores hugepd entries and walk down the tables
>> as if it was traditionnal entries, leading to crazy result.
>>
>> Add walk_hugepd_range() and use it to walk hugepage tables.
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>> ---
>>   mm/pagewalk.c | 54 +++++++++++++++++++++++++++++++++++++++++++++------
>>   1 file changed, 48 insertions(+), 6 deletions(-)
>>
>> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
>> index e81640d9f177..410a9d8f7572 100644
>> --- a/mm/pagewalk.c
>> +++ b/mm/pagewalk.c
>> @@ -58,6 +58,32 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>>   	return err;
>>   }
>>   
>> +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
>> +			     unsigned long end, struct mm_walk *walk, int pdshift)
>> +{
>> +	int err = 0;
>> +#ifdef CONFIG_ARCH_HAS_HUGEPD
>> +	const struct mm_walk_ops *ops = walk->ops;
>> +	int shift = hugepd_shift(*phpd);
>> +	int page_size = 1 << shift;
>> +
>> +	if (addr & (page_size - 1))
>> +		return 0;
>> +
>> +	for (;;) {
>> +		pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
>> +
>> +		err = ops->pte_entry(pte, addr, addr + page_size, walk);
>> +		if (err)
>> +			break;
>> +		if (addr >= end - page_size)
>> +			break;
>> +		addr += page_size;
>> +	}
> 
> Initially I thought this was a somewhat unintuitive way to structure
> this loop, but I see it parallels the structure of walk_pte_range_inner,
> so I think the consistency is worth it.
> 
> I notice the pte walking code potentially takes some locks: does this
> code need to do that?
> 
> arch/powerpc/mm/hugetlbpage.c says that hugepds are protected by the
> mm->page_table_lock, but I don't think we're taking it in this code.

I'll add it, thanks.

> 
>> +#endif
>> +	return err;
>> +}
>> +
>>   static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>>   			  struct mm_walk *walk)
>>   {
>> @@ -108,7 +134,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
>>   				goto again;
>>   		}
>>   
>> -		err = walk_pte_range(pmd, addr, next, walk);
>> +		if (is_hugepd(__hugepd(pmd_val(*pmd))))
>> +			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
>> +		else
>> +			err = walk_pte_range(pmd, addr, next, walk);
>>   		if (err)
>>   			break;
>>   	} while (pmd++, addr = next, addr != end);
>> @@ -157,7 +186,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
>>   		if (pud_none(*pud))
>>   			goto again;
>>   
>> -		err = walk_pmd_range(pud, addr, next, walk);
>> +		if (is_hugepd(__hugepd(pud_val(*pud))))
>> +			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
>> +		else
>> +			err = walk_pmd_range(pud, addr, next, walk);
> 
> I'm a bit worried you might end up calling into walk_hugepd_range with
> ops->pte_entry == NULL, and then jumping to 0.

You are right, I missed it.
I'll bail out of walk_hugepd_range() when ops->pte_entry is NULL.


> 
> static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
> 			  struct mm_walk *walk)
> {
> ...
>          pud = pud_offset(p4d, addr);
> 	do {
>                  ...
>                  if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
> 		    walk->action == ACTION_CONTINUE ||
> 		    !(ops->pmd_entry || ops->pte_entry)) <<< THIS CHECK
> 			continue;
>                  ...
> 		if (is_hugepd(__hugepd(pud_val(*pud))))
> 			err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT);
> 		else
> 			err = walk_pmd_range(pud, addr, next, walk);
> 		if (err)
> 			break;
> 	} while (pud++, addr = next, addr != end);
> 
> walk_pud_range will proceed if there is _either_ an ops->pmd_entry _or_
> an ops->pte_entry, but walk_hugepd_range will call ops->pte_entry
> unconditionally.
> 
> The same issue applies to walk_{p4d,pgd}_range...
> 
> Kind regards,
> Daniel
> 

Thanks
Christophe

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-15 17:18 ` [PATCH v1 3/5] mm: ptdump: Provide page size to notepage() Christophe Leroy
  2021-04-15 23:12   ` Daniel Axtens
@ 2021-04-16  9:28   ` Steven Price
  2021-04-16 10:38     ` Christophe Leroy
  1 sibling, 1 reply; 24+ messages in thread
From: Steven Price @ 2021-04-16  9:28 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

On 15/04/2021 18:18, Christophe Leroy wrote:
> In order to support large pages on powerpc, notepage()
> needs to know the page size of the page.
> 
> Add a page_size argument to notepage().
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>   arch/arm64/mm/ptdump.c         |  2 +-
>   arch/riscv/mm/ptdump.c         |  2 +-
>   arch/s390/mm/dump_pagetables.c |  3 ++-
>   arch/x86/mm/dump_pagetables.c  |  2 +-
>   include/linux/ptdump.h         |  2 +-
>   mm/ptdump.c                    | 16 ++++++++--------
>   6 files changed, 14 insertions(+), 13 deletions(-)
>
[...]
> diff --git a/mm/ptdump.c b/mm/ptdump.c
> index da751448d0e4..61cd16afb1c8 100644
> --- a/mm/ptdump.c
> +++ b/mm/ptdump.c
> @@ -17,7 +17,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
>   {
>   	struct ptdump_state *st = walk->private;
>   
> -	st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
> +	st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]), PAGE_SIZE);

I'm not completely sure what the page_size is going to be used for, but 
note that KASAN presents an interesting case here. We short-cut by 
detecting it's a KASAN region at a high level (PGD/P4D/PUD/PMD) and 
instead of walking the tree down just call note_page() *once* but with 
level==4 because we know KASAN sets up the page table like that.

However the one call actually covers a much larger region - so while 
PAGE_SIZE matches the level it doesn't match the region covered. AFAICT 
this will lead to odd results if you enable KASAN on powerpc.

To be honest I don't fully understand why powerpc requires the page_size 
- it appears to be using it purely to find "holes" in the calls to 
note_page(), but I haven't worked out why such holes would occur.

Steve

>   
>   	walk->action = ACTION_CONTINUE;
>   
> @@ -41,7 +41,7 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
>   		st->effective_prot(st, 0, pgd_val(val));
>   
>   	if (pgd_leaf(val))
> -		st->note_page(st, addr, 0, pgd_val(val));
> +		st->note_page(st, addr, 0, pgd_val(val), PGDIR_SIZE);
>   
>   	return 0;
>   }
> @@ -62,7 +62,7 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
>   		st->effective_prot(st, 1, p4d_val(val));
>   
>   	if (p4d_leaf(val))
> -		st->note_page(st, addr, 1, p4d_val(val));
> +		st->note_page(st, addr, 1, p4d_val(val), P4D_SIZE);
>   
>   	return 0;
>   }
> @@ -83,7 +83,7 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
>   		st->effective_prot(st, 2, pud_val(val));
>   
>   	if (pud_leaf(val))
> -		st->note_page(st, addr, 2, pud_val(val));
> +		st->note_page(st, addr, 2, pud_val(val), PUD_SIZE);
>   
>   	return 0;
>   }
> @@ -102,7 +102,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
>   	if (st->effective_prot)
>   		st->effective_prot(st, 3, pmd_val(val));
>   	if (pmd_leaf(val))
> -		st->note_page(st, addr, 3, pmd_val(val));
> +		st->note_page(st, addr, 3, pmd_val(val), PMD_SIZE);
>   
>   	return 0;
>   }
> @@ -116,7 +116,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
>   	if (st->effective_prot)
>   		st->effective_prot(st, 4, pte_val(val));
>   
> -	st->note_page(st, addr, 4, pte_val(val));
> +	st->note_page(st, addr, 4, pte_val(val), PAGE_SIZE);
>   
>   	return 0;
>   }
> @@ -126,7 +126,7 @@ static int ptdump_hole(unsigned long addr, unsigned long next,
>   {
>   	struct ptdump_state *st = walk->private;
>   
> -	st->note_page(st, addr, depth, 0);
> +	st->note_page(st, addr, depth, 0, 0);
>   
>   	return 0;
>   }
> @@ -153,5 +153,5 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
>   	mmap_read_unlock(mm);
>   
>   	/* Flush out the last page */
> -	st->note_page(st, 0, -1, 0);
> +	st->note_page(st, 0, -1, 0, 0);
>   }
> 


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-16  9:28   ` Steven Price
@ 2021-04-16 10:38     ` Christophe Leroy
  2021-04-16 10:51       ` Steven Price
  0 siblings, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2021-04-16 10:38 UTC (permalink / raw)
  To: Steven Price, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel



Le 16/04/2021 à 11:28, Steven Price a écrit :
> On 15/04/2021 18:18, Christophe Leroy wrote:
>> In order to support large pages on powerpc, notepage()
>> needs to know the page size of the page.
>>
>> Add a page_size argument to notepage().
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>> ---
>>   arch/arm64/mm/ptdump.c         |  2 +-
>>   arch/riscv/mm/ptdump.c         |  2 +-
>>   arch/s390/mm/dump_pagetables.c |  3 ++-
>>   arch/x86/mm/dump_pagetables.c  |  2 +-
>>   include/linux/ptdump.h         |  2 +-
>>   mm/ptdump.c                    | 16 ++++++++--------
>>   6 files changed, 14 insertions(+), 13 deletions(-)
>>
> [...]
>> diff --git a/mm/ptdump.c b/mm/ptdump.c
>> index da751448d0e4..61cd16afb1c8 100644
>> --- a/mm/ptdump.c
>> +++ b/mm/ptdump.c
>> @@ -17,7 +17,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
>>   {
>>       struct ptdump_state *st = walk->private;
>> -    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
>> +    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]), PAGE_SIZE);
> 
> I'm not completely sure what the page_size is going to be used for, but note that KASAN presents an 
> interesting case here. We short-cut by detecting it's a KASAN region at a high level 
> (PGD/P4D/PUD/PMD) and instead of walking the tree down just call note_page() *once* but with 
> level==4 because we know KASAN sets up the page table like that.
> 
> However the one call actually covers a much larger region - so while PAGE_SIZE matches the level it 
> doesn't match the region covered. AFAICT this will lead to odd results if you enable KASAN on powerpc.

Hum .... I successfully tested it with KASAN, I now realise that I tested it with 
CONFIG_KASAN_VMALLOC selected. In this situation, since 
https://github.com/torvalds/linux/commit/af3d0a686 we don't have any common shadow page table anymore.

I'll test again without CONFIG_KASAN_VMALLOC.

> 
> To be honest I don't fully understand why powerpc requires the page_size - it appears to be using it 
> purely to find "holes" in the calls to note_page(), but I haven't worked out why such holes would 
> occur.

I was indeed introduced for KASAN. We have a first commit 
https://github.com/torvalds/linux/commit/cabe8138 which uses page size to detect whether it is a 
KASAN like stuff.

Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a fix. I can't remember what the 
problem was exactly, something around the use of hugepages for kernel memory, came as part of the 
series 
https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/


Christophe

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-16 10:38     ` Christophe Leroy
@ 2021-04-16 10:51       ` Steven Price
  2021-04-16 11:08         ` Christophe Leroy
  2021-04-19 13:14         ` Christophe Leroy
  0 siblings, 2 replies; 24+ messages in thread
From: Steven Price @ 2021-04-16 10:51 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

On 16/04/2021 11:38, Christophe Leroy wrote:
> 
> 
> Le 16/04/2021 à 11:28, Steven Price a écrit :
>> On 15/04/2021 18:18, Christophe Leroy wrote:
>>> In order to support large pages on powerpc, notepage()
>>> needs to know the page size of the page.
>>>
>>> Add a page_size argument to notepage().
>>>
>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>> ---
>>>   arch/arm64/mm/ptdump.c         |  2 +-
>>>   arch/riscv/mm/ptdump.c         |  2 +-
>>>   arch/s390/mm/dump_pagetables.c |  3 ++-
>>>   arch/x86/mm/dump_pagetables.c  |  2 +-
>>>   include/linux/ptdump.h         |  2 +-
>>>   mm/ptdump.c                    | 16 ++++++++--------
>>>   6 files changed, 14 insertions(+), 13 deletions(-)
>>>
>> [...]
>>> diff --git a/mm/ptdump.c b/mm/ptdump.c
>>> index da751448d0e4..61cd16afb1c8 100644
>>> --- a/mm/ptdump.c
>>> +++ b/mm/ptdump.c
>>> @@ -17,7 +17,7 @@ static inline int note_kasan_page_table(struct 
>>> mm_walk *walk,
>>>   {
>>>       struct ptdump_state *st = walk->private;
>>> -    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
>>> +    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]), 
>>> PAGE_SIZE);
>>
>> I'm not completely sure what the page_size is going to be used for, 
>> but note that KASAN presents an interesting case here. We short-cut by 
>> detecting it's a KASAN region at a high level (PGD/P4D/PUD/PMD) and 
>> instead of walking the tree down just call note_page() *once* but with 
>> level==4 because we know KASAN sets up the page table like that.
>>
>> However the one call actually covers a much larger region - so while 
>> PAGE_SIZE matches the level it doesn't match the region covered. 
>> AFAICT this will lead to odd results if you enable KASAN on powerpc.
> 
> Hum .... I successfully tested it with KASAN, I now realise that I 
> tested it with CONFIG_KASAN_VMALLOC selected. In this situation, since 
> https://github.com/torvalds/linux/commit/af3d0a686 we don't have any 
> common shadow page table anymore.
> 
> I'll test again without CONFIG_KASAN_VMALLOC.
> 
>>
>> To be honest I don't fully understand why powerpc requires the 
>> page_size - it appears to be using it purely to find "holes" in the 
>> calls to note_page(), but I haven't worked out why such holes would 
>> occur.
> 
> I was indeed introduced for KASAN. We have a first commit 
> https://github.com/torvalds/linux/commit/cabe8138 which uses page size 
> to detect whether it is a KASAN like stuff.
> 
> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a fix. I 
> can't remember what the problem was exactly, something around the use of 
> hugepages for kernel memory, came as part of the series 
> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 

Ah, that's useful context. So it looks like powerpc took a different 
route to reducing the KASAN output to x86.

Given the generic ptdump code has handling for KASAN already it should 
be possible to drop that from the powerpc arch code, which I think means 
we don't actually need to provide page size to notepage(). Hopefully 
that means more code to delete ;)

Steve

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-16 10:51       ` Steven Price
@ 2021-04-16 11:08         ` Christophe Leroy
  2021-04-16 13:00           ` Steven Price
  2021-04-19 13:14         ` Christophe Leroy
  1 sibling, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2021-04-16 11:08 UTC (permalink / raw)
  To: Steven Price, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel



Le 16/04/2021 à 12:51, Steven Price a écrit :
> On 16/04/2021 11:38, Christophe Leroy wrote:
>>
>>
>> Le 16/04/2021 à 11:28, Steven Price a écrit :
>>> On 15/04/2021 18:18, Christophe Leroy wrote:
>>>> In order to support large pages on powerpc, notepage()
>>>> needs to know the page size of the page.
>>>>
>>>> Add a page_size argument to notepage().
>>>>
>>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>>> ---
>>>>   arch/arm64/mm/ptdump.c         |  2 +-
>>>>   arch/riscv/mm/ptdump.c         |  2 +-
>>>>   arch/s390/mm/dump_pagetables.c |  3 ++-
>>>>   arch/x86/mm/dump_pagetables.c  |  2 +-
>>>>   include/linux/ptdump.h         |  2 +-
>>>>   mm/ptdump.c                    | 16 ++++++++--------
>>>>   6 files changed, 14 insertions(+), 13 deletions(-)
>>>>
>>> [...]
>>>> diff --git a/mm/ptdump.c b/mm/ptdump.c
>>>> index da751448d0e4..61cd16afb1c8 100644
>>>> --- a/mm/ptdump.c
>>>> +++ b/mm/ptdump.c
>>>> @@ -17,7 +17,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
>>>>   {
>>>>       struct ptdump_state *st = walk->private;
>>>> -    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
>>>> +    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]), PAGE_SIZE);
>>>
>>> I'm not completely sure what the page_size is going to be used for, but note that KASAN presents 
>>> an interesting case here. We short-cut by detecting it's a KASAN region at a high level 
>>> (PGD/P4D/PUD/PMD) and instead of walking the tree down just call note_page() *once* but with 
>>> level==4 because we know KASAN sets up the page table like that.
>>>
>>> However the one call actually covers a much larger region - so while PAGE_SIZE matches the level 
>>> it doesn't match the region covered. AFAICT this will lead to odd results if you enable KASAN on 
>>> powerpc.
>>
>> Hum .... I successfully tested it with KASAN, I now realise that I tested it with 
>> CONFIG_KASAN_VMALLOC selected. In this situation, since 
>> https://github.com/torvalds/linux/commit/af3d0a686 we don't have any common shadow page table 
>> anymore.
>>
>> I'll test again without CONFIG_KASAN_VMALLOC.
>>
>>>
>>> To be honest I don't fully understand why powerpc requires the page_size - it appears to be using 
>>> it purely to find "holes" in the calls to note_page(), but I haven't worked out why such holes 
>>> would occur.
>>
>> I was indeed introduced for KASAN. We have a first commit 
>> https://github.com/torvalds/linux/commit/cabe8138 which uses page size to detect whether it is a 
>> KASAN like stuff.
>>
>> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a fix. I can't remember what the 
>> problem was exactly, something around the use of hugepages for kernel memory, came as part of the 
>> series 
>> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 
> 
> 
> Ah, that's useful context. So it looks like powerpc took a different route to reducing the KASAN 
> output to x86.
> 
> Given the generic ptdump code has handling for KASAN already it should be possible to drop that from 
> the powerpc arch code, which I think means we don't actually need to provide page size to 
> notepage(). Hopefully that means more code to delete ;)
> 

Yes ... and no.

It looks like the generic ptdump handles the case when several pgdir entries points to the same 
kasan_early_shadow_pte. But it doesn't take into account the powerpc case where we have regular page 
tables where several (if not all) PTEs are pointing to the kasan_early_shadow_page .

Christophe

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-16 11:08         ` Christophe Leroy
@ 2021-04-16 13:00           ` Steven Price
  2021-04-16 14:40             ` Christophe Leroy
  0 siblings, 1 reply; 24+ messages in thread
From: Steven Price @ 2021-04-16 13:00 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

On 16/04/2021 12:08, Christophe Leroy wrote:
> 
> 
> Le 16/04/2021 à 12:51, Steven Price a écrit :
>> On 16/04/2021 11:38, Christophe Leroy wrote:
>>>
>>>
>>> Le 16/04/2021 à 11:28, Steven Price a écrit :
>>>> On 15/04/2021 18:18, Christophe Leroy wrote:
>>>>> In order to support large pages on powerpc, notepage()
>>>>> needs to know the page size of the page.
>>>>>
>>>>> Add a page_size argument to notepage().
>>>>>
>>>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>>>> ---
>>>>>   arch/arm64/mm/ptdump.c         |  2 +-
>>>>>   arch/riscv/mm/ptdump.c         |  2 +-
>>>>>   arch/s390/mm/dump_pagetables.c |  3 ++-
>>>>>   arch/x86/mm/dump_pagetables.c  |  2 +-
>>>>>   include/linux/ptdump.h         |  2 +-
>>>>>   mm/ptdump.c                    | 16 ++++++++--------
>>>>>   6 files changed, 14 insertions(+), 13 deletions(-)
>>>>>
>>>> [...]
>>>>> diff --git a/mm/ptdump.c b/mm/ptdump.c
>>>>> index da751448d0e4..61cd16afb1c8 100644
>>>>> --- a/mm/ptdump.c
>>>>> +++ b/mm/ptdump.c
>>>>> @@ -17,7 +17,7 @@ static inline int note_kasan_page_table(struct 
>>>>> mm_walk *walk,
>>>>>   {
>>>>>       struct ptdump_state *st = walk->private;
>>>>> -    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
>>>>> +    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]), 
>>>>> PAGE_SIZE);
>>>>
>>>> I'm not completely sure what the page_size is going to be used for, 
>>>> but note that KASAN presents an interesting case here. We short-cut 
>>>> by detecting it's a KASAN region at a high level (PGD/P4D/PUD/PMD) 
>>>> and instead of walking the tree down just call note_page() *once* 
>>>> but with level==4 because we know KASAN sets up the page table like 
>>>> that.
>>>>
>>>> However the one call actually covers a much larger region - so while 
>>>> PAGE_SIZE matches the level it doesn't match the region covered. 
>>>> AFAICT this will lead to odd results if you enable KASAN on powerpc.
>>>
>>> Hum .... I successfully tested it with KASAN, I now realise that I 
>>> tested it with CONFIG_KASAN_VMALLOC selected. In this situation, 
>>> since https://github.com/torvalds/linux/commit/af3d0a686 we don't 
>>> have any common shadow page table anymore.
>>>
>>> I'll test again without CONFIG_KASAN_VMALLOC.
>>>
>>>>
>>>> To be honest I don't fully understand why powerpc requires the 
>>>> page_size - it appears to be using it purely to find "holes" in the 
>>>> calls to note_page(), but I haven't worked out why such holes would 
>>>> occur.
>>>
>>> I was indeed introduced for KASAN. We have a first commit 
>>> https://github.com/torvalds/linux/commit/cabe8138 which uses page 
>>> size to detect whether it is a KASAN like stuff.
>>>
>>> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a 
>>> fix. I can't remember what the problem was exactly, something around 
>>> the use of hugepages for kernel memory, came as part of the series 
>>> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 
>>
>>
>>
>> Ah, that's useful context. So it looks like powerpc took a different 
>> route to reducing the KASAN output to x86.
>>
>> Given the generic ptdump code has handling for KASAN already it should 
>> be possible to drop that from the powerpc arch code, which I think 
>> means we don't actually need to provide page size to notepage(). 
>> Hopefully that means more code to delete ;)
>>
> 
> Yes ... and no.
> 
> It looks like the generic ptdump handles the case when several pgdir 
> entries points to the same kasan_early_shadow_pte. But it doesn't take 
> into account the powerpc case where we have regular page tables where 
> several (if not all) PTEs are pointing to the kasan_early_shadow_page .

I'm not sure I follow quite how powerpc is different here. But could you 
have a similar check for PTEs against kasan_early_shadow_pte as the 
other levels already have?

I'm just worried that page_size isn't well defined in this interface and 
it's going to cause problems in the future.

Steve

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-16 13:00           ` Steven Price
@ 2021-04-16 14:40             ` Christophe Leroy
  2021-04-16 15:04               ` Christophe Leroy
  0 siblings, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2021-04-16 14:40 UTC (permalink / raw)
  To: Steven Price, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel



Le 16/04/2021 à 15:00, Steven Price a écrit :
> On 16/04/2021 12:08, Christophe Leroy wrote:
>>
>>
>> Le 16/04/2021 à 12:51, Steven Price a écrit :
>>> On 16/04/2021 11:38, Christophe Leroy wrote:
>>>>
>>>>
>>>> Le 16/04/2021 à 11:28, Steven Price a écrit :
>>>>> On 15/04/2021 18:18, Christophe Leroy wrote:
>>>>>> In order to support large pages on powerpc, notepage()
>>>>>> needs to know the page size of the page.
>>>>>>
>>>>>> Add a page_size argument to notepage().
>>>>>>
>>>>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>>>>> ---
>>>>>>   arch/arm64/mm/ptdump.c         |  2 +-
>>>>>>   arch/riscv/mm/ptdump.c         |  2 +-
>>>>>>   arch/s390/mm/dump_pagetables.c |  3 ++-
>>>>>>   arch/x86/mm/dump_pagetables.c  |  2 +-
>>>>>>   include/linux/ptdump.h         |  2 +-
>>>>>>   mm/ptdump.c                    | 16 ++++++++--------
>>>>>>   6 files changed, 14 insertions(+), 13 deletions(-)
>>>>>>
>>>>> [...]
>>>>>> diff --git a/mm/ptdump.c b/mm/ptdump.c
>>>>>> index da751448d0e4..61cd16afb1c8 100644
>>>>>> --- a/mm/ptdump.c
>>>>>> +++ b/mm/ptdump.c
>>>>>> @@ -17,7 +17,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk,
>>>>>>   {
>>>>>>       struct ptdump_state *st = walk->private;
>>>>>> -    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
>>>>>> +    st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]), PAGE_SIZE);
>>>>>
>>>>> I'm not completely sure what the page_size is going to be used for, but note that KASAN 
>>>>> presents an interesting case here. We short-cut by detecting it's a KASAN region at a high 
>>>>> level (PGD/P4D/PUD/PMD) and instead of walking the tree down just call note_page() *once* but 
>>>>> with level==4 because we know KASAN sets up the page table like that.
>>>>>
>>>>> However the one call actually covers a much larger region - so while PAGE_SIZE matches the 
>>>>> level it doesn't match the region covered. AFAICT this will lead to odd results if you enable 
>>>>> KASAN on powerpc.
>>>>
>>>> Hum .... I successfully tested it with KASAN, I now realise that I tested it with 
>>>> CONFIG_KASAN_VMALLOC selected. In this situation, since 
>>>> https://github.com/torvalds/linux/commit/af3d0a686 we don't have any common shadow page table 
>>>> anymore.
>>>>
>>>> I'll test again without CONFIG_KASAN_VMALLOC.
>>>>
>>>>>
>>>>> To be honest I don't fully understand why powerpc requires the page_size - it appears to be 
>>>>> using it purely to find "holes" in the calls to note_page(), but I haven't worked out why such 
>>>>> holes would occur.
>>>>
>>>> I was indeed introduced for KASAN. We have a first commit 
>>>> https://github.com/torvalds/linux/commit/cabe8138 which uses page size to detect whether it is a 
>>>> KASAN like stuff.
>>>>
>>>> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a fix. I can't remember what the 
>>>> problem was exactly, something around the use of hugepages for kernel memory, came as part of 
>>>> the series 
>>>> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 
>>>
>>>
>>>
>>>
>>> Ah, that's useful context. So it looks like powerpc took a different route to reducing the KASAN 
>>> output to x86.
>>>
>>> Given the generic ptdump code has handling for KASAN already it should be possible to drop that 
>>> from the powerpc arch code, which I think means we don't actually need to provide page size to 
>>> notepage(). Hopefully that means more code to delete ;)
>>>
>>
>> Yes ... and no.
>>
>> It looks like the generic ptdump handles the case when several pgdir entries points to the same 
>> kasan_early_shadow_pte. But it doesn't take into account the powerpc case where we have regular 
>> page tables where several (if not all) PTEs are pointing to the kasan_early_shadow_page .
> 
> I'm not sure I follow quite how powerpc is different here. But could you have a similar check for 
> PTEs against kasan_early_shadow_pte as the other levels already have?
> 
> I'm just worried that page_size isn't well defined in this interface and it's going to cause 
> problems in the future.
> 

I'm trying. I reverted the two commits b00ff6d8c and cabe8138.

At the moment, I don't get exactly what I expect: For linear memory I get one line for each 8M page 
whereas before reverting the patches I got one 16M line and one 112M line.

And for KASAN shadow area I get two lines for the 2x 8M pages shadowing linear mem then I get one 4M 
line for each PGDIR entry pointing to kasan_early_shadow_pte.

0xf8000000-0xf87fffff 0x07000000         8M   huge        rw       present
0xf8800000-0xf8ffffff 0x07800000         8M   huge        rw       present
0xf9000000-0xf93fffff 0x01430000         4M               r        present
0xf9400000-0xf97fffff 0x01430000         4M               r        present
0xf9800000-0xf9bfffff 0x01430000         4M               r        present
0xf9c00000-0xf9ffffff 0x01430000         4M               r        present
0xfa000000-0xfa3fffff 0x01430000         4M               r        present
0xfa400000-0xfa7fffff 0x01430000         4M               r        present
0xfa800000-0xfabfffff 0x01430000         4M               r        present
0xfac00000-0xfaffffff 0x01430000         4M               r        present
0xfb000000-0xfb3fffff 0x01430000         4M               r        present
0xfb400000-0xfb7fffff 0x01430000         4M               r        present
0xfb800000-0xfbbfffff 0x01430000         4M               r        present
0xfbc00000-0xfbffffff 0x01430000         4M               r        present
0xfc000000-0xfc3fffff 0x01430000         4M               r        present
0xfc400000-0xfc7fffff 0x01430000         4M               r        present
0xfc800000-0xfcbfffff 0x01430000         4M               r        present
0xfcc00000-0xfcffffff 0x01430000         4M               r        present
0xfd000000-0xfd3fffff 0x01430000         4M               r        present
0xfd400000-0xfd7fffff 0x01430000         4M               r        present
0xfd800000-0xfdbfffff 0x01430000         4M               r        present
0xfdc00000-0xfdffffff 0x01430000         4M               r        present
0xfe000000-0xfe3fffff 0x01430000         4M               r        present
0xfe400000-0xfe7fffff 0x01430000         4M               r        present
0xfe800000-0xfebfffff 0x01430000         4M               r        present
0xfec00000-0xfeffffff 0x01430000         4M               r        present

Any idea ?

Christophe

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-16 14:40             ` Christophe Leroy
@ 2021-04-16 15:04               ` Christophe Leroy
  2021-04-16 15:15                 ` Christophe Leroy
  0 siblings, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2021-04-16 15:04 UTC (permalink / raw)
  To: Steven Price, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel



Le 16/04/2021 à 16:40, Christophe Leroy a écrit :
> 
> 
> Le 16/04/2021 à 15:00, Steven Price a écrit :
>> On 16/04/2021 12:08, Christophe Leroy wrote:
>>>
>>>
>>> Le 16/04/2021 à 12:51, Steven Price a écrit :
>>>> On 16/04/2021 11:38, Christophe Leroy wrote:
>>>>>
>>>>>
>>>>> Le 16/04/2021 à 11:28, Steven Price a écrit :
>>>>>> To be honest I don't fully understand why powerpc requires the page_size - it appears to be 
>>>>>> using it purely to find "holes" in the calls to note_page(), but I haven't worked out why such 
>>>>>> holes would occur.
>>>>>
>>>>> I was indeed introduced for KASAN. We have a first commit 
>>>>> https://github.com/torvalds/linux/commit/cabe8138 which uses page size to detect whether it is 
>>>>> a KASAN like stuff.
>>>>>
>>>>> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a fix. I can't remember what 
>>>>> the problem was exactly, something around the use of hugepages for kernel memory, came as part 
>>>>> of the series 
>>>>> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 
>>>>
>>>>
>>>>
>>>>
>>>>
>>>> Ah, that's useful context. So it looks like powerpc took a different route to reducing the KASAN 
>>>> output to x86.
>>>>
>>>> Given the generic ptdump code has handling for KASAN already it should be possible to drop that 
>>>> from the powerpc arch code, which I think means we don't actually need to provide page size to 
>>>> notepage(). Hopefully that means more code to delete ;)
>>>>
>>>
>>> Yes ... and no.
>>>
>>> It looks like the generic ptdump handles the case when several pgdir entries points to the same 
>>> kasan_early_shadow_pte. But it doesn't take into account the powerpc case where we have regular 
>>> page tables where several (if not all) PTEs are pointing to the kasan_early_shadow_page .
>>
>> I'm not sure I follow quite how powerpc is different here. But could you have a similar check for 
>> PTEs against kasan_early_shadow_pte as the other levels already have?
>>
>> I'm just worried that page_size isn't well defined in this interface and it's going to cause 
>> problems in the future.
>>
> 
> I'm trying. I reverted the two commits b00ff6d8c and cabe8138.
> 
> At the moment, I don't get exactly what I expect: For linear memory I get one line for each 8M page 
> whereas before reverting the patches I got one 16M line and one 112M line.
> 
> And for KASAN shadow area I get two lines for the 2x 8M pages shadowing linear mem then I get one 4M 
> line for each PGDIR entry pointing to kasan_early_shadow_pte.
> 
> 0xf8000000-0xf87fffff 0x07000000         8M   huge        rw       present
> 0xf8800000-0xf8ffffff 0x07800000         8M   huge        rw       present
> 0xf9000000-0xf93fffff 0x01430000         4M               r        present
...
> 0xfec00000-0xfeffffff 0x01430000         4M               r        present
> 
> Any idea ?
> 


I think the different with other architectures is here:

	} else if (flag != st->current_flags || level != st->level ||
		   addr >= st->marker[1].start_address ||
		   pa != st->last_pa + PAGE_SIZE) {


In addition to the checks everyone do, powerpc also checks "pa != st->last_pa + PAGE_SIZE".
And it is definitely for that test that page_size argument add been added.

I see that other architectures except RISCV don't dump the physical address. But even RISCV doesn't 
include that check.

That physical address dump was added by commit aaa229529244 ("powerpc/mm: Add physical address to 
Linux page table dump") [https://github.com/torvalds/linux/commit/aaa2295]

How do other architectures deal with the problem described by the commit log of that patch ?

Christophe

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-16 15:04               ` Christophe Leroy
@ 2021-04-16 15:15                 ` Christophe Leroy
  2021-04-16 16:00                   ` Steven Price
  0 siblings, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2021-04-16 15:15 UTC (permalink / raw)
  To: Steven Price, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel



Le 16/04/2021 à 17:04, Christophe Leroy a écrit :
> 
> 
> Le 16/04/2021 à 16:40, Christophe Leroy a écrit :
>>
>>
>> Le 16/04/2021 à 15:00, Steven Price a écrit :
>>> On 16/04/2021 12:08, Christophe Leroy wrote:
>>>>
>>>>
>>>> Le 16/04/2021 à 12:51, Steven Price a écrit :
>>>>> On 16/04/2021 11:38, Christophe Leroy wrote:
>>>>>>
>>>>>>
>>>>>> Le 16/04/2021 à 11:28, Steven Price a écrit :
>>>>>>> To be honest I don't fully understand why powerpc requires the page_size - it appears to be 
>>>>>>> using it purely to find "holes" in the calls to note_page(), but I haven't worked out why 
>>>>>>> such holes would occur.
>>>>>>
>>>>>> I was indeed introduced for KASAN. We have a first commit 
>>>>>> https://github.com/torvalds/linux/commit/cabe8138 which uses page size to detect whether it is 
>>>>>> a KASAN like stuff.
>>>>>>
>>>>>> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a fix. I can't remember what 
>>>>>> the problem was exactly, something around the use of hugepages for kernel memory, came as part 
>>>>>> of the series 
>>>>>> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> Ah, that's useful context. So it looks like powerpc took a different route to reducing the 
>>>>> KASAN output to x86.
>>>>>
>>>>> Given the generic ptdump code has handling for KASAN already it should be possible to drop that 
>>>>> from the powerpc arch code, which I think means we don't actually need to provide page size to 
>>>>> notepage(). Hopefully that means more code to delete ;)
>>>>>
>>>>
>>>> Yes ... and no.
>>>>
>>>> It looks like the generic ptdump handles the case when several pgdir entries points to the same 
>>>> kasan_early_shadow_pte. But it doesn't take into account the powerpc case where we have regular 
>>>> page tables where several (if not all) PTEs are pointing to the kasan_early_shadow_page .
>>>
>>> I'm not sure I follow quite how powerpc is different here. But could you have a similar check for 
>>> PTEs against kasan_early_shadow_pte as the other levels already have?
>>>
>>> I'm just worried that page_size isn't well defined in this interface and it's going to cause 
>>> problems in the future.
>>>
>>
>> I'm trying. I reverted the two commits b00ff6d8c and cabe8138.
>>
>> At the moment, I don't get exactly what I expect: For linear memory I get one line for each 8M 
>> page whereas before reverting the patches I got one 16M line and one 112M line.
>>
>> And for KASAN shadow area I get two lines for the 2x 8M pages shadowing linear mem then I get one 
>> 4M line for each PGDIR entry pointing to kasan_early_shadow_pte.
>>
>> 0xf8000000-0xf87fffff 0x07000000         8M   huge        rw       present
>> 0xf8800000-0xf8ffffff 0x07800000         8M   huge        rw       present
>> 0xf9000000-0xf93fffff 0x01430000         4M               r        present
> ...
>> 0xfec00000-0xfeffffff 0x01430000         4M               r        present
>>
>> Any idea ?
>>
> 
> 
> I think the different with other architectures is here:
> 
>      } else if (flag != st->current_flags || level != st->level ||
>             addr >= st->marker[1].start_address ||
>             pa != st->last_pa + PAGE_SIZE) {
> 
> 
> In addition to the checks everyone do, powerpc also checks "pa != st->last_pa + PAGE_SIZE".
> And it is definitely for that test that page_size argument add been added.

By replacing that test by (pa - st->start_pa != addr - st->start_address) it works again. So we 
definitely don't need the real page size.


> 
> I see that other architectures except RISCV don't dump the physical address. But even RISCV doesn't 
> include that check.
> 
> That physical address dump was added by commit aaa229529244 ("powerpc/mm: Add physical address to 
> Linux page table dump") [https://github.com/torvalds/linux/commit/aaa2295]
> 
> How do other architectures deal with the problem described by the commit log of that patch ?
> 
> Christophe

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-16 15:15                 ` Christophe Leroy
@ 2021-04-16 16:00                   ` Steven Price
  0 siblings, 0 replies; 24+ messages in thread
From: Steven Price @ 2021-04-16 16:00 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

On 16/04/2021 16:15, Christophe Leroy wrote:
> 
> 
> Le 16/04/2021 à 17:04, Christophe Leroy a écrit :
>>
>>
>> Le 16/04/2021 à 16:40, Christophe Leroy a écrit :
>>>
>>>
>>> Le 16/04/2021 à 15:00, Steven Price a écrit :
>>>> On 16/04/2021 12:08, Christophe Leroy wrote:
>>>>>
>>>>>
>>>>> Le 16/04/2021 à 12:51, Steven Price a écrit :
>>>>>> On 16/04/2021 11:38, Christophe Leroy wrote:
>>>>>>>
>>>>>>>
>>>>>>> Le 16/04/2021 à 11:28, Steven Price a écrit :
>>>>>>>> To be honest I don't fully understand why powerpc requires the 
>>>>>>>> page_size - it appears to be using it purely to find "holes" in 
>>>>>>>> the calls to note_page(), but I haven't worked out why such 
>>>>>>>> holes would occur.
>>>>>>>
>>>>>>> I was indeed introduced for KASAN. We have a first commit 
>>>>>>> https://github.com/torvalds/linux/commit/cabe8138 which uses page 
>>>>>>> size to detect whether it is a KASAN like stuff.
>>>>>>>
>>>>>>> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a 
>>>>>>> fix. I can't remember what the problem was exactly, something 
>>>>>>> around the use of hugepages for kernel memory, came as part of 
>>>>>>> the series 
>>>>>>> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> Ah, that's useful context. So it looks like powerpc took a 
>>>>>> different route to reducing the KASAN output to x86.
>>>>>>
>>>>>> Given the generic ptdump code has handling for KASAN already it 
>>>>>> should be possible to drop that from the powerpc arch code, which 
>>>>>> I think means we don't actually need to provide page size to 
>>>>>> notepage(). Hopefully that means more code to delete ;)
>>>>>>
>>>>>
>>>>> Yes ... and no.
>>>>>
>>>>> It looks like the generic ptdump handles the case when several 
>>>>> pgdir entries points to the same kasan_early_shadow_pte. But it 
>>>>> doesn't take into account the powerpc case where we have regular 
>>>>> page tables where several (if not all) PTEs are pointing to the 
>>>>> kasan_early_shadow_page .
>>>>
>>>> I'm not sure I follow quite how powerpc is different here. But could 
>>>> you have a similar check for PTEs against kasan_early_shadow_pte as 
>>>> the other levels already have?
>>>>
>>>> I'm just worried that page_size isn't well defined in this interface 
>>>> and it's going to cause problems in the future.
>>>>
>>>
>>> I'm trying. I reverted the two commits b00ff6d8c and cabe8138.
>>>
>>> At the moment, I don't get exactly what I expect: For linear memory I 
>>> get one line for each 8M page whereas before reverting the patches I 
>>> got one 16M line and one 112M line.
>>>
>>> And for KASAN shadow area I get two lines for the 2x 8M pages 
>>> shadowing linear mem then I get one 4M line for each PGDIR entry 
>>> pointing to kasan_early_shadow_pte.
>>>
>>> 0xf8000000-0xf87fffff 0x07000000         8M   huge        rw       
>>> present
>>> 0xf8800000-0xf8ffffff 0x07800000         8M   huge        rw       
>>> present
>>> 0xf9000000-0xf93fffff 0x01430000         4M               r        
>>> present
>> ...
>>> 0xfec00000-0xfeffffff 0x01430000         4M               r        
>>> present
>>>
>>> Any idea ?
>>>
>>
>>
>> I think the different with other architectures is here:
>>
>>      } else if (flag != st->current_flags || level != st->level ||
>>             addr >= st->marker[1].start_address ||
>>             pa != st->last_pa + PAGE_SIZE) {
>>
>>
>> In addition to the checks everyone do, powerpc also checks "pa != 
>> st->last_pa + PAGE_SIZE".
>> And it is definitely for that test that page_size argument add been 
>> added.
> 
> By replacing that test by (pa - st->start_pa != addr - 
> st->start_address) it works again. So we definitely don't need the real 
> page size.

Yes that should work. Thanks for figuring it out!

> 
>>
>> I see that other architectures except RISCV don't dump the physical 
>> address. But even RISCV doesn't include that check.

Yes not having the physical address certainly simplifies things - 
although I can see why that can be handy to see. The disadvantage is 
that user space or vmalloc()'d memory will produce a lot of output 
because the physical addresses are unlikely to be contiguous. And for 
most uses you don't need the information.

>> That physical address dump was added by commit aaa229529244 
>> ("powerpc/mm: Add physical address to Linux page table dump") 
>> [https://github.com/torvalds/linux/commit/aaa2295]
>>
>> How do other architectures deal with the problem described by the 
>> commit log of that patch ?

AFAIK other architectures are "broken" in this regard. In practice I 
don't think it often causes an issue though.

Steve

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-16 10:51       ` Steven Price
  2021-04-16 11:08         ` Christophe Leroy
@ 2021-04-19 13:14         ` Christophe Leroy
  2021-04-19 14:00           ` Steven Price
  1 sibling, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2021-04-19 13:14 UTC (permalink / raw)
  To: Steven Price, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel



Le 16/04/2021 à 12:51, Steven Price a écrit :
> On 16/04/2021 11:38, Christophe Leroy wrote:
>>
>>
>> Le 16/04/2021 à 11:28, Steven Price a écrit :
>>> On 15/04/2021 18:18, Christophe Leroy wrote:
>>>
>>> To be honest I don't fully understand why powerpc requires the page_size - it appears to be using 
>>> it purely to find "holes" in the calls to note_page(), but I haven't worked out why such holes 
>>> would occur.
>>
>> I was indeed introduced for KASAN. We have a first commit 
>> https://github.com/torvalds/linux/commit/cabe8138 which uses page size to detect whether it is a 
>> KASAN like stuff.
>>
>> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a fix. I can't remember what the 
>> problem was exactly, something around the use of hugepages for kernel memory, came as part of the 
>> series 
>> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 
> 
> 
> Ah, that's useful context. So it looks like powerpc took a different route to reducing the KASAN 
> output to x86.
> 
> Given the generic ptdump code has handling for KASAN already it should be possible to drop that from 
> the powerpc arch code, which I think means we don't actually need to provide page size to 
> notepage(). Hopefully that means more code to delete ;)
> 

Looking at how the generic ptdump code handles KASAN, I'm a bit sceptic.

IIUC, it is checking that kasan_early_shadow_pte is in the same page as the pgtable referred by the 
PMD entry. But what happens if that PMD entry is referring another pgtable which is inside the same 
page as kasan_early_shadow_pte ?

Shouldn't the test be

	if (pmd_page_vaddr(val) == lm_alias(kasan_early_shadow_pte))
		return note_kasan_page_table(walk, addr);


Christophe

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-19 13:14         ` Christophe Leroy
@ 2021-04-19 14:00           ` Steven Price
  2021-04-19 16:41             ` Christophe Leroy
  0 siblings, 1 reply; 24+ messages in thread
From: Steven Price @ 2021-04-19 14:00 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel

On 19/04/2021 14:14, Christophe Leroy wrote:
> 
> 
> Le 16/04/2021 à 12:51, Steven Price a écrit :
>> On 16/04/2021 11:38, Christophe Leroy wrote:
>>>
>>>
>>> Le 16/04/2021 à 11:28, Steven Price a écrit :
>>>> On 15/04/2021 18:18, Christophe Leroy wrote:
>>>>
>>>> To be honest I don't fully understand why powerpc requires the 
>>>> page_size - it appears to be using it purely to find "holes" in the 
>>>> calls to note_page(), but I haven't worked out why such holes would 
>>>> occur.
>>>
>>> I was indeed introduced for KASAN. We have a first commit 
>>> https://github.com/torvalds/linux/commit/cabe8138 which uses page 
>>> size to detect whether it is a KASAN like stuff.
>>>
>>> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a 
>>> fix. I can't remember what the problem was exactly, something around 
>>> the use of hugepages for kernel memory, came as part of the series 
>>> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 
>>
>>
>>
>> Ah, that's useful context. So it looks like powerpc took a different 
>> route to reducing the KASAN output to x86.
>>
>> Given the generic ptdump code has handling for KASAN already it should 
>> be possible to drop that from the powerpc arch code, which I think 
>> means we don't actually need to provide page size to notepage(). 
>> Hopefully that means more code to delete ;)
>>
> 
> Looking at how the generic ptdump code handles KASAN, I'm a bit sceptic.
> 
> IIUC, it is checking that kasan_early_shadow_pte is in the same page as 
> the pgtable referred by the PMD entry. But what happens if that PMD 
> entry is referring another pgtable which is inside the same page as 
> kasan_early_shadow_pte ?
> 
> Shouldn't the test be
> 
>      if (pmd_page_vaddr(val) == lm_alias(kasan_early_shadow_pte))
>          return note_kasan_page_table(walk, addr);

Now I come to look at this code again, I think you're right. On arm64 
this doesn't cause a problem - page tables are page sized and page 
aligned, so there couldn't be any non-KASAN pgtables sharing the page. 
Obviously that's not necessarily true of other architectures.

Feel free to add a patch to your series ;)

Steve

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 3/5] mm: ptdump: Provide page size to notepage()
  2021-04-19 14:00           ` Steven Price
@ 2021-04-19 16:41             ` Christophe Leroy
  0 siblings, 0 replies; 24+ messages in thread
From: Christophe Leroy @ 2021-04-19 16:41 UTC (permalink / raw)
  To: Steven Price, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman, akpm
  Cc: linux-arch, linux-s390, x86, linux-kernel, linux-mm, linux-riscv,
	linuxppc-dev, linux-arm-kernel



Le 19/04/2021 à 16:00, Steven Price a écrit :
> On 19/04/2021 14:14, Christophe Leroy wrote:
>>
>>
>> Le 16/04/2021 à 12:51, Steven Price a écrit :
>>> On 16/04/2021 11:38, Christophe Leroy wrote:
>>>>
>>>>
>>>> Le 16/04/2021 à 11:28, Steven Price a écrit :
>>>>> On 15/04/2021 18:18, Christophe Leroy wrote:
>>>>>
>>>>> To be honest I don't fully understand why powerpc requires the page_size - it appears to be 
>>>>> using it purely to find "holes" in the calls to note_page(), but I haven't worked out why such 
>>>>> holes would occur.
>>>>
>>>> I was indeed introduced for KASAN. We have a first commit 
>>>> https://github.com/torvalds/linux/commit/cabe8138 which uses page size to detect whether it is a 
>>>> KASAN like stuff.
>>>>
>>>> Then came https://github.com/torvalds/linux/commit/b00ff6d8c as a fix. I can't remember what the 
>>>> problem was exactly, something around the use of hugepages for kernel memory, came as part of 
>>>> the series 
>>>> https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1589866984.git.christophe.leroy@csgroup.eu/ 
>>>
>>>
>>>
>>>
>>> Ah, that's useful context. So it looks like powerpc took a different route to reducing the KASAN 
>>> output to x86.
>>>
>>> Given the generic ptdump code has handling for KASAN already it should be possible to drop that 
>>> from the powerpc arch code, which I think means we don't actually need to provide page size to 
>>> notepage(). Hopefully that means more code to delete ;)
>>>
>>
>> Looking at how the generic ptdump code handles KASAN, I'm a bit sceptic.
>>
>> IIUC, it is checking that kasan_early_shadow_pte is in the same page as the pgtable referred by 
>> the PMD entry. But what happens if that PMD entry is referring another pgtable which is inside the 
>> same page as kasan_early_shadow_pte ?
>>
>> Shouldn't the test be
>>
>>      if (pmd_page_vaddr(val) == lm_alias(kasan_early_shadow_pte))
>>          return note_kasan_page_table(walk, addr);
> 
> Now I come to look at this code again, I think you're right. On arm64 this doesn't cause a problem - 
> page tables are page sized and page aligned, so there couldn't be any non-KASAN pgtables sharing the 
> page. Obviously that's not necessarily true of other architectures.
> 
> Feel free to add a patch to your series ;)
> 

Ok.

I'll leave that outside of the series, it is not a show stopper because early shadow page 
directories are all tagged __bss_page_aligned so we can't have two of them in the same page and it 
is really unlikely that we'll have any other statically defined page directory in the same pages either.

And for the special case of powerpc 8xx which is the only one for which we have both KASAN and 
HUGEPD at the time being, there are only two levels of page directories so no issue.

Christophe

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, back to index

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-15 17:18 [PATCH v1 0/5] Convert powerpc to GENERIC_PTDUMP Christophe Leroy
2021-04-15 17:18 ` [PATCH v1 1/5] mm: pagewalk: Fix walk for hugepage tables Christophe Leroy
2021-04-15 22:43   ` Daniel Axtens
2021-04-16  5:48     ` Christophe Leroy
2021-04-15 17:18 ` [PATCH v1 2/5] mm: ptdump: Fix build failure Christophe Leroy
2021-04-15 17:18 ` [PATCH v1 3/5] mm: ptdump: Provide page size to notepage() Christophe Leroy
2021-04-15 23:12   ` Daniel Axtens
2021-04-16  5:19     ` Christophe Leroy
2021-04-16  9:28   ` Steven Price
2021-04-16 10:38     ` Christophe Leroy
2021-04-16 10:51       ` Steven Price
2021-04-16 11:08         ` Christophe Leroy
2021-04-16 13:00           ` Steven Price
2021-04-16 14:40             ` Christophe Leroy
2021-04-16 15:04               ` Christophe Leroy
2021-04-16 15:15                 ` Christophe Leroy
2021-04-16 16:00                   ` Steven Price
2021-04-19 13:14         ` Christophe Leroy
2021-04-19 14:00           ` Steven Price
2021-04-19 16:41             ` Christophe Leroy
2021-04-15 17:18 ` [PATCH v1 4/5] mm: ptdump: Support hugepd table entries Christophe Leroy
2021-04-15 23:29   ` Daniel Axtens
2021-04-16  5:25     ` Christophe Leroy
2021-04-15 17:18 ` [PATCH v1 5/5] powerpc/mm: Convert powerpc to GENERIC_PTDUMP Christophe Leroy

LinuxPPC-Dev Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linuxppc-dev/0 linuxppc-dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linuxppc-dev linuxppc-dev/ https://lore.kernel.org/linuxppc-dev \
		linuxppc-dev@lists.ozlabs.org linuxppc-dev@ozlabs.org
	public-inbox-index linuxppc-dev

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.ozlabs.lists.linuxppc-dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git