All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/2] arm64: apply G-to-nG conversion for KPTI with MMU enabled
@ 2022-04-21 14:03 Ard Biesheuvel
  2022-04-21 14:03 ` [PATCH v3 1/2] arm64: kpti-ng: simplify page table traversal logic Ard Biesheuvel
  2022-04-21 14:03 ` [PATCH v3 2/2] arm64: mm: install KPTI nG mappings with MMU enabled Ard Biesheuvel
  0 siblings, 2 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2022-04-21 14:03 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: Ard Biesheuvel, will, maz, mark.rutland, catalin.marinas, keescook

This v3 is now a series, after I split off some preparatory refactoring
into a separate patch. Rationale for the series is in patch #2.

Changes since v2 [0]:
- reinstate optimization that avoids descending into the same page
  tables repeatedly when KASAN is enabled;
- use broadcast TLB maintenance when doing BBM for the temporary
  mappings, to prevent potential TLB conflicts on the secondary cores
  that are running from the same set of temporary page tables;
- remove a wait loop in the C code, by using 'num_cpus + 1' as the
  signal value rather then 0x0;
- declutter and simplify the asm helper code, to make it more
  maintainable.

Cc: will@kernel.org
Cc: maz@kernel.org
Cc: mark.rutland@arm.com
Cc: catalin.marinas@arm.com
Cc: keescook@chromium.org

[0] https://lore.kernel.org/linux-arm-kernel/20220413121848.787565-1-ardb@kernel.org/

Ard Biesheuvel (2):
  arm64: kpti-ng: simplify page table traversal logic
  arm64: mm: install KPTI nG mappings with MMU enabled

 arch/arm64/include/asm/mmu.h   |   4 +
 arch/arm64/kernel/cpufeature.c |  65 ++++++-
 arch/arm64/mm/mmu.c            |   8 +-
 arch/arm64/mm/proc.S           | 190 ++++++++++----------
 4 files changed, 161 insertions(+), 106 deletions(-)

-- 
2.30.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3 1/2] arm64: kpti-ng: simplify page table traversal logic
  2022-04-21 14:03 [PATCH v3 0/2] arm64: apply G-to-nG conversion for KPTI with MMU enabled Ard Biesheuvel
@ 2022-04-21 14:03 ` Ard Biesheuvel
  2022-05-19 17:09   ` Mark Rutland
  2022-04-21 14:03 ` [PATCH v3 2/2] arm64: mm: install KPTI nG mappings with MMU enabled Ard Biesheuvel
  1 sibling, 1 reply; 7+ messages in thread
From: Ard Biesheuvel @ 2022-04-21 14:03 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: Ard Biesheuvel, will, maz, mark.rutland, catalin.marinas, keescook

Simplify the KPTI G-to-nG asm helper code by:
- pulling the 'table bit' test into the get/put macros so we can combine
  them and incorporate the entire loop;
- moving the 'table bit' test after the update of bit #11 so we no
  longer need separate next_xxx and skip_xxx labels;
- redefining the pmd/pud register aliases and the next_pmd/next_pud
  labels instead of branching to them if the number of configured page
  table levels is less than 3 or 4, respectively;
- folding the descriptor pointer increment into the LDR instructions.

No functional change intended, except for the fact that we now descend
into a next level table after setting bit #11 on its descriptor but this
should make no difference in practice.

While at it, switch to .L prefixed local labels so they don't clutter up
the symbol tables, kallsyms, etc, and clean up the indentation for
legibility.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/mm/proc.S | 97 +++++++-------------
 1 file changed, 34 insertions(+), 63 deletions(-)

diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 50bbed947bec..5619c00f8cd4 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -202,19 +202,24 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 	.pushsection ".idmap.text", "awx"
 
-	.macro	__idmap_kpti_get_pgtable_ent, type
+	.macro	kpti_mk_tbl_ng, type, num_entries
+	add	end_\type\()p, cur_\type\()p, #\num_entries * 8
+.Ldo_\type:
 	dc	cvac, cur_\()\type\()p		// Ensure any existing dirty
 	dmb	sy				// lines are written back before
-	ldr	\type, [cur_\()\type\()p]	// loading the entry
-	tbz	\type, #0, skip_\()\type	// Skip invalid and
-	tbnz	\type, #11, skip_\()\type	// non-global entries
-	.endm
-
-	.macro __idmap_kpti_put_pgtable_ent_ng, type
+	ldr	\type, [cur_\type\()p], #8	// loading the entry
+	tbz	\type, #0, .Lnext_\type		// Skip invalid and
+	tbnz	\type, #11, .Lnext_\type	// non-global entries
 	orr	\type, \type, #PTE_NG		// Same bit for blocks and pages
-	str	\type, [cur_\()\type\()p]	// Update the entry and ensure
+	str	\type, [cur_\type\()p, #-8]	// Update the entry and ensure
 	dmb	sy				// that it is visible to all
 	dc	civac, cur_\()\type\()p		// CPUs.
+	.ifnc	\type, pte
+	tbnz	\type, #1, .Lderef_\type
+	.endif
+.Lnext_\type:
+	cmp	cur_\type\()p, end_\type\()p
+	b.ne	.Ldo_\type
 	.endm
 
 /*
@@ -235,10 +240,8 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 	pgd		.req	x7
 	cur_pudp	.req	x8
 	end_pudp	.req	x9
-	pud		.req	x10
 	cur_pmdp	.req	x11
 	end_pmdp	.req	x12
-	pmd		.req	x13
 	cur_ptep	.req	x14
 	end_ptep	.req	x15
 	pte		.req	x16
@@ -265,16 +268,8 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 
 	/* Everybody is enjoying the idmap, so we can rewrite swapper. */
 	/* PGD */
-	mov	cur_pgdp, swapper_pa
-	add	end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8)
-do_pgd:	__idmap_kpti_get_pgtable_ent	pgd
-	tbnz	pgd, #1, walk_puds
-next_pgd:
-	__idmap_kpti_put_pgtable_ent_ng	pgd
-skip_pgd:
-	add	cur_pgdp, cur_pgdp, #8
-	cmp	cur_pgdp, end_pgdp
-	b.ne	do_pgd
+	mov		cur_pgdp, swapper_pa
+	kpti_mk_tbl_ng	pgd, PTRS_PER_PGD
 
 	/* Publish the updated tables and nuke all the TLBs */
 	dsb	sy
@@ -291,59 +286,35 @@ skip_pgd:
 	str	wzr, [flag_ptr]
 	ret
 
+.Lderef_pgd:
 	/* PUD */
-walk_puds:
-	.if CONFIG_PGTABLE_LEVELS > 3
+	.if		CONFIG_PGTABLE_LEVELS > 3
+	pud		.req	x10
 	pte_to_phys	cur_pudp, pgd
-	add	end_pudp, cur_pudp, #(PTRS_PER_PUD * 8)
-do_pud:	__idmap_kpti_get_pgtable_ent	pud
-	tbnz	pud, #1, walk_pmds
-next_pud:
-	__idmap_kpti_put_pgtable_ent_ng	pud
-skip_pud:
-	add	cur_pudp, cur_pudp, 8
-	cmp	cur_pudp, end_pudp
-	b.ne	do_pud
-	b	next_pgd
-	.else /* CONFIG_PGTABLE_LEVELS <= 3 */
-	mov	pud, pgd
-	b	walk_pmds
-next_pud:
-	b	next_pgd
+	kpti_mk_tbl_ng	pud, PTRS_PER_PUD
+	b		.Lnext_pgd
+	.else		/* CONFIG_PGTABLE_LEVELS <= 3 */
+	pud		.req	pgd
+	.set		.Lnext_pud, .Lnext_pgd
 	.endif
 
+.Lderef_pud:
 	/* PMD */
-walk_pmds:
-	.if CONFIG_PGTABLE_LEVELS > 2
+	.if		CONFIG_PGTABLE_LEVELS > 2
+	pmd		.req	x13
 	pte_to_phys	cur_pmdp, pud
-	add	end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8)
-do_pmd:	__idmap_kpti_get_pgtable_ent	pmd
-	tbnz	pmd, #1, walk_ptes
-next_pmd:
-	__idmap_kpti_put_pgtable_ent_ng	pmd
-skip_pmd:
-	add	cur_pmdp, cur_pmdp, #8
-	cmp	cur_pmdp, end_pmdp
-	b.ne	do_pmd
-	b	next_pud
-	.else /* CONFIG_PGTABLE_LEVELS <= 2 */
-	mov	pmd, pud
-	b	walk_ptes
-next_pmd:
-	b	next_pud
+	kpti_mk_tbl_ng	pmd, PTRS_PER_PMD
+	b		.Lnext_pud
+	.else		/* CONFIG_PGTABLE_LEVELS <= 2 */
+	pmd		.req	pgd
+	.set		.Lnext_pmd, .Lnext_pgd
 	.endif
 
+.Lderef_pmd:
 	/* PTE */
-walk_ptes:
 	pte_to_phys	cur_ptep, pmd
-	add	end_ptep, cur_ptep, #(PTRS_PER_PTE * 8)
-do_pte:	__idmap_kpti_get_pgtable_ent	pte
-	__idmap_kpti_put_pgtable_ent_ng	pte
-skip_pte:
-	add	cur_ptep, cur_ptep, #8
-	cmp	cur_ptep, end_ptep
-	b.ne	do_pte
-	b	next_pmd
+	kpti_mk_tbl_ng	pte, PTRS_PER_PTE
+	b		.Lnext_pmd
 
 	.unreq	cpu
 	.unreq	num_cpus
-- 
2.30.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v3 2/2] arm64: mm: install KPTI nG mappings with MMU enabled
  2022-04-21 14:03 [PATCH v3 0/2] arm64: apply G-to-nG conversion for KPTI with MMU enabled Ard Biesheuvel
  2022-04-21 14:03 ` [PATCH v3 1/2] arm64: kpti-ng: simplify page table traversal logic Ard Biesheuvel
@ 2022-04-21 14:03 ` Ard Biesheuvel
  2022-05-20 13:00   ` Mark Rutland
  1 sibling, 1 reply; 7+ messages in thread
From: Ard Biesheuvel @ 2022-04-21 14:03 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: Ard Biesheuvel, will, maz, mark.rutland, catalin.marinas, keescook

In cases where we unmap the kernel while running in user space, we rely
on ASIDs to distinguish the minimal trampoline from the full kernel
mapping, and this means we must use non-global attributes for those
mappings, to ensure they are scoped by ASID and will not hit in the TLB
inadvertently.

We only do this when needed, as this is generally more costly in terms
of TLB pressure, and so we boot without these non-global attributes, and
apply them to all existing kernel mappings once all CPUs are up and we
know whether or not the non-global attributes are needed. At this point,
we cannot simply unmap and remap the entire address space, so we have to
update all existing block and page descriptors in place.

Currently, we go through a lot of trouble to perform these updates with
the MMU and caches off, to avoid violating break before make (BBM) rules
imposed by the architecture. Since we make changes to page tables that
are not covered by the ID map, we gain access to those descriptors by
disabling translations altogether. This means that the stores to memory
are issued with device attributes, and require extra care in terms of
coherency, which is costly. We also rely on the ID map to access a
shared flag, which requires the ID map to be executable and writable at
the same time, which is another thing we'd prefer to avoid.

So let's switch to an approach where we replace the kernel mapping with
a minimal mapping of a few pages that can be used for the shared flag,
as well as a minimal, ad-hoc fixmap that we can use to map each page
table in turn as we traverse the hierarchy. This requires one PTE per
level, and an associated page worth of VA space in the temporary
mapping.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm64/include/asm/mmu.h   |   4 +
 arch/arm64/kernel/cpufeature.c |  65 +++++++++++-
 arch/arm64/mm/mmu.c            |   8 +-
 arch/arm64/mm/proc.S           | 107 ++++++++++++--------
 4 files changed, 134 insertions(+), 50 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 48f8466a4be9..b896f0ac4985 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -68,6 +68,10 @@ extern void init_mem_pgprot(void);
 extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       unsigned long virt, phys_addr_t size,
 			       pgprot_t prot, bool page_mappings_only);
+extern void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
+				 unsigned long virt, phys_addr_t size,
+				 pgprot_t prot,
+				 phys_addr_t (*pgtable_alloc)(int), int flags);
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
 extern void mark_linear_text_alias_ro(void);
 extern bool kaslr_requires_kpti(void);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d72c4b4d389c..f0688e812e19 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1596,14 +1596,31 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
 }
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define KPTI_NG_TEMP_VA		(-(1UL << PMD_SHIFT))
+
+static phys_addr_t kpti_ng_temp_alloc;
+
+static phys_addr_t kpti_ng_pgd_alloc(int shift)
+{
+	kpti_ng_temp_alloc -= PAGE_SIZE;
+	return kpti_ng_temp_alloc;
+}
+
 static void __nocfi
 kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
 {
-	typedef void (kpti_remap_fn)(int, int, phys_addr_t);
+	static atomic_t flag = ATOMIC_INIT(0);
+	static pgd_t *kpti_ng_temp_pgd;
+	static u64 alloc;
+
+	typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
 	extern kpti_remap_fn idmap_kpti_install_ng_mappings;
 	kpti_remap_fn *remap_fn;
 
-	int cpu = smp_processor_id();
+	int levels = CONFIG_PGTABLE_LEVELS;
+	int order = order_base_2(levels + 1);
+	int num_cpus = num_online_cpus();
+	int primary = 0;
 
 	if (__this_cpu_read(this_cpu_vector) == vectors) {
 		const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI);
@@ -1619,14 +1636,54 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
 	if (arm64_use_ng_mappings)
 		return;
 
+	// First CPU to arrive here gets the job
+	if (atomic_inc_return(&flag) == 1) {
+		alloc =  __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
+		kpti_ng_temp_pgd = (pgd_t *)(alloc + levels * PAGE_SIZE);
+		kpti_ng_temp_alloc = __pa(kpti_ng_temp_pgd);
+		primary = 1;
+
+		//
+		// Create a minimal page table hierarchy that permits us to
+		// store a shared variable that secondaries will poll, and to
+		// map the swapper page tables temporarily as we traverse them.
+		//
+		// The physical pages are laid out as follows:
+		//
+		// +---------+--------+-/-------+-/------ +-\\--------+
+		// :  data   :  PTE[] : | PMD[] : | PUD[] : || PGD[]  :
+		// +---------+--------+-\-------+-\------ +-//--------+
+		//      ^         ^
+		// The first two pages are mapped consecutively into this
+		// hierarchy at a PMD_SHIFT aligned virtual address, so that we
+		// have a place to store the shared variable, and so that we
+		// can manipulate the PTE level entries while the mapping is
+		// active.  The first two entries cover the data page and the
+		// PTE[] page itself, the remaining entries are free to be used
+		// as a ad-hoc fixmap.
+		//
+		__create_pgd_mapping(kpti_ng_temp_pgd, __pa(alloc),
+				     KPTI_NG_TEMP_VA, 2 * PAGE_SIZE,
+				     PAGE_KERNEL, kpti_ng_pgd_alloc, 0);
+
+		// Increment flag again to signal other CPUs to proceed as well
+		atomic_inc_return_release(&flag);
+	} else {
+		// Wait for the primary CPU to set up the temporary page tables
+		while (atomic_read(&flag) <= num_cpus)
+			cpu_relax();
+	}
 	remap_fn = (void *)__pa_symbol(function_nocfi(idmap_kpti_install_ng_mappings));
 
 	cpu_install_idmap();
-	remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir));
+	remap_fn(!primary, num_cpus - 1, __pa(kpti_ng_temp_pgd), KPTI_NG_TEMP_VA);
 	cpu_uninstall_idmap();
 
-	if (!cpu)
+	// Last CPU to leave frees the pages
+	if (atomic_dec_return(&flag) == 1) {
+		free_pages(alloc, order);
 		arm64_use_ng_mappings = true;
+	}
 }
 #else
 static void
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 626ec32873c6..1c7299dfaa84 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -360,11 +360,9 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
 		mutex_unlock(&fixmap_lock);
 }
 
-static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
-				 unsigned long virt, phys_addr_t size,
-				 pgprot_t prot,
-				 phys_addr_t (*pgtable_alloc)(int),
-				 int flags)
+void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
+			  phys_addr_t size, pgprot_t prot,
+			  phys_addr_t (*pgtable_alloc)(int), int flags)
 {
 	unsigned long addr, end, next;
 	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 5619c00f8cd4..20d726207db5 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -14,6 +14,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/asm_pointer_auth.h>
 #include <asm/hwcap.h>
+#include <asm/kernel-pgtable.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
@@ -200,20 +201,21 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
 	.popsection
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+
+#define KPTI_NG_PTE_FLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
+
 	.pushsection ".idmap.text", "awx"
 
 	.macro	kpti_mk_tbl_ng, type, num_entries
 	add	end_\type\()p, cur_\type\()p, #\num_entries * 8
 .Ldo_\type:
-	dc	cvac, cur_\()\type\()p		// Ensure any existing dirty
-	dmb	sy				// lines are written back before
-	ldr	\type, [cur_\type\()p], #8	// loading the entry
-	tbz	\type, #0, .Lnext_\type		// Skip invalid and
-	tbnz	\type, #11, .Lnext_\type	// non-global entries
-	orr	\type, \type, #PTE_NG		// Same bit for blocks and pages
-	str	\type, [cur_\type\()p, #-8]	// Update the entry and ensure
-	dmb	sy				// that it is visible to all
-	dc	civac, cur_\()\type\()p		// CPUs.
+	ldr	\type, [cur_\type\()p], #8	// Load the entry
+	.ifnc	\type, pte
+	tbnz	\type, #11, .Lnext_\type	// Skip visited entries
+	.endif
+	and	valid, \type, #1
+	orr	\type, \type, valid, lsl #11	// nG |= valid
+	str	\type, [cur_\type\()p, #-8]	// Update the entry
 	.ifnc	\type, pte
 	tbnz	\type, #1, .Lderef_\type
 	.endif
@@ -222,19 +224,42 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
 	b.ne	.Ldo_\type
 	.endm
 
+	/*
+	 * Dereference the current table entry and map it into the temporary
+	 * fixmap slot associated with the current level. The ad-hoc fixmap
+	 * is a set of PTEs that are located above the PTEs that cover the
+	 * level 3 page table and the scratch page that precedes it.
+	 */
+	.macro	kpti_map_pgtbl, type, level
+	str	xzr, [temp_pte, #8 * (\level + 2)]	// break before make
+	dsb	ishst
+	add	pte, flag_ptr, #PAGE_SIZE * (\level + 2)
+	lsr	pte, pte, #12
+	tlbi	vaae1is, pte
+	dsb	ish
+	isb
+
+	phys_to_pte pte, cur_\type\()p
+	add	cur_\type\()p, flag_ptr, #PAGE_SIZE * (\level + 2)
+	orr	pte, pte, pte_flags
+	str	pte, [temp_pte, #8 * (\level + 2)]
+	dsb	ishst
+	.endm
+
 /*
- * void __kpti_install_ng_mappings(int cpu, int num_cpus, phys_addr_t swapper)
+ * void __kpti_install_ng_mappings(int cpu, int num_secondaries, phys_addr_t temp_pgd,
+ *				   unsigned long kpti_ng_temp_va)
  *
  * Called exactly once from stop_machine context by each CPU found during boot.
  */
-__idmap_kpti_flag:
-	.long	1
 SYM_FUNC_START(idmap_kpti_install_ng_mappings)
-	cpu		.req	w0
+	cpu		.req	w0	// at entry
+	pte_flags	.req	x0
 	num_cpus	.req	w1
-	swapper_pa	.req	x2
-	swapper_ttb	.req	x3
-	flag_ptr	.req	x4
+	temp_pgd_phys	.req	x2	// at entry
+	temp_pte	.req	x2
+	flag_ptr	.req	x3
+	swapper_ttb	.req	x4
 	cur_pgdp	.req	x5
 	end_pgdp	.req	x6
 	pgd		.req	x7
@@ -245,10 +270,15 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 	cur_ptep	.req	x14
 	end_ptep	.req	x15
 	pte		.req	x16
+	valid		.req	x17
 
 	mrs	swapper_ttb, ttbr1_el1
-	restore_ttbr1	swapper_ttb
-	adr	flag_ptr, __idmap_kpti_flag
+
+	/* Uninstall swapper before surgery begins */
+	__idmap_cpu_set_reserved_ttbr1 x8, x9
+	offset_ttbr1 temp_pgd_phys, x8
+	msr	ttbr1_el1, temp_pgd_phys
+	isb
 
 	cbnz	cpu, __idmap_kpti_secondary
 
@@ -259,31 +289,24 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 	eor	w17, w17, num_cpus
 	cbnz	w17, 1b
 
-	/* We need to walk swapper, so turn off the MMU. */
-	pre_disable_mmu_workaround
-	mrs	x17, sctlr_el1
-	bic	x17, x17, #SCTLR_ELx_M
-	msr	sctlr_el1, x17
-	isb
+	mov	pte_flags, #KPTI_NG_PTE_FLAGS
+
+	/* Advance temp_pte to the fixmap page */
+	add	temp_pte, flag_ptr, #PAGE_SIZE
 
 	/* Everybody is enjoying the idmap, so we can rewrite swapper. */
 	/* PGD */
-	mov		cur_pgdp, swapper_pa
+	adrp		cur_pgdp, swapper_pg_dir
+	kpti_map_pgtbl	pgd, 0
 	kpti_mk_tbl_ng	pgd, PTRS_PER_PGD
 
-	/* Publish the updated tables and nuke all the TLBs */
-	dsb	sy
-	tlbi	vmalle1is
-	dsb	ish
-	isb
-
-	/* We're done: fire up the MMU again */
-	mrs	x17, sctlr_el1
-	orr	x17, x17, #SCTLR_ELx_M
-	set_sctlr_el1	x17
-
 	/* Set the flag to zero to indicate that we're all done */
 	str	wzr, [flag_ptr]
+
+	/* We're done: fire up swapper again */
+	__idmap_cpu_set_reserved_ttbr1 x8, x9
+	msr	ttbr1_el1, swapper_ttb
+	isb
 	ret
 
 .Lderef_pgd:
@@ -291,6 +314,7 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 	.if		CONFIG_PGTABLE_LEVELS > 3
 	pud		.req	x10
 	pte_to_phys	cur_pudp, pgd
+	kpti_map_pgtbl	pud, 1
 	kpti_mk_tbl_ng	pud, PTRS_PER_PUD
 	b		.Lnext_pgd
 	.else		/* CONFIG_PGTABLE_LEVELS <= 3 */
@@ -303,6 +327,7 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 	.if		CONFIG_PGTABLE_LEVELS > 2
 	pmd		.req	x13
 	pte_to_phys	cur_pmdp, pud
+	kpti_map_pgtbl	pmd, 2
 	kpti_mk_tbl_ng	pmd, PTRS_PER_PMD
 	b		.Lnext_pud
 	.else		/* CONFIG_PGTABLE_LEVELS <= 2 */
@@ -313,12 +338,14 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 .Lderef_pmd:
 	/* PTE */
 	pte_to_phys	cur_ptep, pmd
+	kpti_map_pgtbl	pte, 3
 	kpti_mk_tbl_ng	pte, PTRS_PER_PTE
 	b		.Lnext_pmd
 
 	.unreq	cpu
+	.unreq	pte_flags
 	.unreq	num_cpus
-	.unreq	swapper_pa
+	.unreq	temp_pgd_phys
 	.unreq	cur_pgdp
 	.unreq	end_pgdp
 	.unreq	pgd
@@ -331,12 +358,10 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
 	.unreq	cur_ptep
 	.unreq	end_ptep
 	.unreq	pte
+	.unreq	valid
 
 	/* Secondary CPUs end up here */
 __idmap_kpti_secondary:
-	/* Uninstall swapper before surgery begins */
-	__idmap_cpu_set_reserved_ttbr1 x16, x17
-
 	/* Increment the flag to let the boot CPU we're ready */
 1:	ldxr	w16, [flag_ptr]
 	add	w16, w16, #1
@@ -350,7 +375,7 @@ __idmap_kpti_secondary:
 	cbnz	w16, 1b
 
 	/* All done, act like nothing happened */
-	offset_ttbr1 swapper_ttb, x16
+	__idmap_cpu_set_reserved_ttbr1 x8, x9
 	msr	ttbr1_el1, swapper_ttb
 	isb
 	ret
-- 
2.30.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 1/2] arm64: kpti-ng: simplify page table traversal logic
  2022-04-21 14:03 ` [PATCH v3 1/2] arm64: kpti-ng: simplify page table traversal logic Ard Biesheuvel
@ 2022-05-19 17:09   ` Mark Rutland
  2022-05-19 18:08     ` Ard Biesheuvel
  0 siblings, 1 reply; 7+ messages in thread
From: Mark Rutland @ 2022-05-19 17:09 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-arm-kernel, will, maz, catalin.marinas, keescook

Hi Ard,

This is a really nice cleanup!

On Thu, Apr 21, 2022 at 04:03:38PM +0200, Ard Biesheuvel wrote:
> Simplify the KPTI G-to-nG asm helper code by:
> - pulling the 'table bit' test into the get/put macros so we can combine
>   them and incorporate the entire loop;
> - moving the 'table bit' test after the update of bit #11 so we no
>   longer need separate next_xxx and skip_xxx labels;
> - redefining the pmd/pud register aliases and the next_pmd/next_pud
>   labels instead of branching to them if the number of configured page
>   table levels is less than 3 or 4, respectively;

All of this looks good to me.

> - folding the descriptor pointer increment into the LDR instructions.

I think this leads to issuing a CMO to the wrong address; explained below with
a proposed fixup.

> No functional change intended, except for the fact that we now descend
> into a next level table after setting bit #11 on its descriptor but this
> should make no difference in practice.

Since we don't play games with recursive tables I agree that should fine.

> While at it, switch to .L prefixed local labels so they don't clutter up
> the symbol tables, kallsyms, etc, and clean up the indentation for
> legibility.

This also sounds good to me.

> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

If you're happy with my proposed fixup below:

Reviewed-by: Mark Rutland <mark.rutland@arm.com>

With that applied, I've tested this in VMs on ThunderX2 (with KPTI forced on an
kaslr_requires_kpti() hacked to false) in the following configurations: 

* 4K  / 39-bit VAs / 3 levels
* 4K  / 48-bit VAs / 4 levels
* 4K  / 48-bit VAs / 4 levels + KASAN (to test shadow skipping)
* 64K / 42-bit VAs / 2 levels
* 64K / 48-bit VAs / 3 levels
* 64K / 52-bit VAs / 3 levels

... in each case checking the resulting kernel tables with ptudmp.

In all cases that looked good, so (with the fixup):

Tested-by: Mark Rutland <mark.rutland@arm.com>

Beware when looking at the ptdump output that we have 3 KPTI trampoline pages
in the fixmap region which are (legitimately) executable and global; I got very
confused when I first spotted that!

> ---
>  arch/arm64/mm/proc.S | 97 +++++++-------------
>  1 file changed, 34 insertions(+), 63 deletions(-)
> 
> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> index 50bbed947bec..5619c00f8cd4 100644
> --- a/arch/arm64/mm/proc.S
> +++ b/arch/arm64/mm/proc.S
> @@ -202,19 +202,24 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
>  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
>  	.pushsection ".idmap.text", "awx"
>  
> -	.macro	__idmap_kpti_get_pgtable_ent, type
> +	.macro	kpti_mk_tbl_ng, type, num_entries
> +	add	end_\type\()p, cur_\type\()p, #\num_entries * 8
> +.Ldo_\type:
>  	dc	cvac, cur_\()\type\()p		// Ensure any existing dirty

Could clean up the (existing) extraneous `\()` here too? That'd make it easier
to spot the `cur_\type\()p` patter consistently.

>  	dmb	sy				// lines are written back before
> -	ldr	\type, [cur_\()\type\()p]	// loading the entry
> -	tbz	\type, #0, skip_\()\type	// Skip invalid and
> -	tbnz	\type, #11, skip_\()\type	// non-global entries
> -	.endm
> -
> -	.macro __idmap_kpti_put_pgtable_ent_ng, type
> +	ldr	\type, [cur_\type\()p], #8	// loading the entry

So now `cur_`\type()p` points at the *next* entry, which is a bit confusing ...

> +	tbz	\type, #0, .Lnext_\type		// Skip invalid and
> +	tbnz	\type, #11, .Lnext_\type	// non-global entries
>  	orr	\type, \type, #PTE_NG		// Same bit for blocks and pages
> -	str	\type, [cur_\()\type\()p]	// Update the entry and ensure
> +	str	\type, [cur_\type\()p, #-8]	// Update the entry and ensure

... though we handle the offset correctly here ...

>  	dmb	sy				// that it is visible to all
>  	dc	civac, cur_\()\type\()p		// CPUs.

... but here we don't, and perform the CMO for the *next* entry. So for the
last entry in each cacheline we're missing an invalidate.

> +	.ifnc	\type, pte
> +	tbnz	\type, #1, .Lderef_\type
> +	.endif
> +.Lnext_\type:
> +	cmp	cur_\type\()p, end_\type\()p
> +	b.ne	.Ldo_\type
>  	.endm

I reckon it's be slightly clearer and safer to have an explicit `ADD` at the
start of `.Lnext_\type`, i.e.

|         .macro  kpti_mk_tbl_ng, type, num_entries
|         add     end_\type\()p, cur_\type\()p, #\num_entries * 8
| .Ldo_\type:
|         dc      cvac, cur_\type\()p             // Ensure any existing dirty
|         dmb     sy                              // lines are written back before
|         ldr     \type, [cur_\type\()p]          // loading the entry
|         tbz     \type, #0, .Lnext_\type         // Skip invalid and
|         tbnz    \type, #11, .Lnext_\type        // non-global entries
|         orr     \type, \type, #PTE_NG           // Same bit for blocks and pages
|         str     \type, [cur_\type\()p]          // Update the entry and ensure
|         dmb     sy                              // that it is visible to all
|         dc      civac, cur_\()\type\()p         // CPUs.
|         .ifnc   \type, pte 
|         tbnz    \type, #1, .Lderef_\type
|         .endif
| .Lnext_\type:
|         add     cur_\type\()p, cur_\type\()p, #8
|         cmp     cur_\type\()p, end_\type\()p
|         b.ne    .Ldo_\type
|         .endm

Other than that, this looks good to me.

Thanks,
Mark.

>  /*
> @@ -235,10 +240,8 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
>  	pgd		.req	x7
>  	cur_pudp	.req	x8
>  	end_pudp	.req	x9
> -	pud		.req	x10
>  	cur_pmdp	.req	x11
>  	end_pmdp	.req	x12
> -	pmd		.req	x13
>  	cur_ptep	.req	x14
>  	end_ptep	.req	x15
>  	pte		.req	x16
> @@ -265,16 +268,8 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
>  
>  	/* Everybody is enjoying the idmap, so we can rewrite swapper. */
>  	/* PGD */
> -	mov	cur_pgdp, swapper_pa
> -	add	end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8)
> -do_pgd:	__idmap_kpti_get_pgtable_ent	pgd
> -	tbnz	pgd, #1, walk_puds
> -next_pgd:
> -	__idmap_kpti_put_pgtable_ent_ng	pgd
> -skip_pgd:
> -	add	cur_pgdp, cur_pgdp, #8
> -	cmp	cur_pgdp, end_pgdp
> -	b.ne	do_pgd
> +	mov		cur_pgdp, swapper_pa
> +	kpti_mk_tbl_ng	pgd, PTRS_PER_PGD
>  
>  	/* Publish the updated tables and nuke all the TLBs */
>  	dsb	sy
> @@ -291,59 +286,35 @@ skip_pgd:
>  	str	wzr, [flag_ptr]
>  	ret
>  
> +.Lderef_pgd:
>  	/* PUD */
> -walk_puds:
> -	.if CONFIG_PGTABLE_LEVELS > 3
> +	.if		CONFIG_PGTABLE_LEVELS > 3
> +	pud		.req	x10
>  	pte_to_phys	cur_pudp, pgd
> -	add	end_pudp, cur_pudp, #(PTRS_PER_PUD * 8)
> -do_pud:	__idmap_kpti_get_pgtable_ent	pud
> -	tbnz	pud, #1, walk_pmds
> -next_pud:
> -	__idmap_kpti_put_pgtable_ent_ng	pud
> -skip_pud:
> -	add	cur_pudp, cur_pudp, 8
> -	cmp	cur_pudp, end_pudp
> -	b.ne	do_pud
> -	b	next_pgd
> -	.else /* CONFIG_PGTABLE_LEVELS <= 3 */
> -	mov	pud, pgd
> -	b	walk_pmds
> -next_pud:
> -	b	next_pgd
> +	kpti_mk_tbl_ng	pud, PTRS_PER_PUD
> +	b		.Lnext_pgd
> +	.else		/* CONFIG_PGTABLE_LEVELS <= 3 */
> +	pud		.req	pgd
> +	.set		.Lnext_pud, .Lnext_pgd
>  	.endif
>  
> +.Lderef_pud:
>  	/* PMD */
> -walk_pmds:
> -	.if CONFIG_PGTABLE_LEVELS > 2
> +	.if		CONFIG_PGTABLE_LEVELS > 2
> +	pmd		.req	x13
>  	pte_to_phys	cur_pmdp, pud
> -	add	end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8)
> -do_pmd:	__idmap_kpti_get_pgtable_ent	pmd
> -	tbnz	pmd, #1, walk_ptes
> -next_pmd:
> -	__idmap_kpti_put_pgtable_ent_ng	pmd
> -skip_pmd:
> -	add	cur_pmdp, cur_pmdp, #8
> -	cmp	cur_pmdp, end_pmdp
> -	b.ne	do_pmd
> -	b	next_pud
> -	.else /* CONFIG_PGTABLE_LEVELS <= 2 */
> -	mov	pmd, pud
> -	b	walk_ptes
> -next_pmd:
> -	b	next_pud
> +	kpti_mk_tbl_ng	pmd, PTRS_PER_PMD
> +	b		.Lnext_pud
> +	.else		/* CONFIG_PGTABLE_LEVELS <= 2 */
> +	pmd		.req	pgd
> +	.set		.Lnext_pmd, .Lnext_pgd
>  	.endif
>  
> +.Lderef_pmd:
>  	/* PTE */
> -walk_ptes:
>  	pte_to_phys	cur_ptep, pmd
> -	add	end_ptep, cur_ptep, #(PTRS_PER_PTE * 8)
> -do_pte:	__idmap_kpti_get_pgtable_ent	pte
> -	__idmap_kpti_put_pgtable_ent_ng	pte
> -skip_pte:
> -	add	cur_ptep, cur_ptep, #8
> -	cmp	cur_ptep, end_ptep
> -	b.ne	do_pte
> -	b	next_pmd
> +	kpti_mk_tbl_ng	pte, PTRS_PER_PTE
> +	b		.Lnext_pmd
>  
>  	.unreq	cpu
>  	.unreq	num_cpus
> -- 
> 2.30.2
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 1/2] arm64: kpti-ng: simplify page table traversal logic
  2022-05-19 17:09   ` Mark Rutland
@ 2022-05-19 18:08     ` Ard Biesheuvel
  0 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2022-05-19 18:08 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Linux ARM, Will Deacon, Marc Zyngier, Catalin Marinas, Kees Cook

On Thu, 19 May 2022 at 19:09, Mark Rutland <mark.rutland@arm.com> wrote:
>
> Hi Ard,
>
> This is a really nice cleanup!
>

Thanks!

> On Thu, Apr 21, 2022 at 04:03:38PM +0200, Ard Biesheuvel wrote:
> > Simplify the KPTI G-to-nG asm helper code by:
> > - pulling the 'table bit' test into the get/put macros so we can combine
> >   them and incorporate the entire loop;
> > - moving the 'table bit' test after the update of bit #11 so we no
> >   longer need separate next_xxx and skip_xxx labels;
> > - redefining the pmd/pud register aliases and the next_pmd/next_pud
> >   labels instead of branching to them if the number of configured page
> >   table levels is less than 3 or 4, respectively;
>
> All of this looks good to me.
>
> > - folding the descriptor pointer increment into the LDR instructions.
>
> I think this leads to issuing a CMO to the wrong address; explained below with
> a proposed fixup.
>
> > No functional change intended, except for the fact that we now descend
> > into a next level table after setting bit #11 on its descriptor but this
> > should make no difference in practice.
>
> Since we don't play games with recursive tables I agree that should fine.
>
> > While at it, switch to .L prefixed local labels so they don't clutter up
> > the symbol tables, kallsyms, etc, and clean up the indentation for
> > legibility.
>
> This also sounds good to me.
>
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>
> If you're happy with my proposed fixup below:
>
> Reviewed-by: Mark Rutland <mark.rutland@arm.com>
>
> With that applied, I've tested this in VMs on ThunderX2 (with KPTI forced on an
> kaslr_requires_kpti() hacked to false) in the following configurations:
>
> * 4K  / 39-bit VAs / 3 levels
> * 4K  / 48-bit VAs / 4 levels
> * 4K  / 48-bit VAs / 4 levels + KASAN (to test shadow skipping)
> * 64K / 42-bit VAs / 2 levels
> * 64K / 48-bit VAs / 3 levels
> * 64K / 52-bit VAs / 3 levels
>
> ... in each case checking the resulting kernel tables with ptudmp.
>
> In all cases that looked good, so (with the fixup):
>
> Tested-by: Mark Rutland <mark.rutland@arm.com>
>
> Beware when looking at the ptdump output that we have 3 KPTI trampoline pages
> in the fixmap region which are (legitimately) executable and global; I got very
> confused when I first spotted that!
>
> > ---
> >  arch/arm64/mm/proc.S | 97 +++++++-------------
> >  1 file changed, 34 insertions(+), 63 deletions(-)
> >
> > diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> > index 50bbed947bec..5619c00f8cd4 100644
> > --- a/arch/arm64/mm/proc.S
> > +++ b/arch/arm64/mm/proc.S
> > @@ -202,19 +202,24 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
> >  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> >       .pushsection ".idmap.text", "awx"
> >
> > -     .macro  __idmap_kpti_get_pgtable_ent, type
> > +     .macro  kpti_mk_tbl_ng, type, num_entries
> > +     add     end_\type\()p, cur_\type\()p, #\num_entries * 8
> > +.Ldo_\type:
> >       dc      cvac, cur_\()\type\()p          // Ensure any existing dirty
>
> Could clean up the (existing) extraneous `\()` here too? That'd make it easier
> to spot the `cur_\type\()p` patter consistently.
>
> >       dmb     sy                              // lines are written back before
> > -     ldr     \type, [cur_\()\type\()p]       // loading the entry
> > -     tbz     \type, #0, skip_\()\type        // Skip invalid and
> > -     tbnz    \type, #11, skip_\()\type       // non-global entries
> > -     .endm
> > -
> > -     .macro __idmap_kpti_put_pgtable_ent_ng, type
> > +     ldr     \type, [cur_\type\()p], #8      // loading the entry
>
> So now `cur_`\type()p` points at the *next* entry, which is a bit confusing ...
>
> > +     tbz     \type, #0, .Lnext_\type         // Skip invalid and
> > +     tbnz    \type, #11, .Lnext_\type        // non-global entries
> >       orr     \type, \type, #PTE_NG           // Same bit for blocks and pages
> > -     str     \type, [cur_\()\type\()p]       // Update the entry and ensure
> > +     str     \type, [cur_\type\()p, #-8]     // Update the entry and ensure
>
> ... though we handle the offset correctly here ...
>
> >       dmb     sy                              // that it is visible to all
> >       dc      civac, cur_\()\type\()p         // CPUs.
>
> ... but here we don't, and perform the CMO for the *next* entry. So for the
> last entry in each cacheline we're missing an invalidate.
>

Yeah, this is a consequence of splitting off these changes: the next
patch gets rid of the CMOs and so it removes the problem as well.

So either we merge the patches again, or apply your fixup. I don't
have a strong preference either way.

> > +     .ifnc   \type, pte
> > +     tbnz    \type, #1, .Lderef_\type
> > +     .endif
> > +.Lnext_\type:
> > +     cmp     cur_\type\()p, end_\type\()p
> > +     b.ne    .Ldo_\type
> >       .endm
>
> I reckon it's be slightly clearer and safer to have an explicit `ADD` at the
> start of `.Lnext_\type`, i.e.
>
> |         .macro  kpti_mk_tbl_ng, type, num_entries
> |         add     end_\type\()p, cur_\type\()p, #\num_entries * 8
> | .Ldo_\type:
> |         dc      cvac, cur_\type\()p             // Ensure any existing dirty
> |         dmb     sy                              // lines are written back before
> |         ldr     \type, [cur_\type\()p]          // loading the entry
> |         tbz     \type, #0, .Lnext_\type         // Skip invalid and
> |         tbnz    \type, #11, .Lnext_\type        // non-global entries
> |         orr     \type, \type, #PTE_NG           // Same bit for blocks and pages
> |         str     \type, [cur_\type\()p]          // Update the entry and ensure
> |         dmb     sy                              // that it is visible to all
> |         dc      civac, cur_\()\type\()p         // CPUs.
> |         .ifnc   \type, pte
> |         tbnz    \type, #1, .Lderef_\type
> |         .endif
> | .Lnext_\type:
> |         add     cur_\type\()p, cur_\type\()p, #8
> |         cmp     cur_\type\()p, end_\type\()p
> |         b.ne    .Ldo_\type
> |         .endm
>
> Other than that, this looks good to me.
>
> Thanks,
> Mark.
>
> >  /*
> > @@ -235,10 +240,8 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> >       pgd             .req    x7
> >       cur_pudp        .req    x8
> >       end_pudp        .req    x9
> > -     pud             .req    x10
> >       cur_pmdp        .req    x11
> >       end_pmdp        .req    x12
> > -     pmd             .req    x13
> >       cur_ptep        .req    x14
> >       end_ptep        .req    x15
> >       pte             .req    x16
> > @@ -265,16 +268,8 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> >
> >       /* Everybody is enjoying the idmap, so we can rewrite swapper. */
> >       /* PGD */
> > -     mov     cur_pgdp, swapper_pa
> > -     add     end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8)
> > -do_pgd:      __idmap_kpti_get_pgtable_ent    pgd
> > -     tbnz    pgd, #1, walk_puds
> > -next_pgd:
> > -     __idmap_kpti_put_pgtable_ent_ng pgd
> > -skip_pgd:
> > -     add     cur_pgdp, cur_pgdp, #8
> > -     cmp     cur_pgdp, end_pgdp
> > -     b.ne    do_pgd
> > +     mov             cur_pgdp, swapper_pa
> > +     kpti_mk_tbl_ng  pgd, PTRS_PER_PGD
> >
> >       /* Publish the updated tables and nuke all the TLBs */
> >       dsb     sy
> > @@ -291,59 +286,35 @@ skip_pgd:
> >       str     wzr, [flag_ptr]
> >       ret
> >
> > +.Lderef_pgd:
> >       /* PUD */
> > -walk_puds:
> > -     .if CONFIG_PGTABLE_LEVELS > 3
> > +     .if             CONFIG_PGTABLE_LEVELS > 3
> > +     pud             .req    x10
> >       pte_to_phys     cur_pudp, pgd
> > -     add     end_pudp, cur_pudp, #(PTRS_PER_PUD * 8)
> > -do_pud:      __idmap_kpti_get_pgtable_ent    pud
> > -     tbnz    pud, #1, walk_pmds
> > -next_pud:
> > -     __idmap_kpti_put_pgtable_ent_ng pud
> > -skip_pud:
> > -     add     cur_pudp, cur_pudp, 8
> > -     cmp     cur_pudp, end_pudp
> > -     b.ne    do_pud
> > -     b       next_pgd
> > -     .else /* CONFIG_PGTABLE_LEVELS <= 3 */
> > -     mov     pud, pgd
> > -     b       walk_pmds
> > -next_pud:
> > -     b       next_pgd
> > +     kpti_mk_tbl_ng  pud, PTRS_PER_PUD
> > +     b               .Lnext_pgd
> > +     .else           /* CONFIG_PGTABLE_LEVELS <= 3 */
> > +     pud             .req    pgd
> > +     .set            .Lnext_pud, .Lnext_pgd
> >       .endif
> >
> > +.Lderef_pud:
> >       /* PMD */
> > -walk_pmds:
> > -     .if CONFIG_PGTABLE_LEVELS > 2
> > +     .if             CONFIG_PGTABLE_LEVELS > 2
> > +     pmd             .req    x13
> >       pte_to_phys     cur_pmdp, pud
> > -     add     end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8)
> > -do_pmd:      __idmap_kpti_get_pgtable_ent    pmd
> > -     tbnz    pmd, #1, walk_ptes
> > -next_pmd:
> > -     __idmap_kpti_put_pgtable_ent_ng pmd
> > -skip_pmd:
> > -     add     cur_pmdp, cur_pmdp, #8
> > -     cmp     cur_pmdp, end_pmdp
> > -     b.ne    do_pmd
> > -     b       next_pud
> > -     .else /* CONFIG_PGTABLE_LEVELS <= 2 */
> > -     mov     pmd, pud
> > -     b       walk_ptes
> > -next_pmd:
> > -     b       next_pud
> > +     kpti_mk_tbl_ng  pmd, PTRS_PER_PMD
> > +     b               .Lnext_pud
> > +     .else           /* CONFIG_PGTABLE_LEVELS <= 2 */
> > +     pmd             .req    pgd
> > +     .set            .Lnext_pmd, .Lnext_pgd
> >       .endif
> >
> > +.Lderef_pmd:
> >       /* PTE */
> > -walk_ptes:
> >       pte_to_phys     cur_ptep, pmd
> > -     add     end_ptep, cur_ptep, #(PTRS_PER_PTE * 8)
> > -do_pte:      __idmap_kpti_get_pgtable_ent    pte
> > -     __idmap_kpti_put_pgtable_ent_ng pte
> > -skip_pte:
> > -     add     cur_ptep, cur_ptep, #8
> > -     cmp     cur_ptep, end_ptep
> > -     b.ne    do_pte
> > -     b       next_pmd
> > +     kpti_mk_tbl_ng  pte, PTRS_PER_PTE
> > +     b               .Lnext_pmd
> >
> >       .unreq  cpu
> >       .unreq  num_cpus
> > --
> > 2.30.2
> >

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 2/2] arm64: mm: install KPTI nG mappings with MMU enabled
  2022-04-21 14:03 ` [PATCH v3 2/2] arm64: mm: install KPTI nG mappings with MMU enabled Ard Biesheuvel
@ 2022-05-20 13:00   ` Mark Rutland
  2022-05-20 14:15     ` Ard Biesheuvel
  0 siblings, 1 reply; 7+ messages in thread
From: Mark Rutland @ 2022-05-20 13:00 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-arm-kernel, will, maz, catalin.marinas, keescook

Hi Ard,

On Thu, Apr 21, 2022 at 04:03:39PM +0200, Ard Biesheuvel wrote:
> In cases where we unmap the kernel while running in user space, we rely
> on ASIDs to distinguish the minimal trampoline from the full kernel
> mapping, and this means we must use non-global attributes for those
> mappings, to ensure they are scoped by ASID and will not hit in the TLB
> inadvertently.
> 
> We only do this when needed, as this is generally more costly in terms
> of TLB pressure, and so we boot without these non-global attributes, and
> apply them to all existing kernel mappings once all CPUs are up and we
> know whether or not the non-global attributes are needed. At this point,
> we cannot simply unmap and remap the entire address space, so we have to
> update all existing block and page descriptors in place.
> 
> Currently, we go through a lot of trouble to perform these updates with
> the MMU and caches off, to avoid violating break before make (BBM) rules
> imposed by the architecture. Since we make changes to page tables that
> are not covered by the ID map, we gain access to those descriptors by
> disabling translations altogether. This means that the stores to memory
> are issued with device attributes, and require extra care in terms of
> coherency, which is costly. We also rely on the ID map to access a
> shared flag, which requires the ID map to be executable and writable at
> the same time, which is another thing we'd prefer to avoid.
> 
> So let's switch to an approach where we replace the kernel mapping with
> a minimal mapping of a few pages that can be used for the shared flag,
> as well as a minimal, ad-hoc fixmap that we can use to map each page
> table in turn as we traverse the hierarchy. This requires one PTE per
> level, and an associated page worth of VA space in the temporary
> mapping.

I really like the high-level idea!

Overall this looks good. I'm not fond of the way we synchronize between
primary/secondary CPUs, and would prefer if we could split that differently --
I've given more detail on that below (and a branch with a proprosal).

> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/arm64/include/asm/mmu.h   |   4 +
>  arch/arm64/kernel/cpufeature.c |  65 +++++++++++-
>  arch/arm64/mm/mmu.c            |   8 +-
>  arch/arm64/mm/proc.S           | 107 ++++++++++++--------
>  4 files changed, 134 insertions(+), 50 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index 48f8466a4be9..b896f0ac4985 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -68,6 +68,10 @@ extern void init_mem_pgprot(void);
>  extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>  			       unsigned long virt, phys_addr_t size,
>  			       pgprot_t prot, bool page_mappings_only);
> +extern void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> +				 unsigned long virt, phys_addr_t size,
> +				 pgprot_t prot,
> +				 phys_addr_t (*pgtable_alloc)(int), int flags);
>  extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
>  extern void mark_linear_text_alias_ro(void);
>  extern bool kaslr_requires_kpti(void);
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index d72c4b4d389c..f0688e812e19 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -1596,14 +1596,31 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
>  }
>  
>  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> +#define KPTI_NG_TEMP_VA		(-(1UL << PMD_SHIFT))
> +
> +static phys_addr_t kpti_ng_temp_alloc;
> +
> +static phys_addr_t kpti_ng_pgd_alloc(int shift)
> +{
> +	kpti_ng_temp_alloc -= PAGE_SIZE;
> +	return kpti_ng_temp_alloc;
> +}
> +
>  static void __nocfi
>  kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
>  {
> -	typedef void (kpti_remap_fn)(int, int, phys_addr_t);
> +	static atomic_t flag = ATOMIC_INIT(0);
> +	static pgd_t *kpti_ng_temp_pgd;
> +	static u64 alloc;
> +
> +	typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
>  	extern kpti_remap_fn idmap_kpti_install_ng_mappings;
>  	kpti_remap_fn *remap_fn;
>  
> -	int cpu = smp_processor_id();
> +	int levels = CONFIG_PGTABLE_LEVELS;
> +	int order = order_base_2(levels + 1);
> +	int num_cpus = num_online_cpus();
> +	int primary = 0;
>  
>  	if (__this_cpu_read(this_cpu_vector) == vectors) {
>  		const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI);
> @@ -1619,14 +1636,54 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
>  	if (arm64_use_ng_mappings)
>  		return;
>  
> +	// First CPU to arrive here gets the job
> +	if (atomic_inc_return(&flag) == 1) {

I'm not too keen on the way this is split, because (aside from the flag), the
secondaries have no need to have anything mapped in TTBR1. If we leave the flag
in the idmap, and have the secondaries all use the reserved TTBR1 page, we can
consistently have the boot CPU do all the hard work (and we could avoid
broadcast maintenance/barriers during bulk of the rewrite, which could make it
faster).

I had a go at refactoring things that way (with this rebased atop), at:

  https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/kpti/rework



The only downside is that we have to keep the idmap RWX, but that was already
the case (and we could rework the idmap to have separate ROX/RW code/data pages
in future).

I can post that to the list if you don't have major objections to that
approach.

> +		alloc =  __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
> +		kpti_ng_temp_pgd = (pgd_t *)(alloc + levels * PAGE_SIZE);
> +		kpti_ng_temp_alloc = __pa(kpti_ng_temp_pgd);
> +		primary = 1;
> +
> +		//
> +		// Create a minimal page table hierarchy that permits us to
> +		// store a shared variable that secondaries will poll, and to
> +		// map the swapper page tables temporarily as we traverse them.
> +		//
> +		// The physical pages are laid out as follows:
> +		//
> +		// +---------+--------+-/-------+-/------ +-\\--------+
> +		// :  data   :  PTE[] : | PMD[] : | PUD[] : || PGD[]  :
> +		// +---------+--------+-\-------+-\------ +-//--------+
> +		//      ^         ^
> +		// The first two pages are mapped consecutively into this
> +		// hierarchy at a PMD_SHIFT aligned virtual address, so that we
> +		// have a place to store the shared variable, and so that we
> +		// can manipulate the PTE level entries while the mapping is
> +		// active.  The first two entries cover the data page and the
> +		// PTE[] page itself, the remaining entries are free to be used
> +		// as a ad-hoc fixmap.
> +		//
> +		__create_pgd_mapping(kpti_ng_temp_pgd, __pa(alloc),
> +				     KPTI_NG_TEMP_VA, 2 * PAGE_SIZE,
> +				     PAGE_KERNEL, kpti_ng_pgd_alloc, 0);

It took me a while to realise this was relying on the order in which
__create_pgd_mapping() calls kpti_ng_pgd_alloc(). I had a look at building the
tables explicitly to make this clearer, but that didn't seem all that clearer,
so this is fine as-is.

> +
> +		// Increment flag again to signal other CPUs to proceed as well
> +		atomic_inc_return_release(&flag);
> +	} else {
> +		// Wait for the primary CPU to set up the temporary page tables
> +		while (atomic_read(&flag) <= num_cpus)
> +			cpu_relax();
> +	}
>  	remap_fn = (void *)__pa_symbol(function_nocfi(idmap_kpti_install_ng_mappings));
>  
>  	cpu_install_idmap();
> -	remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir));
> +	remap_fn(!primary, num_cpus - 1, __pa(kpti_ng_temp_pgd), KPTI_NG_TEMP_VA);
>  	cpu_uninstall_idmap();
>  
> -	if (!cpu)
> +	// Last CPU to leave frees the pages
> +	if (atomic_dec_return(&flag) == 1) {
> +		free_pages(alloc, order);
>  		arm64_use_ng_mappings = true;
> +	}
>  }
>  #else
>  static void
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 626ec32873c6..1c7299dfaa84 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -360,11 +360,9 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  		mutex_unlock(&fixmap_lock);
>  }
>  
> -static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> -				 unsigned long virt, phys_addr_t size,
> -				 pgprot_t prot,
> -				 phys_addr_t (*pgtable_alloc)(int),
> -				 int flags)
> +void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
> +			  phys_addr_t size, pgprot_t prot,
> +			  phys_addr_t (*pgtable_alloc)(int), int flags)
>  {
>  	unsigned long addr, end, next;
>  	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
> diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> index 5619c00f8cd4..20d726207db5 100644
> --- a/arch/arm64/mm/proc.S
> +++ b/arch/arm64/mm/proc.S
> @@ -14,6 +14,7 @@
>  #include <asm/asm-offsets.h>
>  #include <asm/asm_pointer_auth.h>
>  #include <asm/hwcap.h>
> +#include <asm/kernel-pgtable.h>
>  #include <asm/pgtable-hwdef.h>
>  #include <asm/cpufeature.h>
>  #include <asm/alternative.h>
> @@ -200,20 +201,21 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
>  	.popsection
>  
>  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> +
> +#define KPTI_NG_PTE_FLAGS	(PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
> +
>  	.pushsection ".idmap.text", "awx"
>  
>  	.macro	kpti_mk_tbl_ng, type, num_entries
>  	add	end_\type\()p, cur_\type\()p, #\num_entries * 8
>  .Ldo_\type:
> -	dc	cvac, cur_\()\type\()p		// Ensure any existing dirty
> -	dmb	sy				// lines are written back before
> -	ldr	\type, [cur_\type\()p], #8	// loading the entry
> -	tbz	\type, #0, .Lnext_\type		// Skip invalid and
> -	tbnz	\type, #11, .Lnext_\type	// non-global entries
> -	orr	\type, \type, #PTE_NG		// Same bit for blocks and pages
> -	str	\type, [cur_\type\()p, #-8]	// Update the entry and ensure
> -	dmb	sy				// that it is visible to all
> -	dc	civac, cur_\()\type\()p		// CPUs.
> +	ldr	\type, [cur_\type\()p], #8	// Load the entry
> +	.ifnc	\type, pte
> +	tbnz	\type, #11, .Lnext_\type	// Skip visited entries
> +	.endif
> +	and	valid, \type, #1
> +	orr	\type, \type, valid, lsl #11	// nG |= valid
> +	str	\type, [cur_\type\()p, #-8]	// Update the entry

IMO it was clearer to skip invalid entries than to extract the valid bit value
and ORR that back into the nG position. Was that change intended as an
optimization?

>  	.ifnc	\type, pte
>  	tbnz	\type, #1, .Lderef_\type
>  	.endif

As with the last patch, I'd prfeer is we had an explicit ADD for the increment
in the `.Lnext_\type`. Even with the CMOs removed, I reckon that's clearer.

> @@ -222,19 +224,42 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
>  	b.ne	.Ldo_\type
>  	.endm
>  
> +	/*
> +	 * Dereference the current table entry and map it into the temporary
> +	 * fixmap slot associated with the current level. The ad-hoc fixmap
> +	 * is a set of PTEs that are located above the PTEs that cover the
> +	 * level 3 page table and the scratch page that precedes it.
> +	 */
> +	.macro	kpti_map_pgtbl, type, level
> +	str	xzr, [temp_pte, #8 * (\level + 2)]	// break before make
> +	dsb	ishst
> +	add	pte, flag_ptr, #PAGE_SIZE * (\level + 2)
> +	lsr	pte, pte, #12
> +	tlbi	vaae1is, pte
> +	dsb	ish
> +	isb
> +
> +	phys_to_pte pte, cur_\type\()p
> +	add	cur_\type\()p, flag_ptr, #PAGE_SIZE * (\level + 2)
> +	orr	pte, pte, pte_flags
> +	str	pte, [temp_pte, #8 * (\level + 2)]
> +	dsb	ishst
> +	.endm

If we only had the boot CPU map the fixmap TTBR, we could reduce the scope of
barriers and TLB maintenance here to be nSH, and removing the broadcast
maintenance could be a big win. We'd just need a final DSB ISHST prior to
waking secondaries to ensure they see all the updates.

Mybe that's miniscule compared to keeping the MMU on, though.

Thanks,
Mark.

> +
>  /*
> - * void __kpti_install_ng_mappings(int cpu, int num_cpus, phys_addr_t swapper)
> + * void __kpti_install_ng_mappings(int cpu, int num_secondaries, phys_addr_t temp_pgd,
> + *				   unsigned long kpti_ng_temp_va)
>   *
>   * Called exactly once from stop_machine context by each CPU found during boot.
>   */
> -__idmap_kpti_flag:
> -	.long	1
>  SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> -	cpu		.req	w0
> +	cpu		.req	w0	// at entry
> +	pte_flags	.req	x0
>  	num_cpus	.req	w1
> -	swapper_pa	.req	x2
> -	swapper_ttb	.req	x3
> -	flag_ptr	.req	x4
> +	temp_pgd_phys	.req	x2	// at entry
> +	temp_pte	.req	x2
> +	flag_ptr	.req	x3
> +	swapper_ttb	.req	x4
>  	cur_pgdp	.req	x5
>  	end_pgdp	.req	x6
>  	pgd		.req	x7
> @@ -245,10 +270,15 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
>  	cur_ptep	.req	x14
>  	end_ptep	.req	x15
>  	pte		.req	x16
> +	valid		.req	x17
>  
>  	mrs	swapper_ttb, ttbr1_el1
> -	restore_ttbr1	swapper_ttb
> -	adr	flag_ptr, __idmap_kpti_flag
> +
> +	/* Uninstall swapper before surgery begins */
> +	__idmap_cpu_set_reserved_ttbr1 x8, x9
> +	offset_ttbr1 temp_pgd_phys, x8
> +	msr	ttbr1_el1, temp_pgd_phys
> +	isb
>  
>  	cbnz	cpu, __idmap_kpti_secondary
>  
> @@ -259,31 +289,24 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
>  	eor	w17, w17, num_cpus
>  	cbnz	w17, 1b
>  
> -	/* We need to walk swapper, so turn off the MMU. */
> -	pre_disable_mmu_workaround
> -	mrs	x17, sctlr_el1
> -	bic	x17, x17, #SCTLR_ELx_M
> -	msr	sctlr_el1, x17
> -	isb
> +	mov	pte_flags, #KPTI_NG_PTE_FLAGS
> +
> +	/* Advance temp_pte to the fixmap page */
> +	add	temp_pte, flag_ptr, #PAGE_SIZE
>  
>  	/* Everybody is enjoying the idmap, so we can rewrite swapper. */
>  	/* PGD */
> -	mov		cur_pgdp, swapper_pa
> +	adrp		cur_pgdp, swapper_pg_dir
> +	kpti_map_pgtbl	pgd, 0
>  	kpti_mk_tbl_ng	pgd, PTRS_PER_PGD
>  
> -	/* Publish the updated tables and nuke all the TLBs */
> -	dsb	sy
> -	tlbi	vmalle1is
> -	dsb	ish
> -	isb
> -
> -	/* We're done: fire up the MMU again */
> -	mrs	x17, sctlr_el1
> -	orr	x17, x17, #SCTLR_ELx_M
> -	set_sctlr_el1	x17
> -
>  	/* Set the flag to zero to indicate that we're all done */
>  	str	wzr, [flag_ptr]
> +
> +	/* We're done: fire up swapper again */
> +	__idmap_cpu_set_reserved_ttbr1 x8, x9
> +	msr	ttbr1_el1, swapper_ttb
> +	isb
>  	ret
>  
>  .Lderef_pgd:
> @@ -291,6 +314,7 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
>  	.if		CONFIG_PGTABLE_LEVELS > 3
>  	pud		.req	x10
>  	pte_to_phys	cur_pudp, pgd
> +	kpti_map_pgtbl	pud, 1
>  	kpti_mk_tbl_ng	pud, PTRS_PER_PUD
>  	b		.Lnext_pgd
>  	.else		/* CONFIG_PGTABLE_LEVELS <= 3 */
> @@ -303,6 +327,7 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
>  	.if		CONFIG_PGTABLE_LEVELS > 2
>  	pmd		.req	x13
>  	pte_to_phys	cur_pmdp, pud
> +	kpti_map_pgtbl	pmd, 2
>  	kpti_mk_tbl_ng	pmd, PTRS_PER_PMD
>  	b		.Lnext_pud
>  	.else		/* CONFIG_PGTABLE_LEVELS <= 2 */
> @@ -313,12 +338,14 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
>  .Lderef_pmd:
>  	/* PTE */
>  	pte_to_phys	cur_ptep, pmd
> +	kpti_map_pgtbl	pte, 3
>  	kpti_mk_tbl_ng	pte, PTRS_PER_PTE
>  	b		.Lnext_pmd
>  
>  	.unreq	cpu
> +	.unreq	pte_flags
>  	.unreq	num_cpus
> -	.unreq	swapper_pa
> +	.unreq	temp_pgd_phys
>  	.unreq	cur_pgdp
>  	.unreq	end_pgdp
>  	.unreq	pgd
> @@ -331,12 +358,10 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
>  	.unreq	cur_ptep
>  	.unreq	end_ptep
>  	.unreq	pte
> +	.unreq	valid
>  
>  	/* Secondary CPUs end up here */
>  __idmap_kpti_secondary:
> -	/* Uninstall swapper before surgery begins */
> -	__idmap_cpu_set_reserved_ttbr1 x16, x17
> -
>  	/* Increment the flag to let the boot CPU we're ready */
>  1:	ldxr	w16, [flag_ptr]
>  	add	w16, w16, #1
> @@ -350,7 +375,7 @@ __idmap_kpti_secondary:
>  	cbnz	w16, 1b
>  
>  	/* All done, act like nothing happened */
> -	offset_ttbr1 swapper_ttb, x16
> +	__idmap_cpu_set_reserved_ttbr1 x8, x9
>  	msr	ttbr1_el1, swapper_ttb
>  	isb
>  	ret
> -- 
> 2.30.2
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 2/2] arm64: mm: install KPTI nG mappings with MMU enabled
  2022-05-20 13:00   ` Mark Rutland
@ 2022-05-20 14:15     ` Ard Biesheuvel
  0 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2022-05-20 14:15 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Linux ARM, Will Deacon, Marc Zyngier, Catalin Marinas, Kees Cook

On Fri, 20 May 2022 at 15:00, Mark Rutland <mark.rutland@arm.com> wrote:
>
> Hi Ard,
>
> On Thu, Apr 21, 2022 at 04:03:39PM +0200, Ard Biesheuvel wrote:
> > In cases where we unmap the kernel while running in user space, we rely
> > on ASIDs to distinguish the minimal trampoline from the full kernel
> > mapping, and this means we must use non-global attributes for those
> > mappings, to ensure they are scoped by ASID and will not hit in the TLB
> > inadvertently.
> >
> > We only do this when needed, as this is generally more costly in terms
> > of TLB pressure, and so we boot without these non-global attributes, and
> > apply them to all existing kernel mappings once all CPUs are up and we
> > know whether or not the non-global attributes are needed. At this point,
> > we cannot simply unmap and remap the entire address space, so we have to
> > update all existing block and page descriptors in place.
> >
> > Currently, we go through a lot of trouble to perform these updates with
> > the MMU and caches off, to avoid violating break before make (BBM) rules
> > imposed by the architecture. Since we make changes to page tables that
> > are not covered by the ID map, we gain access to those descriptors by
> > disabling translations altogether. This means that the stores to memory
> > are issued with device attributes, and require extra care in terms of
> > coherency, which is costly. We also rely on the ID map to access a
> > shared flag, which requires the ID map to be executable and writable at
> > the same time, which is another thing we'd prefer to avoid.
> >
> > So let's switch to an approach where we replace the kernel mapping with
> > a minimal mapping of a few pages that can be used for the shared flag,
> > as well as a minimal, ad-hoc fixmap that we can use to map each page
> > table in turn as we traverse the hierarchy. This requires one PTE per
> > level, and an associated page worth of VA space in the temporary
> > mapping.
>
> I really like the high-level idea!
>

Thanks!

> Overall this looks good. I'm not fond of the way we synchronize between
> primary/secondary CPUs, and would prefer if we could split that differently --
> I've given more detail on that below (and a branch with a proprosal).
>
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> >  arch/arm64/include/asm/mmu.h   |   4 +
> >  arch/arm64/kernel/cpufeature.c |  65 +++++++++++-
> >  arch/arm64/mm/mmu.c            |   8 +-
> >  arch/arm64/mm/proc.S           | 107 ++++++++++++--------
> >  4 files changed, 134 insertions(+), 50 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> > index 48f8466a4be9..b896f0ac4985 100644
> > --- a/arch/arm64/include/asm/mmu.h
> > +++ b/arch/arm64/include/asm/mmu.h
> > @@ -68,6 +68,10 @@ extern void init_mem_pgprot(void);
> >  extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
> >                              unsigned long virt, phys_addr_t size,
> >                              pgprot_t prot, bool page_mappings_only);
> > +extern void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> > +                              unsigned long virt, phys_addr_t size,
> > +                              pgprot_t prot,
> > +                              phys_addr_t (*pgtable_alloc)(int), int flags);
> >  extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
> >  extern void mark_linear_text_alias_ro(void);
> >  extern bool kaslr_requires_kpti(void);
> > diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> > index d72c4b4d389c..f0688e812e19 100644
> > --- a/arch/arm64/kernel/cpufeature.c
> > +++ b/arch/arm64/kernel/cpufeature.c
> > @@ -1596,14 +1596,31 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
> >  }
> >
> >  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> > +#define KPTI_NG_TEMP_VA              (-(1UL << PMD_SHIFT))
> > +
> > +static phys_addr_t kpti_ng_temp_alloc;
> > +
> > +static phys_addr_t kpti_ng_pgd_alloc(int shift)
> > +{
> > +     kpti_ng_temp_alloc -= PAGE_SIZE;
> > +     return kpti_ng_temp_alloc;
> > +}
> > +
> >  static void __nocfi
> >  kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
> >  {
> > -     typedef void (kpti_remap_fn)(int, int, phys_addr_t);
> > +     static atomic_t flag = ATOMIC_INIT(0);
> > +     static pgd_t *kpti_ng_temp_pgd;
> > +     static u64 alloc;
> > +
> > +     typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
> >       extern kpti_remap_fn idmap_kpti_install_ng_mappings;
> >       kpti_remap_fn *remap_fn;
> >
> > -     int cpu = smp_processor_id();
> > +     int levels = CONFIG_PGTABLE_LEVELS;
> > +     int order = order_base_2(levels + 1);
> > +     int num_cpus = num_online_cpus();
> > +     int primary = 0;
> >
> >       if (__this_cpu_read(this_cpu_vector) == vectors) {
> >               const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI);
> > @@ -1619,14 +1636,54 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
> >       if (arm64_use_ng_mappings)
> >               return;
> >
> > +     // First CPU to arrive here gets the job
> > +     if (atomic_inc_return(&flag) == 1) {
>
> I'm not too keen on the way this is split, because (aside from the flag), the
> secondaries have no need to have anything mapped in TTBR1. If we leave the flag
> in the idmap, and have the secondaries all use the reserved TTBR1 page, we can
> consistently have the boot CPU do all the hard work (and we could avoid
> broadcast maintenance/barriers during bulk of the rewrite, which could make it
> faster).
>
> I had a go at refactoring things that way (with this rebased atop), at:
>
>   https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/log/?h=arm64/kpti/rework
>
>
>
> The only downside is that we have to keep the idmap RWX, but that was already
> the case (and we could rework the idmap to have separate ROX/RW code/data pages
> in future).
>

This was kind of the point - I stumbled onto this while working on my
WXN changes, which broke this code.

> I can post that to the list if you don't have major objections to that
> approach.
>

Not at all. We can revisit the ID map RWX issue later, but I'm eager
to get that fixed at some point.

> > +             alloc =  __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
> > +             kpti_ng_temp_pgd = (pgd_t *)(alloc + levels * PAGE_SIZE);
> > +             kpti_ng_temp_alloc = __pa(kpti_ng_temp_pgd);
> > +             primary = 1;
> > +
> > +             //
> > +             // Create a minimal page table hierarchy that permits us to
> > +             // store a shared variable that secondaries will poll, and to
> > +             // map the swapper page tables temporarily as we traverse them.
> > +             //
> > +             // The physical pages are laid out as follows:
> > +             //
> > +             // +---------+--------+-/-------+-/------ +-\\--------+
> > +             // :  data   :  PTE[] : | PMD[] : | PUD[] : || PGD[]  :
> > +             // +---------+--------+-\-------+-\------ +-//--------+
> > +             //      ^         ^
> > +             // The first two pages are mapped consecutively into this
> > +             // hierarchy at a PMD_SHIFT aligned virtual address, so that we
> > +             // have a place to store the shared variable, and so that we
> > +             // can manipulate the PTE level entries while the mapping is
> > +             // active.  The first two entries cover the data page and the
> > +             // PTE[] page itself, the remaining entries are free to be used
> > +             // as a ad-hoc fixmap.
> > +             //
> > +             __create_pgd_mapping(kpti_ng_temp_pgd, __pa(alloc),
> > +                                  KPTI_NG_TEMP_VA, 2 * PAGE_SIZE,
> > +                                  PAGE_KERNEL, kpti_ng_pgd_alloc, 0);
>
> It took me a while to realise this was relying on the order in which
> __create_pgd_mapping() calls kpti_ng_pgd_alloc(). I had a look at building the
> tables explicitly to make this clearer, but that didn't seem all that clearer,
> so this is fine as-is.
>

Yeah. On the one hand, it is kind of dodgy to rely on this but on the
other hand, it is hard to imagine how changes to the page table
routines would result in these pages to be allocated in a different
order.

We could add a comment, I suppose.

> > +
> > +             // Increment flag again to signal other CPUs to proceed as well
> > +             atomic_inc_return_release(&flag);
> > +     } else {
> > +             // Wait for the primary CPU to set up the temporary page tables
> > +             while (atomic_read(&flag) <= num_cpus)
> > +                     cpu_relax();
> > +     }
> >       remap_fn = (void *)__pa_symbol(function_nocfi(idmap_kpti_install_ng_mappings));
> >
> >       cpu_install_idmap();
> > -     remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir));
> > +     remap_fn(!primary, num_cpus - 1, __pa(kpti_ng_temp_pgd), KPTI_NG_TEMP_VA);
> >       cpu_uninstall_idmap();
> >
> > -     if (!cpu)
> > +     // Last CPU to leave frees the pages
> > +     if (atomic_dec_return(&flag) == 1) {
> > +             free_pages(alloc, order);
> >               arm64_use_ng_mappings = true;
> > +     }
> >  }
> >  #else
> >  static void
> > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> > index 626ec32873c6..1c7299dfaa84 100644
> > --- a/arch/arm64/mm/mmu.c
> > +++ b/arch/arm64/mm/mmu.c
> > @@ -360,11 +360,9 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
> >               mutex_unlock(&fixmap_lock);
> >  }
> >
> > -static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> > -                              unsigned long virt, phys_addr_t size,
> > -                              pgprot_t prot,
> > -                              phys_addr_t (*pgtable_alloc)(int),
> > -                              int flags)
> > +void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
> > +                       phys_addr_t size, pgprot_t prot,
> > +                       phys_addr_t (*pgtable_alloc)(int), int flags)
> >  {
> >       unsigned long addr, end, next;
> >       pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
> > diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
> > index 5619c00f8cd4..20d726207db5 100644
> > --- a/arch/arm64/mm/proc.S
> > +++ b/arch/arm64/mm/proc.S
> > @@ -14,6 +14,7 @@
> >  #include <asm/asm-offsets.h>
> >  #include <asm/asm_pointer_auth.h>
> >  #include <asm/hwcap.h>
> > +#include <asm/kernel-pgtable.h>
> >  #include <asm/pgtable-hwdef.h>
> >  #include <asm/cpufeature.h>
> >  #include <asm/alternative.h>
> > @@ -200,20 +201,21 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
> >       .popsection
> >
> >  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> > +
> > +#define KPTI_NG_PTE_FLAGS    (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
> > +
> >       .pushsection ".idmap.text", "awx"
> >
> >       .macro  kpti_mk_tbl_ng, type, num_entries
> >       add     end_\type\()p, cur_\type\()p, #\num_entries * 8
> >  .Ldo_\type:
> > -     dc      cvac, cur_\()\type\()p          // Ensure any existing dirty
> > -     dmb     sy                              // lines are written back before
> > -     ldr     \type, [cur_\type\()p], #8      // loading the entry
> > -     tbz     \type, #0, .Lnext_\type         // Skip invalid and
> > -     tbnz    \type, #11, .Lnext_\type        // non-global entries
> > -     orr     \type, \type, #PTE_NG           // Same bit for blocks and pages
> > -     str     \type, [cur_\type\()p, #-8]     // Update the entry and ensure
> > -     dmb     sy                              // that it is visible to all
> > -     dc      civac, cur_\()\type\()p         // CPUs.
> > +     ldr     \type, [cur_\type\()p], #8      // Load the entry
> > +     .ifnc   \type, pte
> > +     tbnz    \type, #11, .Lnext_\type        // Skip visited entries
> > +     .endif
> > +     and     valid, \type, #1
> > +     orr     \type, \type, valid, lsl #11    // nG |= valid
> > +     str     \type, [cur_\type\()p, #-8]     // Update the entry
>
> IMO it was clearer to skip invalid entries than to extract the valid bit value
> and ORR that back into the nG position. Was that change intended as an
> optimization?
>

More or less. Due to some misinterpreted performance numbers, during
development I ended up with a SIMD version for level 3 that updated 8
entries at a time, and so ORRing the valid bits into position #11 was
necessary.

I kept it because the bulk of the work (and therefore the reason for
the huge delay) is the linear map being mapped down to pages, and so
virtually all entries are valid. Omitting the conditional branch is
unlikely to make a measurable difference in performance, though, so we
can reinstate it if you prefer.

> >       .ifnc   \type, pte
> >       tbnz    \type, #1, .Lderef_\type
> >       .endif
>
> As with the last patch, I'd prfeer is we had an explicit ADD for the increment
> in the `.Lnext_\type`. Even with the CMOs removed, I reckon that's clearer.
>

Fine with me.

> > @@ -222,19 +224,42 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
> >       b.ne    .Ldo_\type
> >       .endm
> >
> > +     /*
> > +      * Dereference the current table entry and map it into the temporary
> > +      * fixmap slot associated with the current level. The ad-hoc fixmap
> > +      * is a set of PTEs that are located above the PTEs that cover the
> > +      * level 3 page table and the scratch page that precedes it.
> > +      */
> > +     .macro  kpti_map_pgtbl, type, level
> > +     str     xzr, [temp_pte, #8 * (\level + 2)]      // break before make
> > +     dsb     ishst
> > +     add     pte, flag_ptr, #PAGE_SIZE * (\level + 2)
> > +     lsr     pte, pte, #12
> > +     tlbi    vaae1is, pte
> > +     dsb     ish
> > +     isb
> > +
> > +     phys_to_pte pte, cur_\type\()p
> > +     add     cur_\type\()p, flag_ptr, #PAGE_SIZE * (\level + 2)
> > +     orr     pte, pte, pte_flags
> > +     str     pte, [temp_pte, #8 * (\level + 2)]
> > +     dsb     ishst
> > +     .endm
>
> If we only had the boot CPU map the fixmap TTBR, we could reduce the scope of
> barriers and TLB maintenance here to be nSH, and removing the broadcast
> maintenance could be a big win. We'd just need a final DSB ISHST prior to
> waking secondaries to ensure they see all the updates.
>
> Mybe that's miniscule compared to keeping the MMU on, though.
>

To be honest, the only observable cq. measurable difference on TX2 is
keeping the MMU enabled. Everything else was in the noise.


> > +
> >  /*
> > - * void __kpti_install_ng_mappings(int cpu, int num_cpus, phys_addr_t swapper)
> > + * void __kpti_install_ng_mappings(int cpu, int num_secondaries, phys_addr_t temp_pgd,
> > + *                              unsigned long kpti_ng_temp_va)
> >   *
> >   * Called exactly once from stop_machine context by each CPU found during boot.
> >   */
> > -__idmap_kpti_flag:
> > -     .long   1
> >  SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> > -     cpu             .req    w0
> > +     cpu             .req    w0      // at entry
> > +     pte_flags       .req    x0
> >       num_cpus        .req    w1
> > -     swapper_pa      .req    x2
> > -     swapper_ttb     .req    x3
> > -     flag_ptr        .req    x4
> > +     temp_pgd_phys   .req    x2      // at entry
> > +     temp_pte        .req    x2
> > +     flag_ptr        .req    x3
> > +     swapper_ttb     .req    x4
> >       cur_pgdp        .req    x5
> >       end_pgdp        .req    x6
> >       pgd             .req    x7
> > @@ -245,10 +270,15 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> >       cur_ptep        .req    x14
> >       end_ptep        .req    x15
> >       pte             .req    x16
> > +     valid           .req    x17
> >
> >       mrs     swapper_ttb, ttbr1_el1
> > -     restore_ttbr1   swapper_ttb
> > -     adr     flag_ptr, __idmap_kpti_flag
> > +
> > +     /* Uninstall swapper before surgery begins */
> > +     __idmap_cpu_set_reserved_ttbr1 x8, x9
> > +     offset_ttbr1 temp_pgd_phys, x8
> > +     msr     ttbr1_el1, temp_pgd_phys
> > +     isb
> >
> >       cbnz    cpu, __idmap_kpti_secondary
> >
> > @@ -259,31 +289,24 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> >       eor     w17, w17, num_cpus
> >       cbnz    w17, 1b
> >
> > -     /* We need to walk swapper, so turn off the MMU. */
> > -     pre_disable_mmu_workaround
> > -     mrs     x17, sctlr_el1
> > -     bic     x17, x17, #SCTLR_ELx_M
> > -     msr     sctlr_el1, x17
> > -     isb
> > +     mov     pte_flags, #KPTI_NG_PTE_FLAGS
> > +
> > +     /* Advance temp_pte to the fixmap page */
> > +     add     temp_pte, flag_ptr, #PAGE_SIZE
> >
> >       /* Everybody is enjoying the idmap, so we can rewrite swapper. */
> >       /* PGD */
> > -     mov             cur_pgdp, swapper_pa
> > +     adrp            cur_pgdp, swapper_pg_dir
> > +     kpti_map_pgtbl  pgd, 0
> >       kpti_mk_tbl_ng  pgd, PTRS_PER_PGD
> >
> > -     /* Publish the updated tables and nuke all the TLBs */
> > -     dsb     sy
> > -     tlbi    vmalle1is
> > -     dsb     ish
> > -     isb
> > -
> > -     /* We're done: fire up the MMU again */
> > -     mrs     x17, sctlr_el1
> > -     orr     x17, x17, #SCTLR_ELx_M
> > -     set_sctlr_el1   x17
> > -
> >       /* Set the flag to zero to indicate that we're all done */
> >       str     wzr, [flag_ptr]
> > +
> > +     /* We're done: fire up swapper again */
> > +     __idmap_cpu_set_reserved_ttbr1 x8, x9
> > +     msr     ttbr1_el1, swapper_ttb
> > +     isb
> >       ret
> >
> >  .Lderef_pgd:
> > @@ -291,6 +314,7 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> >       .if             CONFIG_PGTABLE_LEVELS > 3
> >       pud             .req    x10
> >       pte_to_phys     cur_pudp, pgd
> > +     kpti_map_pgtbl  pud, 1
> >       kpti_mk_tbl_ng  pud, PTRS_PER_PUD
> >       b               .Lnext_pgd
> >       .else           /* CONFIG_PGTABLE_LEVELS <= 3 */
> > @@ -303,6 +327,7 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> >       .if             CONFIG_PGTABLE_LEVELS > 2
> >       pmd             .req    x13
> >       pte_to_phys     cur_pmdp, pud
> > +     kpti_map_pgtbl  pmd, 2
> >       kpti_mk_tbl_ng  pmd, PTRS_PER_PMD
> >       b               .Lnext_pud
> >       .else           /* CONFIG_PGTABLE_LEVELS <= 2 */
> > @@ -313,12 +338,14 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> >  .Lderef_pmd:
> >       /* PTE */
> >       pte_to_phys     cur_ptep, pmd
> > +     kpti_map_pgtbl  pte, 3
> >       kpti_mk_tbl_ng  pte, PTRS_PER_PTE
> >       b               .Lnext_pmd
> >
> >       .unreq  cpu
> > +     .unreq  pte_flags
> >       .unreq  num_cpus
> > -     .unreq  swapper_pa
> > +     .unreq  temp_pgd_phys
> >       .unreq  cur_pgdp
> >       .unreq  end_pgdp
> >       .unreq  pgd
> > @@ -331,12 +358,10 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
> >       .unreq  cur_ptep
> >       .unreq  end_ptep
> >       .unreq  pte
> > +     .unreq  valid
> >
> >       /* Secondary CPUs end up here */
> >  __idmap_kpti_secondary:
> > -     /* Uninstall swapper before surgery begins */
> > -     __idmap_cpu_set_reserved_ttbr1 x16, x17
> > -
> >       /* Increment the flag to let the boot CPU we're ready */
> >  1:   ldxr    w16, [flag_ptr]
> >       add     w16, w16, #1
> > @@ -350,7 +375,7 @@ __idmap_kpti_secondary:
> >       cbnz    w16, 1b
> >
> >       /* All done, act like nothing happened */
> > -     offset_ttbr1 swapper_ttb, x16
> > +     __idmap_cpu_set_reserved_ttbr1 x8, x9
> >       msr     ttbr1_el1, swapper_ttb
> >       isb
> >       ret
> > --
> > 2.30.2
> >

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-05-20 14:17 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-21 14:03 [PATCH v3 0/2] arm64: apply G-to-nG conversion for KPTI with MMU enabled Ard Biesheuvel
2022-04-21 14:03 ` [PATCH v3 1/2] arm64: kpti-ng: simplify page table traversal logic Ard Biesheuvel
2022-05-19 17:09   ` Mark Rutland
2022-05-19 18:08     ` Ard Biesheuvel
2022-04-21 14:03 ` [PATCH v3 2/2] arm64: mm: install KPTI nG mappings with MMU enabled Ard Biesheuvel
2022-05-20 13:00   ` Mark Rutland
2022-05-20 14:15     ` Ard Biesheuvel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.