All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] ARM: optimize some variable accesses
@ 2022-03-01 12:04 Ard Biesheuvel
  2022-03-01 12:04 ` [PATCH 1/2] ARM: assembler: simplify ldr_this_cpu for !SMP builds Ard Biesheuvel
  2022-03-01 12:04 ` [PATCH 2/2] ARM: entry: avoid explicit literal loads Ard Biesheuvel
  0 siblings, 2 replies; 5+ messages in thread
From: Ard Biesheuvel @ 2022-03-01 12:04 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: Ard Biesheuvel, Russell King, Arnd Bergmann, Linus Walleij

Now that we have an efficient way to load variables from asm code on any
arch revision, get rid of any explicit literals that are no longer
needed (#2). Patch #1 tweaks the !SMP case for ldr_this_cpu, which uses
the same macros.

Cc: Russell King <linux@armlinux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Linus Walleij <linus.walleij@linaro.org>

Ard Biesheuvel (2):
  ARM: assembler: simplify ldr_this_cpu for !SMP builds
  ARM: entry: avoid explicit literal loads

 arch/arm/include/asm/assembler.h | 28 +++++++++------
 arch/arm/kernel/entry-armv.S     | 37 ++++----------------
 arch/arm/kernel/entry-common.S   | 10 +-----
 arch/arm/kernel/entry-header.S   |  3 +-
 4 files changed, 26 insertions(+), 52 deletions(-)

-- 
2.30.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/2] ARM: assembler: simplify ldr_this_cpu for !SMP builds
  2022-03-01 12:04 [PATCH 0/2] ARM: optimize some variable accesses Ard Biesheuvel
@ 2022-03-01 12:04 ` Ard Biesheuvel
  2022-03-02 11:33   ` Linus Walleij
  2022-03-01 12:04 ` [PATCH 2/2] ARM: entry: avoid explicit literal loads Ard Biesheuvel
  1 sibling, 1 reply; 5+ messages in thread
From: Ard Biesheuvel @ 2022-03-01 12:04 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: Ard Biesheuvel, Russell King, Arnd Bergmann, Linus Walleij

When CONFIG_SMP is not defined, the CPU offset is always zero, and so
we can simplify the sequence to load a per-CPU variable.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/assembler.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 30c1f8c8b178..96f4028f7423 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -682,8 +682,12 @@ THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 	/*
 	 * ldr_va - load a 32-bit word from the virtual address of \sym
 	 */
-	.macro		ldr_va, rd:req, sym:req, cond
+	.macro		ldr_va, rd:req, sym:req, cond, tmp
+	.ifnb		\tmp
+	__ldst_va	ldr, \rd, \tmp, \sym, \cond
+	.else
 	__ldst_va	ldr, \rd, \rd, \sym, \cond
+	.endif
 	.endm
 
 	/*
@@ -717,9 +721,11 @@ THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 	 *		  are permitted to overlap with 'rd' if != sp
 	 */
 	.macro		ldr_this_cpu, rd:req, sym:req, t1:req, t2:req
-#if __LINUX_ARM_ARCH__ >= 7 || \
-    !defined(CONFIG_ARM_HAS_GROUP_RELOCS) || \
-    (defined(MODULE) && defined(CONFIG_ARM_MODULE_PLTS))
+#ifndef CONFIG_SMP
+	ldr_va		\rd, \sym, tmp=\t1
+#elif __LINUX_ARM_ARCH__ >= 7 || \
+      !defined(CONFIG_ARM_HAS_GROUP_RELOCS) || \
+      (defined(MODULE) && defined(CONFIG_ARM_MODULE_PLTS))
 	this_cpu_offset	\t1
 	mov_l		\t2, \sym
 	ldr		\rd, [\t1, \t2]
-- 
2.30.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] ARM: entry: avoid explicit literal loads
  2022-03-01 12:04 [PATCH 0/2] ARM: optimize some variable accesses Ard Biesheuvel
  2022-03-01 12:04 ` [PATCH 1/2] ARM: assembler: simplify ldr_this_cpu for !SMP builds Ard Biesheuvel
@ 2022-03-01 12:04 ` Ard Biesheuvel
  2022-03-02 11:42   ` Linus Walleij
  1 sibling, 1 reply; 5+ messages in thread
From: Ard Biesheuvel @ 2022-03-01 12:04 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: Ard Biesheuvel, Russell King, Arnd Bergmann, Linus Walleij

ARMv7 has MOVW/MOVT instruction pairs to load symbol addresses into
registers without having to rely on literal loads that go via the
D-cache.  For older cores, we now support a similar arrangement, based
on PC-relative group relocations.

This means we can elide most literal loads entirely from the entry path,
by switching to the ldr_va macro to emit the appropriate sequence
depending on the target architecture revision.

While at it, switch to the bl_r macro for invoking the right PABT/DABT
helpers instead of setting the LR register explicitly, which does not
play well with cores that speculate across function returns.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/arm/include/asm/assembler.h | 18 +++++-----
 arch/arm/kernel/entry-armv.S     | 37 ++++----------------
 arch/arm/kernel/entry-common.S   | 10 +-----
 arch/arm/kernel/entry-header.S   |  3 +-
 4 files changed, 18 insertions(+), 50 deletions(-)

diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 96f4028f7423..3a76241d880f 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -656,12 +656,11 @@ THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 	__adldst_l	str, \src, \sym, \tmp, \cond
 	.endm
 
-	.macro		__ldst_va, op, reg, tmp, sym, cond
+	.macro		__ldst_va, op, reg, tmp, sym, cond, offset
 #if __LINUX_ARM_ARCH__ >= 7 || \
     !defined(CONFIG_ARM_HAS_GROUP_RELOCS) || \
     (defined(MODULE) && defined(CONFIG_ARM_MODULE_PLTS))
 	mov_l		\tmp, \sym, \cond
-	\op\cond	\reg, [\tmp]
 #else
 	/*
 	 * Avoid a literal load, by emitting a sequence of ADD/LDR instructions
@@ -673,20 +672,21 @@ THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 	.reloc		.L0_\@, R_ARM_ALU_PC_G0_NC, \sym
 	.reloc		.L1_\@, R_ARM_ALU_PC_G1_NC, \sym
 	.reloc		.L2_\@, R_ARM_LDR_PC_G2, \sym
-.L0_\@: sub\cond	\tmp, pc, #8
-.L1_\@: sub\cond	\tmp, \tmp, #4
-.L2_\@: \op\cond	\reg, [\tmp, #0]
+.L0_\@: sub\cond	\tmp, pc, #8 - \offset
+.L1_\@: sub\cond	\tmp, \tmp, #4 - \offset
+.L2_\@:
 #endif
+	\op\cond	\reg, [\tmp, #\offset]
 	.endm
 
 	/*
 	 * ldr_va - load a 32-bit word from the virtual address of \sym
 	 */
-	.macro		ldr_va, rd:req, sym:req, cond, tmp
+	.macro		ldr_va, rd:req, sym:req, cond, tmp, offset=0
 	.ifnb		\tmp
-	__ldst_va	ldr, \rd, \tmp, \sym, \cond
+	__ldst_va	ldr, \rd, \tmp, \sym, \cond, \offset
 	.else
-	__ldst_va	ldr, \rd, \rd, \sym, \cond
+	__ldst_va	ldr, \rd, \rd, \sym, \cond, \offset
 	.endif
 	.endm
 
@@ -694,7 +694,7 @@ THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 	 * str_va - store a 32-bit word to the virtual address of \sym
 	 */
 	.macro		str_va, rn:req, sym:req, tmp:req, cond
-	__ldst_va	str, \rn, \tmp, \sym, \cond
+	__ldst_va	str, \rn, \tmp, \sym, \cond, 0
 	.endm
 
 	/*
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 5609ca8ae46a..c88a1b5c0ca5 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -90,9 +90,8 @@ UNWIND(	.setfp	fpreg, sp		)
 	.macro	pabt_helper
 	@ PABORT handler takes pt_regs in r2, fault address in r4 and psr in r5
 #ifdef MULTI_PABORT
-	ldr	ip, .LCprocfns
-	mov	lr, pc
-	ldr	pc, [ip, #PROCESSOR_PABT_FUNC]
+	ldr_va	ip, processor, offset=PROCESSOR_PABT_FUNC
+	bl_r	ip
 #else
 	bl	CPU_PABORT_HANDLER
 #endif
@@ -111,9 +110,8 @@ UNWIND(	.setfp	fpreg, sp		)
 	@ the fault status register in r1.  r9 must be preserved.
 	@
 #ifdef MULTI_DABORT
-	ldr	ip, .LCprocfns
-	mov	lr, pc
-	ldr	pc, [ip, #PROCESSOR_DABT_FUNC]
+	ldr_va	ip, processor, offset=PROCESSOR_DABT_FUNC
+	bl_r	ip
 #else
 	bl	CPU_DABORT_HANDLER
 #endif
@@ -331,16 +329,6 @@ __fiq_svc:
  UNWIND(.fnend		)
 ENDPROC(__fiq_svc)
 
-	.align	5
-.LCcralign:
-	.word	cr_alignment
-#ifdef MULTI_DABORT
-.LCprocfns:
-	.word	processor
-#endif
-.LCfp:
-	.word	fp_enter
-
 /*
  * Abort mode handlers
  */
@@ -399,7 +387,7 @@ ENDPROC(__fiq_abt)
  THUMB(	stmia	sp, {r0 - r12}	)
 
  ATRAP(	mrc	p15, 0, r7, c1, c0, 0)
- ATRAP(	ldr	r8, .LCcralign)
+ ATRAP(	ldr_va	r8, cr_alignment)
 
 	ldmia	r0, {r3 - r5}
 	add	r0, sp, #S_PC		@ here for interlock avoidance
@@ -408,8 +396,6 @@ ENDPROC(__fiq_abt)
 	str	r3, [sp]		@ save the "real" r0 copied
 					@ from the exception stack
 
- ATRAP(	ldr	r8, [r8, #0])
-
 	@
 	@ We are now ready to fill in the remaining blanks on the stack:
 	@
@@ -534,9 +520,7 @@ __und_usr_thumb:
  */
 #if __LINUX_ARM_ARCH__ < 7
 /* If the target CPU may not be Thumb-2-capable, a run-time check is needed: */
-#define NEED_CPU_ARCHITECTURE
-	ldr	r5, .LCcpu_architecture
-	ldr	r5, [r5]
+	ldr_va	r5, cpu_architecture
 	cmp	r5, #CPU_ARCH_ARMv7
 	blo	__und_usr_fault_16		@ 16bit undefined instruction
 /*
@@ -683,12 +667,6 @@ call_fpe:
 	ret.w	lr				@ CP#14 (Debug)
 	ret.w	lr				@ CP#15 (Control)
 
-#ifdef NEED_CPU_ARCHITECTURE
-	.align	2
-.LCcpu_architecture:
-	.word	__cpu_architecture
-#endif
-
 #ifdef CONFIG_NEON
 	.align	6
 
@@ -714,9 +692,8 @@ call_fpe:
 #endif
 
 do_fpe:
-	ldr	r4, .LCfp
 	add	r10, r10, #TI_FPSTATE		@ r10 = workspace
-	ldr	pc, [r4]			@ Call FP module USR entry point
+	ldr_va	pc, fp_enter, tmp=r4		@ Call FP module USR entry point
 
 /*
  * The FP module is called with these registers set:
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index c928d6b04cce..f48ef2378d9b 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -174,7 +174,7 @@ ENTRY(vector_swi)
 #endif
 	reload_current r10, ip
 	zero_fp
-	alignment_trap r10, ip, __cr_alignment
+	alignment_trap r10, ip, cr_alignment
 	asm_trace_hardirqs_on save=0
 	enable_irq_notrace
 	ct_user_exit save=0
@@ -304,14 +304,6 @@ __sys_trace_return:
 	bl	syscall_trace_exit
 	b	ret_slow_syscall
 
-	.align	5
-#ifdef CONFIG_ALIGNMENT_TRAP
-	.type	__cr_alignment, #object
-__cr_alignment:
-	.word	cr_alignment
-#endif
-	.ltorg
-
 	.macro	syscall_table_start, sym
 	.equ	__sys_nr, 0
 	.type	\sym, #object
diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
index 9a1dc142f782..5865621bf691 100644
--- a/arch/arm/kernel/entry-header.S
+++ b/arch/arm/kernel/entry-header.S
@@ -48,8 +48,7 @@
 	.macro	alignment_trap, rtmp1, rtmp2, label
 #ifdef CONFIG_ALIGNMENT_TRAP
 	mrc	p15, 0, \rtmp2, c1, c0, 0
-	ldr	\rtmp1, \label
-	ldr	\rtmp1, [\rtmp1]
+	ldr_va	\rtmp1, \label
 	teq	\rtmp1, \rtmp2
 	mcrne	p15, 0, \rtmp1, c1, c0, 0
 #endif
-- 
2.30.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] ARM: assembler: simplify ldr_this_cpu for !SMP builds
  2022-03-01 12:04 ` [PATCH 1/2] ARM: assembler: simplify ldr_this_cpu for !SMP builds Ard Biesheuvel
@ 2022-03-02 11:33   ` Linus Walleij
  0 siblings, 0 replies; 5+ messages in thread
From: Linus Walleij @ 2022-03-02 11:33 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-arm-kernel, Russell King, Arnd Bergmann

On Tue, Mar 1, 2022 at 1:04 PM Ard Biesheuvel <ardb@kernel.org> wrote:

> When CONFIG_SMP is not defined, the CPU offset is always zero, and so
> we can simplify the sequence to load a per-CPU variable.
>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

Maybe mention that you add an optional tmp parameter to
ldr_va as well?

>         /*
>          * ldr_va - load a 32-bit word from the virtual address of \sym
>          */
> -       .macro          ldr_va, rd:req, sym:req, cond
> +       .macro          ldr_va, rd:req, sym:req, cond, tmp

The text above this macro could explain what cond and tmp are for,
took me some time to figure it out.

Anyway, it's not like this code was simple to read in the first place, so:
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>

Yours,
Linus Walleij

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] ARM: entry: avoid explicit literal loads
  2022-03-01 12:04 ` [PATCH 2/2] ARM: entry: avoid explicit literal loads Ard Biesheuvel
@ 2022-03-02 11:42   ` Linus Walleij
  0 siblings, 0 replies; 5+ messages in thread
From: Linus Walleij @ 2022-03-02 11:42 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-arm-kernel, Russell King, Arnd Bergmann

On Tue, Mar 1, 2022 at 1:04 PM Ard Biesheuvel <ardb@kernel.org> wrote:

> ARMv7 has MOVW/MOVT instruction pairs to load symbol addresses into
> registers without having to rely on literal loads that go via the
> D-cache.  For older cores, we now support a similar arrangement, based
> on PC-relative group relocations.
>
> This means we can elide most literal loads entirely from the entry path,
> by switching to the ldr_va macro to emit the appropriate sequence
> depending on the target architecture revision.
>
> While at it, switch to the bl_r macro for invoking the right PABT/DABT
> helpers instead of setting the LR register explicitly, which does not
> play well with cores that speculate across function returns.
>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

After reading the previous patch and figuring out ldr_va this was
easier to understand. I guess I'd like some docs around ldr_va
ultimately (I only think the top level macros warrant some docs,
not say __prefixed inner macros), though offset is quite intuitive
anyway the patch makes the kernel a much better place and it's
a real beauty so:
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>

Yours,
Linus Walleij

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-03-02 11:44 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-01 12:04 [PATCH 0/2] ARM: optimize some variable accesses Ard Biesheuvel
2022-03-01 12:04 ` [PATCH 1/2] ARM: assembler: simplify ldr_this_cpu for !SMP builds Ard Biesheuvel
2022-03-02 11:33   ` Linus Walleij
2022-03-01 12:04 ` [PATCH 2/2] ARM: entry: avoid explicit literal loads Ard Biesheuvel
2022-03-02 11:42   ` Linus Walleij

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.