All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] arm64/xor: use EOR3 instructions when available
@ 2021-12-13 14:02 Ard Biesheuvel
  2021-12-13 19:19 ` Catalin Marinas
  2021-12-14  2:36   ` Nathan Chancellor
  0 siblings, 2 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2021-12-13 14:02 UTC (permalink / raw)
  To: linux-arm-kernel; +Cc: catalin.marinas, will, mark.rutland, Ard Biesheuvel

Use the EOR3 instruction to implement xor_blocks() if the instruction is
available, which is the case if the CPU implements the SHA-3 extension.
This is about 20% faster on Apple M1 when using the 5-way version.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
v2: decouple from static_call changes
    incorporate assembler support detection changes proposed by Catalin

 arch/arm64/Kconfig        |   6 +
 arch/arm64/Makefile       |   5 +
 arch/arm64/lib/xor-neon.c | 147 +++++++++++++++++++-
 3 files changed, 157 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c4207cf9bb17..63d41ba4e716 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1545,6 +1545,12 @@ endmenu
 
 menu "ARMv8.2 architectural features"
 
+config AS_HAS_ARMV8_2
+       def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
+
+config AS_HAS_SHA3
+       def_bool $(as-instr,.arch armv8.2-a+sha3)
+
 config ARM64_PMEM
 	bool "Enable support for persistent memory"
 	select ARCH_HAS_PMEM_API
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index e8cfc5868aa8..2f1de88651e6 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
 					include/generated/asm-offsets.h))
 endif
 
+ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
+# make sure to pass the newest target architecture to -march.
+asm-arch := armv8.2-a
+endif
+
 # Ensure that if the compiler supports branch protection we default it
 # off, this will be overridden if we are using branch protection.
 branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 11bf4f8aca68..5c8688700f63 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -167,7 +167,136 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
 	} while (--lines > 0);
 }
 
-struct xor_block_template const xor_block_inner_neon = {
+static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
+{
+	uint64x2_t res;
+
+	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
+	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
+	return res;
+}
+
+static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+	} while (--lines > 0);
+}
+
+static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3,
+			     unsigned long *p4)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* p1 ^= p4 */
+		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
+		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
+		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
+		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+	} while (--lines > 0);
+}
+
+static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3,
+			     unsigned long *p4, unsigned long *p5)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+	uint64_t *dp5 = (uint64_t *)p5;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* p1 ^= p4 ^ p5 */
+		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
+		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
+		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
+		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+		dp5 += 8;
+	} while (--lines > 0);
+}
+
+struct xor_block_template xor_block_inner_neon __ro_after_init = {
 	.name	= "__inner_neon__",
 	.do_2	= xor_arm64_neon_2,
 	.do_3	= xor_arm64_neon_3,
@@ -176,6 +305,22 @@ struct xor_block_template const xor_block_inner_neon = {
 };
 EXPORT_SYMBOL(xor_block_inner_neon);
 
+static int __init xor_neon_init(void)
+{
+	if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
+		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
+		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
+		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
+	}
+	return 0;
+}
+module_init(xor_neon_init);
+
+static void __exit xor_neon_exit(void)
+{
+}
+module_exit(xor_neon_exit);
+
 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
 MODULE_LICENSE("GPL");
-- 
2.30.2


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
  2021-12-13 14:02 [PATCH v2] arm64/xor: use EOR3 instructions when available Ard Biesheuvel
@ 2021-12-13 19:19 ` Catalin Marinas
  2021-12-14  2:36   ` Nathan Chancellor
  1 sibling, 0 replies; 14+ messages in thread
From: Catalin Marinas @ 2021-12-13 19:19 UTC (permalink / raw)
  To: Ard Biesheuvel, linux-arm-kernel; +Cc: Will Deacon, mark.rutland

On Mon, 13 Dec 2021 15:02:52 +0100, Ard Biesheuvel wrote:
> Use the EOR3 instruction to implement xor_blocks() if the instruction is
> available, which is the case if the CPU implements the SHA-3 extension.
> This is about 20% faster on Apple M1 when using the 5-way version.
> 
> 

Applied to arm64 (for-next/xor-neon), thanks!

[1/1] arm64/xor: use EOR3 instructions when available
      https://git.kernel.org/arm64/c/ce9ba49a2460

-- 
Catalin


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
  2021-12-13 14:02 [PATCH v2] arm64/xor: use EOR3 instructions when available Ard Biesheuvel
@ 2021-12-14  2:36   ` Nathan Chancellor
  2021-12-14  2:36   ` Nathan Chancellor
  1 sibling, 0 replies; 14+ messages in thread
From: Nathan Chancellor @ 2021-12-14  2:36 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: linux-arm-kernel, catalin.marinas, will, mark.rutland, llvm

Hi Ard,

On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> Use the EOR3 instruction to implement xor_blocks() if the instruction is
> available, which is the case if the CPU implements the SHA-3 extension.
> This is about 20% faster on Apple M1 when using the 5-way version.
> 
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
EOR3 instructions when available") in the arm64 tree breaks
allyesconfig:

https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true

I also see this when building with GCC 11.2.0:

WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
arch/arm64/lib/xor-neon.o:(.data+0x0): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data+0x18): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data+0x20): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x0): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x8): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x10): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x18): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x20): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x28): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x30): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x38): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x40): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x48): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.fini_array+0x0): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.init_array+0x0): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x8): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x18): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x20): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x28): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x30): dangerous relocation: unsupported relocation

Cheers,
Nathan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
@ 2021-12-14  2:36   ` Nathan Chancellor
  0 siblings, 0 replies; 14+ messages in thread
From: Nathan Chancellor @ 2021-12-14  2:36 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: linux-arm-kernel, catalin.marinas, will, mark.rutland, llvm

Hi Ard,

On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> Use the EOR3 instruction to implement xor_blocks() if the instruction is
> available, which is the case if the CPU implements the SHA-3 extension.
> This is about 20% faster on Apple M1 when using the 5-way version.
> 
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>

Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
EOR3 instructions when available") in the arm64 tree breaks
allyesconfig:

https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true

I also see this when building with GCC 11.2.0:

WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
arch/arm64/lib/xor-neon.o:(.data+0x0): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data+0x18): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data+0x20): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x0): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x8): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x10): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x18): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x20): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x28): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x30): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x38): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x40): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x48): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.fini_array+0x0): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.init_array+0x0): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x8): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x18): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x20): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x28): dangerous relocation: unsupported relocation
arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x30): dangerous relocation: unsupported relocation

Cheers,
Nathan

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
  2021-12-14  2:36   ` Nathan Chancellor
@ 2021-12-14  8:19     ` Ard Biesheuvel
  -1 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2021-12-14  8:19 UTC (permalink / raw)
  To: Nathan Chancellor, Arnd Bergmann
  Cc: Linux ARM, Catalin Marinas, Will Deacon, Mark Rutland, llvm

+ Arnd

On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
>
> Hi Ard,
>
> On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > available, which is the case if the CPU implements the SHA-3 extension.
> > This is about 20% faster on Apple M1 when using the 5-way version.
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>
> Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> EOR3 instructions when available") in the arm64 tree breaks
> allyesconfig:
>
> https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
>
> I also see this when building with GCC 11.2.0:
>
> WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object

I suspect this is another genksyms crash, preventing the
__crc_xor_block_inner_neon symbol from ever being emitted.

This is a recurring annoyance and I am not sure how to address this
properly. Arnd might have some thoughts on the matter as well.


> arch/arm64/lib/xor-neon.o:(.data+0x0): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data+0x18): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data+0x20): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x0): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x8): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x10): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x18): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x20): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x28): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x30): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x38): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x40): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x48): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.fini_array+0x0): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.init_array+0x0): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x8): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x18): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x20): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x28): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x30): dangerous relocation: unsupported relocation
>
> Cheers,
> Nathan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
@ 2021-12-14  8:19     ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2021-12-14  8:19 UTC (permalink / raw)
  To: Nathan Chancellor, Arnd Bergmann
  Cc: Linux ARM, Catalin Marinas, Will Deacon, Mark Rutland, llvm

+ Arnd

On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
>
> Hi Ard,
>
> On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > available, which is the case if the CPU implements the SHA-3 extension.
> > This is about 20% faster on Apple M1 when using the 5-way version.
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>
> Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> EOR3 instructions when available") in the arm64 tree breaks
> allyesconfig:
>
> https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
>
> I also see this when building with GCC 11.2.0:
>
> WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object

I suspect this is another genksyms crash, preventing the
__crc_xor_block_inner_neon symbol from ever being emitted.

This is a recurring annoyance and I am not sure how to address this
properly. Arnd might have some thoughts on the matter as well.


> arch/arm64/lib/xor-neon.o:(.data+0x0): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data+0x18): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data+0x20): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x0): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x8): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x10): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x18): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x20): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x28): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x30): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x38): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x40): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(__patchable_function_entries+0x48): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.fini_array+0x0): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.init_array+0x0): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x8): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x18): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x20): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x28): dangerous relocation: unsupported relocation
> arch/arm64/lib/xor-neon.o:(.data..ro_after_init+0x30): dangerous relocation: unsupported relocation
>
> Cheers,
> Nathan

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
  2021-12-14  8:19     ` Ard Biesheuvel
@ 2021-12-14 11:05       ` Ard Biesheuvel
  -1 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2021-12-14 11:05 UTC (permalink / raw)
  To: Nathan Chancellor, Arnd Bergmann
  Cc: Linux ARM, Catalin Marinas, Will Deacon, Mark Rutland, llvm

On Tue, 14 Dec 2021 at 09:19, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> + Arnd
>
> On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
> >
> > Hi Ard,
> >
> > On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > > available, which is the case if the CPU implements the SHA-3 extension.
> > > This is about 20% faster on Apple M1 when using the 5-way version.
> > >
> > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> >
> > Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> > EOR3 instructions when available") in the arm64 tree breaks
> > allyesconfig:
> >
> > https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
> >
> > I also see this when building with GCC 11.2.0:
> >
> > WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> > Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> > aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
>
> I suspect this is another genksyms crash, preventing the
> __crc_xor_block_inner_neon symbol from ever being emitted.
>
> This is a recurring annoyance and I am not sure how to address this
> properly. Arnd might have some thoughts on the matter as well.
>
>

I managed to reproduce this: it's not a crash but definitely a bug in
genksyms, as it simply fails to produce the output containing the
assignment of __crc_xor_block_inner_neon.

Moving the definition of xor_block_inner_neon as below works around the issue.

Catalin: would you like me to spin a v3? Or do your prefer to just
fold this into the existing one?

diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 5c8688700f63..d189cf4e70ea 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -167,6 +167,15 @@ void xor_arm64_neon_5(unsigned long bytes,
unsigned long *p1,
        } while (--lines > 0);
 }

+struct xor_block_template xor_block_inner_neon __ro_after_init = {
+       .name   = "__inner_neon__",
+       .do_2   = xor_arm64_neon_2,
+       .do_3   = xor_arm64_neon_3,
+       .do_4   = xor_arm64_neon_4,
+       .do_5   = xor_arm64_neon_5,
+};
+EXPORT_SYMBOL(xor_block_inner_neon);
+
 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 {
        uint64x2_t res;
@@ -296,15 +305,6 @@ static void xor_arm64_eor3_5(unsigned long bytes,
unsigned long *p1,
        } while (--lines > 0);
 }

-struct xor_block_template xor_block_inner_neon __ro_after_init = {
-       .name   = "__inner_neon__",
-       .do_2   = xor_arm64_neon_2,
-       .do_3   = xor_arm64_neon_3,
-       .do_4   = xor_arm64_neon_4,
-       .do_5   = xor_arm64_neon_5,
-};
-EXPORT_SYMBOL(xor_block_inner_neon);
-
 static int __init xor_neon_init(void)
 {
        if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
@ 2021-12-14 11:05       ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2021-12-14 11:05 UTC (permalink / raw)
  To: Nathan Chancellor, Arnd Bergmann
  Cc: Linux ARM, Catalin Marinas, Will Deacon, Mark Rutland, llvm

On Tue, 14 Dec 2021 at 09:19, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> + Arnd
>
> On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
> >
> > Hi Ard,
> >
> > On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > > available, which is the case if the CPU implements the SHA-3 extension.
> > > This is about 20% faster on Apple M1 when using the 5-way version.
> > >
> > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> >
> > Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> > EOR3 instructions when available") in the arm64 tree breaks
> > allyesconfig:
> >
> > https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
> >
> > I also see this when building with GCC 11.2.0:
> >
> > WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> > Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> > aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
>
> I suspect this is another genksyms crash, preventing the
> __crc_xor_block_inner_neon symbol from ever being emitted.
>
> This is a recurring annoyance and I am not sure how to address this
> properly. Arnd might have some thoughts on the matter as well.
>
>

I managed to reproduce this: it's not a crash but definitely a bug in
genksyms, as it simply fails to produce the output containing the
assignment of __crc_xor_block_inner_neon.

Moving the definition of xor_block_inner_neon as below works around the issue.

Catalin: would you like me to spin a v3? Or do your prefer to just
fold this into the existing one?

diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 5c8688700f63..d189cf4e70ea 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -167,6 +167,15 @@ void xor_arm64_neon_5(unsigned long bytes,
unsigned long *p1,
        } while (--lines > 0);
 }

+struct xor_block_template xor_block_inner_neon __ro_after_init = {
+       .name   = "__inner_neon__",
+       .do_2   = xor_arm64_neon_2,
+       .do_3   = xor_arm64_neon_3,
+       .do_4   = xor_arm64_neon_4,
+       .do_5   = xor_arm64_neon_5,
+};
+EXPORT_SYMBOL(xor_block_inner_neon);
+
 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 {
        uint64x2_t res;
@@ -296,15 +305,6 @@ static void xor_arm64_eor3_5(unsigned long bytes,
unsigned long *p1,
        } while (--lines > 0);
 }

-struct xor_block_template xor_block_inner_neon __ro_after_init = {
-       .name   = "__inner_neon__",
-       .do_2   = xor_arm64_neon_2,
-       .do_3   = xor_arm64_neon_3,
-       .do_4   = xor_arm64_neon_4,
-       .do_5   = xor_arm64_neon_5,
-};
-EXPORT_SYMBOL(xor_block_inner_neon);
-
 static int __init xor_neon_init(void)
 {
        if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
  2021-12-14 11:05       ` Ard Biesheuvel
@ 2021-12-14 11:36         ` Catalin Marinas
  -1 siblings, 0 replies; 14+ messages in thread
From: Catalin Marinas @ 2021-12-14 11:36 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Nathan Chancellor, Arnd Bergmann, Linux ARM, Will Deacon,
	Mark Rutland, llvm

On Tue, Dec 14, 2021 at 12:05:34PM +0100, Ard Biesheuvel wrote:
> On Tue, 14 Dec 2021 at 09:19, Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > + Arnd
> >
> > On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
> > >
> > > Hi Ard,
> > >
> > > On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > > > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > > > available, which is the case if the CPU implements the SHA-3 extension.
> > > > This is about 20% faster on Apple M1 when using the 5-way version.
> > > >
> > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > >
> > > Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> > > EOR3 instructions when available") in the arm64 tree breaks
> > > allyesconfig:
> > >
> > > https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
> > >
> > > I also see this when building with GCC 11.2.0:
> > >
> > > WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> > > Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> > > aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
> >
> > I suspect this is another genksyms crash, preventing the
> > __crc_xor_block_inner_neon symbol from ever being emitted.
> >
> > This is a recurring annoyance and I am not sure how to address this
> > properly. Arnd might have some thoughts on the matter as well.
> 
> I managed to reproduce this: it's not a crash but definitely a bug in
> genksyms, as it simply fails to produce the output containing the
> assignment of __crc_xor_block_inner_neon.
> 
> Moving the definition of xor_block_inner_neon as below works around the issue.
> 
> Catalin: would you like me to spin a v3? Or do your prefer to just
> fold this into the existing one?

I'll fold it in. Thanks.

-- 
Catalin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
@ 2021-12-14 11:36         ` Catalin Marinas
  0 siblings, 0 replies; 14+ messages in thread
From: Catalin Marinas @ 2021-12-14 11:36 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Nathan Chancellor, Arnd Bergmann, Linux ARM, Will Deacon,
	Mark Rutland, llvm

On Tue, Dec 14, 2021 at 12:05:34PM +0100, Ard Biesheuvel wrote:
> On Tue, 14 Dec 2021 at 09:19, Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > + Arnd
> >
> > On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
> > >
> > > Hi Ard,
> > >
> > > On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > > > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > > > available, which is the case if the CPU implements the SHA-3 extension.
> > > > This is about 20% faster on Apple M1 when using the 5-way version.
> > > >
> > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > >
> > > Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> > > EOR3 instructions when available") in the arm64 tree breaks
> > > allyesconfig:
> > >
> > > https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
> > >
> > > I also see this when building with GCC 11.2.0:
> > >
> > > WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> > > Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> > > aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
> >
> > I suspect this is another genksyms crash, preventing the
> > __crc_xor_block_inner_neon symbol from ever being emitted.
> >
> > This is a recurring annoyance and I am not sure how to address this
> > properly. Arnd might have some thoughts on the matter as well.
> 
> I managed to reproduce this: it's not a crash but definitely a bug in
> genksyms, as it simply fails to produce the output containing the
> assignment of __crc_xor_block_inner_neon.
> 
> Moving the definition of xor_block_inner_neon as below works around the issue.
> 
> Catalin: would you like me to spin a v3? Or do your prefer to just
> fold this into the existing one?

I'll fold it in. Thanks.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
  2021-12-14 11:36         ` Catalin Marinas
@ 2021-12-14 12:57           ` Ard Biesheuvel
  -1 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2021-12-14 12:57 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Nathan Chancellor, Arnd Bergmann, Linux ARM, Will Deacon,
	Mark Rutland, llvm

On Tue, 14 Dec 2021 at 12:36, Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Tue, Dec 14, 2021 at 12:05:34PM +0100, Ard Biesheuvel wrote:
> > On Tue, 14 Dec 2021 at 09:19, Ard Biesheuvel <ardb@kernel.org> wrote:
> > >
> > > + Arnd
> > >
> > > On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
> > > >
> > > > Hi Ard,
> > > >
> > > > On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > > > > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > > > > available, which is the case if the CPU implements the SHA-3 extension.
> > > > > This is about 20% faster on Apple M1 when using the 5-way version.
> > > > >
> > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > > >
> > > > Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> > > > EOR3 instructions when available") in the arm64 tree breaks
> > > > allyesconfig:
> > > >
> > > > https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
> > > >
> > > > I also see this when building with GCC 11.2.0:
> > > >
> > > > WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> > > > Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> > > > aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
> > >
> > > I suspect this is another genksyms crash, preventing the
> > > __crc_xor_block_inner_neon symbol from ever being emitted.
> > >
> > > This is a recurring annoyance and I am not sure how to address this
> > > properly. Arnd might have some thoughts on the matter as well.
> >
> > I managed to reproduce this: it's not a crash but definitely a bug in
> > genksyms, as it simply fails to produce the output containing the
> > assignment of __crc_xor_block_inner_neon.
> >
> > Moving the definition of xor_block_inner_neon as below works around the issue.
> >
> > Catalin: would you like me to spin a v3? Or do your prefer to just
> > fold this into the existing one?
>
> I'll fold it in. Thanks.
>

The root cause appears to be that genksyms gives up when it encounters

static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
{

because the types are not defined. This is because our
asm/neon-intrinsics.h header avoids #include'ing arm-neon.h in the
context of genksyms, as doing so does result in a genksyms crash.

I have very little motivation to go and figure out why genksyms
crashes in that case, so I think for now, we can stick with the fix I
proposed. Alternatively, we could typedef uint64x2_t to something
arbitrary if __GENKSYMS__ is defined, or use a macro instead of a
static inline for eor3()

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
@ 2021-12-14 12:57           ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2021-12-14 12:57 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Nathan Chancellor, Arnd Bergmann, Linux ARM, Will Deacon,
	Mark Rutland, llvm

On Tue, 14 Dec 2021 at 12:36, Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Tue, Dec 14, 2021 at 12:05:34PM +0100, Ard Biesheuvel wrote:
> > On Tue, 14 Dec 2021 at 09:19, Ard Biesheuvel <ardb@kernel.org> wrote:
> > >
> > > + Arnd
> > >
> > > On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
> > > >
> > > > Hi Ard,
> > > >
> > > > On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > > > > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > > > > available, which is the case if the CPU implements the SHA-3 extension.
> > > > > This is about 20% faster on Apple M1 when using the 5-way version.
> > > > >
> > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > > >
> > > > Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> > > > EOR3 instructions when available") in the arm64 tree breaks
> > > > allyesconfig:
> > > >
> > > > https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
> > > >
> > > > I also see this when building with GCC 11.2.0:
> > > >
> > > > WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> > > > Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> > > > aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
> > >
> > > I suspect this is another genksyms crash, preventing the
> > > __crc_xor_block_inner_neon symbol from ever being emitted.
> > >
> > > This is a recurring annoyance and I am not sure how to address this
> > > properly. Arnd might have some thoughts on the matter as well.
> >
> > I managed to reproduce this: it's not a crash but definitely a bug in
> > genksyms, as it simply fails to produce the output containing the
> > assignment of __crc_xor_block_inner_neon.
> >
> > Moving the definition of xor_block_inner_neon as below works around the issue.
> >
> > Catalin: would you like me to spin a v3? Or do your prefer to just
> > fold this into the existing one?
>
> I'll fold it in. Thanks.
>

The root cause appears to be that genksyms gives up when it encounters

static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
{

because the types are not defined. This is because our
asm/neon-intrinsics.h header avoids #include'ing arm-neon.h in the
context of genksyms, as doing so does result in a genksyms crash.

I have very little motivation to go and figure out why genksyms
crashes in that case, so I think for now, we can stick with the fix I
proposed. Alternatively, we could typedef uint64x2_t to something
arbitrary if __GENKSYMS__ is defined, or use a macro instead of a
static inline for eor3()

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
  2021-12-14 12:57           ` Ard Biesheuvel
@ 2021-12-15 15:15             ` Catalin Marinas
  -1 siblings, 0 replies; 14+ messages in thread
From: Catalin Marinas @ 2021-12-15 15:15 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Nathan Chancellor, Arnd Bergmann, Linux ARM, Will Deacon,
	Mark Rutland, llvm

On Tue, Dec 14, 2021 at 01:57:47PM +0100, Ard Biesheuvel wrote:
> On Tue, 14 Dec 2021 at 12:36, Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Tue, Dec 14, 2021 at 12:05:34PM +0100, Ard Biesheuvel wrote:
> > > On Tue, 14 Dec 2021 at 09:19, Ard Biesheuvel <ardb@kernel.org> wrote:
> > > > On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
> > > > > On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > > > > > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > > > > > available, which is the case if the CPU implements the SHA-3 extension.
> > > > > > This is about 20% faster on Apple M1 when using the 5-way version.
> > > > > >
> > > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > > > >
> > > > > Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> > > > > EOR3 instructions when available") in the arm64 tree breaks
> > > > > allyesconfig:
> > > > >
> > > > > https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
> > > > >
> > > > > I also see this when building with GCC 11.2.0:
> > > > >
> > > > > WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> > > > > Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> > > > > aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
> > > >
> > > > I suspect this is another genksyms crash, preventing the
> > > > __crc_xor_block_inner_neon symbol from ever being emitted.
> > > >
> > > > This is a recurring annoyance and I am not sure how to address this
> > > > properly. Arnd might have some thoughts on the matter as well.
> > >
> > > I managed to reproduce this: it's not a crash but definitely a bug in
> > > genksyms, as it simply fails to produce the output containing the
> > > assignment of __crc_xor_block_inner_neon.
> > >
> > > Moving the definition of xor_block_inner_neon as below works around the issue.
> > >
> > > Catalin: would you like me to spin a v3? Or do your prefer to just
> > > fold this into the existing one?
> >
> > I'll fold it in. Thanks.
> 
> The root cause appears to be that genksyms gives up when it encounters
> 
> static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> {
> 
> because the types are not defined. This is because our
> asm/neon-intrinsics.h header avoids #include'ing arm-neon.h in the
> context of genksyms, as doing so does result in a genksyms crash.
> 
> I have very little motivation to go and figure out why genksyms
> crashes in that case, so I think for now, we can stick with the fix I
> proposed. Alternatively, we could typedef uint64x2_t to something
> arbitrary if __GENKSYMS__ is defined, or use a macro instead of a
> static inline for eor3()

I'll stick to the fix you proposed (already folded in). If we ever add
another EXPORT_SYMBOL after the eor3() function, we better look into
fixing genksyms or defining a dummy uint64x2_t.

-- 
Catalin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] arm64/xor: use EOR3 instructions when available
@ 2021-12-15 15:15             ` Catalin Marinas
  0 siblings, 0 replies; 14+ messages in thread
From: Catalin Marinas @ 2021-12-15 15:15 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Nathan Chancellor, Arnd Bergmann, Linux ARM, Will Deacon,
	Mark Rutland, llvm

On Tue, Dec 14, 2021 at 01:57:47PM +0100, Ard Biesheuvel wrote:
> On Tue, 14 Dec 2021 at 12:36, Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Tue, Dec 14, 2021 at 12:05:34PM +0100, Ard Biesheuvel wrote:
> > > On Tue, 14 Dec 2021 at 09:19, Ard Biesheuvel <ardb@kernel.org> wrote:
> > > > On Tue, 14 Dec 2021 at 03:37, Nathan Chancellor <nathan@kernel.org> wrote:
> > > > > On Mon, Dec 13, 2021 at 03:02:52PM +0100, Ard Biesheuvel wrote:
> > > > > > Use the EOR3 instruction to implement xor_blocks() if the instruction is
> > > > > > available, which is the case if the CPU implements the SHA-3 extension.
> > > > > > This is about 20% faster on Apple M1 when using the 5-way version.
> > > > > >
> > > > > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > > > >
> > > > > Our CI reported that this patch as commit ce9ba49a2460 ("arm64/xor: use
> > > > > EOR3 instructions when available") in the arm64 tree breaks
> > > > > allyesconfig:
> > > > >
> > > > > https://github.com/ClangBuiltLinux/continuous-integration2/runs/4514540083?check_suite_focus=true
> > > > >
> > > > > I also see this when building with GCC 11.2.0:
> > > > >
> > > > > WARNING: modpost: EXPORT symbol "xor_block_inner_neon" [vmlinux] version ...
> > > > > Is "xor_block_inner_neon" prototyped in <asm/asm-prototypes.h>?
> > > > > aarch64-linux-gnu-ld: arch/arm64/lib/xor-neon.o: relocation R_AARCH64_ABS32 against `__crc_xor_block_inner_neon' can not be used when making a shared object
> > > >
> > > > I suspect this is another genksyms crash, preventing the
> > > > __crc_xor_block_inner_neon symbol from ever being emitted.
> > > >
> > > > This is a recurring annoyance and I am not sure how to address this
> > > > properly. Arnd might have some thoughts on the matter as well.
> > >
> > > I managed to reproduce this: it's not a crash but definitely a bug in
> > > genksyms, as it simply fails to produce the output containing the
> > > assignment of __crc_xor_block_inner_neon.
> > >
> > > Moving the definition of xor_block_inner_neon as below works around the issue.
> > >
> > > Catalin: would you like me to spin a v3? Or do your prefer to just
> > > fold this into the existing one?
> >
> > I'll fold it in. Thanks.
> 
> The root cause appears to be that genksyms gives up when it encounters
> 
> static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> {
> 
> because the types are not defined. This is because our
> asm/neon-intrinsics.h header avoids #include'ing arm-neon.h in the
> context of genksyms, as doing so does result in a genksyms crash.
> 
> I have very little motivation to go and figure out why genksyms
> crashes in that case, so I think for now, we can stick with the fix I
> proposed. Alternatively, we could typedef uint64x2_t to something
> arbitrary if __GENKSYMS__ is defined, or use a macro instead of a
> static inline for eor3()

I'll stick to the fix you proposed (already folded in). If we ever add
another EXPORT_SYMBOL after the eor3() function, we better look into
fixing genksyms or defining a dummy uint64x2_t.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2021-12-15 15:50 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-13 14:02 [PATCH v2] arm64/xor: use EOR3 instructions when available Ard Biesheuvel
2021-12-13 19:19 ` Catalin Marinas
2021-12-14  2:36 ` Nathan Chancellor
2021-12-14  2:36   ` Nathan Chancellor
2021-12-14  8:19   ` Ard Biesheuvel
2021-12-14  8:19     ` Ard Biesheuvel
2021-12-14 11:05     ` Ard Biesheuvel
2021-12-14 11:05       ` Ard Biesheuvel
2021-12-14 11:36       ` Catalin Marinas
2021-12-14 11:36         ` Catalin Marinas
2021-12-14 12:57         ` Ard Biesheuvel
2021-12-14 12:57           ` Ard Biesheuvel
2021-12-15 15:15           ` Catalin Marinas
2021-12-15 15:15             ` Catalin Marinas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.