* [PATCH 0/2] arm64: use SHA3 instructions to speed up XOR
@ 2021-11-09 12:03 Ard Biesheuvel
2021-11-09 12:03 ` [PATCH 1/2] arm64/xor: use static calls for inner NEON helpers Ard Biesheuvel
2021-11-09 12:03 ` [PATCH 2/2] arm64/xor: use EOR3 instructions when available Ard Biesheuvel
0 siblings, 2 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2021-11-09 12:03 UTC (permalink / raw)
To: linux-arm-kernel
Cc: catalin.marinas, will, Ard Biesheuvel, Mark Rutland, Peter Zijlstra
If available, use the 3-way EOR3 instruction that is part of the SHA3
extension rather than the ordinary 2-way EOR NEON instruction. Doing so
speeds up XOR processing by ~20% on Apple M1 when using the 5-way
version.
This is also useful as a canary for spotting regressions in the static
call API, as we export a static call from one module and consume it in
another.
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Ard Biesheuvel (2):
arm64/xor: use static calls for inner NEON helpers
arm64/xor: use EOR3 instructions when available
arch/arm64/Kconfig | 3 +
arch/arm64/include/asm/xor.h | 24 ++-
arch/arm64/lib/xor-neon.c | 165 +++++++++++++++++++-
3 files changed, 179 insertions(+), 13 deletions(-)
--
2.30.2
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 1/2] arm64/xor: use static calls for inner NEON helpers
2021-11-09 12:03 [PATCH 0/2] arm64: use SHA3 instructions to speed up XOR Ard Biesheuvel
@ 2021-11-09 12:03 ` Ard Biesheuvel
2021-11-09 12:03 ` [PATCH 2/2] arm64/xor: use EOR3 instructions when available Ard Biesheuvel
1 sibling, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2021-11-09 12:03 UTC (permalink / raw)
To: linux-arm-kernel
Cc: catalin.marinas, will, Ard Biesheuvel, Mark Rutland, Peter Zijlstra
Call the inner NEON helpers using static calls rather than loading
their addresses from a struct. This will be used in a subsequent patch
to switch between NEON and SHA3 based implementations of the XOR code.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/include/asm/xor.h | 24 ++++++++++++++++----
arch/arm64/lib/xor-neon.c | 20 +++++++++-------
2 files changed, 31 insertions(+), 13 deletions(-)
diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
index 947f6a4f1aa0..f52dbb05b4b1 100644
--- a/arch/arm64/include/asm/xor.h
+++ b/arch/arm64/include/asm/xor.h
@@ -7,19 +7,33 @@
*/
#include <linux/hardirq.h>
+#include <linux/static_call.h>
#include <asm-generic/xor.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#ifdef CONFIG_KERNEL_MODE_NEON
-extern struct xor_block_template const xor_block_inner_neon;
+void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
+ unsigned long *p2);
+void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3);
+void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3,
+ unsigned long *p4);
+void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3,
+ unsigned long *p4, unsigned long *p5);
+
+DECLARE_STATIC_CALL(xor_arm64_3, xor_arm64_neon_3);
+DECLARE_STATIC_CALL(xor_arm64_4, xor_arm64_neon_4);
+DECLARE_STATIC_CALL(xor_arm64_5, xor_arm64_neon_5);
static void
xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
kernel_neon_begin();
- xor_block_inner_neon.do_2(bytes, p1, p2);
+ xor_arm64_neon_2(bytes, p1, p2);
kernel_neon_end();
}
@@ -28,7 +42,7 @@ xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
kernel_neon_begin();
- xor_block_inner_neon.do_3(bytes, p1, p2, p3);
+ static_call(xor_arm64_3)(bytes, p1, p2, p3);
kernel_neon_end();
}
@@ -37,7 +51,7 @@ xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
kernel_neon_begin();
- xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
+ static_call(xor_arm64_4)(bytes, p1, p2, p3, p4);
kernel_neon_end();
}
@@ -46,7 +60,7 @@ xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
kernel_neon_begin();
- xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
+ static_call(xor_arm64_5)(bytes, p1, p2, p3, p4, p5);
kernel_neon_end();
}
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 11bf4f8aca68..ee4795f3e166 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -7,6 +7,7 @@
*/
#include <linux/raid/xor.h>
+#include <linux/static_call.h>
#include <linux/module.h>
#include <asm/neon-intrinsics.h>
@@ -36,6 +37,7 @@ void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
dp2 += 8;
} while (--lines > 0);
}
+EXPORT_SYMBOL(xor_arm64_neon_2);
void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3)
@@ -71,6 +73,7 @@ void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
dp3 += 8;
} while (--lines > 0);
}
+EXPORT_SYMBOL(xor_arm64_neon_3);
void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3, unsigned long *p4)
@@ -114,6 +117,7 @@ void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
dp4 += 8;
} while (--lines > 0);
}
+EXPORT_SYMBOL(xor_arm64_neon_4);
void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
unsigned long *p2, unsigned long *p3,
@@ -166,15 +170,15 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
dp5 += 8;
} while (--lines > 0);
}
+EXPORT_SYMBOL(xor_arm64_neon_5);
-struct xor_block_template const xor_block_inner_neon = {
- .name = "__inner_neon__",
- .do_2 = xor_arm64_neon_2,
- .do_3 = xor_arm64_neon_3,
- .do_4 = xor_arm64_neon_4,
- .do_5 = xor_arm64_neon_5,
-};
-EXPORT_SYMBOL(xor_block_inner_neon);
+DEFINE_STATIC_CALL(xor_arm64_3, xor_arm64_neon_3);
+DEFINE_STATIC_CALL(xor_arm64_4, xor_arm64_neon_4);
+DEFINE_STATIC_CALL(xor_arm64_5, xor_arm64_neon_5);
+
+EXPORT_STATIC_CALL(xor_arm64_3);
+EXPORT_STATIC_CALL(xor_arm64_4);
+EXPORT_STATIC_CALL(xor_arm64_5);
MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
MODULE_DESCRIPTION("ARMv8 XOR Extensions");
--
2.30.2
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/2] arm64/xor: use EOR3 instructions when available
2021-11-09 12:03 [PATCH 0/2] arm64: use SHA3 instructions to speed up XOR Ard Biesheuvel
2021-11-09 12:03 ` [PATCH 1/2] arm64/xor: use static calls for inner NEON helpers Ard Biesheuvel
@ 2021-11-09 12:03 ` Ard Biesheuvel
2021-12-13 13:24 ` Catalin Marinas
1 sibling, 1 reply; 7+ messages in thread
From: Ard Biesheuvel @ 2021-11-09 12:03 UTC (permalink / raw)
To: linux-arm-kernel
Cc: catalin.marinas, will, Ard Biesheuvel, Mark Rutland, Peter Zijlstra
Use the EOR3 instruction to implement xor_blocks() if the instruction is
available, which is the case if the CPU implements the SHA-3 extension.
This is about 20% faster on Apple M1 when using the 5-way version.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
arch/arm64/Kconfig | 3 +
arch/arm64/lib/xor-neon.c | 145 ++++++++++++++++++++
2 files changed, 148 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6f2d3e31fb54..14354acba5b4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT
def_bool y
depends on COMPAT && SYSVIPC
+config CC_HAVE_SHA3
+ def_bool $(cc-option, -march=armv8.2-a+sha3)
+
menu "Power management options"
source "kernel/power/Kconfig"
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index ee4795f3e166..0415cb94c781 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
}
EXPORT_SYMBOL(xor_arm64_neon_5);
+static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
+{
+ uint64x2_t res;
+
+ asm(".arch armv8.2-a+sha3 \n"
+ "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+ : "=w"(res) : "w"(p), "w"(q), "w"(r));
+ return res;
+}
+
+static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ } while (--lines > 0);
+}
+
+static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3,
+ unsigned long *p4)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+ uint64_t *dp4 = (uint64_t *)p4;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* p1 ^= p4 */
+ v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
+ v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
+ v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
+ v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ dp4 += 8;
+ } while (--lines > 0);
+}
+
+static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3,
+ unsigned long *p4, unsigned long *p5)
+{
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+ uint64_t *dp3 = (uint64_t *)p3;
+ uint64_t *dp4 = (uint64_t *)p4;
+ uint64_t *dp5 = (uint64_t *)p5;
+
+ register uint64x2_t v0, v1, v2, v3;
+ long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+ do {
+ /* p1 ^= p2 ^ p3 */
+ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+ vld1q_u64(dp3 + 0));
+ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+ vld1q_u64(dp3 + 2));
+ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+ vld1q_u64(dp3 + 4));
+ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+ vld1q_u64(dp3 + 6));
+
+ /* p1 ^= p4 ^ p5 */
+ v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
+ v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
+ v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
+ v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
+
+ /* store */
+ vst1q_u64(dp1 + 0, v0);
+ vst1q_u64(dp1 + 2, v1);
+ vst1q_u64(dp1 + 4, v2);
+ vst1q_u64(dp1 + 6, v3);
+
+ dp1 += 8;
+ dp2 += 8;
+ dp3 += 8;
+ dp4 += 8;
+ dp5 += 8;
+ } while (--lines > 0);
+}
+
DEFINE_STATIC_CALL(xor_arm64_3, xor_arm64_neon_3);
DEFINE_STATIC_CALL(xor_arm64_4, xor_arm64_neon_4);
DEFINE_STATIC_CALL(xor_arm64_5, xor_arm64_neon_5);
@@ -180,6 +309,22 @@ EXPORT_STATIC_CALL(xor_arm64_3);
EXPORT_STATIC_CALL(xor_arm64_4);
EXPORT_STATIC_CALL(xor_arm64_5);
+static int __init xor_neon_init(void)
+{
+ if (IS_ENABLED(CONFIG_CC_HAVE_SHA3) && cpu_have_named_feature(SHA3)) {
+ static_call_update(xor_arm64_3, xor_arm64_eor3_3);
+ static_call_update(xor_arm64_4, xor_arm64_eor3_4);
+ static_call_update(xor_arm64_5, xor_arm64_eor3_5);
+ }
+ return 0;
+}
+module_init(xor_neon_init);
+
+static void __exit xor_neon_exit(void)
+{
+}
+module_exit(xor_neon_exit);
+
MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
MODULE_DESCRIPTION("ARMv8 XOR Extensions");
MODULE_LICENSE("GPL");
--
2.30.2
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 2/2] arm64/xor: use EOR3 instructions when available
2021-11-09 12:03 ` [PATCH 2/2] arm64/xor: use EOR3 instructions when available Ard Biesheuvel
@ 2021-12-13 13:24 ` Catalin Marinas
2021-12-13 13:33 ` Ard Biesheuvel
0 siblings, 1 reply; 7+ messages in thread
From: Catalin Marinas @ 2021-12-13 13:24 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: linux-arm-kernel, will, Mark Rutland, Peter Zijlstra
Hi Ard,
I trust you on the algorithm but some minor issues below.
On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote:
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 6f2d3e31fb54..14354acba5b4 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT
> def_bool y
> depends on COMPAT && SYSVIPC
>
> +config CC_HAVE_SHA3
> + def_bool $(cc-option, -march=armv8.2-a+sha3)
Is it the compiler or the assembler that we need to support this? I
think it's sufficient to only check the latter.
I'd also move it to the ARMv8.2 section.
> +
> menu "Power management options"
>
> source "kernel/power/Kconfig"
> diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> index ee4795f3e166..0415cb94c781 100644
> --- a/arch/arm64/lib/xor-neon.c
> +++ b/arch/arm64/lib/xor-neon.c
> @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
> }
> EXPORT_SYMBOL(xor_arm64_neon_5);
>
> +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> +{
> + uint64x2_t res;
> +
> + asm(".arch armv8.2-a+sha3 \n"
> + "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
> + : "=w"(res) : "w"(p), "w"(q), "w"(r));
> + return res;
> +}
The .arch here may confuse the compiler/assembler since it overrides any
other .arch. I think this diff on top would do but I haven't extensively
tested it. I can fold it in if you give it a try:
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5adae54c98d8..c5104e8829e5 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1545,6 +1545,12 @@ endmenu
menu "ARMv8.2 architectural features"
+config AS_HAS_ARMV8_2
+ def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
+
+config AS_HAS_SHA3
+ def_bool $(as-instr,.arch armv8.2-a+sha3)
+
config ARM64_PMEM
bool "Enable support for persistent memory"
select ARCH_HAS_PMEM_API
@@ -2032,9 +2038,6 @@ config SYSVIPC_COMPAT
def_bool y
depends on COMPAT && SYSVIPC
-config CC_HAVE_SHA3
- def_bool $(cc-option, -march=armv8.2-a+sha3)
-
menu "Power management options"
source "kernel/power/Kconfig"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index e8cfc5868aa8..2f1de88651e6 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
include/generated/asm-offsets.h))
endif
+ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
+# make sure to pass the newest target architecture to -march.
+asm-arch := armv8.2-a
+endif
+
# Ensure that if the compiler supports branch protection we default it
# off, this will be overridden if we are using branch protection.
branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 0415cb94c781..2ca823825363 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -176,7 +176,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
{
uint64x2_t res;
- asm(".arch armv8.2-a+sha3 \n"
+ asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
"eor3 %0.16b, %1.16b, %2.16b, %3.16b"
: "=w"(res) : "w"(p), "w"(q), "w"(r));
return res;
@@ -311,7 +311,7 @@ EXPORT_STATIC_CALL(xor_arm64_5);
static int __init xor_neon_init(void)
{
- if (IS_ENABLED(CONFIG_CC_HAVE_SHA3) && cpu_have_named_feature(SHA3)) {
+ if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
static_call_update(xor_arm64_3, xor_arm64_eor3_3);
static_call_update(xor_arm64_4, xor_arm64_eor3_4);
static_call_update(xor_arm64_5, xor_arm64_eor3_5);
--
Catalin
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 2/2] arm64/xor: use EOR3 instructions when available
2021-12-13 13:24 ` Catalin Marinas
@ 2021-12-13 13:33 ` Ard Biesheuvel
2021-12-13 15:05 ` Catalin Marinas
0 siblings, 1 reply; 7+ messages in thread
From: Ard Biesheuvel @ 2021-12-13 13:33 UTC (permalink / raw)
To: Catalin Marinas; +Cc: Linux ARM, Will Deacon, Mark Rutland, Peter Zijlstra
On Mon, 13 Dec 2021 at 14:25, Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> Hi Ard,
>
> I trust you on the algorithm but some minor issues below.
>
> On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote:
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 6f2d3e31fb54..14354acba5b4 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT
> > def_bool y
> > depends on COMPAT && SYSVIPC
> >
> > +config CC_HAVE_SHA3
> > + def_bool $(cc-option, -march=armv8.2-a+sha3)
>
> Is it the compiler or the assembler that we need to support this? I
> think it's sufficient to only check the latter.
>
> I'd also move it to the ARMv8.2 section.
>
> > +
> > menu "Power management options"
> >
> > source "kernel/power/Kconfig"
> > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> > index ee4795f3e166..0415cb94c781 100644
> > --- a/arch/arm64/lib/xor-neon.c
> > +++ b/arch/arm64/lib/xor-neon.c
> > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
> > }
> > EXPORT_SYMBOL(xor_arm64_neon_5);
> >
> > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> > +{
> > + uint64x2_t res;
> > +
> > + asm(".arch armv8.2-a+sha3 \n"
> > + "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
> > + : "=w"(res) : "w"(p), "w"(q), "w"(r));
> > + return res;
> > +}
>
> The .arch here may confuse the compiler/assembler since it overrides any
> other .arch. I think this diff on top would do but I haven't extensively
> tested it. I can fold it in if you give it a try:
>
I was going to respin this without the static_call changes, since
those are not going to land anytime soon, and for this code, it
doesn't really matter anyway. I'll fold in your diff and test it as
well.
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 5adae54c98d8..c5104e8829e5 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1545,6 +1545,12 @@ endmenu
>
> menu "ARMv8.2 architectural features"
>
> +config AS_HAS_ARMV8_2
> + def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
> +
> +config AS_HAS_SHA3
> + def_bool $(as-instr,.arch armv8.2-a+sha3)
> +
> config ARM64_PMEM
> bool "Enable support for persistent memory"
> select ARCH_HAS_PMEM_API
> @@ -2032,9 +2038,6 @@ config SYSVIPC_COMPAT
> def_bool y
> depends on COMPAT && SYSVIPC
>
> -config CC_HAVE_SHA3
> - def_bool $(cc-option, -march=armv8.2-a+sha3)
> -
> menu "Power management options"
>
> source "kernel/power/Kconfig"
> diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
> index e8cfc5868aa8..2f1de88651e6 100644
> --- a/arch/arm64/Makefile
> +++ b/arch/arm64/Makefile
> @@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
> include/generated/asm-offsets.h))
> endif
>
> +ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
> +# make sure to pass the newest target architecture to -march.
> +asm-arch := armv8.2-a
> +endif
> +
> # Ensure that if the compiler supports branch protection we default it
> # off, this will be overridden if we are using branch protection.
> branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
> diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> index 0415cb94c781..2ca823825363 100644
> --- a/arch/arm64/lib/xor-neon.c
> +++ b/arch/arm64/lib/xor-neon.c
> @@ -176,7 +176,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> {
> uint64x2_t res;
>
> - asm(".arch armv8.2-a+sha3 \n"
> + asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
> "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
> : "=w"(res) : "w"(p), "w"(q), "w"(r));
> return res;
> @@ -311,7 +311,7 @@ EXPORT_STATIC_CALL(xor_arm64_5);
>
> static int __init xor_neon_init(void)
> {
> - if (IS_ENABLED(CONFIG_CC_HAVE_SHA3) && cpu_have_named_feature(SHA3)) {
> + if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
> static_call_update(xor_arm64_3, xor_arm64_eor3_3);
> static_call_update(xor_arm64_4, xor_arm64_eor3_4);
> static_call_update(xor_arm64_5, xor_arm64_eor3_5);
>
> --
> Catalin
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 2/2] arm64/xor: use EOR3 instructions when available
2021-12-13 13:33 ` Ard Biesheuvel
@ 2021-12-13 15:05 ` Catalin Marinas
2021-12-13 15:10 ` Ard Biesheuvel
0 siblings, 1 reply; 7+ messages in thread
From: Catalin Marinas @ 2021-12-13 15:05 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: Linux ARM, Will Deacon, Mark Rutland, Peter Zijlstra
On Mon, Dec 13, 2021 at 02:33:21PM +0100, Ard Biesheuvel wrote:
> On Mon, 13 Dec 2021 at 14:25, Catalin Marinas <catalin.marinas@arm.com> wrote:
> > On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote:
> > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > index 6f2d3e31fb54..14354acba5b4 100644
> > > --- a/arch/arm64/Kconfig
> > > +++ b/arch/arm64/Kconfig
> > > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT
> > > def_bool y
> > > depends on COMPAT && SYSVIPC
> > >
> > > +config CC_HAVE_SHA3
> > > + def_bool $(cc-option, -march=armv8.2-a+sha3)
> >
> > Is it the compiler or the assembler that we need to support this? I
> > think it's sufficient to only check the latter.
> >
> > I'd also move it to the ARMv8.2 section.
> >
> > > +
> > > menu "Power management options"
> > >
> > > source "kernel/power/Kconfig"
> > > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> > > index ee4795f3e166..0415cb94c781 100644
> > > --- a/arch/arm64/lib/xor-neon.c
> > > +++ b/arch/arm64/lib/xor-neon.c
> > > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
> > > }
> > > EXPORT_SYMBOL(xor_arm64_neon_5);
> > >
> > > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> > > +{
> > > + uint64x2_t res;
> > > +
> > > + asm(".arch armv8.2-a+sha3 \n"
> > > + "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
> > > + : "=w"(res) : "w"(p), "w"(q), "w"(r));
> > > + return res;
> > > +}
> >
> > The .arch here may confuse the compiler/assembler since it overrides any
> > other .arch. I think this diff on top would do but I haven't extensively
> > tested it. I can fold it in if you give it a try:
>
> I was going to respin this without the static_call changes, since
> those are not going to land anytime soon,
I thought the generic implementation still works, though not the most
efficient.
> and for this code, it
> doesn't really matter anyway. I'll fold in your diff and test it as
> well.
Sounds fine to me.
--
Catalin
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 2/2] arm64/xor: use EOR3 instructions when available
2021-12-13 15:05 ` Catalin Marinas
@ 2021-12-13 15:10 ` Ard Biesheuvel
0 siblings, 0 replies; 7+ messages in thread
From: Ard Biesheuvel @ 2021-12-13 15:10 UTC (permalink / raw)
To: Catalin Marinas; +Cc: Linux ARM, Will Deacon, Mark Rutland, Peter Zijlstra
On Mon, 13 Dec 2021 at 16:05, Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> On Mon, Dec 13, 2021 at 02:33:21PM +0100, Ard Biesheuvel wrote:
> > On Mon, 13 Dec 2021 at 14:25, Catalin Marinas <catalin.marinas@arm.com> wrote:
> > > On Tue, Nov 09, 2021 at 01:03:36PM +0100, Ard Biesheuvel wrote:
> > > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > > index 6f2d3e31fb54..14354acba5b4 100644
> > > > --- a/arch/arm64/Kconfig
> > > > +++ b/arch/arm64/Kconfig
> > > > @@ -2034,6 +2034,9 @@ config SYSVIPC_COMPAT
> > > > def_bool y
> > > > depends on COMPAT && SYSVIPC
> > > >
> > > > +config CC_HAVE_SHA3
> > > > + def_bool $(cc-option, -march=armv8.2-a+sha3)
> > >
> > > Is it the compiler or the assembler that we need to support this? I
> > > think it's sufficient to only check the latter.
> > >
> > > I'd also move it to the ARMv8.2 section.
> > >
> > > > +
> > > > menu "Power management options"
> > > >
> > > > source "kernel/power/Kconfig"
> > > > diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
> > > > index ee4795f3e166..0415cb94c781 100644
> > > > --- a/arch/arm64/lib/xor-neon.c
> > > > +++ b/arch/arm64/lib/xor-neon.c
> > > > @@ -172,6 +172,135 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
> > > > }
> > > > EXPORT_SYMBOL(xor_arm64_neon_5);
> > > >
> > > > +static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
> > > > +{
> > > > + uint64x2_t res;
> > > > +
> > > > + asm(".arch armv8.2-a+sha3 \n"
> > > > + "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
> > > > + : "=w"(res) : "w"(p), "w"(q), "w"(r));
> > > > + return res;
> > > > +}
> > >
> > > The .arch here may confuse the compiler/assembler since it overrides any
> > > other .arch. I think this diff on top would do but I haven't extensively
> > > tested it. I can fold it in if you give it a try:
> >
> > I was going to respin this without the static_call changes, since
> > those are not going to land anytime soon,
>
> I thought the generic implementation still works, though not the most
> efficient.
>
It does work, but the existing code already uses function pointers, so
at this point, it is just unneeded churn.
> > and for this code, it
> > doesn't really matter anyway. I'll fold in your diff and test it as
> > well.
>
> Sounds fine to me.
>
> --
> Catalin
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-12-13 15:22 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-09 12:03 [PATCH 0/2] arm64: use SHA3 instructions to speed up XOR Ard Biesheuvel
2021-11-09 12:03 ` [PATCH 1/2] arm64/xor: use static calls for inner NEON helpers Ard Biesheuvel
2021-11-09 12:03 ` [PATCH 2/2] arm64/xor: use EOR3 instructions when available Ard Biesheuvel
2021-12-13 13:24 ` Catalin Marinas
2021-12-13 13:33 ` Ard Biesheuvel
2021-12-13 15:05 ` Catalin Marinas
2021-12-13 15:10 ` Ard Biesheuvel
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.