All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
@ 2017-11-21 13:40 ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2017-11-21 13:40 UTC (permalink / raw)
  To: linux-arm-kernel, linux-crypto
  Cc: mark.rutland, samitolvanen, alxmtvv, ak, ghackmann, keescook,
	michal.lkml, ndesaulniers, ynorov, mka, will.deacon, herbert,
	Ard Biesheuvel

Most crypto drivers involving kernel mode NEON take care to put the code
that actually touches the NEON register file in a separate compilation
unit, to prevent the compiler from reordering code that preserves or
restores the NEON context with code that may corrupt it. This is
necessary because we currently have no way to express the restrictions
imposed upon use of the NEON in kernel mode in a way that the compiler
understands.

However, in the case of aes-ce-cipher, it did not seem unreasonable to
deviate from this rule, given how it does not seem possible for the
compiler to reorder cross object function calls with asm blocks whose
in- and output constraints reflect that it reads from and writes to
memory.

Now that LTO is being proposed for the arm64 kernel, it is time to
revisit this. The link time optimization may replace the function
calls to kernel_neon_begin() and kernel_neon_end() with instantiations
of the IR that make up its implementation, allowing further reordering
with the asm block.

So let's clean this up, and move the asm() blocks into a separate .S
file.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Makefile                         |   2 +-
 arch/arm64/crypto/aes-ce-core.S                    |  87 ++++++++++++++++
 .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      | 115 +++------------------
 3 files changed, 100 insertions(+), 104 deletions(-)
 create mode 100644 arch/arm64/crypto/aes-ce-core.S
 rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)

diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index b5edc5918c28..f5e8295fd756 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
 crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
-CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
+aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
 aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S
new file mode 100644
index 000000000000..8efdfdade393
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-core.S
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.arch		armv8-a+crypto
+
+ENTRY(__aes_ce_encrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aese		v0.16b, v2.16b
+	aesmc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aese		v0.16b, v3.16b
+	aesmc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aese		v0.16b, v1.16b
+	aesmc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aese		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+ENDPROC(__aes_ce_encrypt)
+
+ENTRY(__aes_ce_decrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aesd		v0.16b, v2.16b
+	aesimc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aesd		v0.16b, v3.16b
+	aesimc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aesd		v0.16b, v1.16b
+	aesimc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aesd		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+ENDPROC(__aes_ce_decrypt)
+
+/*
+ * __aes_ce_sub() - use the aese instruction to perform the AES sbox
+ *                  substitution on each byte in 'input'
+ */
+ENTRY(__aes_ce_sub)
+	dup		v1.4s, w0
+	movi		v0.16b, #0
+	aese		v0.16b, v1.16b
+	umov		w0, v0.s[0]
+	ret
+ENDPROC(__aes_ce_sub)
+
+ENTRY(__aes_ce_invert)
+	ld1		{v0.4s}, [x1]
+	aesimc		v1.16b, v0.16b
+	st1		{v1.4s}, [x0]
+	ret
+ENDPROC(__aes_ce_invert)
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-glue.c
similarity index 62%
rename from arch/arm64/crypto/aes-ce-cipher.c
rename to arch/arm64/crypto/aes-ce-glue.c
index 6a75cd75ed11..e6b3227bbf57 100644
--- a/arch/arm64/crypto/aes-ce-cipher.c
+++ b/arch/arm64/crypto/aes-ce-glue.c
@@ -29,6 +29,13 @@ struct aes_block {
 	u8 b[AES_BLOCK_SIZE];
 };
 
+asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
+asmlinkage u32 __aes_ce_sub(u32 l);
+asmlinkage void __aes_ce_invert(struct aes_block *out,
+				const struct aes_block *in);
+
 static int num_rounds(struct crypto_aes_ctx *ctx)
 {
 	/*
@@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
 static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct aes_block *out = (struct aes_block *)dst;
-	struct aes_block const *in = (struct aes_block *)src;
-	void *dummy0;
-	int dummy1;
 
 	if (!may_use_simd()) {
 		__aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
@@ -55,49 +58,13 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	}
 
 	kernel_neon_begin();
-
-	__asm__("	ld1	{v0.16b}, %[in]			;"
-		"	ld1	{v1.4s}, [%[key]], #16		;"
-		"	cmp	%w[rounds], #10			;"
-		"	bmi	0f				;"
-		"	bne	3f				;"
-		"	mov	v3.16b, v1.16b			;"
-		"	b	2f				;"
-		"0:	mov	v2.16b, v1.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"1:	aese	v0.16b, v2.16b			;"
-		"	aesmc	v0.16b, v0.16b			;"
-		"2:	ld1	{v1.4s}, [%[key]], #16		;"
-		"	aese	v0.16b, v3.16b			;"
-		"	aesmc	v0.16b, v0.16b			;"
-		"3:	ld1	{v2.4s}, [%[key]], #16		;"
-		"	subs	%w[rounds], %w[rounds], #3	;"
-		"	aese	v0.16b, v1.16b			;"
-		"	aesmc	v0.16b, v0.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"	bpl	1b				;"
-		"	aese	v0.16b, v2.16b			;"
-		"	eor	v0.16b, v0.16b, v3.16b		;"
-		"	st1	{v0.16b}, %[out]		;"
-
-	:	[out]		"=Q"(*out),
-		[key]		"=r"(dummy0),
-		[rounds]	"=r"(dummy1)
-	:	[in]		"Q"(*in),
-				"1"(ctx->key_enc),
-				"2"(num_rounds(ctx) - 2)
-	:	"cc");
-
+	__aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
 	kernel_neon_end();
 }
 
 static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct aes_block *out = (struct aes_block *)dst;
-	struct aes_block const *in = (struct aes_block *)src;
-	void *dummy0;
-	int dummy1;
 
 	if (!may_use_simd()) {
 		__aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
@@ -105,62 +72,10 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	}
 
 	kernel_neon_begin();
-
-	__asm__("	ld1	{v0.16b}, %[in]			;"
-		"	ld1	{v1.4s}, [%[key]], #16		;"
-		"	cmp	%w[rounds], #10			;"
-		"	bmi	0f				;"
-		"	bne	3f				;"
-		"	mov	v3.16b, v1.16b			;"
-		"	b	2f				;"
-		"0:	mov	v2.16b, v1.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"1:	aesd	v0.16b, v2.16b			;"
-		"	aesimc	v0.16b, v0.16b			;"
-		"2:	ld1	{v1.4s}, [%[key]], #16		;"
-		"	aesd	v0.16b, v3.16b			;"
-		"	aesimc	v0.16b, v0.16b			;"
-		"3:	ld1	{v2.4s}, [%[key]], #16		;"
-		"	subs	%w[rounds], %w[rounds], #3	;"
-		"	aesd	v0.16b, v1.16b			;"
-		"	aesimc	v0.16b, v0.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"	bpl	1b				;"
-		"	aesd	v0.16b, v2.16b			;"
-		"	eor	v0.16b, v0.16b, v3.16b		;"
-		"	st1	{v0.16b}, %[out]		;"
-
-	:	[out]		"=Q"(*out),
-		[key]		"=r"(dummy0),
-		[rounds]	"=r"(dummy1)
-	:	[in]		"Q"(*in),
-				"1"(ctx->key_dec),
-				"2"(num_rounds(ctx) - 2)
-	:	"cc");
-
+	__aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
 	kernel_neon_end();
 }
 
-/*
- * aes_sub() - use the aese instruction to perform the AES sbox substitution
- *             on each byte in 'input'
- */
-static u32 aes_sub(u32 input)
-{
-	u32 ret;
-
-	__asm__("dup	v1.4s, %w[in]		;"
-		"movi	v0.16b, #0		;"
-		"aese	v0.16b, v1.16b		;"
-		"umov	%w[out], v0.4s[0]	;"
-
-	:	[out]	"=r"(ret)
-	:	[in]	"r"(input)
-	:		"v0","v1");
-
-	return ret;
-}
-
 int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		     unsigned int key_len)
 {
@@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		u32 *rki = ctx->key_enc + (i * kwords);
 		u32 *rko = rki + kwords;
 
-		rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
+		rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
 		rko[1] = rko[0] ^ rki[1];
 		rko[2] = rko[1] ^ rki[2];
 		rko[3] = rko[2] ^ rki[3];
@@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		} else if (key_len == AES_KEYSIZE_256) {
 			if (i >= 6)
 				break;
-			rko[4] = aes_sub(rko[3]) ^ rki[4];
+			rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
 			rko[5] = rko[4] ^ rki[5];
 			rko[6] = rko[5] ^ rki[6];
 			rko[7] = rko[6] ^ rki[7];
@@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 
 	key_dec[0] = key_enc[j];
 	for (i = 1, j--; j > 0; i++, j--)
-		__asm__("ld1	{v0.4s}, %[in]		;"
-			"aesimc	v1.16b, v0.16b		;"
-			"st1	{v1.4s}, %[out]	;"
-
-		:	[out]	"=Q"(key_dec[i])
-		:	[in]	"Q"(key_enc[j])
-		:		"v0","v1");
+		__aes_ce_invert(key_dec + i, key_enc + j);
 	key_dec[i] = key_enc[0];
 
 	kernel_neon_end();
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
@ 2017-11-21 13:40 ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2017-11-21 13:40 UTC (permalink / raw)
  To: linux-arm-kernel

Most crypto drivers involving kernel mode NEON take care to put the code
that actually touches the NEON register file in a separate compilation
unit, to prevent the compiler from reordering code that preserves or
restores the NEON context with code that may corrupt it. This is
necessary because we currently have no way to express the restrictions
imposed upon use of the NEON in kernel mode in a way that the compiler
understands.

However, in the case of aes-ce-cipher, it did not seem unreasonable to
deviate from this rule, given how it does not seem possible for the
compiler to reorder cross object function calls with asm blocks whose
in- and output constraints reflect that it reads from and writes to
memory.

Now that LTO is being proposed for the arm64 kernel, it is time to
revisit this. The link time optimization may replace the function
calls to kernel_neon_begin() and kernel_neon_end() with instantiations
of the IR that make up its implementation, allowing further reordering
with the asm block.

So let's clean this up, and move the asm() blocks into a separate .S
file.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/arm64/crypto/Makefile                         |   2 +-
 arch/arm64/crypto/aes-ce-core.S                    |  87 ++++++++++++++++
 .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      | 115 +++------------------
 3 files changed, 100 insertions(+), 104 deletions(-)
 create mode 100644 arch/arm64/crypto/aes-ce-core.S
 rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)

diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index b5edc5918c28..f5e8295fd756 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
 crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
-CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
+aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
 aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S
new file mode 100644
index 000000000000..8efdfdade393
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-core.S
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.arch		armv8-a+crypto
+
+ENTRY(__aes_ce_encrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aese		v0.16b, v2.16b
+	aesmc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aese		v0.16b, v3.16b
+	aesmc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aese		v0.16b, v1.16b
+	aesmc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aese		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+ENDPROC(__aes_ce_encrypt)
+
+ENTRY(__aes_ce_decrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aesd		v0.16b, v2.16b
+	aesimc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aesd		v0.16b, v3.16b
+	aesimc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aesd		v0.16b, v1.16b
+	aesimc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aesd		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+ENDPROC(__aes_ce_decrypt)
+
+/*
+ * __aes_ce_sub() - use the aese instruction to perform the AES sbox
+ *                  substitution on each byte in 'input'
+ */
+ENTRY(__aes_ce_sub)
+	dup		v1.4s, w0
+	movi		v0.16b, #0
+	aese		v0.16b, v1.16b
+	umov		w0, v0.s[0]
+	ret
+ENDPROC(__aes_ce_sub)
+
+ENTRY(__aes_ce_invert)
+	ld1		{v0.4s}, [x1]
+	aesimc		v1.16b, v0.16b
+	st1		{v1.4s}, [x0]
+	ret
+ENDPROC(__aes_ce_invert)
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-glue.c
similarity index 62%
rename from arch/arm64/crypto/aes-ce-cipher.c
rename to arch/arm64/crypto/aes-ce-glue.c
index 6a75cd75ed11..e6b3227bbf57 100644
--- a/arch/arm64/crypto/aes-ce-cipher.c
+++ b/arch/arm64/crypto/aes-ce-glue.c
@@ -29,6 +29,13 @@ struct aes_block {
 	u8 b[AES_BLOCK_SIZE];
 };
 
+asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
+asmlinkage u32 __aes_ce_sub(u32 l);
+asmlinkage void __aes_ce_invert(struct aes_block *out,
+				const struct aes_block *in);
+
 static int num_rounds(struct crypto_aes_ctx *ctx)
 {
 	/*
@@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
 static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct aes_block *out = (struct aes_block *)dst;
-	struct aes_block const *in = (struct aes_block *)src;
-	void *dummy0;
-	int dummy1;
 
 	if (!may_use_simd()) {
 		__aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
@@ -55,49 +58,13 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	}
 
 	kernel_neon_begin();
-
-	__asm__("	ld1	{v0.16b}, %[in]			;"
-		"	ld1	{v1.4s}, [%[key]], #16		;"
-		"	cmp	%w[rounds], #10			;"
-		"	bmi	0f				;"
-		"	bne	3f				;"
-		"	mov	v3.16b, v1.16b			;"
-		"	b	2f				;"
-		"0:	mov	v2.16b, v1.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"1:	aese	v0.16b, v2.16b			;"
-		"	aesmc	v0.16b, v0.16b			;"
-		"2:	ld1	{v1.4s}, [%[key]], #16		;"
-		"	aese	v0.16b, v3.16b			;"
-		"	aesmc	v0.16b, v0.16b			;"
-		"3:	ld1	{v2.4s}, [%[key]], #16		;"
-		"	subs	%w[rounds], %w[rounds], #3	;"
-		"	aese	v0.16b, v1.16b			;"
-		"	aesmc	v0.16b, v0.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"	bpl	1b				;"
-		"	aese	v0.16b, v2.16b			;"
-		"	eor	v0.16b, v0.16b, v3.16b		;"
-		"	st1	{v0.16b}, %[out]		;"
-
-	:	[out]		"=Q"(*out),
-		[key]		"=r"(dummy0),
-		[rounds]	"=r"(dummy1)
-	:	[in]		"Q"(*in),
-				"1"(ctx->key_enc),
-				"2"(num_rounds(ctx) - 2)
-	:	"cc");
-
+	__aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
 	kernel_neon_end();
 }
 
 static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct aes_block *out = (struct aes_block *)dst;
-	struct aes_block const *in = (struct aes_block *)src;
-	void *dummy0;
-	int dummy1;
 
 	if (!may_use_simd()) {
 		__aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
@@ -105,62 +72,10 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	}
 
 	kernel_neon_begin();
-
-	__asm__("	ld1	{v0.16b}, %[in]			;"
-		"	ld1	{v1.4s}, [%[key]], #16		;"
-		"	cmp	%w[rounds], #10			;"
-		"	bmi	0f				;"
-		"	bne	3f				;"
-		"	mov	v3.16b, v1.16b			;"
-		"	b	2f				;"
-		"0:	mov	v2.16b, v1.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"1:	aesd	v0.16b, v2.16b			;"
-		"	aesimc	v0.16b, v0.16b			;"
-		"2:	ld1	{v1.4s}, [%[key]], #16		;"
-		"	aesd	v0.16b, v3.16b			;"
-		"	aesimc	v0.16b, v0.16b			;"
-		"3:	ld1	{v2.4s}, [%[key]], #16		;"
-		"	subs	%w[rounds], %w[rounds], #3	;"
-		"	aesd	v0.16b, v1.16b			;"
-		"	aesimc	v0.16b, v0.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"	bpl	1b				;"
-		"	aesd	v0.16b, v2.16b			;"
-		"	eor	v0.16b, v0.16b, v3.16b		;"
-		"	st1	{v0.16b}, %[out]		;"
-
-	:	[out]		"=Q"(*out),
-		[key]		"=r"(dummy0),
-		[rounds]	"=r"(dummy1)
-	:	[in]		"Q"(*in),
-				"1"(ctx->key_dec),
-				"2"(num_rounds(ctx) - 2)
-	:	"cc");
-
+	__aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
 	kernel_neon_end();
 }
 
-/*
- * aes_sub() - use the aese instruction to perform the AES sbox substitution
- *             on each byte in 'input'
- */
-static u32 aes_sub(u32 input)
-{
-	u32 ret;
-
-	__asm__("dup	v1.4s, %w[in]		;"
-		"movi	v0.16b, #0		;"
-		"aese	v0.16b, v1.16b		;"
-		"umov	%w[out], v0.4s[0]	;"
-
-	:	[out]	"=r"(ret)
-	:	[in]	"r"(input)
-	:		"v0","v1");
-
-	return ret;
-}
-
 int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		     unsigned int key_len)
 {
@@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		u32 *rki = ctx->key_enc + (i * kwords);
 		u32 *rko = rki + kwords;
 
-		rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
+		rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
 		rko[1] = rko[0] ^ rki[1];
 		rko[2] = rko[1] ^ rki[2];
 		rko[3] = rko[2] ^ rki[3];
@@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		} else if (key_len == AES_KEYSIZE_256) {
 			if (i >= 6)
 				break;
-			rko[4] = aes_sub(rko[3]) ^ rki[4];
+			rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
 			rko[5] = rko[4] ^ rki[5];
 			rko[6] = rko[5] ^ rki[6];
 			rko[7] = rko[6] ^ rki[7];
@@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 
 	key_dec[0] = key_enc[j];
 	for (i = 1, j--; j > 0; i++, j--)
-		__asm__("ld1	{v0.4s}, %[in]		;"
-			"aesimc	v1.16b, v0.16b		;"
-			"st1	{v1.4s}, %[out]	;"
-
-		:	[out]	"=Q"(key_dec[i])
-		:	[in]	"Q"(key_enc[j])
-		:		"v0","v1");
+		__aes_ce_invert(key_dec + i, key_enc + j);
 	key_dec[i] = key_enc[0];
 
 	kernel_neon_end();
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
  2017-11-21 13:40 ` Ard Biesheuvel
@ 2017-11-21 17:32   ` Nick Desaulniers
  -1 siblings, 0 replies; 14+ messages in thread
From: Nick Desaulniers @ 2017-11-21 17:32 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: linux-arm-kernel, linux-crypto, Mark Rutland, Sami Tolvanen,
	Alex Matveev, Andi Kleen, Greg Hackmann, Kees Cook, Michal Marek,
	Yury Norov, Matthias Kaehlcke, Will Deacon, herbert

Checked for transcription errors, calling convention, and removal of
temporary locals. LGTM.

Reviewed-By: Nick Desaulniers <ndesaulniers@google.com>

On Tue, Nov 21, 2017 at 5:40 AM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> Most crypto drivers involving kernel mode NEON take care to put the code
> that actually touches the NEON register file in a separate compilation
> unit, to prevent the compiler from reordering code that preserves or
> restores the NEON context with code that may corrupt it. This is
> necessary because we currently have no way to express the restrictions
> imposed upon use of the NEON in kernel mode in a way that the compiler
> understands.
>
> However, in the case of aes-ce-cipher, it did not seem unreasonable to
> deviate from this rule, given how it does not seem possible for the
> compiler to reorder cross object function calls with asm blocks whose
> in- and output constraints reflect that it reads from and writes to
> memory.
>
> Now that LTO is being proposed for the arm64 kernel, it is time to
> revisit this. The link time optimization may replace the function
> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
> of the IR that make up its implementation, allowing further reordering
> with the asm block.
>
> So let's clean this up, and move the asm() blocks into a separate .S
> file.
>
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/arm64/crypto/Makefile                         |   2 +-
>  arch/arm64/crypto/aes-ce-core.S                    |  87 ++++++++++++++++
>  .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      | 115 +++------------------
>  3 files changed, 100 insertions(+), 104 deletions(-)
>  create mode 100644 arch/arm64/crypto/aes-ce-core.S
>  rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
>
> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
> index b5edc5918c28..f5e8295fd756 100644
> --- a/arch/arm64/crypto/Makefile
> +++ b/arch/arm64/crypto/Makefile
> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>  crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>  aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
> diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S
> new file mode 100644
> index 000000000000..8efdfdade393
> --- /dev/null
> +++ b/arch/arm64/crypto/aes-ce-core.S
> @@ -0,0 +1,87 @@
> +/*
> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +       .arch           armv8-a+crypto
> +
> +ENTRY(__aes_ce_encrypt)
> +       sub             w3, w3, #2
> +       ld1             {v0.16b}, [x2]
> +       ld1             {v1.4s}, [x0], #16
> +       cmp             w3, #10
> +       bmi             0f
> +       bne             3f
> +       mov             v3.16b, v1.16b
> +       b               2f
> +0:     mov             v2.16b, v1.16b
> +       ld1             {v3.4s}, [x0], #16
> +1:     aese            v0.16b, v2.16b
> +       aesmc           v0.16b, v0.16b
> +2:     ld1             {v1.4s}, [x0], #16
> +       aese            v0.16b, v3.16b
> +       aesmc           v0.16b, v0.16b
> +3:     ld1             {v2.4s}, [x0], #16
> +       subs            w3, w3, #3
> +       aese            v0.16b, v1.16b
> +       aesmc           v0.16b, v0.16b
> +       ld1             {v3.4s}, [x0], #16
> +       bpl             1b
> +       aese            v0.16b, v2.16b
> +       eor             v0.16b, v0.16b, v3.16b
> +       st1             {v0.16b}, [x1]
> +       ret
> +ENDPROC(__aes_ce_encrypt)
> +
> +ENTRY(__aes_ce_decrypt)
> +       sub             w3, w3, #2
> +       ld1             {v0.16b}, [x2]
> +       ld1             {v1.4s}, [x0], #16
> +       cmp             w3, #10
> +       bmi             0f
> +       bne             3f
> +       mov             v3.16b, v1.16b
> +       b               2f
> +0:     mov             v2.16b, v1.16b
> +       ld1             {v3.4s}, [x0], #16
> +1:     aesd            v0.16b, v2.16b
> +       aesimc          v0.16b, v0.16b
> +2:     ld1             {v1.4s}, [x0], #16
> +       aesd            v0.16b, v3.16b
> +       aesimc          v0.16b, v0.16b
> +3:     ld1             {v2.4s}, [x0], #16
> +       subs            w3, w3, #3
> +       aesd            v0.16b, v1.16b
> +       aesimc          v0.16b, v0.16b
> +       ld1             {v3.4s}, [x0], #16
> +       bpl             1b
> +       aesd            v0.16b, v2.16b
> +       eor             v0.16b, v0.16b, v3.16b
> +       st1             {v0.16b}, [x1]
> +       ret
> +ENDPROC(__aes_ce_decrypt)
> +
> +/*
> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
> + *                  substitution on each byte in 'input'
> + */
> +ENTRY(__aes_ce_sub)
> +       dup             v1.4s, w0
> +       movi            v0.16b, #0
> +       aese            v0.16b, v1.16b
> +       umov            w0, v0.s[0]
> +       ret
> +ENDPROC(__aes_ce_sub)
> +
> +ENTRY(__aes_ce_invert)
> +       ld1             {v0.4s}, [x1]
> +       aesimc          v1.16b, v0.16b
> +       st1             {v1.4s}, [x0]
> +       ret
> +ENDPROC(__aes_ce_invert)
> diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-glue.c
> similarity index 62%
> rename from arch/arm64/crypto/aes-ce-cipher.c
> rename to arch/arm64/crypto/aes-ce-glue.c
> index 6a75cd75ed11..e6b3227bbf57 100644
> --- a/arch/arm64/crypto/aes-ce-cipher.c
> +++ b/arch/arm64/crypto/aes-ce-glue.c
> @@ -29,6 +29,13 @@ struct aes_block {
>         u8 b[AES_BLOCK_SIZE];
>  };
>
> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
> +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
> +
> +asmlinkage u32 __aes_ce_sub(u32 l);
> +asmlinkage void __aes_ce_invert(struct aes_block *out,
> +                               const struct aes_block *in);
> +
>  static int num_rounds(struct crypto_aes_ctx *ctx)
>  {
>         /*
> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>  static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -       struct aes_block *out = (struct aes_block *)dst;
> -       struct aes_block const *in = (struct aes_block *)src;
> -       void *dummy0;
> -       int dummy1;
>
>         if (!may_use_simd()) {
>                 __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
> @@ -55,49 +58,13 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>         }
>
>         kernel_neon_begin();
> -
> -       __asm__("       ld1     {v0.16b}, %[in]                 ;"
> -               "       ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       cmp     %w[rounds], #10                 ;"
> -               "       bmi     0f                              ;"
> -               "       bne     3f                              ;"
> -               "       mov     v3.16b, v1.16b                  ;"
> -               "       b       2f                              ;"
> -               "0:     mov     v2.16b, v1.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "1:     aese    v0.16b, v2.16b                  ;"
> -               "       aesmc   v0.16b, v0.16b                  ;"
> -               "2:     ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       aese    v0.16b, v3.16b                  ;"
> -               "       aesmc   v0.16b, v0.16b                  ;"
> -               "3:     ld1     {v2.4s}, [%[key]], #16          ;"
> -               "       subs    %w[rounds], %w[rounds], #3      ;"
> -               "       aese    v0.16b, v1.16b                  ;"
> -               "       aesmc   v0.16b, v0.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "       bpl     1b                              ;"
> -               "       aese    v0.16b, v2.16b                  ;"
> -               "       eor     v0.16b, v0.16b, v3.16b          ;"
> -               "       st1     {v0.16b}, %[out]                ;"
> -
> -       :       [out]           "=Q"(*out),
> -               [key]           "=r"(dummy0),
> -               [rounds]        "=r"(dummy1)
> -       :       [in]            "Q"(*in),
> -                               "1"(ctx->key_enc),
> -                               "2"(num_rounds(ctx) - 2)
> -       :       "cc");
> -
> +       __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>         kernel_neon_end();
>  }
>
>  static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -       struct aes_block *out = (struct aes_block *)dst;
> -       struct aes_block const *in = (struct aes_block *)src;
> -       void *dummy0;
> -       int dummy1;
>
>         if (!may_use_simd()) {
>                 __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
> @@ -105,62 +72,10 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>         }
>
>         kernel_neon_begin();
> -
> -       __asm__("       ld1     {v0.16b}, %[in]                 ;"
> -               "       ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       cmp     %w[rounds], #10                 ;"
> -               "       bmi     0f                              ;"
> -               "       bne     3f                              ;"
> -               "       mov     v3.16b, v1.16b                  ;"
> -               "       b       2f                              ;"
> -               "0:     mov     v2.16b, v1.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "1:     aesd    v0.16b, v2.16b                  ;"
> -               "       aesimc  v0.16b, v0.16b                  ;"
> -               "2:     ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       aesd    v0.16b, v3.16b                  ;"
> -               "       aesimc  v0.16b, v0.16b                  ;"
> -               "3:     ld1     {v2.4s}, [%[key]], #16          ;"
> -               "       subs    %w[rounds], %w[rounds], #3      ;"
> -               "       aesd    v0.16b, v1.16b                  ;"
> -               "       aesimc  v0.16b, v0.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "       bpl     1b                              ;"
> -               "       aesd    v0.16b, v2.16b                  ;"
> -               "       eor     v0.16b, v0.16b, v3.16b          ;"
> -               "       st1     {v0.16b}, %[out]                ;"
> -
> -       :       [out]           "=Q"(*out),
> -               [key]           "=r"(dummy0),
> -               [rounds]        "=r"(dummy1)
> -       :       [in]            "Q"(*in),
> -                               "1"(ctx->key_dec),
> -                               "2"(num_rounds(ctx) - 2)
> -       :       "cc");
> -
> +       __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>         kernel_neon_end();
>  }
>
> -/*
> - * aes_sub() - use the aese instruction to perform the AES sbox substitution
> - *             on each byte in 'input'
> - */
> -static u32 aes_sub(u32 input)
> -{
> -       u32 ret;
> -
> -       __asm__("dup    v1.4s, %w[in]           ;"
> -               "movi   v0.16b, #0              ;"
> -               "aese   v0.16b, v1.16b          ;"
> -               "umov   %w[out], v0.4s[0]       ;"
> -
> -       :       [out]   "=r"(ret)
> -       :       [in]    "r"(input)
> -       :               "v0","v1");
> -
> -       return ret;
> -}
> -
>  int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                      unsigned int key_len)
>  {
> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                 u32 *rki = ctx->key_enc + (i * kwords);
>                 u32 *rko = rki + kwords;
>
> -               rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
> +               rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
>                 rko[1] = rko[0] ^ rki[1];
>                 rko[2] = rko[1] ^ rki[2];
>                 rko[3] = rko[2] ^ rki[3];
> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                 } else if (key_len == AES_KEYSIZE_256) {
>                         if (i >= 6)
>                                 break;
> -                       rko[4] = aes_sub(rko[3]) ^ rki[4];
> +                       rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>                         rko[5] = rko[4] ^ rki[5];
>                         rko[6] = rko[5] ^ rki[6];
>                         rko[7] = rko[6] ^ rki[7];
> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>
>         key_dec[0] = key_enc[j];
>         for (i = 1, j--; j > 0; i++, j--)
> -               __asm__("ld1    {v0.4s}, %[in]          ;"
> -                       "aesimc v1.16b, v0.16b          ;"
> -                       "st1    {v1.4s}, %[out] ;"
> -
> -               :       [out]   "=Q"(key_dec[i])
> -               :       [in]    "Q"(key_enc[j])
> -               :               "v0","v1");
> +               __aes_ce_invert(key_dec + i, key_enc + j);
>         key_dec[i] = key_enc[0];
>
>         kernel_neon_end();
> --
> 2.11.0
>



-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
@ 2017-11-21 17:32   ` Nick Desaulniers
  0 siblings, 0 replies; 14+ messages in thread
From: Nick Desaulniers @ 2017-11-21 17:32 UTC (permalink / raw)
  To: linux-arm-kernel

Checked for transcription errors, calling convention, and removal of
temporary locals. LGTM.

Reviewed-By: Nick Desaulniers <ndesaulniers@google.com>

On Tue, Nov 21, 2017 at 5:40 AM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> Most crypto drivers involving kernel mode NEON take care to put the code
> that actually touches the NEON register file in a separate compilation
> unit, to prevent the compiler from reordering code that preserves or
> restores the NEON context with code that may corrupt it. This is
> necessary because we currently have no way to express the restrictions
> imposed upon use of the NEON in kernel mode in a way that the compiler
> understands.
>
> However, in the case of aes-ce-cipher, it did not seem unreasonable to
> deviate from this rule, given how it does not seem possible for the
> compiler to reorder cross object function calls with asm blocks whose
> in- and output constraints reflect that it reads from and writes to
> memory.
>
> Now that LTO is being proposed for the arm64 kernel, it is time to
> revisit this. The link time optimization may replace the function
> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
> of the IR that make up its implementation, allowing further reordering
> with the asm block.
>
> So let's clean this up, and move the asm() blocks into a separate .S
> file.
>
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/arm64/crypto/Makefile                         |   2 +-
>  arch/arm64/crypto/aes-ce-core.S                    |  87 ++++++++++++++++
>  .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      | 115 +++------------------
>  3 files changed, 100 insertions(+), 104 deletions(-)
>  create mode 100644 arch/arm64/crypto/aes-ce-core.S
>  rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
>
> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
> index b5edc5918c28..f5e8295fd756 100644
> --- a/arch/arm64/crypto/Makefile
> +++ b/arch/arm64/crypto/Makefile
> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>  crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>  aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
> diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S
> new file mode 100644
> index 000000000000..8efdfdade393
> --- /dev/null
> +++ b/arch/arm64/crypto/aes-ce-core.S
> @@ -0,0 +1,87 @@
> +/*
> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +       .arch           armv8-a+crypto
> +
> +ENTRY(__aes_ce_encrypt)
> +       sub             w3, w3, #2
> +       ld1             {v0.16b}, [x2]
> +       ld1             {v1.4s}, [x0], #16
> +       cmp             w3, #10
> +       bmi             0f
> +       bne             3f
> +       mov             v3.16b, v1.16b
> +       b               2f
> +0:     mov             v2.16b, v1.16b
> +       ld1             {v3.4s}, [x0], #16
> +1:     aese            v0.16b, v2.16b
> +       aesmc           v0.16b, v0.16b
> +2:     ld1             {v1.4s}, [x0], #16
> +       aese            v0.16b, v3.16b
> +       aesmc           v0.16b, v0.16b
> +3:     ld1             {v2.4s}, [x0], #16
> +       subs            w3, w3, #3
> +       aese            v0.16b, v1.16b
> +       aesmc           v0.16b, v0.16b
> +       ld1             {v3.4s}, [x0], #16
> +       bpl             1b
> +       aese            v0.16b, v2.16b
> +       eor             v0.16b, v0.16b, v3.16b
> +       st1             {v0.16b}, [x1]
> +       ret
> +ENDPROC(__aes_ce_encrypt)
> +
> +ENTRY(__aes_ce_decrypt)
> +       sub             w3, w3, #2
> +       ld1             {v0.16b}, [x2]
> +       ld1             {v1.4s}, [x0], #16
> +       cmp             w3, #10
> +       bmi             0f
> +       bne             3f
> +       mov             v3.16b, v1.16b
> +       b               2f
> +0:     mov             v2.16b, v1.16b
> +       ld1             {v3.4s}, [x0], #16
> +1:     aesd            v0.16b, v2.16b
> +       aesimc          v0.16b, v0.16b
> +2:     ld1             {v1.4s}, [x0], #16
> +       aesd            v0.16b, v3.16b
> +       aesimc          v0.16b, v0.16b
> +3:     ld1             {v2.4s}, [x0], #16
> +       subs            w3, w3, #3
> +       aesd            v0.16b, v1.16b
> +       aesimc          v0.16b, v0.16b
> +       ld1             {v3.4s}, [x0], #16
> +       bpl             1b
> +       aesd            v0.16b, v2.16b
> +       eor             v0.16b, v0.16b, v3.16b
> +       st1             {v0.16b}, [x1]
> +       ret
> +ENDPROC(__aes_ce_decrypt)
> +
> +/*
> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
> + *                  substitution on each byte in 'input'
> + */
> +ENTRY(__aes_ce_sub)
> +       dup             v1.4s, w0
> +       movi            v0.16b, #0
> +       aese            v0.16b, v1.16b
> +       umov            w0, v0.s[0]
> +       ret
> +ENDPROC(__aes_ce_sub)
> +
> +ENTRY(__aes_ce_invert)
> +       ld1             {v0.4s}, [x1]
> +       aesimc          v1.16b, v0.16b
> +       st1             {v1.4s}, [x0]
> +       ret
> +ENDPROC(__aes_ce_invert)
> diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-glue.c
> similarity index 62%
> rename from arch/arm64/crypto/aes-ce-cipher.c
> rename to arch/arm64/crypto/aes-ce-glue.c
> index 6a75cd75ed11..e6b3227bbf57 100644
> --- a/arch/arm64/crypto/aes-ce-cipher.c
> +++ b/arch/arm64/crypto/aes-ce-glue.c
> @@ -29,6 +29,13 @@ struct aes_block {
>         u8 b[AES_BLOCK_SIZE];
>  };
>
> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
> +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
> +
> +asmlinkage u32 __aes_ce_sub(u32 l);
> +asmlinkage void __aes_ce_invert(struct aes_block *out,
> +                               const struct aes_block *in);
> +
>  static int num_rounds(struct crypto_aes_ctx *ctx)
>  {
>         /*
> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>  static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -       struct aes_block *out = (struct aes_block *)dst;
> -       struct aes_block const *in = (struct aes_block *)src;
> -       void *dummy0;
> -       int dummy1;
>
>         if (!may_use_simd()) {
>                 __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
> @@ -55,49 +58,13 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>         }
>
>         kernel_neon_begin();
> -
> -       __asm__("       ld1     {v0.16b}, %[in]                 ;"
> -               "       ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       cmp     %w[rounds], #10                 ;"
> -               "       bmi     0f                              ;"
> -               "       bne     3f                              ;"
> -               "       mov     v3.16b, v1.16b                  ;"
> -               "       b       2f                              ;"
> -               "0:     mov     v2.16b, v1.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "1:     aese    v0.16b, v2.16b                  ;"
> -               "       aesmc   v0.16b, v0.16b                  ;"
> -               "2:     ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       aese    v0.16b, v3.16b                  ;"
> -               "       aesmc   v0.16b, v0.16b                  ;"
> -               "3:     ld1     {v2.4s}, [%[key]], #16          ;"
> -               "       subs    %w[rounds], %w[rounds], #3      ;"
> -               "       aese    v0.16b, v1.16b                  ;"
> -               "       aesmc   v0.16b, v0.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "       bpl     1b                              ;"
> -               "       aese    v0.16b, v2.16b                  ;"
> -               "       eor     v0.16b, v0.16b, v3.16b          ;"
> -               "       st1     {v0.16b}, %[out]                ;"
> -
> -       :       [out]           "=Q"(*out),
> -               [key]           "=r"(dummy0),
> -               [rounds]        "=r"(dummy1)
> -       :       [in]            "Q"(*in),
> -                               "1"(ctx->key_enc),
> -                               "2"(num_rounds(ctx) - 2)
> -       :       "cc");
> -
> +       __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>         kernel_neon_end();
>  }
>
>  static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -       struct aes_block *out = (struct aes_block *)dst;
> -       struct aes_block const *in = (struct aes_block *)src;
> -       void *dummy0;
> -       int dummy1;
>
>         if (!may_use_simd()) {
>                 __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
> @@ -105,62 +72,10 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>         }
>
>         kernel_neon_begin();
> -
> -       __asm__("       ld1     {v0.16b}, %[in]                 ;"
> -               "       ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       cmp     %w[rounds], #10                 ;"
> -               "       bmi     0f                              ;"
> -               "       bne     3f                              ;"
> -               "       mov     v3.16b, v1.16b                  ;"
> -               "       b       2f                              ;"
> -               "0:     mov     v2.16b, v1.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "1:     aesd    v0.16b, v2.16b                  ;"
> -               "       aesimc  v0.16b, v0.16b                  ;"
> -               "2:     ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       aesd    v0.16b, v3.16b                  ;"
> -               "       aesimc  v0.16b, v0.16b                  ;"
> -               "3:     ld1     {v2.4s}, [%[key]], #16          ;"
> -               "       subs    %w[rounds], %w[rounds], #3      ;"
> -               "       aesd    v0.16b, v1.16b                  ;"
> -               "       aesimc  v0.16b, v0.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "       bpl     1b                              ;"
> -               "       aesd    v0.16b, v2.16b                  ;"
> -               "       eor     v0.16b, v0.16b, v3.16b          ;"
> -               "       st1     {v0.16b}, %[out]                ;"
> -
> -       :       [out]           "=Q"(*out),
> -               [key]           "=r"(dummy0),
> -               [rounds]        "=r"(dummy1)
> -       :       [in]            "Q"(*in),
> -                               "1"(ctx->key_dec),
> -                               "2"(num_rounds(ctx) - 2)
> -       :       "cc");
> -
> +       __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>         kernel_neon_end();
>  }
>
> -/*
> - * aes_sub() - use the aese instruction to perform the AES sbox substitution
> - *             on each byte in 'input'
> - */
> -static u32 aes_sub(u32 input)
> -{
> -       u32 ret;
> -
> -       __asm__("dup    v1.4s, %w[in]           ;"
> -               "movi   v0.16b, #0              ;"
> -               "aese   v0.16b, v1.16b          ;"
> -               "umov   %w[out], v0.4s[0]       ;"
> -
> -       :       [out]   "=r"(ret)
> -       :       [in]    "r"(input)
> -       :               "v0","v1");
> -
> -       return ret;
> -}
> -
>  int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                      unsigned int key_len)
>  {
> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                 u32 *rki = ctx->key_enc + (i * kwords);
>                 u32 *rko = rki + kwords;
>
> -               rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
> +               rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
>                 rko[1] = rko[0] ^ rki[1];
>                 rko[2] = rko[1] ^ rki[2];
>                 rko[3] = rko[2] ^ rki[3];
> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                 } else if (key_len == AES_KEYSIZE_256) {
>                         if (i >= 6)
>                                 break;
> -                       rko[4] = aes_sub(rko[3]) ^ rki[4];
> +                       rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>                         rko[5] = rko[4] ^ rki[5];
>                         rko[6] = rko[5] ^ rki[6];
>                         rko[7] = rko[6] ^ rki[7];
> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>
>         key_dec[0] = key_enc[j];
>         for (i = 1, j--; j > 0; i++, j--)
> -               __asm__("ld1    {v0.4s}, %[in]          ;"
> -                       "aesimc v1.16b, v0.16b          ;"
> -                       "st1    {v1.4s}, %[out] ;"
> -
> -               :       [out]   "=Q"(key_dec[i])
> -               :       [in]    "Q"(key_enc[j])
> -               :               "v0","v1");
> +               __aes_ce_invert(key_dec + i, key_enc + j);
>         key_dec[i] = key_enc[0];
>
>         kernel_neon_end();
> --
> 2.11.0
>



-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
  2017-11-21 13:40 ` Ard Biesheuvel
@ 2017-11-22 10:05   ` Alex Matveev
  -1 siblings, 0 replies; 14+ messages in thread
From: Alex Matveev @ 2017-11-22 10:05 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: linux-arm-kernel, linux-crypto, mark.rutland, samitolvanen, ak,
	ghackmann, keescook, michal.lkml, ndesaulniers, ynorov, mka,
	will.deacon, herbert

This is better than my simple fix, thank you.

Out of curiosity, why doesn't NEON code use barrier() to prevent
reordering?

On Tue, 21 Nov 2017 13:40:17 +0000
Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:

> Most crypto drivers involving kernel mode NEON take care to put the
> code that actually touches the NEON register file in a separate
> compilation unit, to prevent the compiler from reordering code that
> preserves or restores the NEON context with code that may corrupt it.
> This is necessary because we currently have no way to express the
> restrictions imposed upon use of the NEON in kernel mode in a way
> that the compiler understands.
> 
> However, in the case of aes-ce-cipher, it did not seem unreasonable to
> deviate from this rule, given how it does not seem possible for the
> compiler to reorder cross object function calls with asm blocks whose
> in- and output constraints reflect that it reads from and writes to
> memory.
> 
> Now that LTO is being proposed for the arm64 kernel, it is time to
> revisit this. The link time optimization may replace the function
> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
> of the IR that make up its implementation, allowing further reordering
> with the asm block.
> 
> So let's clean this up, and move the asm() blocks into a separate .S
> file.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/arm64/crypto/Makefile                         |   2 +-
>  arch/arm64/crypto/aes-ce-core.S                    |  87
> ++++++++++++++++ .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      |
> 115 +++------------------ 3 files changed, 100 insertions(+), 104
> deletions(-) create mode 100644 arch/arm64/crypto/aes-ce-core.S
>  rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
> 
> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
> index b5edc5918c28..f5e8295fd756 100644
> --- a/arch/arm64/crypto/Makefile
> +++ b/arch/arm64/crypto/Makefile
> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>  crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>  
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>  
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>  aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
> diff --git a/arch/arm64/crypto/aes-ce-core.S
> b/arch/arm64/crypto/aes-ce-core.S new file mode 100644
> index 000000000000..8efdfdade393
> --- /dev/null
> +++ b/arch/arm64/crypto/aes-ce-core.S
> @@ -0,0 +1,87 @@
> +/*
> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or
> modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +	.arch		armv8-a+crypto
> +
> +ENTRY(__aes_ce_encrypt)
> +	sub		w3, w3, #2
> +	ld1		{v0.16b}, [x2]
> +	ld1		{v1.4s}, [x0], #16
> +	cmp		w3, #10
> +	bmi		0f
> +	bne		3f
> +	mov		v3.16b, v1.16b
> +	b		2f
> +0:	mov		v2.16b, v1.16b
> +	ld1		{v3.4s}, [x0], #16
> +1:	aese		v0.16b, v2.16b
> +	aesmc		v0.16b, v0.16b
> +2:	ld1		{v1.4s}, [x0], #16
> +	aese		v0.16b, v3.16b
> +	aesmc		v0.16b, v0.16b
> +3:	ld1		{v2.4s}, [x0], #16
> +	subs		w3, w3, #3
> +	aese		v0.16b, v1.16b
> +	aesmc		v0.16b, v0.16b
> +	ld1		{v3.4s}, [x0], #16
> +	bpl		1b
> +	aese		v0.16b, v2.16b
> +	eor		v0.16b, v0.16b, v3.16b
> +	st1		{v0.16b}, [x1]
> +	ret
> +ENDPROC(__aes_ce_encrypt)
> +
> +ENTRY(__aes_ce_decrypt)
> +	sub		w3, w3, #2
> +	ld1		{v0.16b}, [x2]
> +	ld1		{v1.4s}, [x0], #16
> +	cmp		w3, #10
> +	bmi		0f
> +	bne		3f
> +	mov		v3.16b, v1.16b
> +	b		2f
> +0:	mov		v2.16b, v1.16b
> +	ld1		{v3.4s}, [x0], #16
> +1:	aesd		v0.16b, v2.16b
> +	aesimc		v0.16b, v0.16b
> +2:	ld1		{v1.4s}, [x0], #16
> +	aesd		v0.16b, v3.16b
> +	aesimc		v0.16b, v0.16b
> +3:	ld1		{v2.4s}, [x0], #16
> +	subs		w3, w3, #3
> +	aesd		v0.16b, v1.16b
> +	aesimc		v0.16b, v0.16b
> +	ld1		{v3.4s}, [x0], #16
> +	bpl		1b
> +	aesd		v0.16b, v2.16b
> +	eor		v0.16b, v0.16b, v3.16b
> +	st1		{v0.16b}, [x1]
> +	ret
> +ENDPROC(__aes_ce_decrypt)
> +
> +/*
> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
> + *                  substitution on each byte in 'input'
> + */
> +ENTRY(__aes_ce_sub)
> +	dup		v1.4s, w0
> +	movi		v0.16b, #0
> +	aese		v0.16b, v1.16b
> +	umov		w0, v0.s[0]
> +	ret
> +ENDPROC(__aes_ce_sub)
> +
> +ENTRY(__aes_ce_invert)
> +	ld1		{v0.4s}, [x1]
> +	aesimc		v1.16b, v0.16b
> +	st1		{v1.4s}, [x0]
> +	ret
> +ENDPROC(__aes_ce_invert)
> diff --git a/arch/arm64/crypto/aes-ce-cipher.c
> b/arch/arm64/crypto/aes-ce-glue.c similarity index 62%
> rename from arch/arm64/crypto/aes-ce-cipher.c
> rename to arch/arm64/crypto/aes-ce-glue.c
> index 6a75cd75ed11..e6b3227bbf57 100644
> --- a/arch/arm64/crypto/aes-ce-cipher.c
> +++ b/arch/arm64/crypto/aes-ce-glue.c
> @@ -29,6 +29,13 @@ struct aes_block {
>  	u8 b[AES_BLOCK_SIZE];
>  };
>  
> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int
> rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8
> *in, int rounds); +
> +asmlinkage u32 __aes_ce_sub(u32 l);
> +asmlinkage void __aes_ce_invert(struct aes_block *out,
> +				const struct aes_block *in);
> +
>  static int num_rounds(struct crypto_aes_ctx *ctx)
>  {
>  	/*
> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>  static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8
> const src[]) {
>  	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -	struct aes_block *out = (struct aes_block *)dst;
> -	struct aes_block const *in = (struct aes_block *)src;
> -	void *dummy0;
> -	int dummy1;
>  
>  	if (!may_use_simd()) {
>  		__aes_arm64_encrypt(ctx->key_enc, dst, src,
> num_rounds(ctx)); @@ -55,49 +58,13 @@ static void
> aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>  
>  	kernel_neon_begin();
> -
> -	__asm__("	ld1	{v0.16b},
> %[in]			;"
> -		"	ld1	{v1.4s}, [%[key]],
> #16		;"
> -		"	cmp	%w[rounds],
> #10			;"
> -		"	bmi
> 0f				;"
> -		"	bne
> 3f				;"
> -		"	mov	v3.16b,
> v1.16b			;"
> -		"	b
> 2f				;"
> -		"0:	mov	v2.16b,
> v1.16b			;"
> -		"	ld1	{v3.4s}, [%[key]],
> #16		;"
> -		"1:	aese	v0.16b,
> v2.16b			;"
> -		"	aesmc	v0.16b,
> v0.16b			;"
> -		"2:	ld1	{v1.4s}, [%[key]],
> #16		;"
> -		"	aese	v0.16b,
> v3.16b			;"
> -		"	aesmc	v0.16b,
> v0.16b			;"
> -		"3:	ld1	{v2.4s}, [%[key]],
> #16		;"
> -		"	subs	%w[rounds], %w[rounds],
> #3	;"
> -		"	aese	v0.16b,
> v1.16b			;"
> -		"	aesmc	v0.16b,
> v0.16b			;"
> -		"	ld1	{v3.4s}, [%[key]],
> #16		;"
> -		"	bpl
> 1b				;"
> -		"	aese	v0.16b,
> v2.16b			;"
> -		"	eor	v0.16b, v0.16b,
> v3.16b		;"
> -		"	st1	{v0.16b},
> %[out]		;" -
> -	:	[out]		"=Q"(*out),
> -		[key]		"=r"(dummy0),
> -		[rounds]	"=r"(dummy1)
> -	:	[in]		"Q"(*in),
> -				"1"(ctx->key_enc),
> -				"2"(num_rounds(ctx) - 2)
> -	:	"cc");
> -
> +	__aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>  	kernel_neon_end();
>  }
>  
>  static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8
> const src[]) {
>  	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -	struct aes_block *out = (struct aes_block *)dst;
> -	struct aes_block const *in = (struct aes_block *)src;
> -	void *dummy0;
> -	int dummy1;
>  
>  	if (!may_use_simd()) {
>  		__aes_arm64_decrypt(ctx->key_dec, dst, src,
> num_rounds(ctx)); @@ -105,62 +72,10 @@ static void
> aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>  
>  	kernel_neon_begin();
> -
> -	__asm__("	ld1	{v0.16b},
> %[in]			;"
> -		"	ld1	{v1.4s}, [%[key]],
> #16		;"
> -		"	cmp	%w[rounds],
> #10			;"
> -		"	bmi
> 0f				;"
> -		"	bne
> 3f				;"
> -		"	mov	v3.16b,
> v1.16b			;"
> -		"	b
> 2f				;"
> -		"0:	mov	v2.16b,
> v1.16b			;"
> -		"	ld1	{v3.4s}, [%[key]],
> #16		;"
> -		"1:	aesd	v0.16b,
> v2.16b			;"
> -		"	aesimc	v0.16b,
> v0.16b			;"
> -		"2:	ld1	{v1.4s}, [%[key]],
> #16		;"
> -		"	aesd	v0.16b,
> v3.16b			;"
> -		"	aesimc	v0.16b,
> v0.16b			;"
> -		"3:	ld1	{v2.4s}, [%[key]],
> #16		;"
> -		"	subs	%w[rounds], %w[rounds],
> #3	;"
> -		"	aesd	v0.16b,
> v1.16b			;"
> -		"	aesimc	v0.16b,
> v0.16b			;"
> -		"	ld1	{v3.4s}, [%[key]],
> #16		;"
> -		"	bpl
> 1b				;"
> -		"	aesd	v0.16b,
> v2.16b			;"
> -		"	eor	v0.16b, v0.16b,
> v3.16b		;"
> -		"	st1	{v0.16b},
> %[out]		;" -
> -	:	[out]		"=Q"(*out),
> -		[key]		"=r"(dummy0),
> -		[rounds]	"=r"(dummy1)
> -	:	[in]		"Q"(*in),
> -				"1"(ctx->key_dec),
> -				"2"(num_rounds(ctx) - 2)
> -	:	"cc");
> -
> +	__aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>  	kernel_neon_end();
>  }
>  
> -/*
> - * aes_sub() - use the aese instruction to perform the AES sbox
> substitution
> - *             on each byte in 'input'
> - */
> -static u32 aes_sub(u32 input)
> -{
> -	u32 ret;
> -
> -	__asm__("dup	v1.4s, %w[in]		;"
> -		"movi	v0.16b, #0		;"
> -		"aese	v0.16b, v1.16b		;"
> -		"umov	%w[out], v0.4s[0]	;"
> -
> -	:	[out]	"=r"(ret)
> -	:	[in]	"r"(input)
> -	:		"v0","v1");
> -
> -	return ret;
> -}
> -
>  int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>  		     unsigned int key_len)
>  {
> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
> const u8 *in_key, u32 *rki = ctx->key_enc + (i * kwords);
>  		u32 *rko = rki + kwords;
>  
> -		rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^
> rcon[i] ^ rki[0];
> +		rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
> rcon[i] ^ rki[0]; rko[1] = rko[0] ^ rki[1];
>  		rko[2] = rko[1] ^ rki[2];
>  		rko[3] = rko[2] ^ rki[3];
> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
> const u8 *in_key, } else if (key_len == AES_KEYSIZE_256) {
>  			if (i >= 6)
>  				break;
> -			rko[4] = aes_sub(rko[3]) ^ rki[4];
> +			rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>  			rko[5] = rko[4] ^ rki[5];
>  			rko[6] = rko[5] ^ rki[6];
>  			rko[7] = rko[6] ^ rki[7];
> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
> const u8 *in_key, 
>  	key_dec[0] = key_enc[j];
>  	for (i = 1, j--; j > 0; i++, j--)
> -		__asm__("ld1	{v0.4s}, %[in]		;"
> -			"aesimc	v1.16b,
> v0.16b		;"
> -			"st1	{v1.4s}, %[out]	;"
> -
> -		:	[out]	"=Q"(key_dec[i])
> -		:	[in]	"Q"(key_enc[j])
> -		:		"v0","v1");
> +		__aes_ce_invert(key_dec + i, key_enc + j);
>  	key_dec[i] = key_enc[0];
>  
>  	kernel_neon_end();

Regards,
Alex

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
@ 2017-11-22 10:05   ` Alex Matveev
  0 siblings, 0 replies; 14+ messages in thread
From: Alex Matveev @ 2017-11-22 10:05 UTC (permalink / raw)
  To: linux-arm-kernel

This is better than my simple fix, thank you.

Out of curiosity, why doesn't NEON code use barrier() to prevent
reordering?

On Tue, 21 Nov 2017 13:40:17 +0000
Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:

> Most crypto drivers involving kernel mode NEON take care to put the
> code that actually touches the NEON register file in a separate
> compilation unit, to prevent the compiler from reordering code that
> preserves or restores the NEON context with code that may corrupt it.
> This is necessary because we currently have no way to express the
> restrictions imposed upon use of the NEON in kernel mode in a way
> that the compiler understands.
> 
> However, in the case of aes-ce-cipher, it did not seem unreasonable to
> deviate from this rule, given how it does not seem possible for the
> compiler to reorder cross object function calls with asm blocks whose
> in- and output constraints reflect that it reads from and writes to
> memory.
> 
> Now that LTO is being proposed for the arm64 kernel, it is time to
> revisit this. The link time optimization may replace the function
> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
> of the IR that make up its implementation, allowing further reordering
> with the asm block.
> 
> So let's clean this up, and move the asm() blocks into a separate .S
> file.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  arch/arm64/crypto/Makefile                         |   2 +-
>  arch/arm64/crypto/aes-ce-core.S                    |  87
> ++++++++++++++++ .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      |
> 115 +++------------------ 3 files changed, 100 insertions(+), 104
> deletions(-) create mode 100644 arch/arm64/crypto/aes-ce-core.S
>  rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
> 
> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
> index b5edc5918c28..f5e8295fd756 100644
> --- a/arch/arm64/crypto/Makefile
> +++ b/arch/arm64/crypto/Makefile
> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>  crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>  
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>  
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>  aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
> diff --git a/arch/arm64/crypto/aes-ce-core.S
> b/arch/arm64/crypto/aes-ce-core.S new file mode 100644
> index 000000000000..8efdfdade393
> --- /dev/null
> +++ b/arch/arm64/crypto/aes-ce-core.S
> @@ -0,0 +1,87 @@
> +/*
> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or
> modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +	.arch		armv8-a+crypto
> +
> +ENTRY(__aes_ce_encrypt)
> +	sub		w3, w3, #2
> +	ld1		{v0.16b}, [x2]
> +	ld1		{v1.4s}, [x0], #16
> +	cmp		w3, #10
> +	bmi		0f
> +	bne		3f
> +	mov		v3.16b, v1.16b
> +	b		2f
> +0:	mov		v2.16b, v1.16b
> +	ld1		{v3.4s}, [x0], #16
> +1:	aese		v0.16b, v2.16b
> +	aesmc		v0.16b, v0.16b
> +2:	ld1		{v1.4s}, [x0], #16
> +	aese		v0.16b, v3.16b
> +	aesmc		v0.16b, v0.16b
> +3:	ld1		{v2.4s}, [x0], #16
> +	subs		w3, w3, #3
> +	aese		v0.16b, v1.16b
> +	aesmc		v0.16b, v0.16b
> +	ld1		{v3.4s}, [x0], #16
> +	bpl		1b
> +	aese		v0.16b, v2.16b
> +	eor		v0.16b, v0.16b, v3.16b
> +	st1		{v0.16b}, [x1]
> +	ret
> +ENDPROC(__aes_ce_encrypt)
> +
> +ENTRY(__aes_ce_decrypt)
> +	sub		w3, w3, #2
> +	ld1		{v0.16b}, [x2]
> +	ld1		{v1.4s}, [x0], #16
> +	cmp		w3, #10
> +	bmi		0f
> +	bne		3f
> +	mov		v3.16b, v1.16b
> +	b		2f
> +0:	mov		v2.16b, v1.16b
> +	ld1		{v3.4s}, [x0], #16
> +1:	aesd		v0.16b, v2.16b
> +	aesimc		v0.16b, v0.16b
> +2:	ld1		{v1.4s}, [x0], #16
> +	aesd		v0.16b, v3.16b
> +	aesimc		v0.16b, v0.16b
> +3:	ld1		{v2.4s}, [x0], #16
> +	subs		w3, w3, #3
> +	aesd		v0.16b, v1.16b
> +	aesimc		v0.16b, v0.16b
> +	ld1		{v3.4s}, [x0], #16
> +	bpl		1b
> +	aesd		v0.16b, v2.16b
> +	eor		v0.16b, v0.16b, v3.16b
> +	st1		{v0.16b}, [x1]
> +	ret
> +ENDPROC(__aes_ce_decrypt)
> +
> +/*
> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
> + *                  substitution on each byte in 'input'
> + */
> +ENTRY(__aes_ce_sub)
> +	dup		v1.4s, w0
> +	movi		v0.16b, #0
> +	aese		v0.16b, v1.16b
> +	umov		w0, v0.s[0]
> +	ret
> +ENDPROC(__aes_ce_sub)
> +
> +ENTRY(__aes_ce_invert)
> +	ld1		{v0.4s}, [x1]
> +	aesimc		v1.16b, v0.16b
> +	st1		{v1.4s}, [x0]
> +	ret
> +ENDPROC(__aes_ce_invert)
> diff --git a/arch/arm64/crypto/aes-ce-cipher.c
> b/arch/arm64/crypto/aes-ce-glue.c similarity index 62%
> rename from arch/arm64/crypto/aes-ce-cipher.c
> rename to arch/arm64/crypto/aes-ce-glue.c
> index 6a75cd75ed11..e6b3227bbf57 100644
> --- a/arch/arm64/crypto/aes-ce-cipher.c
> +++ b/arch/arm64/crypto/aes-ce-glue.c
> @@ -29,6 +29,13 @@ struct aes_block {
>  	u8 b[AES_BLOCK_SIZE];
>  };
>  
> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int
> rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8
> *in, int rounds); +
> +asmlinkage u32 __aes_ce_sub(u32 l);
> +asmlinkage void __aes_ce_invert(struct aes_block *out,
> +				const struct aes_block *in);
> +
>  static int num_rounds(struct crypto_aes_ctx *ctx)
>  {
>  	/*
> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>  static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8
> const src[]) {
>  	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -	struct aes_block *out = (struct aes_block *)dst;
> -	struct aes_block const *in = (struct aes_block *)src;
> -	void *dummy0;
> -	int dummy1;
>  
>  	if (!may_use_simd()) {
>  		__aes_arm64_encrypt(ctx->key_enc, dst, src,
> num_rounds(ctx)); @@ -55,49 +58,13 @@ static void
> aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>  
>  	kernel_neon_begin();
> -
> -	__asm__("	ld1	{v0.16b},
> %[in]			;"
> -		"	ld1	{v1.4s}, [%[key]],
> #16		;"
> -		"	cmp	%w[rounds],
> #10			;"
> -		"	bmi
> 0f				;"
> -		"	bne
> 3f				;"
> -		"	mov	v3.16b,
> v1.16b			;"
> -		"	b
> 2f				;"
> -		"0:	mov	v2.16b,
> v1.16b			;"
> -		"	ld1	{v3.4s}, [%[key]],
> #16		;"
> -		"1:	aese	v0.16b,
> v2.16b			;"
> -		"	aesmc	v0.16b,
> v0.16b			;"
> -		"2:	ld1	{v1.4s}, [%[key]],
> #16		;"
> -		"	aese	v0.16b,
> v3.16b			;"
> -		"	aesmc	v0.16b,
> v0.16b			;"
> -		"3:	ld1	{v2.4s}, [%[key]],
> #16		;"
> -		"	subs	%w[rounds], %w[rounds],
> #3	;"
> -		"	aese	v0.16b,
> v1.16b			;"
> -		"	aesmc	v0.16b,
> v0.16b			;"
> -		"	ld1	{v3.4s}, [%[key]],
> #16		;"
> -		"	bpl
> 1b				;"
> -		"	aese	v0.16b,
> v2.16b			;"
> -		"	eor	v0.16b, v0.16b,
> v3.16b		;"
> -		"	st1	{v0.16b},
> %[out]		;" -
> -	:	[out]		"=Q"(*out),
> -		[key]		"=r"(dummy0),
> -		[rounds]	"=r"(dummy1)
> -	:	[in]		"Q"(*in),
> -				"1"(ctx->key_enc),
> -				"2"(num_rounds(ctx) - 2)
> -	:	"cc");
> -
> +	__aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>  	kernel_neon_end();
>  }
>  
>  static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8
> const src[]) {
>  	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -	struct aes_block *out = (struct aes_block *)dst;
> -	struct aes_block const *in = (struct aes_block *)src;
> -	void *dummy0;
> -	int dummy1;
>  
>  	if (!may_use_simd()) {
>  		__aes_arm64_decrypt(ctx->key_dec, dst, src,
> num_rounds(ctx)); @@ -105,62 +72,10 @@ static void
> aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>  
>  	kernel_neon_begin();
> -
> -	__asm__("	ld1	{v0.16b},
> %[in]			;"
> -		"	ld1	{v1.4s}, [%[key]],
> #16		;"
> -		"	cmp	%w[rounds],
> #10			;"
> -		"	bmi
> 0f				;"
> -		"	bne
> 3f				;"
> -		"	mov	v3.16b,
> v1.16b			;"
> -		"	b
> 2f				;"
> -		"0:	mov	v2.16b,
> v1.16b			;"
> -		"	ld1	{v3.4s}, [%[key]],
> #16		;"
> -		"1:	aesd	v0.16b,
> v2.16b			;"
> -		"	aesimc	v0.16b,
> v0.16b			;"
> -		"2:	ld1	{v1.4s}, [%[key]],
> #16		;"
> -		"	aesd	v0.16b,
> v3.16b			;"
> -		"	aesimc	v0.16b,
> v0.16b			;"
> -		"3:	ld1	{v2.4s}, [%[key]],
> #16		;"
> -		"	subs	%w[rounds], %w[rounds],
> #3	;"
> -		"	aesd	v0.16b,
> v1.16b			;"
> -		"	aesimc	v0.16b,
> v0.16b			;"
> -		"	ld1	{v3.4s}, [%[key]],
> #16		;"
> -		"	bpl
> 1b				;"
> -		"	aesd	v0.16b,
> v2.16b			;"
> -		"	eor	v0.16b, v0.16b,
> v3.16b		;"
> -		"	st1	{v0.16b},
> %[out]		;" -
> -	:	[out]		"=Q"(*out),
> -		[key]		"=r"(dummy0),
> -		[rounds]	"=r"(dummy1)
> -	:	[in]		"Q"(*in),
> -				"1"(ctx->key_dec),
> -				"2"(num_rounds(ctx) - 2)
> -	:	"cc");
> -
> +	__aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>  	kernel_neon_end();
>  }
>  
> -/*
> - * aes_sub() - use the aese instruction to perform the AES sbox
> substitution
> - *             on each byte in 'input'
> - */
> -static u32 aes_sub(u32 input)
> -{
> -	u32 ret;
> -
> -	__asm__("dup	v1.4s, %w[in]		;"
> -		"movi	v0.16b, #0		;"
> -		"aese	v0.16b, v1.16b		;"
> -		"umov	%w[out], v0.4s[0]	;"
> -
> -	:	[out]	"=r"(ret)
> -	:	[in]	"r"(input)
> -	:		"v0","v1");
> -
> -	return ret;
> -}
> -
>  int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>  		     unsigned int key_len)
>  {
> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
> const u8 *in_key, u32 *rki = ctx->key_enc + (i * kwords);
>  		u32 *rko = rki + kwords;
>  
> -		rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^
> rcon[i] ^ rki[0];
> +		rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
> rcon[i] ^ rki[0]; rko[1] = rko[0] ^ rki[1];
>  		rko[2] = rko[1] ^ rki[2];
>  		rko[3] = rko[2] ^ rki[3];
> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
> const u8 *in_key, } else if (key_len == AES_KEYSIZE_256) {
>  			if (i >= 6)
>  				break;
> -			rko[4] = aes_sub(rko[3]) ^ rki[4];
> +			rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>  			rko[5] = rko[4] ^ rki[5];
>  			rko[6] = rko[5] ^ rki[6];
>  			rko[7] = rko[6] ^ rki[7];
> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
> const u8 *in_key, 
>  	key_dec[0] = key_enc[j];
>  	for (i = 1, j--; j > 0; i++, j--)
> -		__asm__("ld1	{v0.4s}, %[in]		;"
> -			"aesimc	v1.16b,
> v0.16b		;"
> -			"st1	{v1.4s}, %[out]	;"
> -
> -		:	[out]	"=Q"(key_dec[i])
> -		:	[in]	"Q"(key_enc[j])
> -		:		"v0","v1");
> +		__aes_ce_invert(key_dec + i, key_enc + j);
>  	key_dec[i] = key_enc[0];
>  
>  	kernel_neon_end();

Regards,
Alex

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
  2017-11-22 10:05   ` Alex Matveev
@ 2017-11-22 10:12     ` Ard Biesheuvel
  -1 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2017-11-22 10:12 UTC (permalink / raw)
  To: Alex Matveev
  Cc: linux-arm-kernel, linux-crypto, Mark Rutland, Sami Tolvanen,
	Andi Kleen, Greg Hackmann, Kees Cook, Michal Marek,
	Nick Desaulniers, Yury Norov, Matthias Kaehlcke, Will Deacon,
	Herbert Xu

On 22 November 2017 at 10:05, Alex Matveev <alxmtvv@gmail.com> wrote:
> This is better than my simple fix, thank you.
>
> Out of curiosity, why doesn't NEON code use barrier() to prevent
> reordering?
>

Because barrier() affects ordering of memory accesses, not register accesses.

> On Tue, 21 Nov 2017 13:40:17 +0000
> Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
>> Most crypto drivers involving kernel mode NEON take care to put the
>> code that actually touches the NEON register file in a separate
>> compilation unit, to prevent the compiler from reordering code that
>> preserves or restores the NEON context with code that may corrupt it.
>> This is necessary because we currently have no way to express the
>> restrictions imposed upon use of the NEON in kernel mode in a way
>> that the compiler understands.
>>
>> However, in the case of aes-ce-cipher, it did not seem unreasonable to
>> deviate from this rule, given how it does not seem possible for the
>> compiler to reorder cross object function calls with asm blocks whose
>> in- and output constraints reflect that it reads from and writes to
>> memory.
>>
>> Now that LTO is being proposed for the arm64 kernel, it is time to
>> revisit this. The link time optimization may replace the function
>> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
>> of the IR that make up its implementation, allowing further reordering
>> with the asm block.
>>
>> So let's clean this up, and move the asm() blocks into a separate .S
>> file.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> ---
>>  arch/arm64/crypto/Makefile                         |   2 +-
>>  arch/arm64/crypto/aes-ce-core.S                    |  87
>> ++++++++++++++++ .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      |
>> 115 +++------------------ 3 files changed, 100 insertions(+), 104
>> deletions(-) create mode 100644 arch/arm64/crypto/aes-ce-core.S
>>  rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
>>
>> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
>> index b5edc5918c28..f5e8295fd756 100644
>> --- a/arch/arm64/crypto/Makefile
>> +++ b/arch/arm64/crypto/Makefile
>> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>>  crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>>
>>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
>> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
>> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>>
>>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>>  aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
>> diff --git a/arch/arm64/crypto/aes-ce-core.S
>> b/arch/arm64/crypto/aes-ce-core.S new file mode 100644
>> index 000000000000..8efdfdade393
>> --- /dev/null
>> +++ b/arch/arm64/crypto/aes-ce-core.S
>> @@ -0,0 +1,87 @@
>> +/*
>> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
>> + *
>> + * This program is free software; you can redistribute it and/or
>> modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/assembler.h>
>> +
>> +     .arch           armv8-a+crypto
>> +
>> +ENTRY(__aes_ce_encrypt)
>> +     sub             w3, w3, #2
>> +     ld1             {v0.16b}, [x2]
>> +     ld1             {v1.4s}, [x0], #16
>> +     cmp             w3, #10
>> +     bmi             0f
>> +     bne             3f
>> +     mov             v3.16b, v1.16b
>> +     b               2f
>> +0:   mov             v2.16b, v1.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +1:   aese            v0.16b, v2.16b
>> +     aesmc           v0.16b, v0.16b
>> +2:   ld1             {v1.4s}, [x0], #16
>> +     aese            v0.16b, v3.16b
>> +     aesmc           v0.16b, v0.16b
>> +3:   ld1             {v2.4s}, [x0], #16
>> +     subs            w3, w3, #3
>> +     aese            v0.16b, v1.16b
>> +     aesmc           v0.16b, v0.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +     bpl             1b
>> +     aese            v0.16b, v2.16b
>> +     eor             v0.16b, v0.16b, v3.16b
>> +     st1             {v0.16b}, [x1]
>> +     ret
>> +ENDPROC(__aes_ce_encrypt)
>> +
>> +ENTRY(__aes_ce_decrypt)
>> +     sub             w3, w3, #2
>> +     ld1             {v0.16b}, [x2]
>> +     ld1             {v1.4s}, [x0], #16
>> +     cmp             w3, #10
>> +     bmi             0f
>> +     bne             3f
>> +     mov             v3.16b, v1.16b
>> +     b               2f
>> +0:   mov             v2.16b, v1.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +1:   aesd            v0.16b, v2.16b
>> +     aesimc          v0.16b, v0.16b
>> +2:   ld1             {v1.4s}, [x0], #16
>> +     aesd            v0.16b, v3.16b
>> +     aesimc          v0.16b, v0.16b
>> +3:   ld1             {v2.4s}, [x0], #16
>> +     subs            w3, w3, #3
>> +     aesd            v0.16b, v1.16b
>> +     aesimc          v0.16b, v0.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +     bpl             1b
>> +     aesd            v0.16b, v2.16b
>> +     eor             v0.16b, v0.16b, v3.16b
>> +     st1             {v0.16b}, [x1]
>> +     ret
>> +ENDPROC(__aes_ce_decrypt)
>> +
>> +/*
>> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
>> + *                  substitution on each byte in 'input'
>> + */
>> +ENTRY(__aes_ce_sub)
>> +     dup             v1.4s, w0
>> +     movi            v0.16b, #0
>> +     aese            v0.16b, v1.16b
>> +     umov            w0, v0.s[0]
>> +     ret
>> +ENDPROC(__aes_ce_sub)
>> +
>> +ENTRY(__aes_ce_invert)
>> +     ld1             {v0.4s}, [x1]
>> +     aesimc          v1.16b, v0.16b
>> +     st1             {v1.4s}, [x0]
>> +     ret
>> +ENDPROC(__aes_ce_invert)
>> diff --git a/arch/arm64/crypto/aes-ce-cipher.c
>> b/arch/arm64/crypto/aes-ce-glue.c similarity index 62%
>> rename from arch/arm64/crypto/aes-ce-cipher.c
>> rename to arch/arm64/crypto/aes-ce-glue.c
>> index 6a75cd75ed11..e6b3227bbf57 100644
>> --- a/arch/arm64/crypto/aes-ce-cipher.c
>> +++ b/arch/arm64/crypto/aes-ce-glue.c
>> @@ -29,6 +29,13 @@ struct aes_block {
>>       u8 b[AES_BLOCK_SIZE];
>>  };
>>
>> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int
>> rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8
>> *in, int rounds); +
>> +asmlinkage u32 __aes_ce_sub(u32 l);
>> +asmlinkage void __aes_ce_invert(struct aes_block *out,
>> +                             const struct aes_block *in);
>> +
>>  static int num_rounds(struct crypto_aes_ctx *ctx)
>>  {
>>       /*
>> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>>  static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8
>> const src[]) {
>>       struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>> -     struct aes_block *out = (struct aes_block *)dst;
>> -     struct aes_block const *in = (struct aes_block *)src;
>> -     void *dummy0;
>> -     int dummy1;
>>
>>       if (!may_use_simd()) {
>>               __aes_arm64_encrypt(ctx->key_enc, dst, src,
>> num_rounds(ctx)); @@ -55,49 +58,13 @@ static void
>> aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>>
>>       kernel_neon_begin();
>> -
>> -     __asm__("       ld1     {v0.16b},
>> %[in]                 ;"
>> -             "       ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       cmp     %w[rounds],
>> #10                   ;"
>> -             "       bmi
>> 0f                            ;"
>> -             "       bne
>> 3f                            ;"
>> -             "       mov     v3.16b,
>> v1.16b                        ;"
>> -             "       b
>> 2f                            ;"
>> -             "0:     mov     v2.16b,
>> v1.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "1:     aese    v0.16b,
>> v2.16b                        ;"
>> -             "       aesmc   v0.16b,
>> v0.16b                        ;"
>> -             "2:     ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       aese    v0.16b,
>> v3.16b                        ;"
>> -             "       aesmc   v0.16b,
>> v0.16b                        ;"
>> -             "3:     ld1     {v2.4s}, [%[key]],
>> #16           ;"
>> -             "       subs    %w[rounds], %w[rounds],
>> #3    ;"
>> -             "       aese    v0.16b,
>> v1.16b                        ;"
>> -             "       aesmc   v0.16b,
>> v0.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "       bpl
>> 1b                            ;"
>> -             "       aese    v0.16b,
>> v2.16b                        ;"
>> -             "       eor     v0.16b, v0.16b,
>> v3.16b                ;"
>> -             "       st1     {v0.16b},
>> %[out]                ;" -
>> -     :       [out]           "=Q"(*out),
>> -             [key]           "=r"(dummy0),
>> -             [rounds]        "=r"(dummy1)
>> -     :       [in]            "Q"(*in),
>> -                             "1"(ctx->key_enc),
>> -                             "2"(num_rounds(ctx) - 2)
>> -     :       "cc");
>> -
>> +     __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>>       kernel_neon_end();
>>  }
>>
>>  static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8
>> const src[]) {
>>       struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>> -     struct aes_block *out = (struct aes_block *)dst;
>> -     struct aes_block const *in = (struct aes_block *)src;
>> -     void *dummy0;
>> -     int dummy1;
>>
>>       if (!may_use_simd()) {
>>               __aes_arm64_decrypt(ctx->key_dec, dst, src,
>> num_rounds(ctx)); @@ -105,62 +72,10 @@ static void
>> aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>>
>>       kernel_neon_begin();
>> -
>> -     __asm__("       ld1     {v0.16b},
>> %[in]                 ;"
>> -             "       ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       cmp     %w[rounds],
>> #10                   ;"
>> -             "       bmi
>> 0f                            ;"
>> -             "       bne
>> 3f                            ;"
>> -             "       mov     v3.16b,
>> v1.16b                        ;"
>> -             "       b
>> 2f                            ;"
>> -             "0:     mov     v2.16b,
>> v1.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "1:     aesd    v0.16b,
>> v2.16b                        ;"
>> -             "       aesimc  v0.16b,
>> v0.16b                        ;"
>> -             "2:     ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       aesd    v0.16b,
>> v3.16b                        ;"
>> -             "       aesimc  v0.16b,
>> v0.16b                        ;"
>> -             "3:     ld1     {v2.4s}, [%[key]],
>> #16           ;"
>> -             "       subs    %w[rounds], %w[rounds],
>> #3    ;"
>> -             "       aesd    v0.16b,
>> v1.16b                        ;"
>> -             "       aesimc  v0.16b,
>> v0.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "       bpl
>> 1b                            ;"
>> -             "       aesd    v0.16b,
>> v2.16b                        ;"
>> -             "       eor     v0.16b, v0.16b,
>> v3.16b                ;"
>> -             "       st1     {v0.16b},
>> %[out]                ;" -
>> -     :       [out]           "=Q"(*out),
>> -             [key]           "=r"(dummy0),
>> -             [rounds]        "=r"(dummy1)
>> -     :       [in]            "Q"(*in),
>> -                             "1"(ctx->key_dec),
>> -                             "2"(num_rounds(ctx) - 2)
>> -     :       "cc");
>> -
>> +     __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>>       kernel_neon_end();
>>  }
>>
>> -/*
>> - * aes_sub() - use the aese instruction to perform the AES sbox
>> substitution
>> - *             on each byte in 'input'
>> - */
>> -static u32 aes_sub(u32 input)
>> -{
>> -     u32 ret;
>> -
>> -     __asm__("dup    v1.4s, %w[in]           ;"
>> -             "movi   v0.16b, #0              ;"
>> -             "aese   v0.16b, v1.16b          ;"
>> -             "umov   %w[out], v0.4s[0]       ;"
>> -
>> -     :       [out]   "=r"(ret)
>> -     :       [in]    "r"(input)
>> -     :               "v0","v1");
>> -
>> -     return ret;
>> -}
>> -
>>  int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>>                    unsigned int key_len)
>>  {
>> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key, u32 *rki = ctx->key_enc + (i * kwords);
>>               u32 *rko = rki + kwords;
>>
>> -             rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^
>> rcon[i] ^ rki[0];
>> +             rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
>> rcon[i] ^ rki[0]; rko[1] = rko[0] ^ rki[1];
>>               rko[2] = rko[1] ^ rki[2];
>>               rko[3] = rko[2] ^ rki[3];
>> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key, } else if (key_len == AES_KEYSIZE_256) {
>>                       if (i >= 6)
>>                               break;
>> -                     rko[4] = aes_sub(rko[3]) ^ rki[4];
>> +                     rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>>                       rko[5] = rko[4] ^ rki[5];
>>                       rko[6] = rko[5] ^ rki[6];
>>                       rko[7] = rko[6] ^ rki[7];
>> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key,
>>       key_dec[0] = key_enc[j];
>>       for (i = 1, j--; j > 0; i++, j--)
>> -             __asm__("ld1    {v0.4s}, %[in]          ;"
>> -                     "aesimc v1.16b,
>> v0.16b                ;"
>> -                     "st1    {v1.4s}, %[out] ;"
>> -
>> -             :       [out]   "=Q"(key_dec[i])
>> -             :       [in]    "Q"(key_enc[j])
>> -             :               "v0","v1");
>> +             __aes_ce_invert(key_dec + i, key_enc + j);
>>       key_dec[i] = key_enc[0];
>>
>>       kernel_neon_end();
>
> Regards,
> Alex

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
@ 2017-11-22 10:12     ` Ard Biesheuvel
  0 siblings, 0 replies; 14+ messages in thread
From: Ard Biesheuvel @ 2017-11-22 10:12 UTC (permalink / raw)
  To: linux-arm-kernel

On 22 November 2017 at 10:05, Alex Matveev <alxmtvv@gmail.com> wrote:
> This is better than my simple fix, thank you.
>
> Out of curiosity, why doesn't NEON code use barrier() to prevent
> reordering?
>

Because barrier() affects ordering of memory accesses, not register accesses.

> On Tue, 21 Nov 2017 13:40:17 +0000
> Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
>> Most crypto drivers involving kernel mode NEON take care to put the
>> code that actually touches the NEON register file in a separate
>> compilation unit, to prevent the compiler from reordering code that
>> preserves or restores the NEON context with code that may corrupt it.
>> This is necessary because we currently have no way to express the
>> restrictions imposed upon use of the NEON in kernel mode in a way
>> that the compiler understands.
>>
>> However, in the case of aes-ce-cipher, it did not seem unreasonable to
>> deviate from this rule, given how it does not seem possible for the
>> compiler to reorder cross object function calls with asm blocks whose
>> in- and output constraints reflect that it reads from and writes to
>> memory.
>>
>> Now that LTO is being proposed for the arm64 kernel, it is time to
>> revisit this. The link time optimization may replace the function
>> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
>> of the IR that make up its implementation, allowing further reordering
>> with the asm block.
>>
>> So let's clean this up, and move the asm() blocks into a separate .S
>> file.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>> ---
>>  arch/arm64/crypto/Makefile                         |   2 +-
>>  arch/arm64/crypto/aes-ce-core.S                    |  87
>> ++++++++++++++++ .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      |
>> 115 +++------------------ 3 files changed, 100 insertions(+), 104
>> deletions(-) create mode 100644 arch/arm64/crypto/aes-ce-core.S
>>  rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
>>
>> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
>> index b5edc5918c28..f5e8295fd756 100644
>> --- a/arch/arm64/crypto/Makefile
>> +++ b/arch/arm64/crypto/Makefile
>> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>>  crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>>
>>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
>> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
>> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>>
>>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>>  aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
>> diff --git a/arch/arm64/crypto/aes-ce-core.S
>> b/arch/arm64/crypto/aes-ce-core.S new file mode 100644
>> index 000000000000..8efdfdade393
>> --- /dev/null
>> +++ b/arch/arm64/crypto/aes-ce-core.S
>> @@ -0,0 +1,87 @@
>> +/*
>> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
>> + *
>> + * This program is free software; you can redistribute it and/or
>> modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/assembler.h>
>> +
>> +     .arch           armv8-a+crypto
>> +
>> +ENTRY(__aes_ce_encrypt)
>> +     sub             w3, w3, #2
>> +     ld1             {v0.16b}, [x2]
>> +     ld1             {v1.4s}, [x0], #16
>> +     cmp             w3, #10
>> +     bmi             0f
>> +     bne             3f
>> +     mov             v3.16b, v1.16b
>> +     b               2f
>> +0:   mov             v2.16b, v1.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +1:   aese            v0.16b, v2.16b
>> +     aesmc           v0.16b, v0.16b
>> +2:   ld1             {v1.4s}, [x0], #16
>> +     aese            v0.16b, v3.16b
>> +     aesmc           v0.16b, v0.16b
>> +3:   ld1             {v2.4s}, [x0], #16
>> +     subs            w3, w3, #3
>> +     aese            v0.16b, v1.16b
>> +     aesmc           v0.16b, v0.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +     bpl             1b
>> +     aese            v0.16b, v2.16b
>> +     eor             v0.16b, v0.16b, v3.16b
>> +     st1             {v0.16b}, [x1]
>> +     ret
>> +ENDPROC(__aes_ce_encrypt)
>> +
>> +ENTRY(__aes_ce_decrypt)
>> +     sub             w3, w3, #2
>> +     ld1             {v0.16b}, [x2]
>> +     ld1             {v1.4s}, [x0], #16
>> +     cmp             w3, #10
>> +     bmi             0f
>> +     bne             3f
>> +     mov             v3.16b, v1.16b
>> +     b               2f
>> +0:   mov             v2.16b, v1.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +1:   aesd            v0.16b, v2.16b
>> +     aesimc          v0.16b, v0.16b
>> +2:   ld1             {v1.4s}, [x0], #16
>> +     aesd            v0.16b, v3.16b
>> +     aesimc          v0.16b, v0.16b
>> +3:   ld1             {v2.4s}, [x0], #16
>> +     subs            w3, w3, #3
>> +     aesd            v0.16b, v1.16b
>> +     aesimc          v0.16b, v0.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +     bpl             1b
>> +     aesd            v0.16b, v2.16b
>> +     eor             v0.16b, v0.16b, v3.16b
>> +     st1             {v0.16b}, [x1]
>> +     ret
>> +ENDPROC(__aes_ce_decrypt)
>> +
>> +/*
>> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
>> + *                  substitution on each byte in 'input'
>> + */
>> +ENTRY(__aes_ce_sub)
>> +     dup             v1.4s, w0
>> +     movi            v0.16b, #0
>> +     aese            v0.16b, v1.16b
>> +     umov            w0, v0.s[0]
>> +     ret
>> +ENDPROC(__aes_ce_sub)
>> +
>> +ENTRY(__aes_ce_invert)
>> +     ld1             {v0.4s}, [x1]
>> +     aesimc          v1.16b, v0.16b
>> +     st1             {v1.4s}, [x0]
>> +     ret
>> +ENDPROC(__aes_ce_invert)
>> diff --git a/arch/arm64/crypto/aes-ce-cipher.c
>> b/arch/arm64/crypto/aes-ce-glue.c similarity index 62%
>> rename from arch/arm64/crypto/aes-ce-cipher.c
>> rename to arch/arm64/crypto/aes-ce-glue.c
>> index 6a75cd75ed11..e6b3227bbf57 100644
>> --- a/arch/arm64/crypto/aes-ce-cipher.c
>> +++ b/arch/arm64/crypto/aes-ce-glue.c
>> @@ -29,6 +29,13 @@ struct aes_block {
>>       u8 b[AES_BLOCK_SIZE];
>>  };
>>
>> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int
>> rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8
>> *in, int rounds); +
>> +asmlinkage u32 __aes_ce_sub(u32 l);
>> +asmlinkage void __aes_ce_invert(struct aes_block *out,
>> +                             const struct aes_block *in);
>> +
>>  static int num_rounds(struct crypto_aes_ctx *ctx)
>>  {
>>       /*
>> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>>  static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8
>> const src[]) {
>>       struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>> -     struct aes_block *out = (struct aes_block *)dst;
>> -     struct aes_block const *in = (struct aes_block *)src;
>> -     void *dummy0;
>> -     int dummy1;
>>
>>       if (!may_use_simd()) {
>>               __aes_arm64_encrypt(ctx->key_enc, dst, src,
>> num_rounds(ctx)); @@ -55,49 +58,13 @@ static void
>> aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>>
>>       kernel_neon_begin();
>> -
>> -     __asm__("       ld1     {v0.16b},
>> %[in]                 ;"
>> -             "       ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       cmp     %w[rounds],
>> #10                   ;"
>> -             "       bmi
>> 0f                            ;"
>> -             "       bne
>> 3f                            ;"
>> -             "       mov     v3.16b,
>> v1.16b                        ;"
>> -             "       b
>> 2f                            ;"
>> -             "0:     mov     v2.16b,
>> v1.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "1:     aese    v0.16b,
>> v2.16b                        ;"
>> -             "       aesmc   v0.16b,
>> v0.16b                        ;"
>> -             "2:     ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       aese    v0.16b,
>> v3.16b                        ;"
>> -             "       aesmc   v0.16b,
>> v0.16b                        ;"
>> -             "3:     ld1     {v2.4s}, [%[key]],
>> #16           ;"
>> -             "       subs    %w[rounds], %w[rounds],
>> #3    ;"
>> -             "       aese    v0.16b,
>> v1.16b                        ;"
>> -             "       aesmc   v0.16b,
>> v0.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "       bpl
>> 1b                            ;"
>> -             "       aese    v0.16b,
>> v2.16b                        ;"
>> -             "       eor     v0.16b, v0.16b,
>> v3.16b                ;"
>> -             "       st1     {v0.16b},
>> %[out]                ;" -
>> -     :       [out]           "=Q"(*out),
>> -             [key]           "=r"(dummy0),
>> -             [rounds]        "=r"(dummy1)
>> -     :       [in]            "Q"(*in),
>> -                             "1"(ctx->key_enc),
>> -                             "2"(num_rounds(ctx) - 2)
>> -     :       "cc");
>> -
>> +     __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>>       kernel_neon_end();
>>  }
>>
>>  static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8
>> const src[]) {
>>       struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>> -     struct aes_block *out = (struct aes_block *)dst;
>> -     struct aes_block const *in = (struct aes_block *)src;
>> -     void *dummy0;
>> -     int dummy1;
>>
>>       if (!may_use_simd()) {
>>               __aes_arm64_decrypt(ctx->key_dec, dst, src,
>> num_rounds(ctx)); @@ -105,62 +72,10 @@ static void
>> aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>>
>>       kernel_neon_begin();
>> -
>> -     __asm__("       ld1     {v0.16b},
>> %[in]                 ;"
>> -             "       ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       cmp     %w[rounds],
>> #10                   ;"
>> -             "       bmi
>> 0f                            ;"
>> -             "       bne
>> 3f                            ;"
>> -             "       mov     v3.16b,
>> v1.16b                        ;"
>> -             "       b
>> 2f                            ;"
>> -             "0:     mov     v2.16b,
>> v1.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "1:     aesd    v0.16b,
>> v2.16b                        ;"
>> -             "       aesimc  v0.16b,
>> v0.16b                        ;"
>> -             "2:     ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       aesd    v0.16b,
>> v3.16b                        ;"
>> -             "       aesimc  v0.16b,
>> v0.16b                        ;"
>> -             "3:     ld1     {v2.4s}, [%[key]],
>> #16           ;"
>> -             "       subs    %w[rounds], %w[rounds],
>> #3    ;"
>> -             "       aesd    v0.16b,
>> v1.16b                        ;"
>> -             "       aesimc  v0.16b,
>> v0.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "       bpl
>> 1b                            ;"
>> -             "       aesd    v0.16b,
>> v2.16b                        ;"
>> -             "       eor     v0.16b, v0.16b,
>> v3.16b                ;"
>> -             "       st1     {v0.16b},
>> %[out]                ;" -
>> -     :       [out]           "=Q"(*out),
>> -             [key]           "=r"(dummy0),
>> -             [rounds]        "=r"(dummy1)
>> -     :       [in]            "Q"(*in),
>> -                             "1"(ctx->key_dec),
>> -                             "2"(num_rounds(ctx) - 2)
>> -     :       "cc");
>> -
>> +     __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>>       kernel_neon_end();
>>  }
>>
>> -/*
>> - * aes_sub() - use the aese instruction to perform the AES sbox
>> substitution
>> - *             on each byte in 'input'
>> - */
>> -static u32 aes_sub(u32 input)
>> -{
>> -     u32 ret;
>> -
>> -     __asm__("dup    v1.4s, %w[in]           ;"
>> -             "movi   v0.16b, #0              ;"
>> -             "aese   v0.16b, v1.16b          ;"
>> -             "umov   %w[out], v0.4s[0]       ;"
>> -
>> -     :       [out]   "=r"(ret)
>> -     :       [in]    "r"(input)
>> -     :               "v0","v1");
>> -
>> -     return ret;
>> -}
>> -
>>  int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>>                    unsigned int key_len)
>>  {
>> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key, u32 *rki = ctx->key_enc + (i * kwords);
>>               u32 *rko = rki + kwords;
>>
>> -             rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^
>> rcon[i] ^ rki[0];
>> +             rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
>> rcon[i] ^ rki[0]; rko[1] = rko[0] ^ rki[1];
>>               rko[2] = rko[1] ^ rki[2];
>>               rko[3] = rko[2] ^ rki[3];
>> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key, } else if (key_len == AES_KEYSIZE_256) {
>>                       if (i >= 6)
>>                               break;
>> -                     rko[4] = aes_sub(rko[3]) ^ rki[4];
>> +                     rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>>                       rko[5] = rko[4] ^ rki[5];
>>                       rko[6] = rko[5] ^ rki[6];
>>                       rko[7] = rko[6] ^ rki[7];
>> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key,
>>       key_dec[0] = key_enc[j];
>>       for (i = 1, j--; j > 0; i++, j--)
>> -             __asm__("ld1    {v0.4s}, %[in]          ;"
>> -                     "aesimc v1.16b,
>> v0.16b                ;"
>> -                     "st1    {v1.4s}, %[out] ;"
>> -
>> -             :       [out]   "=Q"(key_dec[i])
>> -             :       [in]    "Q"(key_enc[j])
>> -             :               "v0","v1");
>> +             __aes_ce_invert(key_dec + i, key_enc + j);
>>       key_dec[i] = key_enc[0];
>>
>>       kernel_neon_end();
>
> Regards,
> Alex

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
  2017-11-22 10:12     ` Ard Biesheuvel
@ 2017-11-22 12:04       ` Mark Rutland
  -1 siblings, 0 replies; 14+ messages in thread
From: Mark Rutland @ 2017-11-22 12:04 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Alex Matveev, linux-arm-kernel, linux-crypto, Sami Tolvanen,
	Andi Kleen, Greg Hackmann, Kees Cook, Michal Marek,
	Nick Desaulniers, Yury Norov, Matthias Kaehlcke, Will Deacon,
	Herbert Xu

On Wed, Nov 22, 2017 at 10:12:17AM +0000, Ard Biesheuvel wrote:
> On 22 November 2017 at 10:05, Alex Matveev <alxmtvv@gmail.com> wrote:
> > This is better than my simple fix, thank you.
> >
> > Out of curiosity, why doesn't NEON code use barrier() to prevent
> > reordering?
> 
> Because barrier() affects ordering of memory accesses, not register accesses.

My understanding was that barrier() also hazards with any volatile asm
(and that in general, volatile asm block couldn't be re-ordered w.r.t.
one another). I'm sure we rely on this, even if it's not true. :/

So making the asm statements volatile might work too.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
@ 2017-11-22 12:04       ` Mark Rutland
  0 siblings, 0 replies; 14+ messages in thread
From: Mark Rutland @ 2017-11-22 12:04 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 22, 2017 at 10:12:17AM +0000, Ard Biesheuvel wrote:
> On 22 November 2017 at 10:05, Alex Matveev <alxmtvv@gmail.com> wrote:
> > This is better than my simple fix, thank you.
> >
> > Out of curiosity, why doesn't NEON code use barrier() to prevent
> > reordering?
> 
> Because barrier() affects ordering of memory accesses, not register accesses.

My understanding was that barrier() also hazards with any volatile asm
(and that in general, volatile asm block couldn't be re-ordered w.r.t.
one another). I'm sure we rely on this, even if it's not true. :/

So making the asm statements volatile might work too.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
  2017-11-22 12:04       ` Mark Rutland
@ 2017-11-22 12:17         ` Mark Rutland
  -1 siblings, 0 replies; 14+ messages in thread
From: Mark Rutland @ 2017-11-22 12:17 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Andi Kleen, Kees Cook, Will Deacon, Nick Desaulniers,
	Greg Hackmann, Michal Marek, Yury Norov, linux-crypto,
	Sami Tolvanen, Alex Matveev, Matthias Kaehlcke, linux-arm-kernel,
	Herbert Xu

On Wed, Nov 22, 2017 at 12:04:58PM +0000, Mark Rutland wrote:
> On Wed, Nov 22, 2017 at 10:12:17AM +0000, Ard Biesheuvel wrote:
> > On 22 November 2017 at 10:05, Alex Matveev <alxmtvv@gmail.com> wrote:
> > > This is better than my simple fix, thank you.
> > >
> > > Out of curiosity, why doesn't NEON code use barrier() to prevent
> > > reordering?
> > 
> > Because barrier() affects ordering of memory accesses, not register accesses.
> 
> My understanding was that barrier() also hazards with any volatile asm
> (and that in general, volatile asm block couldn't be re-ordered w.r.t.
> one another). I'm sure we rely on this, even if it's not true. :/

>From a quick scan, certainly in tls_thread_flush(),
compat_arm_syscall(), elsewhere where we have isb() and other helpers
like {read,write}_sysreg().

Mark.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
@ 2017-11-22 12:17         ` Mark Rutland
  0 siblings, 0 replies; 14+ messages in thread
From: Mark Rutland @ 2017-11-22 12:17 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 22, 2017 at 12:04:58PM +0000, Mark Rutland wrote:
> On Wed, Nov 22, 2017 at 10:12:17AM +0000, Ard Biesheuvel wrote:
> > On 22 November 2017 at 10:05, Alex Matveev <alxmtvv@gmail.com> wrote:
> > > This is better than my simple fix, thank you.
> > >
> > > Out of curiosity, why doesn't NEON code use barrier() to prevent
> > > reordering?
> > 
> > Because barrier() affects ordering of memory accesses, not register accesses.
> 
> My understanding was that barrier() also hazards with any volatile asm
> (and that in general, volatile asm block couldn't be re-ordered w.r.t.
> one another). I'm sure we rely on this, even if it's not true. :/

>From a quick scan, certainly in tls_thread_flush(),
compat_arm_syscall(), elsewhere where we have isb() and other helpers
like {read,write}_sysreg().

Mark.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
  2017-11-21 13:40 ` Ard Biesheuvel
@ 2017-11-29  6:38   ` Herbert Xu
  -1 siblings, 0 replies; 14+ messages in thread
From: Herbert Xu @ 2017-11-29  6:38 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: linux-arm-kernel, linux-crypto, mark.rutland, samitolvanen,
	alxmtvv, ak, ghackmann, keescook, michal.lkml, ndesaulniers,
	ynorov, mka, will.deacon

On Tue, Nov 21, 2017 at 01:40:17PM +0000, Ard Biesheuvel wrote:
> Most crypto drivers involving kernel mode NEON take care to put the code
> that actually touches the NEON register file in a separate compilation
> unit, to prevent the compiler from reordering code that preserves or
> restores the NEON context with code that may corrupt it. This is
> necessary because we currently have no way to express the restrictions
> imposed upon use of the NEON in kernel mode in a way that the compiler
> understands.
> 
> However, in the case of aes-ce-cipher, it did not seem unreasonable to
> deviate from this rule, given how it does not seem possible for the
> compiler to reorder cross object function calls with asm blocks whose
> in- and output constraints reflect that it reads from and writes to
> memory.
> 
> Now that LTO is being proposed for the arm64 kernel, it is time to
> revisit this. The link time optimization may replace the function
> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
> of the IR that make up its implementation, allowing further reordering
> with the asm block.
> 
> So let's clean this up, and move the asm() blocks into a separate .S
> file.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Patch applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
@ 2017-11-29  6:38   ` Herbert Xu
  0 siblings, 0 replies; 14+ messages in thread
From: Herbert Xu @ 2017-11-29  6:38 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 21, 2017 at 01:40:17PM +0000, Ard Biesheuvel wrote:
> Most crypto drivers involving kernel mode NEON take care to put the code
> that actually touches the NEON register file in a separate compilation
> unit, to prevent the compiler from reordering code that preserves or
> restores the NEON context with code that may corrupt it. This is
> necessary because we currently have no way to express the restrictions
> imposed upon use of the NEON in kernel mode in a way that the compiler
> understands.
> 
> However, in the case of aes-ce-cipher, it did not seem unreasonable to
> deviate from this rule, given how it does not seem possible for the
> compiler to reorder cross object function calls with asm blocks whose
> in- and output constraints reflect that it reads from and writes to
> memory.
> 
> Now that LTO is being proposed for the arm64 kernel, it is time to
> revisit this. The link time optimization may replace the function
> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
> of the IR that make up its implementation, allowing further reordering
> with the asm block.
> 
> So let's clean this up, and move the asm() blocks into a separate .S
> file.
> 
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Patch applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-11-29  6:39 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-21 13:40 [PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file Ard Biesheuvel
2017-11-21 13:40 ` Ard Biesheuvel
2017-11-21 17:32 ` Nick Desaulniers
2017-11-21 17:32   ` Nick Desaulniers
2017-11-22 10:05 ` Alex Matveev
2017-11-22 10:05   ` Alex Matveev
2017-11-22 10:12   ` Ard Biesheuvel
2017-11-22 10:12     ` Ard Biesheuvel
2017-11-22 12:04     ` Mark Rutland
2017-11-22 12:04       ` Mark Rutland
2017-11-22 12:17       ` Mark Rutland
2017-11-22 12:17         ` Mark Rutland
2017-11-29  6:38 ` Herbert Xu
2017-11-29  6:38   ` Herbert Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.