Linux-Crypto Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 0/3] crypto: aegis128 followup
@ 2019-08-02 15:15 Ard Biesheuvel
  2019-08-02 15:15 ` [PATCH resend 1/3] crypto: aegis128 - add support for SIMD acceleration Ard Biesheuvel
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Ard Biesheuvel @ 2019-08-02 15:15 UTC (permalink / raw)
  To: linux-crypto; +Cc: herbert, ebiggers, Ard Biesheuvel

This series resubmits the aegis128 SIMD patches that were reverted due to
the fact that the compiler's optimization behavior wrt variables with static
linkage does not turn out to guarantee that function calls that are
conditional on the value of such a variable are optimized away if the value
is a compile time constant and the condition evaluates to false at compile
time as well.

Patch #1 reintroduces the changes to the generic code to permit SIMD
routines to be attached to the aegis128 driver. This time, the conditional
check is pulled into a helper function which collapses to 'return false'
if the CONFIG_CRYPTO_AEGIS128_SIMD Kconfig symbol is not set. (This has
been confirmed by one of the reporters of the original issue as sufficient
to address the problem).

Patch #2 is mostly unchanged wrt the version that got reverted, only some
inline annotations were added back.

Patch #3 is new and is included as an RFC. It implements the SIMD routines
for arm64 without using the optional AES instructions, but using plain SIMD
arithmetic instead. This is much slower than AES instructions, but still
substantially more efficient than table based scalar AES on systems where
memory accesses are expensive, such as the Raspberry Pi 3 (which does not
implement the AES instructions)

Ard Biesheuvel (3):
  crypto: aegis128 - add support for SIMD acceleration
  crypto: aegis128 - provide a SIMD implementation based on NEON
    intrinsics
  crypto: arm64/aegis128 - implement plain NEON version

 crypto/Kconfig                         |   5 +
 crypto/Makefile                        |  18 ++
 crypto/{aegis128.c => aegis128-core.c} |  52 ++++-
 crypto/aegis128-neon-inner.c           | 204 ++++++++++++++++++++
 crypto/aegis128-neon.c                 |  57 ++++++
 5 files changed, 332 insertions(+), 4 deletions(-)
 rename crypto/{aegis128.c => aegis128-core.c} (89%)
 create mode 100644 crypto/aegis128-neon-inner.c
 create mode 100644 crypto/aegis128-neon.c

-- 
2.17.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH resend 1/3] crypto: aegis128 - add support for SIMD acceleration
  2019-08-02 15:15 [PATCH 0/3] crypto: aegis128 followup Ard Biesheuvel
@ 2019-08-02 15:15 ` Ard Biesheuvel
  2019-08-02 15:15 ` [PATCH resend 2/3] crypto: aegis128 - provide a SIMD implementation based on NEON intrinsics Ard Biesheuvel
  2019-08-02 15:15 ` [PATCH RFC 3/3] crypto: arm64/aegis128 - implement plain NEON version Ard Biesheuvel
  2 siblings, 0 replies; 6+ messages in thread
From: Ard Biesheuvel @ 2019-08-02 15:15 UTC (permalink / raw)
  To: linux-crypto; +Cc: herbert, ebiggers, Ard Biesheuvel

Add some plumbing to allow the AEGIS128 code to be built with SIMD
routines for acceleration.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Makefile                        |  1 +
 crypto/{aegis128.c => aegis128-core.c} | 52 ++++++++++++++++++--
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index cfcc954e59f9..92e985714ff6 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -90,6 +90,7 @@ obj-$(CONFIG_CRYPTO_GCM) += gcm.o
 obj-$(CONFIG_CRYPTO_CCM) += ccm.o
 obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o
 obj-$(CONFIG_CRYPTO_AEGIS128) += aegis128.o
+aegis128-y := aegis128-core.o
 obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
 obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
 obj-$(CONFIG_CRYPTO_DES) += des_generic.o
diff --git a/crypto/aegis128.c b/crypto/aegis128-core.c
similarity index 89%
rename from crypto/aegis128.c
rename to crypto/aegis128-core.c
index 32840d5e7f65..fa69e99968e2 100644
--- a/crypto/aegis128.c
+++ b/crypto/aegis128-core.c
@@ -8,6 +8,7 @@
 
 #include <crypto/algapi.h>
 #include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/err.h>
@@ -16,6 +17,8 @@
 #include <linux/module.h>
 #include <linux/scatterlist.h>
 
+#include <asm/simd.h>
+
 #include "aegis.h"
 
 #define AEGIS128_NONCE_SIZE 16
@@ -40,6 +43,24 @@ struct aegis128_ops {
 			    const u8 *src, unsigned int size);
 };
 
+static bool have_simd;
+
+static bool aegis128_do_simd(void)
+{
+#ifdef CONFIG_CRYPTO_AEGIS128_SIMD
+	if (have_simd)
+		return crypto_simd_usable();
+#endif
+	return false;
+}
+
+bool crypto_aegis128_have_simd(void);
+void crypto_aegis128_update_simd(struct aegis_state *state, const void *msg);
+void crypto_aegis128_encrypt_chunk_simd(struct aegis_state *state, u8 *dst,
+					const u8 *src, unsigned int size);
+void crypto_aegis128_decrypt_chunk_simd(struct aegis_state *state, u8 *dst,
+					const u8 *src, unsigned int size);
+
 static void crypto_aegis128_update(struct aegis_state *state)
 {
 	union aegis_block tmp;
@@ -55,12 +76,22 @@ static void crypto_aegis128_update(struct aegis_state *state)
 static void crypto_aegis128_update_a(struct aegis_state *state,
 				     const union aegis_block *msg)
 {
+	if (aegis128_do_simd()) {
+		crypto_aegis128_update_simd(state, msg);
+		return;
+	}
+
 	crypto_aegis128_update(state);
 	crypto_aegis_block_xor(&state->blocks[0], msg);
 }
 
 static void crypto_aegis128_update_u(struct aegis_state *state, const void *msg)
 {
+	if (aegis128_do_simd()) {
+		crypto_aegis128_update_simd(state, msg);
+		return;
+	}
+
 	crypto_aegis128_update(state);
 	crypto_xor(state->blocks[0].bytes, msg, AEGIS_BLOCK_SIZE);
 }
@@ -365,7 +396,7 @@ static void crypto_aegis128_crypt(struct aead_request *req,
 
 static int crypto_aegis128_encrypt(struct aead_request *req)
 {
-	static const struct aegis128_ops ops = {
+	const struct aegis128_ops *ops = &(struct aegis128_ops){
 		.skcipher_walk_init = skcipher_walk_aead_encrypt,
 		.crypt_chunk = crypto_aegis128_encrypt_chunk,
 	};
@@ -375,7 +406,12 @@ static int crypto_aegis128_encrypt(struct aead_request *req)
 	unsigned int authsize = crypto_aead_authsize(tfm);
 	unsigned int cryptlen = req->cryptlen;
 
-	crypto_aegis128_crypt(req, &tag, cryptlen, &ops);
+	if (aegis128_do_simd())
+		ops = &(struct aegis128_ops){
+			.skcipher_walk_init = skcipher_walk_aead_encrypt,
+			.crypt_chunk = crypto_aegis128_encrypt_chunk_simd };
+
+	crypto_aegis128_crypt(req, &tag, cryptlen, ops);
 
 	scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen,
 				 authsize, 1);
@@ -384,7 +420,7 @@ static int crypto_aegis128_encrypt(struct aead_request *req)
 
 static int crypto_aegis128_decrypt(struct aead_request *req)
 {
-	static const struct aegis128_ops ops = {
+	const struct aegis128_ops *ops = &(struct aegis128_ops){
 		.skcipher_walk_init = skcipher_walk_aead_decrypt,
 		.crypt_chunk = crypto_aegis128_decrypt_chunk,
 	};
@@ -398,7 +434,12 @@ static int crypto_aegis128_decrypt(struct aead_request *req)
 	scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen,
 				 authsize, 0);
 
-	crypto_aegis128_crypt(req, &tag, cryptlen, &ops);
+	if (aegis128_do_simd())
+		ops = &(struct aegis128_ops){
+			.skcipher_walk_init = skcipher_walk_aead_decrypt,
+			.crypt_chunk = crypto_aegis128_decrypt_chunk_simd };
+
+	crypto_aegis128_crypt(req, &tag, cryptlen, ops);
 
 	return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
 }
@@ -429,6 +470,9 @@ static struct aead_alg crypto_aegis128_alg = {
 
 static int __init crypto_aegis128_module_init(void)
 {
+	if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD))
+		have_simd = crypto_aegis128_have_simd();
+
 	return crypto_register_aead(&crypto_aegis128_alg);
 }
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH resend 2/3] crypto: aegis128 - provide a SIMD implementation based on NEON intrinsics
  2019-08-02 15:15 [PATCH 0/3] crypto: aegis128 followup Ard Biesheuvel
  2019-08-02 15:15 ` [PATCH resend 1/3] crypto: aegis128 - add support for SIMD acceleration Ard Biesheuvel
@ 2019-08-02 15:15 ` Ard Biesheuvel
  2019-08-02 15:15 ` [PATCH RFC 3/3] crypto: arm64/aegis128 - implement plain NEON version Ard Biesheuvel
  2 siblings, 0 replies; 6+ messages in thread
From: Ard Biesheuvel @ 2019-08-02 15:15 UTC (permalink / raw)
  To: linux-crypto; +Cc: herbert, ebiggers, Ard Biesheuvel

Provide an accelerated implementation of aegis128 by wiring up the
SIMD hooks in the generic driver to an implementation based on NEON
intrinsics, which can be compiled to both ARM and arm64 code.

This results in a performance of 2.2 cycles per byte on Cortex-A53,
which is a performance increase of ~11x compared to the generic
code.

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig               |   5 +
 crypto/Makefile              |  12 ++
 crypto/aegis128-neon-inner.c | 151 ++++++++++++++++++++
 crypto/aegis128-neon.c       |  43 ++++++
 4 files changed, 211 insertions(+)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 8880c1fc51d8..455a3354e291 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -306,6 +306,11 @@ config CRYPTO_AEGIS128
 	help
 	 Support for the AEGIS-128 dedicated AEAD algorithm.
 
+config CRYPTO_AEGIS128_SIMD
+	bool "Support SIMD acceleration for AEGIS-128"
+	depends on CRYPTO_AEGIS128 && ((ARM || ARM64) && KERNEL_MODE_NEON)
+	default y
+
 config CRYPTO_AEGIS128_AESNI_SSE2
 	tristate "AEGIS-128 AEAD algorithm (x86_64 AESNI+SSE2 implementation)"
 	depends on X86 && 64BIT
diff --git a/crypto/Makefile b/crypto/Makefile
index 92e985714ff6..99a9fa9087d1 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -91,6 +91,18 @@ obj-$(CONFIG_CRYPTO_CCM) += ccm.o
 obj-$(CONFIG_CRYPTO_CHACHA20POLY1305) += chacha20poly1305.o
 obj-$(CONFIG_CRYPTO_AEGIS128) += aegis128.o
 aegis128-y := aegis128-core.o
+
+ifeq ($(ARCH),arm)
+CFLAGS_aegis128-neon-inner.o += -ffreestanding -march=armv7-a -mfloat-abi=softfp
+CFLAGS_aegis128-neon-inner.o += -mfpu=crypto-neon-fp-armv8
+aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
+endif
+ifeq ($(ARCH),arm64)
+CFLAGS_aegis128-neon-inner.o += -ffreestanding -mcpu=generic+crypto
+CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only
+aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
+endif
+
 obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
 obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
 obj-$(CONFIG_CRYPTO_DES) += des_generic.o
diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c
new file mode 100644
index 000000000000..6aca2f425b6d
--- /dev/null
+++ b/crypto/aegis128-neon-inner.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#ifdef CONFIG_ARM64
+#include <asm/neon-intrinsics.h>
+
+#define AES_ROUND	"aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b"
+#else
+#include <arm_neon.h>
+
+#define AES_ROUND	"aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0"
+#endif
+
+#define AEGIS_BLOCK_SIZE	16
+
+#include <stddef.h>
+
+void *memcpy(void *dest, const void *src, size_t n);
+void *memset(void *s, int c, size_t n);
+
+struct aegis128_state {
+	uint8x16_t v[5];
+};
+
+static struct aegis128_state aegis128_load_state_neon(const void *state)
+{
+	return (struct aegis128_state){ {
+		vld1q_u8(state),
+		vld1q_u8(state + 16),
+		vld1q_u8(state + 32),
+		vld1q_u8(state + 48),
+		vld1q_u8(state + 64)
+	} };
+}
+
+static void aegis128_save_state_neon(struct aegis128_state st, void *state)
+{
+	vst1q_u8(state, st.v[0]);
+	vst1q_u8(state + 16, st.v[1]);
+	vst1q_u8(state + 32, st.v[2]);
+	vst1q_u8(state + 48, st.v[3]);
+	vst1q_u8(state + 64, st.v[4]);
+}
+
+static inline __attribute__((always_inline))
+uint8x16_t aegis_aes_round(uint8x16_t w)
+{
+	uint8x16_t z = {};
+
+	/*
+	 * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
+	 * to force the compiler to issue the aese/aesmc instructions in pairs.
+	 * This is much faster on many cores, where the instruction pair can
+	 * execute in a single cycle.
+	 */
+	asm(AES_ROUND : "+w"(w) : "w"(z));
+	return w;
+}
+
+static inline __attribute__((always_inline))
+struct aegis128_state aegis128_update_neon(struct aegis128_state st,
+					   uint8x16_t m)
+{
+	uint8x16_t t;
+
+	t        = aegis_aes_round(st.v[3]);
+	st.v[3] ^= aegis_aes_round(st.v[2]);
+	st.v[2] ^= aegis_aes_round(st.v[1]);
+	st.v[1] ^= aegis_aes_round(st.v[0]);
+	st.v[0] ^= aegis_aes_round(st.v[4]) ^ m;
+	st.v[4] ^= t;
+
+	return st;
+}
+
+void crypto_aegis128_update_neon(void *state, const void *msg)
+{
+	struct aegis128_state st = aegis128_load_state_neon(state);
+
+	st = aegis128_update_neon(st, vld1q_u8(msg));
+
+	aegis128_save_state_neon(st, state);
+}
+
+void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
+					unsigned int size)
+{
+	struct aegis128_state st = aegis128_load_state_neon(state);
+	uint8x16_t tmp;
+
+	while (size >= AEGIS_BLOCK_SIZE) {
+		uint8x16_t s = vld1q_u8(src);
+
+		tmp = s ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
+		st = aegis128_update_neon(st, s);
+		vst1q_u8(dst, tmp);
+
+		size -= AEGIS_BLOCK_SIZE;
+		src += AEGIS_BLOCK_SIZE;
+		dst += AEGIS_BLOCK_SIZE;
+	}
+
+	if (size > 0) {
+		uint8_t buf[AEGIS_BLOCK_SIZE] = {};
+		uint8x16_t msg;
+
+		memcpy(buf, src, size);
+		msg = vld1q_u8(buf);
+		tmp = msg ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
+		st = aegis128_update_neon(st, msg);
+		vst1q_u8(buf, tmp);
+		memcpy(dst, buf, size);
+	}
+
+	aegis128_save_state_neon(st, state);
+}
+
+void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
+					unsigned int size)
+{
+	struct aegis128_state st = aegis128_load_state_neon(state);
+	uint8x16_t tmp;
+
+	while (size >= AEGIS_BLOCK_SIZE) {
+		tmp = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
+		st = aegis128_update_neon(st, tmp);
+		vst1q_u8(dst, tmp);
+
+		size -= AEGIS_BLOCK_SIZE;
+		src += AEGIS_BLOCK_SIZE;
+		dst += AEGIS_BLOCK_SIZE;
+	}
+
+	if (size > 0) {
+		uint8_t buf[AEGIS_BLOCK_SIZE] = {};
+		uint8x16_t msg;
+
+		memcpy(buf, src, size);
+		msg = vld1q_u8(buf) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
+		vst1q_u8(buf, msg);
+		memcpy(dst, buf, size);
+
+		memset(buf + size, 0, AEGIS_BLOCK_SIZE - size);
+		msg = vld1q_u8(buf);
+		st = aegis128_update_neon(st, msg);
+	}
+
+	aegis128_save_state_neon(st, state);
+}
diff --git a/crypto/aegis128-neon.c b/crypto/aegis128-neon.c
new file mode 100644
index 000000000000..c1c0a1686f67
--- /dev/null
+++ b/crypto/aegis128-neon.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/neon.h>
+
+#include "aegis.h"
+
+void crypto_aegis128_update_neon(void *state, const void *msg);
+void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
+					unsigned int size);
+void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
+					unsigned int size);
+
+bool crypto_aegis128_have_simd(void)
+{
+	return cpu_have_feature(cpu_feature(AES));
+}
+
+void crypto_aegis128_update_simd(union aegis_block *state, const void *msg)
+{
+	kernel_neon_begin();
+	crypto_aegis128_update_neon(state, msg);
+	kernel_neon_end();
+}
+
+void crypto_aegis128_encrypt_chunk_simd(union aegis_block *state, u8 *dst,
+					const u8 *src, unsigned int size)
+{
+	kernel_neon_begin();
+	crypto_aegis128_encrypt_chunk_neon(state, dst, src, size);
+	kernel_neon_end();
+}
+
+void crypto_aegis128_decrypt_chunk_simd(union aegis_block *state, u8 *dst,
+					const u8 *src, unsigned int size)
+{
+	kernel_neon_begin();
+	crypto_aegis128_decrypt_chunk_neon(state, dst, src, size);
+	kernel_neon_end();
+}
-- 
2.17.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH RFC 3/3] crypto: arm64/aegis128 - implement plain NEON version
  2019-08-02 15:15 [PATCH 0/3] crypto: aegis128 followup Ard Biesheuvel
  2019-08-02 15:15 ` [PATCH resend 1/3] crypto: aegis128 - add support for SIMD acceleration Ard Biesheuvel
  2019-08-02 15:15 ` [PATCH resend 2/3] crypto: aegis128 - provide a SIMD implementation based on NEON intrinsics Ard Biesheuvel
@ 2019-08-02 15:15 ` Ard Biesheuvel
  2019-08-08 22:31   ` Nick Desaulniers
  2 siblings, 1 reply; 6+ messages in thread
From: Ard Biesheuvel @ 2019-08-02 15:15 UTC (permalink / raw)
  To: linux-crypto; +Cc: herbert, ebiggers, Ard Biesheuvel, Nick Desaulniers

Provide a version of the core AES transform to the aegis128 SIMD
code that does not rely on the special AES instructions, but uses
plain NEON instructions instead. This allows the SIMD version of
the aegis128 driver to be used on arm64 systems that do not
implement those instructions (which are not mandatory in the
architecture), such as the Raspberry Pi 3.

Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 crypto/Makefile              |  5 ++
 crypto/aegis128-neon-inner.c | 53 ++++++++++++++++++++
 crypto/aegis128-neon.c       | 16 +++++-
 3 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index 99a9fa9087d1..c3760c7616ac 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -99,6 +99,11 @@ aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
 endif
 ifeq ($(ARCH),arm64)
 CFLAGS_aegis128-neon-inner.o += -ffreestanding -mcpu=generic+crypto
+CFLAGS_aegis128-neon-inner.o += -ffixed-q14 -ffixed-q15
+CFLAGS_aegis128-neon-inner.o += -ffixed-q16 -ffixed-q17 -ffixed-q18 -ffixed-q19
+CFLAGS_aegis128-neon-inner.o += -ffixed-q20 -ffixed-q21 -ffixed-q22 -ffixed-q23
+CFLAGS_aegis128-neon-inner.o += -ffixed-q24 -ffixed-q25 -ffixed-q26 -ffixed-q27
+CFLAGS_aegis128-neon-inner.o += -ffixed-q28 -ffixed-q29 -ffixed-q30 -ffixed-q31
 CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only
 aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
 endif
diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c
index 6aca2f425b6d..7aa4cef3c2de 100644
--- a/crypto/aegis128-neon-inner.c
+++ b/crypto/aegis128-neon-inner.c
@@ -17,6 +17,8 @@
 
 #include <stddef.h>
 
+extern int aegis128_have_aes_insn;
+
 void *memcpy(void *dest, const void *src, size_t n);
 void *memset(void *s, int c, size_t n);
 
@@ -49,6 +51,32 @@ uint8x16_t aegis_aes_round(uint8x16_t w)
 {
 	uint8x16_t z = {};
 
+#ifdef CONFIG_ARM64
+	if (!__builtin_expect(aegis128_have_aes_insn, 1)) {
+		uint8x16_t v;
+
+		// shift rows
+		asm("tbl %0.16b, {%0.16b}, v14.16b" : "+w"(w));
+
+		// sub bytes
+		asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
+		w -= 0x40;
+		asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
+		w -= 0x40;
+		asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
+		w -= 0x40;
+		asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
+
+		// mix columns
+		w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
+		w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
+		asm("tbl %0.16b, {%1.16b}, v15.16b" : "=w"(v) : "w"(v ^ w));
+		w ^= v;
+
+		return w;
+	}
+#endif
+
 	/*
 	 * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
 	 * to force the compiler to issue the aese/aesmc instructions in pairs.
@@ -149,3 +177,28 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
 
 	aegis128_save_state_neon(st, state);
 }
+
+#ifdef CONFIG_ARM64
+void crypto_aegis128_init_neon(void)
+{
+	u64 tmp;
+
+	asm volatile(
+	    "adrp		%0, crypto_aes_sbox		\n\t"
+	    "add		%0, %0, :lo12:crypto_aes_sbox	\n\t"
+	    "mov		v14.16b, %1.16b			\n\t"
+	    "mov		v15.16b, %2.16b			\n\t"
+	    "ld1		{v16.16b-v19.16b}, [%0], #64	\n\t"
+	    "ld1		{v20.16b-v23.16b}, [%0], #64	\n\t"
+	    "ld1		{v24.16b-v27.16b}, [%0], #64	\n\t"
+	    "ld1		{v28.16b-v31.16b}, [%0]		\n\t"
+	    : "=&r"(tmp)
+	    : "w"((uint8x16_t){ // shift rows permutation vector
+			0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+			0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, }),
+	      "w"((uint8x16_t){ // ror32 permutation vector
+			0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+			0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,	})
+	);
+}
+#endif
diff --git a/crypto/aegis128-neon.c b/crypto/aegis128-neon.c
index c1c0a1686f67..72f9d48e4963 100644
--- a/crypto/aegis128-neon.c
+++ b/crypto/aegis128-neon.c
@@ -14,14 +14,24 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
 void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
 					unsigned int size);
 
+void crypto_aegis128_init_neon(void);
+
+int aegis128_have_aes_insn __ro_after_init;
+
 bool crypto_aegis128_have_simd(void)
 {
-	return cpu_have_feature(cpu_feature(AES));
+	if (cpu_have_feature(cpu_feature(AES))) {
+		aegis128_have_aes_insn = 1;
+		return true;
+	}
+	return IS_ENABLED(CONFIG_ARM64);
 }
 
 void crypto_aegis128_update_simd(union aegis_block *state, const void *msg)
 {
 	kernel_neon_begin();
+	if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
+		crypto_aegis128_init_neon();
 	crypto_aegis128_update_neon(state, msg);
 	kernel_neon_end();
 }
@@ -30,6 +40,8 @@ void crypto_aegis128_encrypt_chunk_simd(union aegis_block *state, u8 *dst,
 					const u8 *src, unsigned int size)
 {
 	kernel_neon_begin();
+	if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
+		crypto_aegis128_init_neon();
 	crypto_aegis128_encrypt_chunk_neon(state, dst, src, size);
 	kernel_neon_end();
 }
@@ -38,6 +50,8 @@ void crypto_aegis128_decrypt_chunk_simd(union aegis_block *state, u8 *dst,
 					const u8 *src, unsigned int size)
 {
 	kernel_neon_begin();
+	if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
+		crypto_aegis128_init_neon();
 	crypto_aegis128_decrypt_chunk_neon(state, dst, src, size);
 	kernel_neon_end();
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH RFC 3/3] crypto: arm64/aegis128 - implement plain NEON version
  2019-08-02 15:15 ` [PATCH RFC 3/3] crypto: arm64/aegis128 - implement plain NEON version Ard Biesheuvel
@ 2019-08-08 22:31   ` Nick Desaulniers
  2019-08-09 17:20     ` Ard Biesheuvel
  0 siblings, 1 reply; 6+ messages in thread
From: Nick Desaulniers @ 2019-08-08 22:31 UTC (permalink / raw)
  To: Ard Biesheuvel; +Cc: linux-crypto, Herbert Xu, Eric Biggers, Tri Vo, Petr Hosek

On Fri, Aug 2, 2019 at 8:15 AM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
> Provide a version of the core AES transform to the aegis128 SIMD
> code that does not rely on the special AES instructions, but uses
> plain NEON instructions instead. This allows the SIMD version of
> the aegis128 driver to be used on arm64 systems that do not
> implement those instructions (which are not mandatory in the
> architecture), such as the Raspberry Pi 3.
>
> Cc: Nick Desaulniers <ndesaulniers@google.com>

Thanks for the heads up, thoughts below:

> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> ---
>  crypto/Makefile              |  5 ++
>  crypto/aegis128-neon-inner.c | 53 ++++++++++++++++++++
>  crypto/aegis128-neon.c       | 16 +++++-
>  3 files changed, 73 insertions(+), 1 deletion(-)
>
> diff --git a/crypto/Makefile b/crypto/Makefile
> index 99a9fa9087d1..c3760c7616ac 100644
> --- a/crypto/Makefile
> +++ b/crypto/Makefile
> @@ -99,6 +99,11 @@ aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
>  endif
>  ifeq ($(ARCH),arm64)
>  CFLAGS_aegis128-neon-inner.o += -ffreestanding -mcpu=generic+crypto
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q14 -ffixed-q15
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q16 -ffixed-q17 -ffixed-q18 -ffixed-q19
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q20 -ffixed-q21 -ffixed-q22 -ffixed-q23
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q24 -ffixed-q25 -ffixed-q26 -ffixed-q27
> +CFLAGS_aegis128-neon-inner.o += -ffixed-q28 -ffixed-q29 -ffixed-q30 -ffixed-q31

So Tri implemented support for -ffixed-x*, but Clang currently lacks
support for -ffixed-q*.  Petr recently made this slightly more
generic:
https://reviews.llvm.org/D56305
but Clang still doesn't allow specifying any register number + width
for each supported arch.  The arm64 support for x registers was
manually added.

I'm guessing that for arm64 that if:
* w* is 32b registers
* x* is 64b registers
then:
* q* is 128b NEON registers?

I'm curious as to why we need to reserve these registers when calling
functions in this TU?  I assume this has to do with the calling
convention for uint8x16_t?  Can you point me to documentation about
that (that way I can reference in any patch to Clang/LLVM)?

>  CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only
>  aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
>  endif
> diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c
> index 6aca2f425b6d..7aa4cef3c2de 100644
> --- a/crypto/aegis128-neon-inner.c
> +++ b/crypto/aegis128-neon-inner.c
> @@ -17,6 +17,8 @@
>
>  #include <stddef.h>
>
> +extern int aegis128_have_aes_insn;
> +
>  void *memcpy(void *dest, const void *src, size_t n);
>  void *memset(void *s, int c, size_t n);
>
> @@ -49,6 +51,32 @@ uint8x16_t aegis_aes_round(uint8x16_t w)
>  {
>         uint8x16_t z = {};
>
> +#ifdef CONFIG_ARM64
> +       if (!__builtin_expect(aegis128_have_aes_insn, 1)) {

Can we use a likely/unlikely here?  It always takes me a minute to decode these.

> +               uint8x16_t v;
> +
> +               // shift rows
> +               asm("tbl %0.16b, {%0.16b}, v14.16b" : "+w"(w));
> +
> +               // sub bytes
> +               asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
> +               w -= 0x40;
> +               asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
> +               w -= 0x40;
> +               asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
> +               w -= 0x40;
> +               asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
> +
> +               // mix columns
> +               w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);

What does it mean to right shift a int8x16_t?  Is that elementwise
right shift or do the bits shift from one element to another?

> +               w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
> +               asm("tbl %0.16b, {%1.16b}, v15.16b" : "=w"(v) : "w"(v ^ w));
> +               w ^= v;
> +
> +               return w;
> +       }
> +#endif
> +
>         /*
>          * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
>          * to force the compiler to issue the aese/aesmc instructions in pairs.
> @@ -149,3 +177,28 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
>
>         aegis128_save_state_neon(st, state);
>  }
> +
> +#ifdef CONFIG_ARM64
> +void crypto_aegis128_init_neon(void)
> +{
> +       u64 tmp;
> +
> +       asm volatile(
> +           "adrp               %0, crypto_aes_sbox             \n\t"
> +           "add                %0, %0, :lo12:crypto_aes_sbox   \n\t"
> +           "mov                v14.16b, %1.16b                 \n\t"
> +           "mov                v15.16b, %2.16b                 \n\t"
> +           "ld1                {v16.16b-v19.16b}, [%0], #64    \n\t"
> +           "ld1                {v20.16b-v23.16b}, [%0], #64    \n\t"
> +           "ld1                {v24.16b-v27.16b}, [%0], #64    \n\t"
> +           "ld1                {v28.16b-v31.16b}, [%0]         \n\t"
> +           : "=&r"(tmp)
> +           : "w"((uint8x16_t){ // shift rows permutation vector
> +                       0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
> +                       0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, }),
> +             "w"((uint8x16_t){ // ror32 permutation vector
> +                       0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
> +                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, })
> +       );
> +}
> +#endif
> diff --git a/crypto/aegis128-neon.c b/crypto/aegis128-neon.c
> index c1c0a1686f67..72f9d48e4963 100644
> --- a/crypto/aegis128-neon.c
> +++ b/crypto/aegis128-neon.c
> @@ -14,14 +14,24 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
>  void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
>                                         unsigned int size);
>
> +void crypto_aegis128_init_neon(void);
> +
> +int aegis128_have_aes_insn __ro_after_init;
> +
>  bool crypto_aegis128_have_simd(void)
>  {
> -       return cpu_have_feature(cpu_feature(AES));
> +       if (cpu_have_feature(cpu_feature(AES))) {
> +               aegis128_have_aes_insn = 1;
> +               return true;

This could just fall through right? (if you removed the return
statement, I assume IS_ENABLED doesn't have runtime overhead but is
just a preprocessor check?)

> +       }
> +       return IS_ENABLED(CONFIG_ARM64);
>  }
>
>  void crypto_aegis128_update_simd(union aegis_block *state, const void *msg)
>  {
>         kernel_neon_begin();
> +       if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
> +               crypto_aegis128_init_neon();
>         crypto_aegis128_update_neon(state, msg);
>         kernel_neon_end();
>  }
> @@ -30,6 +40,8 @@ void crypto_aegis128_encrypt_chunk_simd(union aegis_block *state, u8 *dst,
>                                         const u8 *src, unsigned int size)
>  {
>         kernel_neon_begin();
> +       if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
> +               crypto_aegis128_init_neon();
>         crypto_aegis128_encrypt_chunk_neon(state, dst, src, size);
>         kernel_neon_end();
>  }
> @@ -38,6 +50,8 @@ void crypto_aegis128_decrypt_chunk_simd(union aegis_block *state, u8 *dst,
>                                         const u8 *src, unsigned int size)
>  {
>         kernel_neon_begin();
> +       if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
> +               crypto_aegis128_init_neon();
>         crypto_aegis128_decrypt_chunk_neon(state, dst, src, size);
>         kernel_neon_end();
>  }
> --
> 2.17.1
>


-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH RFC 3/3] crypto: arm64/aegis128 - implement plain NEON version
  2019-08-08 22:31   ` Nick Desaulniers
@ 2019-08-09 17:20     ` Ard Biesheuvel
  0 siblings, 0 replies; 6+ messages in thread
From: Ard Biesheuvel @ 2019-08-09 17:20 UTC (permalink / raw)
  To: Nick Desaulniers
  Cc: open list:HARDWARE RANDOM NUMBER GENERATOR CORE, Herbert Xu,
	Eric Biggers, Tri Vo, Petr Hosek

On Fri, 9 Aug 2019 at 01:31, Nick Desaulniers <ndesaulniers@google.com> wrote:
>
> On Fri, Aug 2, 2019 at 8:15 AM Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> >
> > Provide a version of the core AES transform to the aegis128 SIMD
> > code that does not rely on the special AES instructions, but uses
> > plain NEON instructions instead. This allows the SIMD version of
> > the aegis128 driver to be used on arm64 systems that do not
> > implement those instructions (which are not mandatory in the
> > architecture), such as the Raspberry Pi 3.
> >
> > Cc: Nick Desaulniers <ndesaulniers@google.com>
>
> Thanks for the heads up, thoughts below:
>
> > Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> > ---
> >  crypto/Makefile              |  5 ++
> >  crypto/aegis128-neon-inner.c | 53 ++++++++++++++++++++
> >  crypto/aegis128-neon.c       | 16 +++++-
> >  3 files changed, 73 insertions(+), 1 deletion(-)
> >
> > diff --git a/crypto/Makefile b/crypto/Makefile
> > index 99a9fa9087d1..c3760c7616ac 100644
> > --- a/crypto/Makefile
> > +++ b/crypto/Makefile
> > @@ -99,6 +99,11 @@ aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
> >  endif
> >  ifeq ($(ARCH),arm64)
> >  CFLAGS_aegis128-neon-inner.o += -ffreestanding -mcpu=generic+crypto
> > +CFLAGS_aegis128-neon-inner.o += -ffixed-q14 -ffixed-q15
> > +CFLAGS_aegis128-neon-inner.o += -ffixed-q16 -ffixed-q17 -ffixed-q18 -ffixed-q19
> > +CFLAGS_aegis128-neon-inner.o += -ffixed-q20 -ffixed-q21 -ffixed-q22 -ffixed-q23
> > +CFLAGS_aegis128-neon-inner.o += -ffixed-q24 -ffixed-q25 -ffixed-q26 -ffixed-q27
> > +CFLAGS_aegis128-neon-inner.o += -ffixed-q28 -ffixed-q29 -ffixed-q30 -ffixed-q31
>
> So Tri implemented support for -ffixed-x*, but Clang currently lacks
> support for -ffixed-q*.  Petr recently made this slightly more
> generic:
> https://reviews.llvm.org/D56305
> but Clang still doesn't allow specifying any register number + width
> for each supported arch.  The arm64 support for x registers was
> manually added.
>
> I'm guessing that for arm64 that if:
> * w* is 32b registers
> * x* is 64b registers
> then:
> * q* is 128b NEON registers?
>
> I'm curious as to why we need to reserve these registers when calling
> functions in this TU?  I assume this has to do with the calling
> convention for uint8x16_t?  Can you point me to documentation about
> that (that way I can reference in any patch to Clang/LLVM)?
>

This is to allow the AES sbox to remain loaded in registers.
Otherwise, the compiler will reload all 256 bytes into 16 NEON
registers for every AES block processed, which is suboptimal.

> >  CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only
> >  aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
> >  endif
> > diff --git a/crypto/aegis128-neon-inner.c b/crypto/aegis128-neon-inner.c
> > index 6aca2f425b6d..7aa4cef3c2de 100644
> > --- a/crypto/aegis128-neon-inner.c
> > +++ b/crypto/aegis128-neon-inner.c
> > @@ -17,6 +17,8 @@
> >
> >  #include <stddef.h>
> >
> > +extern int aegis128_have_aes_insn;
> > +
> >  void *memcpy(void *dest, const void *src, size_t n);
> >  void *memset(void *s, int c, size_t n);
> >
> > @@ -49,6 +51,32 @@ uint8x16_t aegis_aes_round(uint8x16_t w)
> >  {
> >         uint8x16_t z = {};
> >
> > +#ifdef CONFIG_ARM64
> > +       if (!__builtin_expect(aegis128_have_aes_insn, 1)) {
>
> Can we use a likely/unlikely here?  It always takes me a minute to decode these.
>

I am avoiding ordinary kernel headers in this TU, so not really.

> > +               uint8x16_t v;
> > +
> > +               // shift rows
> > +               asm("tbl %0.16b, {%0.16b}, v14.16b" : "+w"(w));
> > +
> > +               // sub bytes
> > +               asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
> > +               w -= 0x40;
> > +               asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
> > +               w -= 0x40;
> > +               asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
> > +               w -= 0x40;
> > +               asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
> > +
> > +               // mix columns
> > +               w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
>
> What does it mean to right shift a int8x16_t?  Is that elementwise
> right shift or do the bits shift from one element to another?
>

Element wise. This applies to all uint8x16_t arithmetic, which is kind
of the point.

> > +               w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
> > +               asm("tbl %0.16b, {%1.16b}, v15.16b" : "=w"(v) : "w"(v ^ w));
> > +               w ^= v;
> > +
> > +               return w;
> > +       }
> > +#endif
> > +
> >         /*
> >          * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
> >          * to force the compiler to issue the aese/aesmc instructions in pairs.
> > @@ -149,3 +177,28 @@ void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
> >
> >         aegis128_save_state_neon(st, state);
> >  }
> > +
> > +#ifdef CONFIG_ARM64
> > +void crypto_aegis128_init_neon(void)
> > +{
> > +       u64 tmp;
> > +
> > +       asm volatile(
> > +           "adrp               %0, crypto_aes_sbox             \n\t"
> > +           "add                %0, %0, :lo12:crypto_aes_sbox   \n\t"
> > +           "mov                v14.16b, %1.16b                 \n\t"
> > +           "mov                v15.16b, %2.16b                 \n\t"
> > +           "ld1                {v16.16b-v19.16b}, [%0], #64    \n\t"
> > +           "ld1                {v20.16b-v23.16b}, [%0], #64    \n\t"
> > +           "ld1                {v24.16b-v27.16b}, [%0], #64    \n\t"
> > +           "ld1                {v28.16b-v31.16b}, [%0]         \n\t"
> > +           : "=&r"(tmp)
> > +           : "w"((uint8x16_t){ // shift rows permutation vector
> > +                       0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
> > +                       0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, }),
> > +             "w"((uint8x16_t){ // ror32 permutation vector
> > +                       0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
> > +                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, })
> > +       );
> > +}
> > +#endif
> > diff --git a/crypto/aegis128-neon.c b/crypto/aegis128-neon.c
> > index c1c0a1686f67..72f9d48e4963 100644
> > --- a/crypto/aegis128-neon.c
> > +++ b/crypto/aegis128-neon.c
> > @@ -14,14 +14,24 @@ void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
> >  void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
> >                                         unsigned int size);
> >
> > +void crypto_aegis128_init_neon(void);
> > +
> > +int aegis128_have_aes_insn __ro_after_init;
> > +
> >  bool crypto_aegis128_have_simd(void)
> >  {
> > -       return cpu_have_feature(cpu_feature(AES));
> > +       if (cpu_have_feature(cpu_feature(AES))) {
> > +               aegis128_have_aes_insn = 1;
> > +               return true;
>
> This could just fall through right? (if you removed the return
> statement, I assume IS_ENABLED doesn't have runtime overhead but is
> just a preprocessor check?)
>

No. This code can be built for 32-bit ARM as well, in which case we
can only use the AES instructions.

> > +       }
> > +       return IS_ENABLED(CONFIG_ARM64);
> >  }
> >
> >  void crypto_aegis128_update_simd(union aegis_block *state, const void *msg)
> >  {
> >         kernel_neon_begin();
> > +       if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
> > +               crypto_aegis128_init_neon();
> >         crypto_aegis128_update_neon(state, msg);
> >         kernel_neon_end();
> >  }
> > @@ -30,6 +40,8 @@ void crypto_aegis128_encrypt_chunk_simd(union aegis_block *state, u8 *dst,
> >                                         const u8 *src, unsigned int size)
> >  {
> >         kernel_neon_begin();
> > +       if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
> > +               crypto_aegis128_init_neon();
> >         crypto_aegis128_encrypt_chunk_neon(state, dst, src, size);
> >         kernel_neon_end();
> >  }
> > @@ -38,6 +50,8 @@ void crypto_aegis128_decrypt_chunk_simd(union aegis_block *state, u8 *dst,
> >                                         const u8 *src, unsigned int size)
> >  {
> >         kernel_neon_begin();
> > +       if (IS_ENABLED(CONFIG_ARM64) && !aegis128_have_aes_insn)
> > +               crypto_aegis128_init_neon();
> >         crypto_aegis128_decrypt_chunk_neon(state, dst, src, size);
> >         kernel_neon_end();
> >  }
> > --
> > 2.17.1
> >
>
>
> --
> Thanks,
> ~Nick Desaulniers

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, back to index

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-02 15:15 [PATCH 0/3] crypto: aegis128 followup Ard Biesheuvel
2019-08-02 15:15 ` [PATCH resend 1/3] crypto: aegis128 - add support for SIMD acceleration Ard Biesheuvel
2019-08-02 15:15 ` [PATCH resend 2/3] crypto: aegis128 - provide a SIMD implementation based on NEON intrinsics Ard Biesheuvel
2019-08-02 15:15 ` [PATCH RFC 3/3] crypto: arm64/aegis128 - implement plain NEON version Ard Biesheuvel
2019-08-08 22:31   ` Nick Desaulniers
2019-08-09 17:20     ` Ard Biesheuvel

Linux-Crypto Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-crypto/0 linux-crypto/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-crypto linux-crypto/ https://lore.kernel.org/linux-crypto \
		linux-crypto@vger.kernel.org linux-crypto@archiver.kernel.org
	public-inbox-index linux-crypto


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-crypto


AGPL code for this site: git clone https://public-inbox.org/ public-inbox