All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: linux-fscrypt@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org,
	Herbert Xu <herbert@gondor.apana.org.au>,
	Paul Crowley <paulcrowley@google.com>,
	Greg Kaiser <gkaiser@google.com>,
	"Jason A . Donenfeld" <Jason@zx2c4.com>,
	Samuel Neves <samuel.c.p.neves@gmail.com>,
	Tomer Ashur <tomer.ashur@esat.kuleuven.be>
Subject: [RFC PATCH v3 08/15] crypto: arm/chacha20 - refactor to allow varying number of rounds
Date: Mon,  5 Nov 2018 15:25:19 -0800	[thread overview]
Message-ID: <20181105232526.173947-9-ebiggers@kernel.org> (raw)
In-Reply-To: <20181105232526.173947-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

In preparation for adding XChaCha12 support, rename/refactor the NEON
implementation of ChaCha20 to support different numbers of rounds.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm/crypto/Makefile                      |  4 +-
 ...hacha20-neon-core.S => chacha-neon-core.S} | 44 ++++++++-------
 ...hacha20-neon-glue.c => chacha-neon-glue.c} | 56 ++++++++++---------
 3 files changed, 56 insertions(+), 48 deletions(-)
 rename arch/arm/crypto/{chacha20-neon-core.S => chacha-neon-core.S} (94%)
 rename arch/arm/crypto/{chacha20-neon-glue.c => chacha-neon-glue.c} (72%)

diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index bd5bceef0605..005482ff9504 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -52,7 +52,7 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
 
 ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha-neon-core.S
similarity index 94%
rename from arch/arm/crypto/chacha20-neon-core.S
rename to arch/arm/crypto/chacha-neon-core.S
index 2335e5055d2b..eb22926d4912 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -1,5 +1,5 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ * ChaCha/XChaCha NEON helper functions
  *
  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  *
@@ -27,9 +27,9 @@
   * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
   *					 needs index vector)
   *
-  * ChaCha20 has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit
-  * rotations, the only choices are (a) and (b).  We use (a) since it takes
-  * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
+  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
+  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
+  * cycles of (b) on both Cortex-A7 and Cortex-A53.
   *
   * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
   * and doesn't need a temporary register.
@@ -53,18 +53,19 @@
 	.align		5
 
 /*
- * chacha20_permute - permute one block
+ * chacha_permute - permute one block
  *
  * Permute one 64-byte block where the state matrix is stored in the four NEON
  * registers q0-q3.  It performs matrix operations on four words in parallel,
  * but requires shuffling to rearrange the words after each round.
  *
+ * The round count is given in r3.
+ *
  * Clobbers: r3, ip, q4-q5
  */
-chacha20_permute:
+chacha_permute:
 
 	adr		ip, .Lrol8_table
-	mov		r3, #10
 	vld1.8		{d10}, [ip, :64]
 
 .Ldoubleround:
@@ -128,16 +129,17 @@ chacha20_permute:
 	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	vext.8		q3, q3, q3, #4
 
-	subs		r3, r3, #1
+	subs		r3, r3, #2
 	bne		.Ldoubleround
 
 	bx		lr
-ENDPROC(chacha20_permute)
+ENDPROC(chacha_permute)
 
-ENTRY(chacha20_block_xor_neon)
+ENTRY(chacha_block_xor_neon)
 	// r0: Input state matrix, s
 	// r1: 1 data block output, o
 	// r2: 1 data block input, i
+	// r3: nrounds
 	push		{lr}
 
 	// x0..3 = s0..3
@@ -150,7 +152,7 @@ ENTRY(chacha20_block_xor_neon)
 	vmov		q10, q2
 	vmov		q11, q3
 
-	bl		chacha20_permute
+	bl		chacha_permute
 
 	add		ip, r2, #0x20
 	vld1.8		{q4-q5}, [r2]
@@ -177,30 +179,32 @@ ENTRY(chacha20_block_xor_neon)
 	vst1.8		{q2-q3}, [ip]
 
 	pop		{pc}
-ENDPROC(chacha20_block_xor_neon)
+ENDPROC(chacha_block_xor_neon)
 
-ENTRY(hchacha20_block_neon)
+ENTRY(hchacha_block_neon)
 	// r0: Input state matrix, s
 	// r1: output (8 32-bit words)
+	// r2: nrounds
 	push		{lr}
 
 	vld1.32		{q0-q1}, [r0]!
 	vld1.32		{q2-q3}, [r0]
 
-	bl		chacha20_permute
+	mov		r3, r2
+	bl		chacha_permute
 
 	vst1.32		{q0}, [r1]!
 	vst1.32		{q3}, [r1]
 
 	pop		{pc}
-ENDPROC(hchacha20_block_neon)
+ENDPROC(hchacha_block_neon)
 
 	.align		4
 .Lctrinc:	.word	0, 1, 2, 3
 .Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
 
 	.align		5
-ENTRY(chacha20_4block_xor_neon)
+ENTRY(chacha_4block_xor_neon)
 	push		{r4-r5}
 	mov		r4, sp			// preserve the stack pointer
 	sub		ip, sp, #0x20		// allocate a 32 byte buffer
@@ -210,9 +214,10 @@ ENTRY(chacha20_4block_xor_neon)
 	// r0: Input state matrix, s
 	// r1: 4 data blocks output, o
 	// r2: 4 data blocks input, i
+	// r3: nrounds
 
 	//
-	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// This function encrypts four consecutive ChaCha blocks by loading
 	// the state matrix in NEON registers four times. The algorithm performs
 	// each operation on the corresponding word of each state matrix, hence
 	// requires no word shuffling. The words are re-interleaved before the
@@ -245,7 +250,6 @@ ENTRY(chacha20_4block_xor_neon)
 	vdup.32		q0, d0[0]
 
 	adr		ip, .Lrol8_table
-	mov		r3, #10
 	b		1f
 
 .Ldoubleround4:
@@ -443,7 +447,7 @@ ENTRY(chacha20_4block_xor_neon)
 	vsri.u32	q5, q8, #25
 	vsri.u32	q6, q9, #25
 
-	subs		r3, r3, #1
+	subs		r3, r3, #2
 	bne		.Ldoubleround4
 
 	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
@@ -553,4 +557,4 @@ ENTRY(chacha20_4block_xor_neon)
 
 	pop		{r4-r5}
 	bx		lr
-ENDPROC(chacha20_4block_xor_neon)
+ENDPROC(chacha_4block_xor_neon)
diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c
similarity index 72%
rename from arch/arm/crypto/chacha20-neon-glue.c
rename to arch/arm/crypto/chacha-neon-glue.c
index f2d3b0f70a8d..385557d38634 100644
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ b/arch/arm/crypto/chacha-neon-glue.c
@@ -28,24 +28,26 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 
-asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out);
-
-static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
-			    unsigned int bytes)
+asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				      int nrounds);
+asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				       int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
 {
 	u8 buf[CHACHA_BLOCK_SIZE];
 
 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		chacha20_4block_xor_neon(state, dst, src);
+		chacha_4block_xor_neon(state, dst, src, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE * 4;
 		src += CHACHA_BLOCK_SIZE * 4;
 		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
 	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha20_block_xor_neon(state, dst, src);
+		chacha_block_xor_neon(state, dst, src, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE;
 		src += CHACHA_BLOCK_SIZE;
 		dst += CHACHA_BLOCK_SIZE;
@@ -53,13 +55,13 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 	}
 	if (bytes) {
 		memcpy(buf, src, bytes);
-		chacha20_block_xor_neon(state, buf, buf);
+		chacha_block_xor_neon(state, buf, buf, nrounds);
 		memcpy(dst, buf, bytes);
 	}
 }
 
-static int chacha20_neon_stream_xor(struct skcipher_request *req,
-				    struct chacha_ctx *ctx, u8 *iv)
+static int chacha_neon_stream_xor(struct skcipher_request *req,
+				  struct chacha_ctx *ctx, u8 *iv)
 {
 	struct skcipher_walk walk;
 	u32 state[16];
@@ -76,8 +78,8 @@ static int chacha20_neon_stream_xor(struct skcipher_request *req,
 			nbytes = round_down(nbytes, walk.stride);
 
 		kernel_neon_begin();
-		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
-				nbytes);
+		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes, ctx->nrounds);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
@@ -85,7 +87,7 @@ static int chacha20_neon_stream_xor(struct skcipher_request *req,
 	return err;
 }
 
-static int chacha20_neon(struct skcipher_request *req)
+static int chacha_neon(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -93,10 +95,10 @@ static int chacha20_neon(struct skcipher_request *req)
 	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
 		return crypto_chacha_crypt(req);
 
-	return chacha20_neon_stream_xor(req, ctx, req->iv);
+	return chacha_neon_stream_xor(req, ctx, req->iv);
 }
 
-static int xchacha20_neon(struct skcipher_request *req)
+static int xchacha_neon(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -110,12 +112,13 @@ static int xchacha20_neon(struct skcipher_request *req)
 	crypto_chacha_init(state, ctx, req->iv);
 
 	kernel_neon_begin();
-	hchacha20_block_neon(state, subctx.key);
+	hchacha_block_neon(state, subctx.key, ctx->nrounds);
 	kernel_neon_end();
+	subctx.nrounds = ctx->nrounds;
 
 	memcpy(&real_iv[0], req->iv + 24, 8);
 	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha20_neon_stream_xor(req, &subctx, real_iv);
+	return chacha_neon_stream_xor(req, &subctx, real_iv);
 }
 
 static struct skcipher_alg algs[] = {
@@ -133,8 +136,8 @@ static struct skcipher_alg algs[] = {
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.walksize		= 4 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= chacha20_neon,
-		.decrypt		= chacha20_neon,
+		.encrypt		= chacha_neon,
+		.decrypt		= chacha_neon,
 	}, {
 		.base.cra_name		= "xchacha20",
 		.base.cra_driver_name	= "xchacha20-neon",
@@ -149,12 +152,12 @@ static struct skcipher_alg algs[] = {
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.walksize		= 4 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= xchacha20_neon,
-		.decrypt		= xchacha20_neon,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
 	}
 };
 
-static int __init chacha20_simd_mod_init(void)
+static int __init chacha_simd_mod_init(void)
 {
 	if (!(elf_hwcap & HWCAP_NEON))
 		return -ENODEV;
@@ -162,14 +165,15 @@ static int __init chacha20_simd_mod_init(void)
 	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
 }
 
-static void __exit chacha20_simd_mod_fini(void)
+static void __exit chacha_simd_mod_fini(void)
 {
 	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
 
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("chacha20");
-- 
2.19.1.930.g4563a0d9d0-goog

WARNING: multiple messages have this Message-ID (diff)
From: ebiggers@kernel.org (Eric Biggers)
To: linux-arm-kernel@lists.infradead.org
Subject: [RFC PATCH v3 08/15] crypto: arm/chacha20 - refactor to allow varying number of rounds
Date: Mon,  5 Nov 2018 15:25:19 -0800	[thread overview]
Message-ID: <20181105232526.173947-9-ebiggers@kernel.org> (raw)
In-Reply-To: <20181105232526.173947-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

In preparation for adding XChaCha12 support, rename/refactor the NEON
implementation of ChaCha20 to support different numbers of rounds.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm/crypto/Makefile                      |  4 +-
 ...hacha20-neon-core.S => chacha-neon-core.S} | 44 ++++++++-------
 ...hacha20-neon-glue.c => chacha-neon-glue.c} | 56 ++++++++++---------
 3 files changed, 56 insertions(+), 48 deletions(-)
 rename arch/arm/crypto/{chacha20-neon-core.S => chacha-neon-core.S} (94%)
 rename arch/arm/crypto/{chacha20-neon-glue.c => chacha-neon-glue.c} (72%)

diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index bd5bceef0605..005482ff9504 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -52,7 +52,7 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
 
 ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha-neon-core.S
similarity index 94%
rename from arch/arm/crypto/chacha20-neon-core.S
rename to arch/arm/crypto/chacha-neon-core.S
index 2335e5055d2b..eb22926d4912 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -1,5 +1,5 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
+ * ChaCha/XChaCha NEON helper functions
  *
  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
  *
@@ -27,9 +27,9 @@
   * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
   *					 needs index vector)
   *
-  * ChaCha20 has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit
-  * rotations, the only choices are (a) and (b).  We use (a) since it takes
-  * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
+  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
+  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
+  * cycles of (b) on both Cortex-A7 and Cortex-A53.
   *
   * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
   * and doesn't need a temporary register.
@@ -53,18 +53,19 @@
 	.align		5
 
 /*
- * chacha20_permute - permute one block
+ * chacha_permute - permute one block
  *
  * Permute one 64-byte block where the state matrix is stored in the four NEON
  * registers q0-q3.  It performs matrix operations on four words in parallel,
  * but requires shuffling to rearrange the words after each round.
  *
+ * The round count is given in r3.
+ *
  * Clobbers: r3, ip, q4-q5
  */
-chacha20_permute:
+chacha_permute:
 
 	adr		ip, .Lrol8_table
-	mov		r3, #10
 	vld1.8		{d10}, [ip, :64]
 
 .Ldoubleround:
@@ -128,16 +129,17 @@ chacha20_permute:
 	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	vext.8		q3, q3, q3, #4
 
-	subs		r3, r3, #1
+	subs		r3, r3, #2
 	bne		.Ldoubleround
 
 	bx		lr
-ENDPROC(chacha20_permute)
+ENDPROC(chacha_permute)
 
-ENTRY(chacha20_block_xor_neon)
+ENTRY(chacha_block_xor_neon)
 	// r0: Input state matrix, s
 	// r1: 1 data block output, o
 	// r2: 1 data block input, i
+	// r3: nrounds
 	push		{lr}
 
 	// x0..3 = s0..3
@@ -150,7 +152,7 @@ ENTRY(chacha20_block_xor_neon)
 	vmov		q10, q2
 	vmov		q11, q3
 
-	bl		chacha20_permute
+	bl		chacha_permute
 
 	add		ip, r2, #0x20
 	vld1.8		{q4-q5}, [r2]
@@ -177,30 +179,32 @@ ENTRY(chacha20_block_xor_neon)
 	vst1.8		{q2-q3}, [ip]
 
 	pop		{pc}
-ENDPROC(chacha20_block_xor_neon)
+ENDPROC(chacha_block_xor_neon)
 
-ENTRY(hchacha20_block_neon)
+ENTRY(hchacha_block_neon)
 	// r0: Input state matrix, s
 	// r1: output (8 32-bit words)
+	// r2: nrounds
 	push		{lr}
 
 	vld1.32		{q0-q1}, [r0]!
 	vld1.32		{q2-q3}, [r0]
 
-	bl		chacha20_permute
+	mov		r3, r2
+	bl		chacha_permute
 
 	vst1.32		{q0}, [r1]!
 	vst1.32		{q3}, [r1]
 
 	pop		{pc}
-ENDPROC(hchacha20_block_neon)
+ENDPROC(hchacha_block_neon)
 
 	.align		4
 .Lctrinc:	.word	0, 1, 2, 3
 .Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
 
 	.align		5
-ENTRY(chacha20_4block_xor_neon)
+ENTRY(chacha_4block_xor_neon)
 	push		{r4-r5}
 	mov		r4, sp			// preserve the stack pointer
 	sub		ip, sp, #0x20		// allocate a 32 byte buffer
@@ -210,9 +214,10 @@ ENTRY(chacha20_4block_xor_neon)
 	// r0: Input state matrix, s
 	// r1: 4 data blocks output, o
 	// r2: 4 data blocks input, i
+	// r3: nrounds
 
 	//
-	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// This function encrypts four consecutive ChaCha blocks by loading
 	// the state matrix in NEON registers four times. The algorithm performs
 	// each operation on the corresponding word of each state matrix, hence
 	// requires no word shuffling. The words are re-interleaved before the
@@ -245,7 +250,6 @@ ENTRY(chacha20_4block_xor_neon)
 	vdup.32		q0, d0[0]
 
 	adr		ip, .Lrol8_table
-	mov		r3, #10
 	b		1f
 
 .Ldoubleround4:
@@ -443,7 +447,7 @@ ENTRY(chacha20_4block_xor_neon)
 	vsri.u32	q5, q8, #25
 	vsri.u32	q6, q9, #25
 
-	subs		r3, r3, #1
+	subs		r3, r3, #2
 	bne		.Ldoubleround4
 
 	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
@@ -553,4 +557,4 @@ ENTRY(chacha20_4block_xor_neon)
 
 	pop		{r4-r5}
 	bx		lr
-ENDPROC(chacha20_4block_xor_neon)
+ENDPROC(chacha_4block_xor_neon)
diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c
similarity index 72%
rename from arch/arm/crypto/chacha20-neon-glue.c
rename to arch/arm/crypto/chacha-neon-glue.c
index f2d3b0f70a8d..385557d38634 100644
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ b/arch/arm/crypto/chacha-neon-glue.c
@@ -28,24 +28,26 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 
-asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out);
-
-static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
-			    unsigned int bytes)
+asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				      int nrounds);
+asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				       int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
 {
 	u8 buf[CHACHA_BLOCK_SIZE];
 
 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
-		chacha20_4block_xor_neon(state, dst, src);
+		chacha_4block_xor_neon(state, dst, src, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE * 4;
 		src += CHACHA_BLOCK_SIZE * 4;
 		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
 	while (bytes >= CHACHA_BLOCK_SIZE) {
-		chacha20_block_xor_neon(state, dst, src);
+		chacha_block_xor_neon(state, dst, src, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE;
 		src += CHACHA_BLOCK_SIZE;
 		dst += CHACHA_BLOCK_SIZE;
@@ -53,13 +55,13 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 	}
 	if (bytes) {
 		memcpy(buf, src, bytes);
-		chacha20_block_xor_neon(state, buf, buf);
+		chacha_block_xor_neon(state, buf, buf, nrounds);
 		memcpy(dst, buf, bytes);
 	}
 }
 
-static int chacha20_neon_stream_xor(struct skcipher_request *req,
-				    struct chacha_ctx *ctx, u8 *iv)
+static int chacha_neon_stream_xor(struct skcipher_request *req,
+				  struct chacha_ctx *ctx, u8 *iv)
 {
 	struct skcipher_walk walk;
 	u32 state[16];
@@ -76,8 +78,8 @@ static int chacha20_neon_stream_xor(struct skcipher_request *req,
 			nbytes = round_down(nbytes, walk.stride);
 
 		kernel_neon_begin();
-		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
-				nbytes);
+		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes, ctx->nrounds);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
@@ -85,7 +87,7 @@ static int chacha20_neon_stream_xor(struct skcipher_request *req,
 	return err;
 }
 
-static int chacha20_neon(struct skcipher_request *req)
+static int chacha_neon(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -93,10 +95,10 @@ static int chacha20_neon(struct skcipher_request *req)
 	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
 		return crypto_chacha_crypt(req);
 
-	return chacha20_neon_stream_xor(req, ctx, req->iv);
+	return chacha_neon_stream_xor(req, ctx, req->iv);
 }
 
-static int xchacha20_neon(struct skcipher_request *req)
+static int xchacha_neon(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -110,12 +112,13 @@ static int xchacha20_neon(struct skcipher_request *req)
 	crypto_chacha_init(state, ctx, req->iv);
 
 	kernel_neon_begin();
-	hchacha20_block_neon(state, subctx.key);
+	hchacha_block_neon(state, subctx.key, ctx->nrounds);
 	kernel_neon_end();
+	subctx.nrounds = ctx->nrounds;
 
 	memcpy(&real_iv[0], req->iv + 24, 8);
 	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha20_neon_stream_xor(req, &subctx, real_iv);
+	return chacha_neon_stream_xor(req, &subctx, real_iv);
 }
 
 static struct skcipher_alg algs[] = {
@@ -133,8 +136,8 @@ static struct skcipher_alg algs[] = {
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.walksize		= 4 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= chacha20_neon,
-		.decrypt		= chacha20_neon,
+		.encrypt		= chacha_neon,
+		.decrypt		= chacha_neon,
 	}, {
 		.base.cra_name		= "xchacha20",
 		.base.cra_driver_name	= "xchacha20-neon",
@@ -149,12 +152,12 @@ static struct skcipher_alg algs[] = {
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.walksize		= 4 * CHACHA_BLOCK_SIZE,
 		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= xchacha20_neon,
-		.decrypt		= xchacha20_neon,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
 	}
 };
 
-static int __init chacha20_simd_mod_init(void)
+static int __init chacha_simd_mod_init(void)
 {
 	if (!(elf_hwcap & HWCAP_NEON))
 		return -ENODEV;
@@ -162,14 +165,15 @@ static int __init chacha20_simd_mod_init(void)
 	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
 }
 
-static void __exit chacha20_simd_mod_fini(void)
+static void __exit chacha_simd_mod_fini(void)
 {
 	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
 
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("chacha20");
-- 
2.19.1.930.g4563a0d9d0-goog

  parent reply	other threads:[~2018-11-06  8:49 UTC|newest]

Thread overview: 98+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-05 23:25 [RFC PATCH v3 00/15] crypto: Adiantum support Eric Biggers
2018-11-05 23:25 ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 01/15] crypto: chacha20-generic - add HChaCha20 library function Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 02/15] crypto: chacha20-generic - don't unnecessarily use atomic walk Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 03/15] crypto: chacha20-generic - add XChaCha20 support Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 04/15] crypto: chacha20-generic - refactor to allow varying number of rounds Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 05/15] crypto: chacha - add XChaCha12 support Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 06/15] crypto: arm/chacha20 - limit the preemption-disabled section Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 07/15] crypto: arm/chacha20 - add XChaCha20 support Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-06 12:41   ` Ard Biesheuvel
2018-11-06 12:41     ` Ard Biesheuvel
2018-11-06 12:41     ` Ard Biesheuvel
2018-11-05 23:25 ` Eric Biggers [this message]
2018-11-05 23:25   ` [RFC PATCH v3 08/15] crypto: arm/chacha20 - refactor to allow varying number of rounds Eric Biggers
2018-11-06 12:46   ` Ard Biesheuvel
2018-11-06 12:46     ` Ard Biesheuvel
2018-11-06 12:46     ` Ard Biesheuvel
2018-11-05 23:25 ` [RFC PATCH v3 09/15] crypto: arm/chacha - add XChaCha12 support Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 10/15] crypto: poly1305 - use structures for key and accumulator Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-06 14:28   ` Ard Biesheuvel
2018-11-06 14:28     ` Ard Biesheuvel
2018-11-06 14:28     ` Ard Biesheuvel
2018-11-12 18:58     ` Eric Biggers
2018-11-12 18:58       ` Eric Biggers
2018-11-12 18:58       ` Eric Biggers
2018-11-16  6:02       ` Herbert Xu
2018-11-16  6:02         ` Herbert Xu
2018-11-16  6:02         ` Herbert Xu
2018-11-17  0:17         ` Eric Biggers
2018-11-17  0:17           ` Eric Biggers
2018-11-17  0:17           ` Eric Biggers
2018-11-17  0:30           ` Ard Biesheuvel
2018-11-17  0:30             ` Ard Biesheuvel
2018-11-17  0:30             ` Ard Biesheuvel
2018-11-18 13:46           ` Jason A. Donenfeld
2018-11-18 13:46             ` Jason A. Donenfeld
2018-11-19  5:24             ` [RFC PATCH] zinc chacha20 generic implementation using crypto API code Herbert Xu
2018-11-19  6:13               ` Jason A. Donenfeld
2018-11-19  6:13                 ` Jason A. Donenfeld
2018-11-19  6:22                 ` Herbert Xu
2018-11-19  6:22                   ` Herbert Xu
2018-11-19 22:54                 ` Eric Biggers
2018-11-19 22:54                   ` Eric Biggers
2018-11-19 23:15                   ` Jason A. Donenfeld
2018-11-19 23:15                     ` Jason A. Donenfeld
2018-11-19 23:23                     ` Eric Biggers
2018-11-19 23:23                       ` Eric Biggers
2018-11-19 23:31                       ` Jason A. Donenfeld
2018-11-19 23:31                         ` Jason A. Donenfeld
2018-11-20  3:06                   ` Herbert Xu
2018-11-20  3:06                     ` Herbert Xu
2018-11-20  3:08                     ` Jason A. Donenfeld
2018-11-20  3:08                       ` Jason A. Donenfeld
2018-11-20  6:02               ` [RFC PATCH v2 0/4] Exporting existing crypto API code through zinc Herbert Xu
2018-11-20  6:02                 ` Herbert Xu
2018-11-20  6:04                 ` [v2 PATCH 1/4] crypto: chacha20 - Export chacha20 functions without crypto API Herbert Xu
2018-11-20  6:04                   ` Herbert Xu
2018-11-20  6:04                 ` [v2 PATCH 2/4] zinc: ChaCha20 generic C implementation and selftest Herbert Xu
2018-11-20  6:04                 ` [v2 PATCH 3/4] zinc: Add x86 accelerated ChaCha20 Herbert Xu
2018-11-20  6:04                   ` Herbert Xu
2018-11-20  6:04                 ` [v2 PATCH 4/4] zinc: ChaCha20 x86_64 implementation Herbert Xu
2018-11-20 10:32                 ` [RFC PATCH v2 0/4] Exporting existing crypto API code through zinc Ard Biesheuvel
2018-11-20 10:32                   ` Ard Biesheuvel
2018-11-20 10:32                   ` Ard Biesheuvel
2018-11-20 14:18                   ` Herbert Xu
2018-11-20 14:18                     ` Herbert Xu
2018-11-20 14:18                     ` Herbert Xu
2018-11-20 16:24                     ` Jason A. Donenfeld
2018-11-20 16:24                       ` Jason A. Donenfeld
2018-11-20 18:51                       ` Theodore Y. Ts'o
2018-11-20 18:51                         ` Theodore Y. Ts'o
2018-11-21  7:55                       ` Herbert Xu
2018-11-21  7:55                         ` Herbert Xu
2018-11-20 16:18                 ` Jason A. Donenfeld
2018-11-20 16:18                   ` Jason A. Donenfeld
2018-11-21  6:01                   ` Herbert Xu
2018-11-21  6:01                     ` Herbert Xu
2018-11-05 23:25 ` [RFC PATCH v3 11/15] crypto: poly1305 - add Poly1305 core API Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 12/15] crypto: nhpoly1305 - add NHPoly1305 support Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 13/15] crypto: arm/nhpoly1305 - add NEON-accelerated NHPoly1305 Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 14/15] crypto: adiantum - add Adiantum support Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-05 23:25 ` [RFC PATCH v3 15/15] fscrypt: " Eric Biggers
2018-11-05 23:25   ` Eric Biggers
2018-11-08  6:47 ` [RFC PATCH v3 00/15] crypto: " Martin Willi
2018-11-08  6:47   ` Martin Willi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181105232526.173947-9-ebiggers@kernel.org \
    --to=ebiggers@kernel.org \
    --cc=Jason@zx2c4.com \
    --cc=gkaiser@google.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-fscrypt@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=paulcrowley@google.com \
    --cc=samuel.c.p.neves@gmail.com \
    --cc=tomer.ashur@esat.kuleuven.be \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.