All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
To: linux-crypto@vger.kernel.org
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>,
	Herbert Xu <herbert@gondor.apana.org.au>,
	David Miller <davem@davemloft.net>,
	"Jason A . Donenfeld" <Jason@zx2c4.com>,
	Samuel Neves <sneves@dei.uc.pt>, Arnd Bergmann <arnd@arndb.de>,
	Eric Biggers <ebiggers@google.com>,
	Andy Lutomirski <luto@kernel.org>,
	Martin Willi <martin@strongswan.org>,
	Rene van Dorst <opensource@vdorst.com>
Subject: [PATCH v3 09/29] crypto: mips/chacha - import 32r2 ChaCha code from Zinc
Date: Mon,  7 Oct 2019 18:45:50 +0200	[thread overview]
Message-ID: <20191007164610.6881-10-ard.biesheuvel@linaro.org> (raw)
In-Reply-To: <20191007164610.6881-1-ard.biesheuvel@linaro.org>

This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
Zinc patch set.

Co-developed-by: René van Dorst <opensource@vdorst.com>
Signed-off-by: René van Dorst <opensource@vdorst.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 arch/mips/crypto/chacha-core.S | 424 ++++++++++++++++++++
 1 file changed, 424 insertions(+)

diff --git a/arch/mips/crypto/chacha-core.S b/arch/mips/crypto/chacha-core.S
new file mode 100644
index 000000000000..a81e02db95e7
--- /dev/null
+++ b/arch/mips/crypto/chacha-core.S
@@ -0,0 +1,424 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32		0x3c
+#define CHACHA20_BLOCK_SIZE	64
+#define STACK_SIZE		32
+
+#define X0	$t0
+#define X1	$t1
+#define X2	$t2
+#define X3	$t3
+#define X4	$t4
+#define X5	$t5
+#define X6	$t6
+#define X7	$t7
+#define X8	$t8
+#define X9	$t9
+#define X10	$v1
+#define X11	$s6
+#define X12	$s5
+#define X13	$s4
+#define X14	$s3
+#define X15	$s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0	$s1
+#define T1	$s0
+#define T(n)	T ## n
+#define X(n)	X ## n
+
+/* Input arguments */
+#define STATE		$a0
+#define OUT		$a1
+#define IN		$a2
+#define BYTES		$a3
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0		$v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X		X15
+#define SAVED_CA	$s7
+
+#define IS_UNALIGNED	$s7
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define ROTx rotl
+#define ROTR(n) rotr n, 24
+#define	CPU_TO_LE32(n) \
+	wsbh	n; \
+	rotr	n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define ROTx rotr
+#define CPU_TO_LE32(n)
+#define ROTR(n)
+#endif
+
+#define FOR_EACH_WORD(x) \
+	x( 0); \
+	x( 1); \
+	x( 2); \
+	x( 3); \
+	x( 4); \
+	x( 5); \
+	x( 6); \
+	x( 7); \
+	x( 8); \
+	x( 9); \
+	x(10); \
+	x(11); \
+	x(12); \
+	x(13); \
+	x(14); \
+	x(15);
+
+#define FOR_EACH_WORD_REV(x) \
+	x(15); \
+	x(14); \
+	x(13); \
+	x(12); \
+	x(11); \
+	x(10); \
+	x( 9); \
+	x( 8); \
+	x( 7); \
+	x( 6); \
+	x( 5); \
+	x( 4); \
+	x( 3); \
+	x( 2); \
+	x( 1); \
+	x( 0);
+
+#define PLUS_ONE_0	 1
+#define PLUS_ONE_1	 2
+#define PLUS_ONE_2	 3
+#define PLUS_ONE_3	 4
+#define PLUS_ONE_4	 5
+#define PLUS_ONE_5	 6
+#define PLUS_ONE_6	 7
+#define PLUS_ONE_7	 8
+#define PLUS_ONE_8	 9
+#define PLUS_ONE_9	10
+#define PLUS_ONE_10	11
+#define PLUS_ONE_11	12
+#define PLUS_ONE_12	13
+#define PLUS_ONE_13	14
+#define PLUS_ONE_14	15
+#define PLUS_ONE_15	16
+#define PLUS_ONE(x)	PLUS_ONE_ ## x
+#define _CONCAT3(a,b,c)	a ## b ## c
+#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
+
+#define STORE_UNALIGNED(x) \
+CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lwl	T1, (x*4)+MSB ## (IN); \
+	lwr	T1, (x*4)+LSB ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	swl	X ## x, (x*4)+MSB ## (OUT); \
+	swr	X ## x, (x*4)+LSB ## (OUT);
+
+#define STORE_ALIGNED(x) \
+CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lw	T1, (x*4) ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	sw	X ## x, (x*4) ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x) \
+.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha20_mips_xor_aligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define JMPTBL_UNALIGNED(x) \
+.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha20_mips_xor_unaligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
+	addu	X(A), X(K); \
+	addu	X(B), X(L); \
+	addu	X(C), X(M); \
+	addu	X(D), X(N); \
+	xor	X(V), X(A); \
+	xor	X(W), X(B); \
+	xor	X(Y), X(C); \
+	xor	X(Z), X(D); \
+	rotl	X(V), S;    \
+	rotl	X(W), S;    \
+	rotl	X(Y), S;    \
+	rotl	X(Z), S;
+
+.text
+.set	reorder
+.set	noat
+.globl	chacha20_mips
+.ent	chacha20_mips
+chacha20_mips:
+	.frame	$sp, STACK_SIZE, $ra
+
+	addiu	$sp, -STACK_SIZE
+
+	/* Return bytes = 0. */
+	beqz	BYTES, .Lchacha20_mips_end
+
+	lw	NONCE_0, 48(STATE)
+
+	/* Save s0-s7 */
+	sw	$s0,  0($sp)
+	sw	$s1,  4($sp)
+	sw	$s2,  8($sp)
+	sw	$s3, 12($sp)
+	sw	$s4, 16($sp)
+	sw	$s5, 20($sp)
+	sw	$s6, 24($sp)
+	sw	$s7, 28($sp)
+
+	/* Test IN or OUT is unaligned.
+	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
+	 */
+	or	IS_UNALIGNED, IN, OUT
+	andi	IS_UNALIGNED, 0x3
+
+	/* Set number of rounds */
+	li	$at, 20
+
+	b	.Lchacha20_rounds_start
+
+.align 4
+.Loop_chacha20_rounds:
+	addiu	IN,  CHACHA20_BLOCK_SIZE
+	addiu	OUT, CHACHA20_BLOCK_SIZE
+	addiu	NONCE_0, 1
+
+.Lchacha20_rounds_start:
+	lw	X0,  0(STATE)
+	lw	X1,  4(STATE)
+	lw	X2,  8(STATE)
+	lw	X3,  12(STATE)
+
+	lw	X4,  16(STATE)
+	lw	X5,  20(STATE)
+	lw	X6,  24(STATE)
+	lw	X7,  28(STATE)
+	lw	X8,  32(STATE)
+	lw	X9,  36(STATE)
+	lw	X10, 40(STATE)
+	lw	X11, 44(STATE)
+
+	move	X12, NONCE_0
+	lw	X13, 52(STATE)
+	lw	X14, 56(STATE)
+	lw	X15, 60(STATE)
+
+.Loop_chacha20_xor_rounds:
+	addiu	$at, -2
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	bnez	$at, .Loop_chacha20_xor_rounds
+
+	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
+
+	/* Is data src/dst unaligned? Jump */
+	bnez	IS_UNALIGNED, .Loop_chacha20_unaligned
+
+	/* Set number rounds here to fill delayslot. */
+	li	$at, 20
+
+	/* BYTES < 0, it has no full block. */
+	bltz	BYTES, .Lchacha20_mips_no_full_block_aligned
+
+	FOR_EACH_WORD_REV(STORE_ALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha20_rounds
+
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+
+	/* BYTES < 0? Handle last bytes */
+	bltz	BYTES, .Lchacha20_mips_xor_bytes
+
+.Lchacha20_mips_xor_done:
+	/* Restore used registers */
+	lw	$s0,  0($sp)
+	lw	$s1,  4($sp)
+	lw	$s2,  8($sp)
+	lw	$s3, 12($sp)
+	lw	$s4, 16($sp)
+	lw	$s5, 20($sp)
+	lw	$s6, 24($sp)
+	lw	$s7, 28($sp)
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+.Lchacha20_mips_end:
+	addiu	$sp, STACK_SIZE
+	jr	$ra
+
+.Lchacha20_mips_no_full_block_aligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_ALIGNED)
+
+
+.Loop_chacha20_unaligned:
+	/* Set number rounds here to fill delayslot. */
+	li	$at, 20
+
+	/* BYTES > 0, it has no full block. */
+	bltz	BYTES, .Lchacha20_mips_no_full_block_unaligned
+
+	FOR_EACH_WORD_REV(STORE_UNALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha20_rounds
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+	.set noreorder
+	/* Fall through to byte handling */
+	bgez	BYTES, .Lchacha20_mips_xor_done
+.Lchacha20_mips_xor_unaligned_0_b:
+.Lchacha20_mips_xor_aligned_0_b:
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+	.set reorder
+
+.Lchacha20_mips_xor_bytes:
+	addu	IN, $at
+	addu	OUT, $at
+	/* First byte */
+	lbu	T1, 0(IN)
+	addiu	$at, BYTES, 1
+	CPU_TO_LE32(SAVED_X)
+	ROTR(SAVED_X)
+	xor	T1, SAVED_X
+	sb	T1, 0(OUT)
+	beqz	$at, .Lchacha20_mips_xor_done
+	/* Second byte */
+	lbu	T1, 1(IN)
+	addiu	$at, BYTES, 2
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 1(OUT)
+	beqz	$at, .Lchacha20_mips_xor_done
+	/* Third byte */
+	lbu	T1, 2(IN)
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 2(OUT)
+	b	.Lchacha20_mips_xor_done
+
+.Lchacha20_mips_no_full_block_unaligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_UNALIGNED)
+.end chacha20_mips
+.set at
-- 
2.20.1


  parent reply	other threads:[~2019-10-07 16:46 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-07 16:45 [PATCH v3 00/29] crypto: crypto API library interfaces for WireGuard Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 01/29] crypto: chacha - move existing library code into lib/crypto Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 02/29] crypto: x86/chacha - depend on generic chacha library instead of crypto driver Ard Biesheuvel
2019-10-11  6:00   ` Eric Biggers
2019-10-15 10:00   ` Martin Willi
2019-10-15 10:12     ` Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 03/29] crypto: x86/chacha - expose SIMD ChaCha routine as library function Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 04/29] crypto: arm64/chacha - depend on generic chacha library instead of crypto driver Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 05/29] crypto: arm64/chacha - expose arm64 ChaCha routine as library function Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 06/29] crypto: arm/chacha - import Eric Biggers's scalar accelerated ChaCha code Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 07/29] crypto: arm/chacha - remove dependency on generic ChaCha driver Ard Biesheuvel
2019-10-11  6:12   ` Eric Biggers
2019-10-11  6:31   ` Eric Biggers
2019-10-07 16:45 ` [PATCH v3 08/29] crypto: arm/chacha - expose ARM ChaCha routine as library function Ard Biesheuvel
2019-10-07 16:45 ` Ard Biesheuvel [this message]
2019-10-07 16:45 ` [PATCH v3 10/29] crypto: mips/chacha - wire up accelerated 32r2 code from Zinc Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 11/29] crypto: chacha - unexport chacha_generic routines Ard Biesheuvel
2019-10-11  6:04   ` Eric Biggers
2019-10-07 16:45 ` [PATCH v3 12/29] crypto: poly1305 - move core routines into a separate library Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 13/29] crypto: x86/poly1305 - unify Poly1305 state struct with generic code Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 14/29] crypto: poly1305 - expose init/update/final library interface Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 15/29] crypto: x86/poly1305 - depend on generic library not generic shash Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 16/29] crypto: x86/poly1305 - expose existing driver as poly1305 library Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 17/29] crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation Ard Biesheuvel
2019-10-07 16:45 ` [PATCH v3 18/29] crypto: arm/poly1305 " Ard Biesheuvel
2019-10-07 16:46 ` [PATCH v3 19/29] crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS optimized implementation Ard Biesheuvel
2019-10-07 21:02   ` René van Dorst
2019-10-08  5:55     ` Ard Biesheuvel
2019-10-08 11:38     ` Andy Polyakov
2019-10-08 17:46       ` René van Dorst
2019-10-11 14:14       ` Andy Polyakov
2019-10-11 17:21         ` René van Dorst
2019-10-11 18:49           ` Andy Polyakov
2019-10-11 21:38           ` Arnd Bergmann
2019-10-07 16:46 ` [PATCH v3 20/29] int128: move __uint128_t compiler test to Kconfig Ard Biesheuvel
2019-10-07 16:46 ` [PATCH v3 21/29] crypto: BLAKE2s - generic C library implementation and selftest Ard Biesheuvel
2019-10-11  6:02   ` Eric Biggers
2019-10-11 16:45     ` Jason A. Donenfeld
2019-10-14 12:53       ` Ard Biesheuvel
2019-10-07 16:46 ` [PATCH v3 22/29] crypto: BLAKE2s - x86_64 library implementation Ard Biesheuvel
2019-10-07 16:46 ` [PATCH v3 23/29] crypto: Curve25519 - generic C library implementations and selftest Ard Biesheuvel
2019-10-07 16:46 ` [PATCH v3 24/29] crypto: lib/curve25519 - work around Clang stack spilling issue Ard Biesheuvel
2019-10-14 14:13   ` Jason A. Donenfeld
2019-10-14 16:07     ` Ard Biesheuvel
2019-10-07 16:46 ` [PATCH v3 25/29] crypto: Curve25519 - x86_64 library implementation Ard Biesheuvel
2019-10-07 16:46 ` [PATCH v3 26/29] crypto: arm - import Bernstein and Schwabe's Curve25519 ARM implementation Ard Biesheuvel
2019-10-07 16:46 ` [PATCH v3 27/29] crypto: arm/Curve25519 - wire up NEON implementation Ard Biesheuvel
2019-10-07 16:46 ` [PATCH v3 28/29] crypto: chacha20poly1305 - import construction and selftest from Zinc Ard Biesheuvel
2019-10-11  6:14   ` Eric Biggers
2019-10-07 16:46 ` [PATCH v3 29/29] crypto: lib/chacha20poly1305 - reimplement crypt_from_sg() routine Ard Biesheuvel
2019-10-14 14:33 ` [PATCH v3 00/29] crypto: crypto API library interfaces for WireGuard Jason A. Donenfeld

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191007164610.6881-10-ard.biesheuvel@linaro.org \
    --to=ard.biesheuvel@linaro.org \
    --cc=Jason@zx2c4.com \
    --cc=arnd@arndb.de \
    --cc=davem@davemloft.net \
    --cc=ebiggers@google.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-crypto@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=martin@strongswan.org \
    --cc=opensource@vdorst.com \
    --cc=sneves@dei.uc.pt \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.