All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] arm: crypto: Add NEON optimized SHA-256
@ 2015-03-16 15:48 ` Sami Tolvanen
  0 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-16 15:48 UTC (permalink / raw)
  To: linux-arm-kernel, linux-crypto, ard.biesheuvel, herbert, davem

Add Andy Polyakov's NEON optimized SHA-256 implementation.

On Nexus 6, this implementation is ~2x faster than sha256-generic.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>

---
 arch/arm/crypto/Makefile            |    2 
 arch/arm/crypto/sha256-armv7-neon.S |  819 ++++++++++++++++++++++++++++++++++++
 arch/arm/crypto/sha256_neon_glue.c  |  201 ++++++++
 crypto/Kconfig                      |   12 
 4 files changed, 1034 insertions(+)

diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index b48fa34..316dba2 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -6,12 +6,14 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM_NEON) += sha256-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
 
 aes-arm-y	:= aes-armv4.o aes_glue.o
 aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-y := sha256-armv7-neon.o sha256_neon_glue.o
 sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
 
 quiet_cmd_perl = PERL    $@
diff --git a/arch/arm/crypto/sha256-armv7-neon.S b/arch/arm/crypto/sha256-armv7-neon.S
new file mode 100644
index 0000000..5ce04c2
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv7-neon.S
@@ -0,0 +1,819 @@
+@ sha256-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-256 transform
+@
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+#include <linux/linkage.h>
+
+.text
+.code   32
+.fpu neon
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+.word	0
+.align	5
+
+.align 5
+ENTRY(sha256_transform_neon)
+	/* Input:
+	 *	%r0: SHA256_CONTEXT
+	 *	%r1: data
+	 *	%r2: nblks
+	 */
+	sub	r3,pc,#8		@ sha256_transform_neon
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	stmdb	sp!,{r4-r12,lr}
+
+	mov	r12,sp
+	sub	sp,sp,#16*4+16		@ alloca
+	sub	r14,r3,#256+32	@ K256
+	bic	sp,sp,#15		@ align for 128-bit stores
+
+	vld1.8		{q0},[r1]!
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	vld1.32		{q8},[r14,:128]!
+	vld1.32		{q9},[r14,:128]!
+	vld1.32		{q10},[r14,:128]!
+	vld1.32		{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str		r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str		r1,[sp,#68]
+	mov		r1,sp
+	vrev32.8	q2,q2
+	str		r2,[sp,#72]
+	vrev32.8	q3,q3
+	str		r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32		{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32		{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32		{q10},[r1,:128]!
+	vst1.32		{q11},[r1,:128]!
+
+	ldmia		r0,{r4-r11}
+	sub		r1,r1,#64
+	ldr		r2,[sp,#0]
+	eor		r12,r12,r12
+	eor		r3,r5,r6
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	.L_00_48
+
+	ldr		r1,[sp,#68]
+	ldr		r0,[sp,#72]
+	sub		r14,r14,#256	@ rewind r14
+	teq		r1,r0
+	subeq		r1,r1,#64		@ avoid SEGV
+	vld1.8		{q0},[r1]!		@ load next input block
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	strne		r1,[sp,#68]
+	mov		r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8-r11}
+
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	eorne	r3,r5,r6
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+ENDPROC(sha256_transform_neon)
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 0000000..698a498
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,201 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ *   Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+
+asmlinkage void sha256_transform_neon(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+static int sha256_neon_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+				unsigned int len, unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_transform_neon(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_transform_neon(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!may_use_simd()) {
+		res = crypto_sha256_update(desc, data, len);
+	} else {
+		kernel_neon_begin();
+		res = __sha256_neon_update(desc, data, len, partial);
+		kernel_neon_end();
+	}
+
+	return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	if (!may_use_simd()) {
+		crypto_sha256_update(desc, padding, padlen);
+		crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_neon_begin();
+		/* We need to fill a whole block for __sha256_neon_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha256_neon_update(desc, padding, padlen, index);
+		}
+		__sha256_neon_update(desc, (const u8 *)&bits,
+					sizeof(bits), 56);
+		kernel_neon_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_neon_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_neon_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_neon_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha256_neon_final,
+	.export		=	sha256_neon_export,
+	.import		=	sha256_neon_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-neon",
+		.cra_priority	=	350,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static int __init sha256_neon_mod_init(void)
+{
+	if (!cpu_has_neon())
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit sha256_neon_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha256_neon_mod_init);
+module_exit(sha256_neon_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, NEON accelerated");
+
+MODULE_ALIAS("sha256");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 50f4da4..0505523 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -610,6 +610,18 @@ config CRYPTO_SHA256
 	  This code also includes SHA-224, a 224 bit hash with 112 bits
 	  of security against collision attacks.
 
+config CRYPTO_SHA256_ARM_NEON
+	tristate "SHA256 digest algorithm (ARM NEON)"
+	depends on ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
+	select CRYPTO_SHA256
+	select CRYPTO_HASH
+	help
+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
+	  using ARM NEON instructions, when available.
+
+	  This version of SHA implements a 256 bit hash with 128 bits of
+	  security against collision attacks.
+
 config CRYPTO_SHA256_SPARC64
 	tristate "SHA224 and SHA256 digest algorithm (SPARC64)"
 	depends on SPARC64

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH] arm: crypto: Add NEON optimized SHA-256
@ 2015-03-16 15:48 ` Sami Tolvanen
  0 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-16 15:48 UTC (permalink / raw)
  To: linux-arm-kernel

Add Andy Polyakov's NEON optimized SHA-256 implementation.

On Nexus 6, this implementation is ~2x faster than sha256-generic.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>

---
 arch/arm/crypto/Makefile            |    2 
 arch/arm/crypto/sha256-armv7-neon.S |  819 ++++++++++++++++++++++++++++++++++++
 arch/arm/crypto/sha256_neon_glue.c  |  201 ++++++++
 crypto/Kconfig                      |   12 
 4 files changed, 1034 insertions(+)

diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index b48fa34..316dba2 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -6,12 +6,14 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM_NEON) += sha256-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
 
 aes-arm-y	:= aes-armv4.o aes_glue.o
 aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-y := sha256-armv7-neon.o sha256_neon_glue.o
 sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
 
 quiet_cmd_perl = PERL    $@
diff --git a/arch/arm/crypto/sha256-armv7-neon.S b/arch/arm/crypto/sha256-armv7-neon.S
new file mode 100644
index 0000000..5ce04c2
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv7-neon.S
@@ -0,0 +1,819 @@
+@ sha256-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-256 transform
+@
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+#include <linux/linkage.h>
+
+.text
+.code   32
+.fpu neon
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+.word	0
+.align	5
+
+.align 5
+ENTRY(sha256_transform_neon)
+	/* Input:
+	 *	%r0: SHA256_CONTEXT
+	 *	%r1: data
+	 *	%r2: nblks
+	 */
+	sub	r3,pc,#8		@ sha256_transform_neon
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	stmdb	sp!,{r4-r12,lr}
+
+	mov	r12,sp
+	sub	sp,sp,#16*4+16		@ alloca
+	sub	r14,r3,#256+32	@ K256
+	bic	sp,sp,#15		@ align for 128-bit stores
+
+	vld1.8		{q0},[r1]!
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	vld1.32		{q8},[r14,:128]!
+	vld1.32		{q9},[r14,:128]!
+	vld1.32		{q10},[r14,:128]!
+	vld1.32		{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str		r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str		r1,[sp,#68]
+	mov		r1,sp
+	vrev32.8	q2,q2
+	str		r2,[sp,#72]
+	vrev32.8	q3,q3
+	str		r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32		{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32		{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32		{q10},[r1,:128]!
+	vst1.32		{q11},[r1,:128]!
+
+	ldmia		r0,{r4-r11}
+	sub		r1,r1,#64
+	ldr		r2,[sp,#0]
+	eor		r12,r12,r12
+	eor		r3,r5,r6
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	.L_00_48
+
+	ldr		r1,[sp,#68]
+	ldr		r0,[sp,#72]
+	sub		r14,r14,#256	@ rewind r14
+	teq		r1,r0
+	subeq		r1,r1,#64		@ avoid SEGV
+	vld1.8		{q0},[r1]!		@ load next input block
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	strne		r1,[sp,#68]
+	mov		r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8-r11}
+
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	eorne	r3,r5,r6
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+ENDPROC(sha256_transform_neon)
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 0000000..698a498
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,201 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ *   Copyright ? 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+
+asmlinkage void sha256_transform_neon(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+static int sha256_neon_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+				unsigned int len, unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_transform_neon(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_transform_neon(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!may_use_simd()) {
+		res = crypto_sha256_update(desc, data, len);
+	} else {
+		kernel_neon_begin();
+		res = __sha256_neon_update(desc, data, len, partial);
+		kernel_neon_end();
+	}
+
+	return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	if (!may_use_simd()) {
+		crypto_sha256_update(desc, padding, padlen);
+		crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_neon_begin();
+		/* We need to fill a whole block for __sha256_neon_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha256_neon_update(desc, padding, padlen, index);
+		}
+		__sha256_neon_update(desc, (const u8 *)&bits,
+					sizeof(bits), 56);
+		kernel_neon_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_neon_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_neon_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_neon_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha256_neon_final,
+	.export		=	sha256_neon_export,
+	.import		=	sha256_neon_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-neon",
+		.cra_priority	=	350,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static int __init sha256_neon_mod_init(void)
+{
+	if (!cpu_has_neon())
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit sha256_neon_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha256_neon_mod_init);
+module_exit(sha256_neon_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, NEON accelerated");
+
+MODULE_ALIAS("sha256");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 50f4da4..0505523 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -610,6 +610,18 @@ config CRYPTO_SHA256
 	  This code also includes SHA-224, a 224 bit hash with 112 bits
 	  of security against collision attacks.
 
+config CRYPTO_SHA256_ARM_NEON
+	tristate "SHA256 digest algorithm (ARM NEON)"
+	depends on ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
+	select CRYPTO_SHA256
+	select CRYPTO_HASH
+	help
+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
+	  using ARM NEON instructions, when available.
+
+	  This version of SHA implements a 256 bit hash with 128 bits of
+	  security against collision attacks.
+
 config CRYPTO_SHA256_SPARC64
 	tristate "SHA224 and SHA256 digest algorithm (SPARC64)"
 	depends on SPARC64

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCH] arm: crypto: Add NEON optimized SHA-256
  2015-03-16 15:48 ` Sami Tolvanen
@ 2015-03-16 16:08   ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-16 16:08 UTC (permalink / raw)
  To: Sami Tolvanen, Andy Polyakov
  Cc: linux-arm-kernel, linux-crypto, Herbert Xu, David S. Miller

Hello Sami,

On 16 March 2015 at 16:48, Sami Tolvanen <samitolvanen@google.com> wrote:
> Add Andy Polyakov's NEON optimized SHA-256 implementation.
>
> On Nexus 6, this implementation is ~2x faster than sha256-generic.
>
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
>

Have you tested this code with the tcrypt.ko module?

Some more comments below

> ---
>  arch/arm/crypto/Makefile            |    2
>  arch/arm/crypto/sha256-armv7-neon.S |  819 ++++++++++++++++++++++++++++++++++++
>  arch/arm/crypto/sha256_neon_glue.c  |  201 ++++++++
>  crypto/Kconfig                      |   12
>  4 files changed, 1034 insertions(+)
>
> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
> index b48fa34..316dba2 100644
> --- a/arch/arm/crypto/Makefile
> +++ b/arch/arm/crypto/Makefile
> @@ -6,12 +6,14 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
>  obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
> +obj-$(CONFIG_CRYPTO_SHA256_ARM_NEON) += sha256-arm-neon.o
>  obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
>
>  aes-arm-y      := aes-armv4.o aes_glue.o
>  aes-arm-bs-y   := aesbs-core.o aesbs-glue.o
>  sha1-arm-y     := sha1-armv4-large.o sha1_glue.o
>  sha1-arm-neon-y        := sha1-armv7-neon.o sha1_neon_glue.o
> +sha256-arm-neon-y := sha256-armv7-neon.o sha256_neon_glue.o
>  sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
>
>  quiet_cmd_perl = PERL    $@
> diff --git a/arch/arm/crypto/sha256-armv7-neon.S b/arch/arm/crypto/sha256-armv7-neon.S
> new file mode 100644
> index 0000000..5ce04c2
> --- /dev/null
> +++ b/arch/arm/crypto/sha256-armv7-neon.S
> @@ -0,0 +1,819 @@
> +@ sha256-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-256 transform
> +@
> +@ ====================================================================
> +@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
> +@ project. The module is, however, dual licensed under OpenSSL and
> +@ CRYPTOGAMS licenses depending on where you obtain it. For further
> +@ details see http://www.openssl.org/~appro/cryptogams/.
> +@ ====================================================================
> +

Did you talk to Andy about the license? I don't think this is
permissible for the kernel as-is.


> +#include <linux/linkage.h>
> +
> +.text
> +.code   32
> +.fpu neon
> +
> +.type  K256,%object
> +.align 5
> +K256:
> +.word  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
> +.word  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
> +.word  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
> +.word  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
> +.word  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
> +.word  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
> +.word  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
> +.word  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
> +.word  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
> +.word  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
> +.word  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
> +.word  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
> +.word  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
> +.word  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
> +.word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
> +.word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
> +.size  K256,.-K256
> +.word  0                               @ terminator
> +.word  0
> +.align 5
> +
> +.align 5
> +ENTRY(sha256_transform_neon)
> +       /* Input:
> +        *      %r0: SHA256_CONTEXT
> +        *      %r1: data
> +        *      %r2: nblks
> +        */
> +       sub     r3,pc,#8                @ sha256_transform_neon

This is broken on thumb-2, use adr instead


> +       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
> +
> +       stmdb   sp!,{r4-r12,lr}
> +
> +       mov     r12,sp
> +       sub     sp,sp,#16*4+16          @ alloca
> +       sub     r14,r3,#256+32  @ K256
> +       bic     sp,sp,#15               @ align for 128-bit stores
> +
> +       vld1.8          {q0},[r1]!
> +       vld1.8          {q1},[r1]!
> +       vld1.8          {q2},[r1]!
> +       vld1.8          {q3},[r1]!
> +       vld1.32         {q8},[r14,:128]!
> +       vld1.32         {q9},[r14,:128]!
> +       vld1.32         {q10},[r14,:128]!
> +       vld1.32         {q11},[r14,:128]!
> +       vrev32.8        q0,q0           @ yes, even on
> +       str             r0,[sp,#64]
> +       vrev32.8        q1,q1           @ big-endian
> +       str             r1,[sp,#68]
> +       mov             r1,sp
> +       vrev32.8        q2,q2
> +       str             r2,[sp,#72]
> +       vrev32.8        q3,q3
> +       str             r12,[sp,#76]            @ save original sp
> +       vadd.i32        q8,q8,q0
> +       vadd.i32        q9,q9,q1
> +       vst1.32         {q8},[r1,:128]!
> +       vadd.i32        q10,q10,q2
> +       vst1.32         {q9},[r1,:128]!
> +       vadd.i32        q11,q11,q3
> +       vst1.32         {q10},[r1,:128]!
> +       vst1.32         {q11},[r1,:128]!
> +
> +       ldmia           r0,{r4-r11}
> +       sub             r1,r1,#64
> +       ldr             r2,[sp,#0]
> +       eor             r12,r12,r12
> +       eor             r3,r5,r6
> +       b               .L_00_48
> +
> +.align 4
> +.L_00_48:
> +       vext.8  q8,q0,q1,#4
> +       add     r11,r11,r2
> +       eor     r2,r9,r10
> +       eor     r0,r8,r8,ror#5
> +       vext.8  q9,q2,q3,#4
> +       add     r4,r4,r12
> +       and     r2,r2,r8
> +       eor     r12,r0,r8,ror#19
> +       vshr.u32        q10,q8,#7
> +       eor     r0,r4,r4,ror#11
> +       eor     r2,r2,r10
> +       vadd.i32        q0,q0,q9
> +       add     r11,r11,r12,ror#6
> +       eor     r12,r4,r5
> +       vshr.u32        q9,q8,#3
> +       eor     r0,r0,r4,ror#20
> +       add     r11,r11,r2
> +       vsli.32 q10,q8,#25
> +       ldr     r2,[sp,#4]
> +       and     r3,r3,r12
> +       vshr.u32        q11,q8,#18
> +       add     r7,r7,r11
> +       add     r11,r11,r0,ror#2
> +       eor     r3,r3,r5
> +       veor    q9,q9,q10
> +       add     r10,r10,r2
> +       vsli.32 q11,q8,#14
> +       eor     r2,r8,r9
> +       eor     r0,r7,r7,ror#5
> +       vshr.u32        d24,d7,#17
> +       add     r11,r11,r3
> +       and     r2,r2,r7
> +       veor    q9,q9,q11
> +       eor     r3,r0,r7,ror#19
> +       eor     r0,r11,r11,ror#11
> +       vsli.32 d24,d7,#15
> +       eor     r2,r2,r9
> +       add     r10,r10,r3,ror#6
> +       vshr.u32        d25,d7,#10
> +       eor     r3,r11,r4
> +       eor     r0,r0,r11,ror#20
> +       vadd.i32        q0,q0,q9
> +       add     r10,r10,r2
> +       ldr     r2,[sp,#8]
> +       veor    d25,d25,d24
> +       and     r12,r12,r3
> +       add     r6,r6,r10
> +       vshr.u32        d24,d7,#19
> +       add     r10,r10,r0,ror#2
> +       eor     r12,r12,r4
> +       vsli.32 d24,d7,#13
> +       add     r9,r9,r2
> +       eor     r2,r7,r8
> +       veor    d25,d25,d24
> +       eor     r0,r6,r6,ror#5
> +       add     r10,r10,r12
> +       vadd.i32        d0,d0,d25
> +       and     r2,r2,r6
> +       eor     r12,r0,r6,ror#19
> +       vshr.u32        d24,d0,#17
> +       eor     r0,r10,r10,ror#11
> +       eor     r2,r2,r8
> +       vsli.32 d24,d0,#15
> +       add     r9,r9,r12,ror#6
> +       eor     r12,r10,r11
> +       vshr.u32        d25,d0,#10
> +       eor     r0,r0,r10,ror#20
> +       add     r9,r9,r2
> +       veor    d25,d25,d24
> +       ldr     r2,[sp,#12]
> +       and     r3,r3,r12
> +       vshr.u32        d24,d0,#19
> +       add     r5,r5,r9
> +       add     r9,r9,r0,ror#2
> +       eor     r3,r3,r11
> +       vld1.32 {q8},[r14,:128]!
> +       add     r8,r8,r2
> +       vsli.32 d24,d0,#13
> +       eor     r2,r6,r7
> +       eor     r0,r5,r5,ror#5
> +       veor    d25,d25,d24
> +       add     r9,r9,r3
> +       and     r2,r2,r5
> +       vadd.i32        d1,d1,d25
> +       eor     r3,r0,r5,ror#19
> +       eor     r0,r9,r9,ror#11
> +       vadd.i32        q8,q8,q0
> +       eor     r2,r2,r7
> +       add     r8,r8,r3,ror#6
> +       eor     r3,r9,r10
> +       eor     r0,r0,r9,ror#20
> +       add     r8,r8,r2
> +       ldr     r2,[sp,#16]
> +       and     r12,r12,r3
> +       add     r4,r4,r8
> +       vst1.32 {q8},[r1,:128]!
> +       add     r8,r8,r0,ror#2
> +       eor     r12,r12,r10
> +       vext.8  q8,q1,q2,#4
> +       add     r7,r7,r2
> +       eor     r2,r5,r6
> +       eor     r0,r4,r4,ror#5
> +       vext.8  q9,q3,q0,#4
> +       add     r8,r8,r12
> +       and     r2,r2,r4
> +       eor     r12,r0,r4,ror#19
> +       vshr.u32        q10,q8,#7
> +       eor     r0,r8,r8,ror#11
> +       eor     r2,r2,r6
> +       vadd.i32        q1,q1,q9
> +       add     r7,r7,r12,ror#6
> +       eor     r12,r8,r9
> +       vshr.u32        q9,q8,#3
> +       eor     r0,r0,r8,ror#20
> +       add     r7,r7,r2
> +       vsli.32 q10,q8,#25
> +       ldr     r2,[sp,#20]
> +       and     r3,r3,r12
> +       vshr.u32        q11,q8,#18
> +       add     r11,r11,r7
> +       add     r7,r7,r0,ror#2
> +       eor     r3,r3,r9
> +       veor    q9,q9,q10
> +       add     r6,r6,r2
> +       vsli.32 q11,q8,#14
> +       eor     r2,r4,r5
> +       eor     r0,r11,r11,ror#5
> +       vshr.u32        d24,d1,#17
> +       add     r7,r7,r3
> +       and     r2,r2,r11
> +       veor    q9,q9,q11
> +       eor     r3,r0,r11,ror#19
> +       eor     r0,r7,r7,ror#11
> +       vsli.32 d24,d1,#15
> +       eor     r2,r2,r5
> +       add     r6,r6,r3,ror#6
> +       vshr.u32        d25,d1,#10
> +       eor     r3,r7,r8
> +       eor     r0,r0,r7,ror#20
> +       vadd.i32        q1,q1,q9
> +       add     r6,r6,r2
> +       ldr     r2,[sp,#24]
> +       veor    d25,d25,d24
> +       and     r12,r12,r3
> +       add     r10,r10,r6
> +       vshr.u32        d24,d1,#19
> +       add     r6,r6,r0,ror#2
> +       eor     r12,r12,r8
> +       vsli.32 d24,d1,#13
> +       add     r5,r5,r2
> +       eor     r2,r11,r4
> +       veor    d25,d25,d24
> +       eor     r0,r10,r10,ror#5
> +       add     r6,r6,r12
> +       vadd.i32        d2,d2,d25
> +       and     r2,r2,r10
> +       eor     r12,r0,r10,ror#19
> +       vshr.u32        d24,d2,#17
> +       eor     r0,r6,r6,ror#11
> +       eor     r2,r2,r4
> +       vsli.32 d24,d2,#15
> +       add     r5,r5,r12,ror#6
> +       eor     r12,r6,r7
> +       vshr.u32        d25,d2,#10
> +       eor     r0,r0,r6,ror#20
> +       add     r5,r5,r2
> +       veor    d25,d25,d24
> +       ldr     r2,[sp,#28]
> +       and     r3,r3,r12
> +       vshr.u32        d24,d2,#19
> +       add     r9,r9,r5
> +       add     r5,r5,r0,ror#2
> +       eor     r3,r3,r7
> +       vld1.32 {q8},[r14,:128]!
> +       add     r4,r4,r2
> +       vsli.32 d24,d2,#13
> +       eor     r2,r10,r11
> +       eor     r0,r9,r9,ror#5
> +       veor    d25,d25,d24
> +       add     r5,r5,r3
> +       and     r2,r2,r9
> +       vadd.i32        d3,d3,d25
> +       eor     r3,r0,r9,ror#19
> +       eor     r0,r5,r5,ror#11
> +       vadd.i32        q8,q8,q1
> +       eor     r2,r2,r11
> +       add     r4,r4,r3,ror#6
> +       eor     r3,r5,r6
> +       eor     r0,r0,r5,ror#20
> +       add     r4,r4,r2
> +       ldr     r2,[sp,#32]
> +       and     r12,r12,r3
> +       add     r8,r8,r4
> +       vst1.32 {q8},[r1,:128]!
> +       add     r4,r4,r0,ror#2
> +       eor     r12,r12,r6
> +       vext.8  q8,q2,q3,#4
> +       add     r11,r11,r2
> +       eor     r2,r9,r10
> +       eor     r0,r8,r8,ror#5
> +       vext.8  q9,q0,q1,#4
> +       add     r4,r4,r12
> +       and     r2,r2,r8
> +       eor     r12,r0,r8,ror#19
> +       vshr.u32        q10,q8,#7
> +       eor     r0,r4,r4,ror#11
> +       eor     r2,r2,r10
> +       vadd.i32        q2,q2,q9
> +       add     r11,r11,r12,ror#6
> +       eor     r12,r4,r5
> +       vshr.u32        q9,q8,#3
> +       eor     r0,r0,r4,ror#20
> +       add     r11,r11,r2
> +       vsli.32 q10,q8,#25
> +       ldr     r2,[sp,#36]
> +       and     r3,r3,r12
> +       vshr.u32        q11,q8,#18
> +       add     r7,r7,r11
> +       add     r11,r11,r0,ror#2
> +       eor     r3,r3,r5
> +       veor    q9,q9,q10
> +       add     r10,r10,r2
> +       vsli.32 q11,q8,#14
> +       eor     r2,r8,r9
> +       eor     r0,r7,r7,ror#5
> +       vshr.u32        d24,d3,#17
> +       add     r11,r11,r3
> +       and     r2,r2,r7
> +       veor    q9,q9,q11
> +       eor     r3,r0,r7,ror#19
> +       eor     r0,r11,r11,ror#11
> +       vsli.32 d24,d3,#15
> +       eor     r2,r2,r9
> +       add     r10,r10,r3,ror#6
> +       vshr.u32        d25,d3,#10
> +       eor     r3,r11,r4
> +       eor     r0,r0,r11,ror#20
> +       vadd.i32        q2,q2,q9
> +       add     r10,r10,r2
> +       ldr     r2,[sp,#40]
> +       veor    d25,d25,d24
> +       and     r12,r12,r3
> +       add     r6,r6,r10
> +       vshr.u32        d24,d3,#19
> +       add     r10,r10,r0,ror#2
> +       eor     r12,r12,r4
> +       vsli.32 d24,d3,#13
> +       add     r9,r9,r2
> +       eor     r2,r7,r8
> +       veor    d25,d25,d24
> +       eor     r0,r6,r6,ror#5
> +       add     r10,r10,r12
> +       vadd.i32        d4,d4,d25
> +       and     r2,r2,r6
> +       eor     r12,r0,r6,ror#19
> +       vshr.u32        d24,d4,#17
> +       eor     r0,r10,r10,ror#11
> +       eor     r2,r2,r8
> +       vsli.32 d24,d4,#15
> +       add     r9,r9,r12,ror#6
> +       eor     r12,r10,r11
> +       vshr.u32        d25,d4,#10
> +       eor     r0,r0,r10,ror#20
> +       add     r9,r9,r2
> +       veor    d25,d25,d24
> +       ldr     r2,[sp,#44]
> +       and     r3,r3,r12
> +       vshr.u32        d24,d4,#19
> +       add     r5,r5,r9
> +       add     r9,r9,r0,ror#2
> +       eor     r3,r3,r11
> +       vld1.32 {q8},[r14,:128]!
> +       add     r8,r8,r2
> +       vsli.32 d24,d4,#13
> +       eor     r2,r6,r7
> +       eor     r0,r5,r5,ror#5
> +       veor    d25,d25,d24
> +       add     r9,r9,r3
> +       and     r2,r2,r5
> +       vadd.i32        d5,d5,d25
> +       eor     r3,r0,r5,ror#19
> +       eor     r0,r9,r9,ror#11
> +       vadd.i32        q8,q8,q2
> +       eor     r2,r2,r7
> +       add     r8,r8,r3,ror#6
> +       eor     r3,r9,r10
> +       eor     r0,r0,r9,ror#20
> +       add     r8,r8,r2
> +       ldr     r2,[sp,#48]
> +       and     r12,r12,r3
> +       add     r4,r4,r8
> +       vst1.32 {q8},[r1,:128]!
> +       add     r8,r8,r0,ror#2
> +       eor     r12,r12,r10
> +       vext.8  q8,q3,q0,#4
> +       add     r7,r7,r2
> +       eor     r2,r5,r6
> +       eor     r0,r4,r4,ror#5
> +       vext.8  q9,q1,q2,#4
> +       add     r8,r8,r12
> +       and     r2,r2,r4
> +       eor     r12,r0,r4,ror#19
> +       vshr.u32        q10,q8,#7
> +       eor     r0,r8,r8,ror#11
> +       eor     r2,r2,r6
> +       vadd.i32        q3,q3,q9
> +       add     r7,r7,r12,ror#6
> +       eor     r12,r8,r9
> +       vshr.u32        q9,q8,#3
> +       eor     r0,r0,r8,ror#20
> +       add     r7,r7,r2
> +       vsli.32 q10,q8,#25
> +       ldr     r2,[sp,#52]
> +       and     r3,r3,r12
> +       vshr.u32        q11,q8,#18
> +       add     r11,r11,r7
> +       add     r7,r7,r0,ror#2
> +       eor     r3,r3,r9
> +       veor    q9,q9,q10
> +       add     r6,r6,r2
> +       vsli.32 q11,q8,#14
> +       eor     r2,r4,r5
> +       eor     r0,r11,r11,ror#5
> +       vshr.u32        d24,d5,#17
> +       add     r7,r7,r3
> +       and     r2,r2,r11
> +       veor    q9,q9,q11
> +       eor     r3,r0,r11,ror#19
> +       eor     r0,r7,r7,ror#11
> +       vsli.32 d24,d5,#15
> +       eor     r2,r2,r5
> +       add     r6,r6,r3,ror#6
> +       vshr.u32        d25,d5,#10
> +       eor     r3,r7,r8
> +       eor     r0,r0,r7,ror#20
> +       vadd.i32        q3,q3,q9
> +       add     r6,r6,r2
> +       ldr     r2,[sp,#56]
> +       veor    d25,d25,d24
> +       and     r12,r12,r3
> +       add     r10,r10,r6
> +       vshr.u32        d24,d5,#19
> +       add     r6,r6,r0,ror#2
> +       eor     r12,r12,r8
> +       vsli.32 d24,d5,#13
> +       add     r5,r5,r2
> +       eor     r2,r11,r4
> +       veor    d25,d25,d24
> +       eor     r0,r10,r10,ror#5
> +       add     r6,r6,r12
> +       vadd.i32        d6,d6,d25
> +       and     r2,r2,r10
> +       eor     r12,r0,r10,ror#19
> +       vshr.u32        d24,d6,#17
> +       eor     r0,r6,r6,ror#11
> +       eor     r2,r2,r4
> +       vsli.32 d24,d6,#15
> +       add     r5,r5,r12,ror#6
> +       eor     r12,r6,r7
> +       vshr.u32        d25,d6,#10
> +       eor     r0,r0,r6,ror#20
> +       add     r5,r5,r2
> +       veor    d25,d25,d24
> +       ldr     r2,[sp,#60]
> +       and     r3,r3,r12
> +       vshr.u32        d24,d6,#19
> +       add     r9,r9,r5
> +       add     r5,r5,r0,ror#2
> +       eor     r3,r3,r7
> +       vld1.32 {q8},[r14,:128]!
> +       add     r4,r4,r2
> +       vsli.32 d24,d6,#13
> +       eor     r2,r10,r11
> +       eor     r0,r9,r9,ror#5
> +       veor    d25,d25,d24
> +       add     r5,r5,r3
> +       and     r2,r2,r9
> +       vadd.i32        d7,d7,d25
> +       eor     r3,r0,r9,ror#19
> +       eor     r0,r5,r5,ror#11
> +       vadd.i32        q8,q8,q3
> +       eor     r2,r2,r11
> +       add     r4,r4,r3,ror#6
> +       eor     r3,r5,r6
> +       eor     r0,r0,r5,ror#20
> +       add     r4,r4,r2
> +       ldr     r2,[r14]
> +       and     r12,r12,r3
> +       add     r8,r8,r4
> +       vst1.32 {q8},[r1,:128]!
> +       add     r4,r4,r0,ror#2
> +       eor     r12,r12,r6
> +       teq     r2,#0                           @ check for K256 terminator
> +       ldr     r2,[sp,#0]
> +       sub     r1,r1,#64
> +       bne     .L_00_48
> +
> +       ldr             r1,[sp,#68]
> +       ldr             r0,[sp,#72]
> +       sub             r14,r14,#256    @ rewind r14
> +       teq             r1,r0
> +       subeq           r1,r1,#64               @ avoid SEGV
> +       vld1.8          {q0},[r1]!              @ load next input block
> +       vld1.8          {q1},[r1]!
> +       vld1.8          {q2},[r1]!
> +       vld1.8          {q3},[r1]!
> +       strne           r1,[sp,#68]
> +       mov             r1,sp
> +       add     r11,r11,r2
> +       eor     r2,r9,r10
> +       eor     r0,r8,r8,ror#5
> +       add     r4,r4,r12
> +       vld1.32 {q8},[r14,:128]!
> +       and     r2,r2,r8
> +       eor     r12,r0,r8,ror#19
> +       eor     r0,r4,r4,ror#11
> +       eor     r2,r2,r10
> +       vrev32.8        q0,q0
> +       add     r11,r11,r12,ror#6
> +       eor     r12,r4,r5
> +       eor     r0,r0,r4,ror#20
> +       add     r11,r11,r2
> +       vadd.i32        q8,q8,q0
> +       ldr     r2,[sp,#4]
> +       and     r3,r3,r12
> +       add     r7,r7,r11
> +       add     r11,r11,r0,ror#2
> +       eor     r3,r3,r5
> +       add     r10,r10,r2
> +       eor     r2,r8,r9
> +       eor     r0,r7,r7,ror#5
> +       add     r11,r11,r3
> +       and     r2,r2,r7
> +       eor     r3,r0,r7,ror#19
> +       eor     r0,r11,r11,ror#11
> +       eor     r2,r2,r9
> +       add     r10,r10,r3,ror#6
> +       eor     r3,r11,r4
> +       eor     r0,r0,r11,ror#20
> +       add     r10,r10,r2
> +       ldr     r2,[sp,#8]
> +       and     r12,r12,r3
> +       add     r6,r6,r10
> +       add     r10,r10,r0,ror#2
> +       eor     r12,r12,r4
> +       add     r9,r9,r2
> +       eor     r2,r7,r8
> +       eor     r0,r6,r6,ror#5
> +       add     r10,r10,r12
> +       and     r2,r2,r6
> +       eor     r12,r0,r6,ror#19
> +       eor     r0,r10,r10,ror#11
> +       eor     r2,r2,r8
> +       add     r9,r9,r12,ror#6
> +       eor     r12,r10,r11
> +       eor     r0,r0,r10,ror#20
> +       add     r9,r9,r2
> +       ldr     r2,[sp,#12]
> +       and     r3,r3,r12
> +       add     r5,r5,r9
> +       add     r9,r9,r0,ror#2
> +       eor     r3,r3,r11
> +       add     r8,r8,r2
> +       eor     r2,r6,r7
> +       eor     r0,r5,r5,ror#5
> +       add     r9,r9,r3
> +       and     r2,r2,r5
> +       eor     r3,r0,r5,ror#19
> +       eor     r0,r9,r9,ror#11
> +       eor     r2,r2,r7
> +       add     r8,r8,r3,ror#6
> +       eor     r3,r9,r10
> +       eor     r0,r0,r9,ror#20
> +       add     r8,r8,r2
> +       ldr     r2,[sp,#16]
> +       and     r12,r12,r3
> +       add     r4,r4,r8
> +       add     r8,r8,r0,ror#2
> +       eor     r12,r12,r10
> +       vst1.32 {q8},[r1,:128]!
> +       add     r7,r7,r2
> +       eor     r2,r5,r6
> +       eor     r0,r4,r4,ror#5
> +       add     r8,r8,r12
> +       vld1.32 {q8},[r14,:128]!
> +       and     r2,r2,r4
> +       eor     r12,r0,r4,ror#19
> +       eor     r0,r8,r8,ror#11
> +       eor     r2,r2,r6
> +       vrev32.8        q1,q1
> +       add     r7,r7,r12,ror#6
> +       eor     r12,r8,r9
> +       eor     r0,r0,r8,ror#20
> +       add     r7,r7,r2
> +       vadd.i32        q8,q8,q1
> +       ldr     r2,[sp,#20]
> +       and     r3,r3,r12
> +       add     r11,r11,r7
> +       add     r7,r7,r0,ror#2
> +       eor     r3,r3,r9
> +       add     r6,r6,r2
> +       eor     r2,r4,r5
> +       eor     r0,r11,r11,ror#5
> +       add     r7,r7,r3
> +       and     r2,r2,r11
> +       eor     r3,r0,r11,ror#19
> +       eor     r0,r7,r7,ror#11
> +       eor     r2,r2,r5
> +       add     r6,r6,r3,ror#6
> +       eor     r3,r7,r8
> +       eor     r0,r0,r7,ror#20
> +       add     r6,r6,r2
> +       ldr     r2,[sp,#24]
> +       and     r12,r12,r3
> +       add     r10,r10,r6
> +       add     r6,r6,r0,ror#2
> +       eor     r12,r12,r8
> +       add     r5,r5,r2
> +       eor     r2,r11,r4
> +       eor     r0,r10,r10,ror#5
> +       add     r6,r6,r12
> +       and     r2,r2,r10
> +       eor     r12,r0,r10,ror#19
> +       eor     r0,r6,r6,ror#11
> +       eor     r2,r2,r4
> +       add     r5,r5,r12,ror#6
> +       eor     r12,r6,r7
> +       eor     r0,r0,r6,ror#20
> +       add     r5,r5,r2
> +       ldr     r2,[sp,#28]
> +       and     r3,r3,r12
> +       add     r9,r9,r5
> +       add     r5,r5,r0,ror#2
> +       eor     r3,r3,r7
> +       add     r4,r4,r2
> +       eor     r2,r10,r11
> +       eor     r0,r9,r9,ror#5
> +       add     r5,r5,r3
> +       and     r2,r2,r9
> +       eor     r3,r0,r9,ror#19
> +       eor     r0,r5,r5,ror#11
> +       eor     r2,r2,r11
> +       add     r4,r4,r3,ror#6
> +       eor     r3,r5,r6
> +       eor     r0,r0,r5,ror#20
> +       add     r4,r4,r2
> +       ldr     r2,[sp,#32]
> +       and     r12,r12,r3
> +       add     r8,r8,r4
> +       add     r4,r4,r0,ror#2
> +       eor     r12,r12,r6
> +       vst1.32 {q8},[r1,:128]!
> +       add     r11,r11,r2
> +       eor     r2,r9,r10
> +       eor     r0,r8,r8,ror#5
> +       add     r4,r4,r12
> +       vld1.32 {q8},[r14,:128]!
> +       and     r2,r2,r8
> +       eor     r12,r0,r8,ror#19
> +       eor     r0,r4,r4,ror#11
> +       eor     r2,r2,r10
> +       vrev32.8        q2,q2
> +       add     r11,r11,r12,ror#6
> +       eor     r12,r4,r5
> +       eor     r0,r0,r4,ror#20
> +       add     r11,r11,r2
> +       vadd.i32        q8,q8,q2
> +       ldr     r2,[sp,#36]
> +       and     r3,r3,r12
> +       add     r7,r7,r11
> +       add     r11,r11,r0,ror#2
> +       eor     r3,r3,r5
> +       add     r10,r10,r2
> +       eor     r2,r8,r9
> +       eor     r0,r7,r7,ror#5
> +       add     r11,r11,r3
> +       and     r2,r2,r7
> +       eor     r3,r0,r7,ror#19
> +       eor     r0,r11,r11,ror#11
> +       eor     r2,r2,r9
> +       add     r10,r10,r3,ror#6
> +       eor     r3,r11,r4
> +       eor     r0,r0,r11,ror#20
> +       add     r10,r10,r2
> +       ldr     r2,[sp,#40]
> +       and     r12,r12,r3
> +       add     r6,r6,r10
> +       add     r10,r10,r0,ror#2
> +       eor     r12,r12,r4
> +       add     r9,r9,r2
> +       eor     r2,r7,r8
> +       eor     r0,r6,r6,ror#5
> +       add     r10,r10,r12
> +       and     r2,r2,r6
> +       eor     r12,r0,r6,ror#19
> +       eor     r0,r10,r10,ror#11
> +       eor     r2,r2,r8
> +       add     r9,r9,r12,ror#6
> +       eor     r12,r10,r11
> +       eor     r0,r0,r10,ror#20
> +       add     r9,r9,r2
> +       ldr     r2,[sp,#44]
> +       and     r3,r3,r12
> +       add     r5,r5,r9
> +       add     r9,r9,r0,ror#2
> +       eor     r3,r3,r11
> +       add     r8,r8,r2
> +       eor     r2,r6,r7
> +       eor     r0,r5,r5,ror#5
> +       add     r9,r9,r3
> +       and     r2,r2,r5
> +       eor     r3,r0,r5,ror#19
> +       eor     r0,r9,r9,ror#11
> +       eor     r2,r2,r7
> +       add     r8,r8,r3,ror#6
> +       eor     r3,r9,r10
> +       eor     r0,r0,r9,ror#20
> +       add     r8,r8,r2
> +       ldr     r2,[sp,#48]
> +       and     r12,r12,r3
> +       add     r4,r4,r8
> +       add     r8,r8,r0,ror#2
> +       eor     r12,r12,r10
> +       vst1.32 {q8},[r1,:128]!
> +       add     r7,r7,r2
> +       eor     r2,r5,r6
> +       eor     r0,r4,r4,ror#5
> +       add     r8,r8,r12
> +       vld1.32 {q8},[r14,:128]!
> +       and     r2,r2,r4
> +       eor     r12,r0,r4,ror#19
> +       eor     r0,r8,r8,ror#11
> +       eor     r2,r2,r6
> +       vrev32.8        q3,q3
> +       add     r7,r7,r12,ror#6
> +       eor     r12,r8,r9
> +       eor     r0,r0,r8,ror#20
> +       add     r7,r7,r2
> +       vadd.i32        q8,q8,q3
> +       ldr     r2,[sp,#52]
> +       and     r3,r3,r12
> +       add     r11,r11,r7
> +       add     r7,r7,r0,ror#2
> +       eor     r3,r3,r9
> +       add     r6,r6,r2
> +       eor     r2,r4,r5
> +       eor     r0,r11,r11,ror#5
> +       add     r7,r7,r3
> +       and     r2,r2,r11
> +       eor     r3,r0,r11,ror#19
> +       eor     r0,r7,r7,ror#11
> +       eor     r2,r2,r5
> +       add     r6,r6,r3,ror#6
> +       eor     r3,r7,r8
> +       eor     r0,r0,r7,ror#20
> +       add     r6,r6,r2
> +       ldr     r2,[sp,#56]
> +       and     r12,r12,r3
> +       add     r10,r10,r6
> +       add     r6,r6,r0,ror#2
> +       eor     r12,r12,r8
> +       add     r5,r5,r2
> +       eor     r2,r11,r4
> +       eor     r0,r10,r10,ror#5
> +       add     r6,r6,r12
> +       and     r2,r2,r10
> +       eor     r12,r0,r10,ror#19
> +       eor     r0,r6,r6,ror#11
> +       eor     r2,r2,r4
> +       add     r5,r5,r12,ror#6
> +       eor     r12,r6,r7
> +       eor     r0,r0,r6,ror#20
> +       add     r5,r5,r2
> +       ldr     r2,[sp,#60]
> +       and     r3,r3,r12
> +       add     r9,r9,r5
> +       add     r5,r5,r0,ror#2
> +       eor     r3,r3,r7
> +       add     r4,r4,r2
> +       eor     r2,r10,r11
> +       eor     r0,r9,r9,ror#5
> +       add     r5,r5,r3
> +       and     r2,r2,r9
> +       eor     r3,r0,r9,ror#19
> +       eor     r0,r5,r5,ror#11
> +       eor     r2,r2,r11
> +       add     r4,r4,r3,ror#6
> +       eor     r3,r5,r6
> +       eor     r0,r0,r5,ror#20
> +       add     r4,r4,r2
> +       ldr     r2,[sp,#64]
> +       and     r12,r12,r3
> +       add     r8,r8,r4
> +       add     r4,r4,r0,ror#2
> +       eor     r12,r12,r6
> +       vst1.32 {q8},[r1,:128]!
> +       ldr     r0,[r2,#0]
> +       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
> +       ldr     r12,[r2,#4]
> +       ldr     r3,[r2,#8]
> +       ldr     r1,[r2,#12]
> +       add     r4,r4,r0                        @ accumulate
> +       ldr     r0,[r2,#16]
> +       add     r5,r5,r12
> +       ldr     r12,[r2,#20]
> +       add     r6,r6,r3
> +       ldr     r3,[r2,#24]
> +       add     r7,r7,r1
> +       ldr     r1,[r2,#28]
> +       add     r8,r8,r0
> +       str     r4,[r2],#4
> +       add     r9,r9,r12
> +       str     r5,[r2],#4
> +       add     r10,r10,r3
> +       str     r6,[r2],#4
> +       add     r11,r11,r1
> +       str     r7,[r2],#4
> +       stmia   r2,{r8-r11}
> +
> +       movne   r1,sp
> +       ldrne   r2,[sp,#0]
> +       eorne   r12,r12,r12
> +       ldreq   sp,[sp,#76]                     @ restore original sp
> +       eorne   r3,r5,r6
> +       bne     .L_00_48
> +
> +       ldmia   sp!,{r4-r12,pc}
> +ENDPROC(sha256_transform_neon)
> diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
> new file mode 100644
> index 0000000..698a498
> --- /dev/null
> +++ b/arch/arm/crypto/sha256_neon_glue.c
> @@ -0,0 +1,201 @@
> +/*
> + * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
> + * using NEON instructions.
> + *
> + * Copyright © 2015 Google Inc.
> + *
> + * This file is based on sha512_neon_glue.c:
> + *   Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the Free
> + * Software Foundation; either version 2 of the License, or (at your option)
> + * any later version.
> + *
> + */
> +
> +#include <crypto/internal/hash.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/mm.h>
> +#include <linux/cryptohash.h>
> +#include <linux/types.h>
> +#include <linux/string.h>
> +#include <crypto/sha.h>
> +#include <asm/byteorder.h>
> +#include <asm/simd.h>
> +#include <asm/neon.h>
> +
> +asmlinkage void sha256_transform_neon(u32 *digest, const void *data,
> +                                     unsigned int num_blks);
> +
> +
> +static int sha256_neon_init(struct shash_desc *desc)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +
> +       sctx->state[0] = SHA256_H0;
> +       sctx->state[1] = SHA256_H1;
> +       sctx->state[2] = SHA256_H2;
> +       sctx->state[3] = SHA256_H3;
> +       sctx->state[4] = SHA256_H4;
> +       sctx->state[5] = SHA256_H5;
> +       sctx->state[6] = SHA256_H6;
> +       sctx->state[7] = SHA256_H7;
> +       sctx->count = 0;
> +
> +       return 0;
> +}
> +
> +static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
> +                               unsigned int len, unsigned int partial)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +       unsigned int done = 0;
> +
> +       sctx->count += len;
> +
> +       if (partial) {
> +               done = SHA256_BLOCK_SIZE - partial;
> +               memcpy(sctx->buf + partial, data, done);
> +               sha256_transform_neon(sctx->state, sctx->buf, 1);
> +       }
> +
> +       if (len - done >= SHA256_BLOCK_SIZE) {
> +               const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
> +
> +               sha256_transform_neon(sctx->state, data + done, rounds);
> +               done += rounds * SHA256_BLOCK_SIZE;
> +       }
> +
> +       memcpy(sctx->buf, data + done, len - done);
> +
> +       return 0;
> +}
> +
> +static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
> +                            unsigned int len)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +       unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
> +       int res;
> +
> +       /* Handle the fast case right here */
> +       if (partial + len < SHA256_BLOCK_SIZE) {
> +               sctx->count += len;
> +               memcpy(sctx->buf + partial, data, len);
> +
> +               return 0;
> +       }
> +
> +       if (!may_use_simd()) {
> +               res = crypto_sha256_update(desc, data, len);
> +       } else {
> +               kernel_neon_begin();
> +               res = __sha256_neon_update(desc, data, len, partial);
> +               kernel_neon_end();
> +       }
> +
> +       return res;
> +}
> +
> +/* Add padding and return the message digest. */
> +static int sha256_neon_final(struct shash_desc *desc, u8 *out)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +       unsigned int i, index, padlen;
> +       __be32 *dst = (__be32 *)out;
> +       __be64 bits;
> +       static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
> +
> +       /* save number of bits */
> +       bits = cpu_to_be64(sctx->count << 3);
> +
> +       /* Pad out to 56 mod 64 and append length */
> +       index = sctx->count % SHA256_BLOCK_SIZE;
> +       padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
> +
> +       if (!may_use_simd()) {
> +               crypto_sha256_update(desc, padding, padlen);
> +               crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
> +       } else {
> +               kernel_neon_begin();
> +               /* We need to fill a whole block for __sha256_neon_update() */
> +               if (padlen <= 56) {
> +                       sctx->count += padlen;
> +                       memcpy(sctx->buf + index, padding, padlen);
> +               } else {
> +                       __sha256_neon_update(desc, padding, padlen, index);
> +               }
> +               __sha256_neon_update(desc, (const u8 *)&bits,
> +                                       sizeof(bits), 56);
> +               kernel_neon_end();
> +       }
> +
> +       /* Store state in digest */
> +       for (i = 0; i < 8; i++)
> +               dst[i] = cpu_to_be32(sctx->state[i]);
> +
> +       /* Wipe context */
> +       memset(sctx, 0, sizeof(*sctx));
> +
> +       return 0;
> +}
> +
> +static int sha256_neon_export(struct shash_desc *desc, void *out)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +
> +       memcpy(out, sctx, sizeof(*sctx));
> +
> +       return 0;
> +}
> +
> +static int sha256_neon_import(struct shash_desc *desc, const void *in)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +
> +       memcpy(sctx, in, sizeof(*sctx));
> +
> +       return 0;
> +}
> +
> +static struct shash_alg alg = {
> +       .digestsize     =       SHA256_DIGEST_SIZE,
> +       .init           =       sha256_neon_init,
> +       .update         =       sha256_neon_update,
> +       .final          =       sha256_neon_final,
> +       .export         =       sha256_neon_export,
> +       .import         =       sha256_neon_import,
> +       .descsize       =       sizeof(struct sha256_state),
> +       .statesize      =       sizeof(struct sha256_state),
> +       .base           =       {
> +               .cra_name       =       "sha256",
> +               .cra_driver_name =      "sha256-neon",
> +               .cra_priority   =       350,
> +               .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
> +               .cra_blocksize  =       SHA256_BLOCK_SIZE,
> +               .cra_module     =       THIS_MODULE,
> +       }
> +};
> +

You can also implement SHA-224 using the same core transform, it's
just some trivial glue code.

> +static int __init sha256_neon_mod_init(void)
> +{
> +       if (!cpu_has_neon())
> +               return -ENODEV;
> +
> +       return crypto_register_shash(&alg);
> +}
> +
> +static void __exit sha256_neon_mod_fini(void)
> +{
> +       crypto_unregister_shash(&alg);
> +}
> +
> +module_init(sha256_neon_mod_init);
> +module_exit(sha256_neon_mod_fini);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, NEON accelerated");
> +
> +MODULE_ALIAS("sha256");
> diff --git a/crypto/Kconfig b/crypto/Kconfig
> index 50f4da4..0505523 100644
> --- a/crypto/Kconfig
> +++ b/crypto/Kconfig
> @@ -610,6 +610,18 @@ config CRYPTO_SHA256
>           This code also includes SHA-224, a 224 bit hash with 112 bits
>           of security against collision attacks.
>
> +config CRYPTO_SHA256_ARM_NEON
> +       tristate "SHA256 digest algorithm (ARM NEON)"
> +       depends on ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
> +       select CRYPTO_SHA256
> +       select CRYPTO_HASH
> +       help
> +         SHA-256 secure hash standard (DFIPS 180-2) implemented
> +         using ARM NEON instructions, when available.
> +
> +         This version of SHA implements a 256 bit hash with 128 bits of
> +         security against collision attacks.
> +

Could you please rebase this onto Herbert's cryptodev tree and move
this to arch/arm/crypto/Kconfig?


>  config CRYPTO_SHA256_SPARC64
>         tristate "SHA224 and SHA256 digest algorithm (SPARC64)"
>         depends on SPARC64

Regards,
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] arm: crypto: Add NEON optimized SHA-256
@ 2015-03-16 16:08   ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-16 16:08 UTC (permalink / raw)
  To: linux-arm-kernel

Hello Sami,

On 16 March 2015 at 16:48, Sami Tolvanen <samitolvanen@google.com> wrote:
> Add Andy Polyakov's NEON optimized SHA-256 implementation.
>
> On Nexus 6, this implementation is ~2x faster than sha256-generic.
>
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
>

Have you tested this code with the tcrypt.ko module?

Some more comments below

> ---
>  arch/arm/crypto/Makefile            |    2
>  arch/arm/crypto/sha256-armv7-neon.S |  819 ++++++++++++++++++++++++++++++++++++
>  arch/arm/crypto/sha256_neon_glue.c  |  201 ++++++++
>  crypto/Kconfig                      |   12
>  4 files changed, 1034 insertions(+)
>
> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
> index b48fa34..316dba2 100644
> --- a/arch/arm/crypto/Makefile
> +++ b/arch/arm/crypto/Makefile
> @@ -6,12 +6,14 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
>  obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
> +obj-$(CONFIG_CRYPTO_SHA256_ARM_NEON) += sha256-arm-neon.o
>  obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
>
>  aes-arm-y      := aes-armv4.o aes_glue.o
>  aes-arm-bs-y   := aesbs-core.o aesbs-glue.o
>  sha1-arm-y     := sha1-armv4-large.o sha1_glue.o
>  sha1-arm-neon-y        := sha1-armv7-neon.o sha1_neon_glue.o
> +sha256-arm-neon-y := sha256-armv7-neon.o sha256_neon_glue.o
>  sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
>
>  quiet_cmd_perl = PERL    $@
> diff --git a/arch/arm/crypto/sha256-armv7-neon.S b/arch/arm/crypto/sha256-armv7-neon.S
> new file mode 100644
> index 0000000..5ce04c2
> --- /dev/null
> +++ b/arch/arm/crypto/sha256-armv7-neon.S
> @@ -0,0 +1,819 @@
> +@ sha256-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-256 transform
> +@
> +@ ====================================================================
> +@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
> +@ project. The module is, however, dual licensed under OpenSSL and
> +@ CRYPTOGAMS licenses depending on where you obtain it. For further
> +@ details see http://www.openssl.org/~appro/cryptogams/.
> +@ ====================================================================
> +

Did you talk to Andy about the license? I don't think this is
permissible for the kernel as-is.


> +#include <linux/linkage.h>
> +
> +.text
> +.code   32
> +.fpu neon
> +
> +.type  K256,%object
> +.align 5
> +K256:
> +.word  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
> +.word  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
> +.word  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
> +.word  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
> +.word  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
> +.word  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
> +.word  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
> +.word  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
> +.word  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
> +.word  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
> +.word  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
> +.word  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
> +.word  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
> +.word  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
> +.word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
> +.word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
> +.size  K256,.-K256
> +.word  0                               @ terminator
> +.word  0
> +.align 5
> +
> +.align 5
> +ENTRY(sha256_transform_neon)
> +       /* Input:
> +        *      %r0: SHA256_CONTEXT
> +        *      %r1: data
> +        *      %r2: nblks
> +        */
> +       sub     r3,pc,#8                @ sha256_transform_neon

This is broken on thumb-2, use adr instead


> +       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
> +
> +       stmdb   sp!,{r4-r12,lr}
> +
> +       mov     r12,sp
> +       sub     sp,sp,#16*4+16          @ alloca
> +       sub     r14,r3,#256+32  @ K256
> +       bic     sp,sp,#15               @ align for 128-bit stores
> +
> +       vld1.8          {q0},[r1]!
> +       vld1.8          {q1},[r1]!
> +       vld1.8          {q2},[r1]!
> +       vld1.8          {q3},[r1]!
> +       vld1.32         {q8},[r14,:128]!
> +       vld1.32         {q9},[r14,:128]!
> +       vld1.32         {q10},[r14,:128]!
> +       vld1.32         {q11},[r14,:128]!
> +       vrev32.8        q0,q0           @ yes, even on
> +       str             r0,[sp,#64]
> +       vrev32.8        q1,q1           @ big-endian
> +       str             r1,[sp,#68]
> +       mov             r1,sp
> +       vrev32.8        q2,q2
> +       str             r2,[sp,#72]
> +       vrev32.8        q3,q3
> +       str             r12,[sp,#76]            @ save original sp
> +       vadd.i32        q8,q8,q0
> +       vadd.i32        q9,q9,q1
> +       vst1.32         {q8},[r1,:128]!
> +       vadd.i32        q10,q10,q2
> +       vst1.32         {q9},[r1,:128]!
> +       vadd.i32        q11,q11,q3
> +       vst1.32         {q10},[r1,:128]!
> +       vst1.32         {q11},[r1,:128]!
> +
> +       ldmia           r0,{r4-r11}
> +       sub             r1,r1,#64
> +       ldr             r2,[sp,#0]
> +       eor             r12,r12,r12
> +       eor             r3,r5,r6
> +       b               .L_00_48
> +
> +.align 4
> +.L_00_48:
> +       vext.8  q8,q0,q1,#4
> +       add     r11,r11,r2
> +       eor     r2,r9,r10
> +       eor     r0,r8,r8,ror#5
> +       vext.8  q9,q2,q3,#4
> +       add     r4,r4,r12
> +       and     r2,r2,r8
> +       eor     r12,r0,r8,ror#19
> +       vshr.u32        q10,q8,#7
> +       eor     r0,r4,r4,ror#11
> +       eor     r2,r2,r10
> +       vadd.i32        q0,q0,q9
> +       add     r11,r11,r12,ror#6
> +       eor     r12,r4,r5
> +       vshr.u32        q9,q8,#3
> +       eor     r0,r0,r4,ror#20
> +       add     r11,r11,r2
> +       vsli.32 q10,q8,#25
> +       ldr     r2,[sp,#4]
> +       and     r3,r3,r12
> +       vshr.u32        q11,q8,#18
> +       add     r7,r7,r11
> +       add     r11,r11,r0,ror#2
> +       eor     r3,r3,r5
> +       veor    q9,q9,q10
> +       add     r10,r10,r2
> +       vsli.32 q11,q8,#14
> +       eor     r2,r8,r9
> +       eor     r0,r7,r7,ror#5
> +       vshr.u32        d24,d7,#17
> +       add     r11,r11,r3
> +       and     r2,r2,r7
> +       veor    q9,q9,q11
> +       eor     r3,r0,r7,ror#19
> +       eor     r0,r11,r11,ror#11
> +       vsli.32 d24,d7,#15
> +       eor     r2,r2,r9
> +       add     r10,r10,r3,ror#6
> +       vshr.u32        d25,d7,#10
> +       eor     r3,r11,r4
> +       eor     r0,r0,r11,ror#20
> +       vadd.i32        q0,q0,q9
> +       add     r10,r10,r2
> +       ldr     r2,[sp,#8]
> +       veor    d25,d25,d24
> +       and     r12,r12,r3
> +       add     r6,r6,r10
> +       vshr.u32        d24,d7,#19
> +       add     r10,r10,r0,ror#2
> +       eor     r12,r12,r4
> +       vsli.32 d24,d7,#13
> +       add     r9,r9,r2
> +       eor     r2,r7,r8
> +       veor    d25,d25,d24
> +       eor     r0,r6,r6,ror#5
> +       add     r10,r10,r12
> +       vadd.i32        d0,d0,d25
> +       and     r2,r2,r6
> +       eor     r12,r0,r6,ror#19
> +       vshr.u32        d24,d0,#17
> +       eor     r0,r10,r10,ror#11
> +       eor     r2,r2,r8
> +       vsli.32 d24,d0,#15
> +       add     r9,r9,r12,ror#6
> +       eor     r12,r10,r11
> +       vshr.u32        d25,d0,#10
> +       eor     r0,r0,r10,ror#20
> +       add     r9,r9,r2
> +       veor    d25,d25,d24
> +       ldr     r2,[sp,#12]
> +       and     r3,r3,r12
> +       vshr.u32        d24,d0,#19
> +       add     r5,r5,r9
> +       add     r9,r9,r0,ror#2
> +       eor     r3,r3,r11
> +       vld1.32 {q8},[r14,:128]!
> +       add     r8,r8,r2
> +       vsli.32 d24,d0,#13
> +       eor     r2,r6,r7
> +       eor     r0,r5,r5,ror#5
> +       veor    d25,d25,d24
> +       add     r9,r9,r3
> +       and     r2,r2,r5
> +       vadd.i32        d1,d1,d25
> +       eor     r3,r0,r5,ror#19
> +       eor     r0,r9,r9,ror#11
> +       vadd.i32        q8,q8,q0
> +       eor     r2,r2,r7
> +       add     r8,r8,r3,ror#6
> +       eor     r3,r9,r10
> +       eor     r0,r0,r9,ror#20
> +       add     r8,r8,r2
> +       ldr     r2,[sp,#16]
> +       and     r12,r12,r3
> +       add     r4,r4,r8
> +       vst1.32 {q8},[r1,:128]!
> +       add     r8,r8,r0,ror#2
> +       eor     r12,r12,r10
> +       vext.8  q8,q1,q2,#4
> +       add     r7,r7,r2
> +       eor     r2,r5,r6
> +       eor     r0,r4,r4,ror#5
> +       vext.8  q9,q3,q0,#4
> +       add     r8,r8,r12
> +       and     r2,r2,r4
> +       eor     r12,r0,r4,ror#19
> +       vshr.u32        q10,q8,#7
> +       eor     r0,r8,r8,ror#11
> +       eor     r2,r2,r6
> +       vadd.i32        q1,q1,q9
> +       add     r7,r7,r12,ror#6
> +       eor     r12,r8,r9
> +       vshr.u32        q9,q8,#3
> +       eor     r0,r0,r8,ror#20
> +       add     r7,r7,r2
> +       vsli.32 q10,q8,#25
> +       ldr     r2,[sp,#20]
> +       and     r3,r3,r12
> +       vshr.u32        q11,q8,#18
> +       add     r11,r11,r7
> +       add     r7,r7,r0,ror#2
> +       eor     r3,r3,r9
> +       veor    q9,q9,q10
> +       add     r6,r6,r2
> +       vsli.32 q11,q8,#14
> +       eor     r2,r4,r5
> +       eor     r0,r11,r11,ror#5
> +       vshr.u32        d24,d1,#17
> +       add     r7,r7,r3
> +       and     r2,r2,r11
> +       veor    q9,q9,q11
> +       eor     r3,r0,r11,ror#19
> +       eor     r0,r7,r7,ror#11
> +       vsli.32 d24,d1,#15
> +       eor     r2,r2,r5
> +       add     r6,r6,r3,ror#6
> +       vshr.u32        d25,d1,#10
> +       eor     r3,r7,r8
> +       eor     r0,r0,r7,ror#20
> +       vadd.i32        q1,q1,q9
> +       add     r6,r6,r2
> +       ldr     r2,[sp,#24]
> +       veor    d25,d25,d24
> +       and     r12,r12,r3
> +       add     r10,r10,r6
> +       vshr.u32        d24,d1,#19
> +       add     r6,r6,r0,ror#2
> +       eor     r12,r12,r8
> +       vsli.32 d24,d1,#13
> +       add     r5,r5,r2
> +       eor     r2,r11,r4
> +       veor    d25,d25,d24
> +       eor     r0,r10,r10,ror#5
> +       add     r6,r6,r12
> +       vadd.i32        d2,d2,d25
> +       and     r2,r2,r10
> +       eor     r12,r0,r10,ror#19
> +       vshr.u32        d24,d2,#17
> +       eor     r0,r6,r6,ror#11
> +       eor     r2,r2,r4
> +       vsli.32 d24,d2,#15
> +       add     r5,r5,r12,ror#6
> +       eor     r12,r6,r7
> +       vshr.u32        d25,d2,#10
> +       eor     r0,r0,r6,ror#20
> +       add     r5,r5,r2
> +       veor    d25,d25,d24
> +       ldr     r2,[sp,#28]
> +       and     r3,r3,r12
> +       vshr.u32        d24,d2,#19
> +       add     r9,r9,r5
> +       add     r5,r5,r0,ror#2
> +       eor     r3,r3,r7
> +       vld1.32 {q8},[r14,:128]!
> +       add     r4,r4,r2
> +       vsli.32 d24,d2,#13
> +       eor     r2,r10,r11
> +       eor     r0,r9,r9,ror#5
> +       veor    d25,d25,d24
> +       add     r5,r5,r3
> +       and     r2,r2,r9
> +       vadd.i32        d3,d3,d25
> +       eor     r3,r0,r9,ror#19
> +       eor     r0,r5,r5,ror#11
> +       vadd.i32        q8,q8,q1
> +       eor     r2,r2,r11
> +       add     r4,r4,r3,ror#6
> +       eor     r3,r5,r6
> +       eor     r0,r0,r5,ror#20
> +       add     r4,r4,r2
> +       ldr     r2,[sp,#32]
> +       and     r12,r12,r3
> +       add     r8,r8,r4
> +       vst1.32 {q8},[r1,:128]!
> +       add     r4,r4,r0,ror#2
> +       eor     r12,r12,r6
> +       vext.8  q8,q2,q3,#4
> +       add     r11,r11,r2
> +       eor     r2,r9,r10
> +       eor     r0,r8,r8,ror#5
> +       vext.8  q9,q0,q1,#4
> +       add     r4,r4,r12
> +       and     r2,r2,r8
> +       eor     r12,r0,r8,ror#19
> +       vshr.u32        q10,q8,#7
> +       eor     r0,r4,r4,ror#11
> +       eor     r2,r2,r10
> +       vadd.i32        q2,q2,q9
> +       add     r11,r11,r12,ror#6
> +       eor     r12,r4,r5
> +       vshr.u32        q9,q8,#3
> +       eor     r0,r0,r4,ror#20
> +       add     r11,r11,r2
> +       vsli.32 q10,q8,#25
> +       ldr     r2,[sp,#36]
> +       and     r3,r3,r12
> +       vshr.u32        q11,q8,#18
> +       add     r7,r7,r11
> +       add     r11,r11,r0,ror#2
> +       eor     r3,r3,r5
> +       veor    q9,q9,q10
> +       add     r10,r10,r2
> +       vsli.32 q11,q8,#14
> +       eor     r2,r8,r9
> +       eor     r0,r7,r7,ror#5
> +       vshr.u32        d24,d3,#17
> +       add     r11,r11,r3
> +       and     r2,r2,r7
> +       veor    q9,q9,q11
> +       eor     r3,r0,r7,ror#19
> +       eor     r0,r11,r11,ror#11
> +       vsli.32 d24,d3,#15
> +       eor     r2,r2,r9
> +       add     r10,r10,r3,ror#6
> +       vshr.u32        d25,d3,#10
> +       eor     r3,r11,r4
> +       eor     r0,r0,r11,ror#20
> +       vadd.i32        q2,q2,q9
> +       add     r10,r10,r2
> +       ldr     r2,[sp,#40]
> +       veor    d25,d25,d24
> +       and     r12,r12,r3
> +       add     r6,r6,r10
> +       vshr.u32        d24,d3,#19
> +       add     r10,r10,r0,ror#2
> +       eor     r12,r12,r4
> +       vsli.32 d24,d3,#13
> +       add     r9,r9,r2
> +       eor     r2,r7,r8
> +       veor    d25,d25,d24
> +       eor     r0,r6,r6,ror#5
> +       add     r10,r10,r12
> +       vadd.i32        d4,d4,d25
> +       and     r2,r2,r6
> +       eor     r12,r0,r6,ror#19
> +       vshr.u32        d24,d4,#17
> +       eor     r0,r10,r10,ror#11
> +       eor     r2,r2,r8
> +       vsli.32 d24,d4,#15
> +       add     r9,r9,r12,ror#6
> +       eor     r12,r10,r11
> +       vshr.u32        d25,d4,#10
> +       eor     r0,r0,r10,ror#20
> +       add     r9,r9,r2
> +       veor    d25,d25,d24
> +       ldr     r2,[sp,#44]
> +       and     r3,r3,r12
> +       vshr.u32        d24,d4,#19
> +       add     r5,r5,r9
> +       add     r9,r9,r0,ror#2
> +       eor     r3,r3,r11
> +       vld1.32 {q8},[r14,:128]!
> +       add     r8,r8,r2
> +       vsli.32 d24,d4,#13
> +       eor     r2,r6,r7
> +       eor     r0,r5,r5,ror#5
> +       veor    d25,d25,d24
> +       add     r9,r9,r3
> +       and     r2,r2,r5
> +       vadd.i32        d5,d5,d25
> +       eor     r3,r0,r5,ror#19
> +       eor     r0,r9,r9,ror#11
> +       vadd.i32        q8,q8,q2
> +       eor     r2,r2,r7
> +       add     r8,r8,r3,ror#6
> +       eor     r3,r9,r10
> +       eor     r0,r0,r9,ror#20
> +       add     r8,r8,r2
> +       ldr     r2,[sp,#48]
> +       and     r12,r12,r3
> +       add     r4,r4,r8
> +       vst1.32 {q8},[r1,:128]!
> +       add     r8,r8,r0,ror#2
> +       eor     r12,r12,r10
> +       vext.8  q8,q3,q0,#4
> +       add     r7,r7,r2
> +       eor     r2,r5,r6
> +       eor     r0,r4,r4,ror#5
> +       vext.8  q9,q1,q2,#4
> +       add     r8,r8,r12
> +       and     r2,r2,r4
> +       eor     r12,r0,r4,ror#19
> +       vshr.u32        q10,q8,#7
> +       eor     r0,r8,r8,ror#11
> +       eor     r2,r2,r6
> +       vadd.i32        q3,q3,q9
> +       add     r7,r7,r12,ror#6
> +       eor     r12,r8,r9
> +       vshr.u32        q9,q8,#3
> +       eor     r0,r0,r8,ror#20
> +       add     r7,r7,r2
> +       vsli.32 q10,q8,#25
> +       ldr     r2,[sp,#52]
> +       and     r3,r3,r12
> +       vshr.u32        q11,q8,#18
> +       add     r11,r11,r7
> +       add     r7,r7,r0,ror#2
> +       eor     r3,r3,r9
> +       veor    q9,q9,q10
> +       add     r6,r6,r2
> +       vsli.32 q11,q8,#14
> +       eor     r2,r4,r5
> +       eor     r0,r11,r11,ror#5
> +       vshr.u32        d24,d5,#17
> +       add     r7,r7,r3
> +       and     r2,r2,r11
> +       veor    q9,q9,q11
> +       eor     r3,r0,r11,ror#19
> +       eor     r0,r7,r7,ror#11
> +       vsli.32 d24,d5,#15
> +       eor     r2,r2,r5
> +       add     r6,r6,r3,ror#6
> +       vshr.u32        d25,d5,#10
> +       eor     r3,r7,r8
> +       eor     r0,r0,r7,ror#20
> +       vadd.i32        q3,q3,q9
> +       add     r6,r6,r2
> +       ldr     r2,[sp,#56]
> +       veor    d25,d25,d24
> +       and     r12,r12,r3
> +       add     r10,r10,r6
> +       vshr.u32        d24,d5,#19
> +       add     r6,r6,r0,ror#2
> +       eor     r12,r12,r8
> +       vsli.32 d24,d5,#13
> +       add     r5,r5,r2
> +       eor     r2,r11,r4
> +       veor    d25,d25,d24
> +       eor     r0,r10,r10,ror#5
> +       add     r6,r6,r12
> +       vadd.i32        d6,d6,d25
> +       and     r2,r2,r10
> +       eor     r12,r0,r10,ror#19
> +       vshr.u32        d24,d6,#17
> +       eor     r0,r6,r6,ror#11
> +       eor     r2,r2,r4
> +       vsli.32 d24,d6,#15
> +       add     r5,r5,r12,ror#6
> +       eor     r12,r6,r7
> +       vshr.u32        d25,d6,#10
> +       eor     r0,r0,r6,ror#20
> +       add     r5,r5,r2
> +       veor    d25,d25,d24
> +       ldr     r2,[sp,#60]
> +       and     r3,r3,r12
> +       vshr.u32        d24,d6,#19
> +       add     r9,r9,r5
> +       add     r5,r5,r0,ror#2
> +       eor     r3,r3,r7
> +       vld1.32 {q8},[r14,:128]!
> +       add     r4,r4,r2
> +       vsli.32 d24,d6,#13
> +       eor     r2,r10,r11
> +       eor     r0,r9,r9,ror#5
> +       veor    d25,d25,d24
> +       add     r5,r5,r3
> +       and     r2,r2,r9
> +       vadd.i32        d7,d7,d25
> +       eor     r3,r0,r9,ror#19
> +       eor     r0,r5,r5,ror#11
> +       vadd.i32        q8,q8,q3
> +       eor     r2,r2,r11
> +       add     r4,r4,r3,ror#6
> +       eor     r3,r5,r6
> +       eor     r0,r0,r5,ror#20
> +       add     r4,r4,r2
> +       ldr     r2,[r14]
> +       and     r12,r12,r3
> +       add     r8,r8,r4
> +       vst1.32 {q8},[r1,:128]!
> +       add     r4,r4,r0,ror#2
> +       eor     r12,r12,r6
> +       teq     r2,#0                           @ check for K256 terminator
> +       ldr     r2,[sp,#0]
> +       sub     r1,r1,#64
> +       bne     .L_00_48
> +
> +       ldr             r1,[sp,#68]
> +       ldr             r0,[sp,#72]
> +       sub             r14,r14,#256    @ rewind r14
> +       teq             r1,r0
> +       subeq           r1,r1,#64               @ avoid SEGV
> +       vld1.8          {q0},[r1]!              @ load next input block
> +       vld1.8          {q1},[r1]!
> +       vld1.8          {q2},[r1]!
> +       vld1.8          {q3},[r1]!
> +       strne           r1,[sp,#68]
> +       mov             r1,sp
> +       add     r11,r11,r2
> +       eor     r2,r9,r10
> +       eor     r0,r8,r8,ror#5
> +       add     r4,r4,r12
> +       vld1.32 {q8},[r14,:128]!
> +       and     r2,r2,r8
> +       eor     r12,r0,r8,ror#19
> +       eor     r0,r4,r4,ror#11
> +       eor     r2,r2,r10
> +       vrev32.8        q0,q0
> +       add     r11,r11,r12,ror#6
> +       eor     r12,r4,r5
> +       eor     r0,r0,r4,ror#20
> +       add     r11,r11,r2
> +       vadd.i32        q8,q8,q0
> +       ldr     r2,[sp,#4]
> +       and     r3,r3,r12
> +       add     r7,r7,r11
> +       add     r11,r11,r0,ror#2
> +       eor     r3,r3,r5
> +       add     r10,r10,r2
> +       eor     r2,r8,r9
> +       eor     r0,r7,r7,ror#5
> +       add     r11,r11,r3
> +       and     r2,r2,r7
> +       eor     r3,r0,r7,ror#19
> +       eor     r0,r11,r11,ror#11
> +       eor     r2,r2,r9
> +       add     r10,r10,r3,ror#6
> +       eor     r3,r11,r4
> +       eor     r0,r0,r11,ror#20
> +       add     r10,r10,r2
> +       ldr     r2,[sp,#8]
> +       and     r12,r12,r3
> +       add     r6,r6,r10
> +       add     r10,r10,r0,ror#2
> +       eor     r12,r12,r4
> +       add     r9,r9,r2
> +       eor     r2,r7,r8
> +       eor     r0,r6,r6,ror#5
> +       add     r10,r10,r12
> +       and     r2,r2,r6
> +       eor     r12,r0,r6,ror#19
> +       eor     r0,r10,r10,ror#11
> +       eor     r2,r2,r8
> +       add     r9,r9,r12,ror#6
> +       eor     r12,r10,r11
> +       eor     r0,r0,r10,ror#20
> +       add     r9,r9,r2
> +       ldr     r2,[sp,#12]
> +       and     r3,r3,r12
> +       add     r5,r5,r9
> +       add     r9,r9,r0,ror#2
> +       eor     r3,r3,r11
> +       add     r8,r8,r2
> +       eor     r2,r6,r7
> +       eor     r0,r5,r5,ror#5
> +       add     r9,r9,r3
> +       and     r2,r2,r5
> +       eor     r3,r0,r5,ror#19
> +       eor     r0,r9,r9,ror#11
> +       eor     r2,r2,r7
> +       add     r8,r8,r3,ror#6
> +       eor     r3,r9,r10
> +       eor     r0,r0,r9,ror#20
> +       add     r8,r8,r2
> +       ldr     r2,[sp,#16]
> +       and     r12,r12,r3
> +       add     r4,r4,r8
> +       add     r8,r8,r0,ror#2
> +       eor     r12,r12,r10
> +       vst1.32 {q8},[r1,:128]!
> +       add     r7,r7,r2
> +       eor     r2,r5,r6
> +       eor     r0,r4,r4,ror#5
> +       add     r8,r8,r12
> +       vld1.32 {q8},[r14,:128]!
> +       and     r2,r2,r4
> +       eor     r12,r0,r4,ror#19
> +       eor     r0,r8,r8,ror#11
> +       eor     r2,r2,r6
> +       vrev32.8        q1,q1
> +       add     r7,r7,r12,ror#6
> +       eor     r12,r8,r9
> +       eor     r0,r0,r8,ror#20
> +       add     r7,r7,r2
> +       vadd.i32        q8,q8,q1
> +       ldr     r2,[sp,#20]
> +       and     r3,r3,r12
> +       add     r11,r11,r7
> +       add     r7,r7,r0,ror#2
> +       eor     r3,r3,r9
> +       add     r6,r6,r2
> +       eor     r2,r4,r5
> +       eor     r0,r11,r11,ror#5
> +       add     r7,r7,r3
> +       and     r2,r2,r11
> +       eor     r3,r0,r11,ror#19
> +       eor     r0,r7,r7,ror#11
> +       eor     r2,r2,r5
> +       add     r6,r6,r3,ror#6
> +       eor     r3,r7,r8
> +       eor     r0,r0,r7,ror#20
> +       add     r6,r6,r2
> +       ldr     r2,[sp,#24]
> +       and     r12,r12,r3
> +       add     r10,r10,r6
> +       add     r6,r6,r0,ror#2
> +       eor     r12,r12,r8
> +       add     r5,r5,r2
> +       eor     r2,r11,r4
> +       eor     r0,r10,r10,ror#5
> +       add     r6,r6,r12
> +       and     r2,r2,r10
> +       eor     r12,r0,r10,ror#19
> +       eor     r0,r6,r6,ror#11
> +       eor     r2,r2,r4
> +       add     r5,r5,r12,ror#6
> +       eor     r12,r6,r7
> +       eor     r0,r0,r6,ror#20
> +       add     r5,r5,r2
> +       ldr     r2,[sp,#28]
> +       and     r3,r3,r12
> +       add     r9,r9,r5
> +       add     r5,r5,r0,ror#2
> +       eor     r3,r3,r7
> +       add     r4,r4,r2
> +       eor     r2,r10,r11
> +       eor     r0,r9,r9,ror#5
> +       add     r5,r5,r3
> +       and     r2,r2,r9
> +       eor     r3,r0,r9,ror#19
> +       eor     r0,r5,r5,ror#11
> +       eor     r2,r2,r11
> +       add     r4,r4,r3,ror#6
> +       eor     r3,r5,r6
> +       eor     r0,r0,r5,ror#20
> +       add     r4,r4,r2
> +       ldr     r2,[sp,#32]
> +       and     r12,r12,r3
> +       add     r8,r8,r4
> +       add     r4,r4,r0,ror#2
> +       eor     r12,r12,r6
> +       vst1.32 {q8},[r1,:128]!
> +       add     r11,r11,r2
> +       eor     r2,r9,r10
> +       eor     r0,r8,r8,ror#5
> +       add     r4,r4,r12
> +       vld1.32 {q8},[r14,:128]!
> +       and     r2,r2,r8
> +       eor     r12,r0,r8,ror#19
> +       eor     r0,r4,r4,ror#11
> +       eor     r2,r2,r10
> +       vrev32.8        q2,q2
> +       add     r11,r11,r12,ror#6
> +       eor     r12,r4,r5
> +       eor     r0,r0,r4,ror#20
> +       add     r11,r11,r2
> +       vadd.i32        q8,q8,q2
> +       ldr     r2,[sp,#36]
> +       and     r3,r3,r12
> +       add     r7,r7,r11
> +       add     r11,r11,r0,ror#2
> +       eor     r3,r3,r5
> +       add     r10,r10,r2
> +       eor     r2,r8,r9
> +       eor     r0,r7,r7,ror#5
> +       add     r11,r11,r3
> +       and     r2,r2,r7
> +       eor     r3,r0,r7,ror#19
> +       eor     r0,r11,r11,ror#11
> +       eor     r2,r2,r9
> +       add     r10,r10,r3,ror#6
> +       eor     r3,r11,r4
> +       eor     r0,r0,r11,ror#20
> +       add     r10,r10,r2
> +       ldr     r2,[sp,#40]
> +       and     r12,r12,r3
> +       add     r6,r6,r10
> +       add     r10,r10,r0,ror#2
> +       eor     r12,r12,r4
> +       add     r9,r9,r2
> +       eor     r2,r7,r8
> +       eor     r0,r6,r6,ror#5
> +       add     r10,r10,r12
> +       and     r2,r2,r6
> +       eor     r12,r0,r6,ror#19
> +       eor     r0,r10,r10,ror#11
> +       eor     r2,r2,r8
> +       add     r9,r9,r12,ror#6
> +       eor     r12,r10,r11
> +       eor     r0,r0,r10,ror#20
> +       add     r9,r9,r2
> +       ldr     r2,[sp,#44]
> +       and     r3,r3,r12
> +       add     r5,r5,r9
> +       add     r9,r9,r0,ror#2
> +       eor     r3,r3,r11
> +       add     r8,r8,r2
> +       eor     r2,r6,r7
> +       eor     r0,r5,r5,ror#5
> +       add     r9,r9,r3
> +       and     r2,r2,r5
> +       eor     r3,r0,r5,ror#19
> +       eor     r0,r9,r9,ror#11
> +       eor     r2,r2,r7
> +       add     r8,r8,r3,ror#6
> +       eor     r3,r9,r10
> +       eor     r0,r0,r9,ror#20
> +       add     r8,r8,r2
> +       ldr     r2,[sp,#48]
> +       and     r12,r12,r3
> +       add     r4,r4,r8
> +       add     r8,r8,r0,ror#2
> +       eor     r12,r12,r10
> +       vst1.32 {q8},[r1,:128]!
> +       add     r7,r7,r2
> +       eor     r2,r5,r6
> +       eor     r0,r4,r4,ror#5
> +       add     r8,r8,r12
> +       vld1.32 {q8},[r14,:128]!
> +       and     r2,r2,r4
> +       eor     r12,r0,r4,ror#19
> +       eor     r0,r8,r8,ror#11
> +       eor     r2,r2,r6
> +       vrev32.8        q3,q3
> +       add     r7,r7,r12,ror#6
> +       eor     r12,r8,r9
> +       eor     r0,r0,r8,ror#20
> +       add     r7,r7,r2
> +       vadd.i32        q8,q8,q3
> +       ldr     r2,[sp,#52]
> +       and     r3,r3,r12
> +       add     r11,r11,r7
> +       add     r7,r7,r0,ror#2
> +       eor     r3,r3,r9
> +       add     r6,r6,r2
> +       eor     r2,r4,r5
> +       eor     r0,r11,r11,ror#5
> +       add     r7,r7,r3
> +       and     r2,r2,r11
> +       eor     r3,r0,r11,ror#19
> +       eor     r0,r7,r7,ror#11
> +       eor     r2,r2,r5
> +       add     r6,r6,r3,ror#6
> +       eor     r3,r7,r8
> +       eor     r0,r0,r7,ror#20
> +       add     r6,r6,r2
> +       ldr     r2,[sp,#56]
> +       and     r12,r12,r3
> +       add     r10,r10,r6
> +       add     r6,r6,r0,ror#2
> +       eor     r12,r12,r8
> +       add     r5,r5,r2
> +       eor     r2,r11,r4
> +       eor     r0,r10,r10,ror#5
> +       add     r6,r6,r12
> +       and     r2,r2,r10
> +       eor     r12,r0,r10,ror#19
> +       eor     r0,r6,r6,ror#11
> +       eor     r2,r2,r4
> +       add     r5,r5,r12,ror#6
> +       eor     r12,r6,r7
> +       eor     r0,r0,r6,ror#20
> +       add     r5,r5,r2
> +       ldr     r2,[sp,#60]
> +       and     r3,r3,r12
> +       add     r9,r9,r5
> +       add     r5,r5,r0,ror#2
> +       eor     r3,r3,r7
> +       add     r4,r4,r2
> +       eor     r2,r10,r11
> +       eor     r0,r9,r9,ror#5
> +       add     r5,r5,r3
> +       and     r2,r2,r9
> +       eor     r3,r0,r9,ror#19
> +       eor     r0,r5,r5,ror#11
> +       eor     r2,r2,r11
> +       add     r4,r4,r3,ror#6
> +       eor     r3,r5,r6
> +       eor     r0,r0,r5,ror#20
> +       add     r4,r4,r2
> +       ldr     r2,[sp,#64]
> +       and     r12,r12,r3
> +       add     r8,r8,r4
> +       add     r4,r4,r0,ror#2
> +       eor     r12,r12,r6
> +       vst1.32 {q8},[r1,:128]!
> +       ldr     r0,[r2,#0]
> +       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
> +       ldr     r12,[r2,#4]
> +       ldr     r3,[r2,#8]
> +       ldr     r1,[r2,#12]
> +       add     r4,r4,r0                        @ accumulate
> +       ldr     r0,[r2,#16]
> +       add     r5,r5,r12
> +       ldr     r12,[r2,#20]
> +       add     r6,r6,r3
> +       ldr     r3,[r2,#24]
> +       add     r7,r7,r1
> +       ldr     r1,[r2,#28]
> +       add     r8,r8,r0
> +       str     r4,[r2],#4
> +       add     r9,r9,r12
> +       str     r5,[r2],#4
> +       add     r10,r10,r3
> +       str     r6,[r2],#4
> +       add     r11,r11,r1
> +       str     r7,[r2],#4
> +       stmia   r2,{r8-r11}
> +
> +       movne   r1,sp
> +       ldrne   r2,[sp,#0]
> +       eorne   r12,r12,r12
> +       ldreq   sp,[sp,#76]                     @ restore original sp
> +       eorne   r3,r5,r6
> +       bne     .L_00_48
> +
> +       ldmia   sp!,{r4-r12,pc}
> +ENDPROC(sha256_transform_neon)
> diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
> new file mode 100644
> index 0000000..698a498
> --- /dev/null
> +++ b/arch/arm/crypto/sha256_neon_glue.c
> @@ -0,0 +1,201 @@
> +/*
> + * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
> + * using NEON instructions.
> + *
> + * Copyright ? 2015 Google Inc.
> + *
> + * This file is based on sha512_neon_glue.c:
> + *   Copyright ? 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the Free
> + * Software Foundation; either version 2 of the License, or (at your option)
> + * any later version.
> + *
> + */
> +
> +#include <crypto/internal/hash.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/mm.h>
> +#include <linux/cryptohash.h>
> +#include <linux/types.h>
> +#include <linux/string.h>
> +#include <crypto/sha.h>
> +#include <asm/byteorder.h>
> +#include <asm/simd.h>
> +#include <asm/neon.h>
> +
> +asmlinkage void sha256_transform_neon(u32 *digest, const void *data,
> +                                     unsigned int num_blks);
> +
> +
> +static int sha256_neon_init(struct shash_desc *desc)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +
> +       sctx->state[0] = SHA256_H0;
> +       sctx->state[1] = SHA256_H1;
> +       sctx->state[2] = SHA256_H2;
> +       sctx->state[3] = SHA256_H3;
> +       sctx->state[4] = SHA256_H4;
> +       sctx->state[5] = SHA256_H5;
> +       sctx->state[6] = SHA256_H6;
> +       sctx->state[7] = SHA256_H7;
> +       sctx->count = 0;
> +
> +       return 0;
> +}
> +
> +static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
> +                               unsigned int len, unsigned int partial)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +       unsigned int done = 0;
> +
> +       sctx->count += len;
> +
> +       if (partial) {
> +               done = SHA256_BLOCK_SIZE - partial;
> +               memcpy(sctx->buf + partial, data, done);
> +               sha256_transform_neon(sctx->state, sctx->buf, 1);
> +       }
> +
> +       if (len - done >= SHA256_BLOCK_SIZE) {
> +               const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
> +
> +               sha256_transform_neon(sctx->state, data + done, rounds);
> +               done += rounds * SHA256_BLOCK_SIZE;
> +       }
> +
> +       memcpy(sctx->buf, data + done, len - done);
> +
> +       return 0;
> +}
> +
> +static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
> +                            unsigned int len)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +       unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
> +       int res;
> +
> +       /* Handle the fast case right here */
> +       if (partial + len < SHA256_BLOCK_SIZE) {
> +               sctx->count += len;
> +               memcpy(sctx->buf + partial, data, len);
> +
> +               return 0;
> +       }
> +
> +       if (!may_use_simd()) {
> +               res = crypto_sha256_update(desc, data, len);
> +       } else {
> +               kernel_neon_begin();
> +               res = __sha256_neon_update(desc, data, len, partial);
> +               kernel_neon_end();
> +       }
> +
> +       return res;
> +}
> +
> +/* Add padding and return the message digest. */
> +static int sha256_neon_final(struct shash_desc *desc, u8 *out)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +       unsigned int i, index, padlen;
> +       __be32 *dst = (__be32 *)out;
> +       __be64 bits;
> +       static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
> +
> +       /* save number of bits */
> +       bits = cpu_to_be64(sctx->count << 3);
> +
> +       /* Pad out to 56 mod 64 and append length */
> +       index = sctx->count % SHA256_BLOCK_SIZE;
> +       padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
> +
> +       if (!may_use_simd()) {
> +               crypto_sha256_update(desc, padding, padlen);
> +               crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
> +       } else {
> +               kernel_neon_begin();
> +               /* We need to fill a whole block for __sha256_neon_update() */
> +               if (padlen <= 56) {
> +                       sctx->count += padlen;
> +                       memcpy(sctx->buf + index, padding, padlen);
> +               } else {
> +                       __sha256_neon_update(desc, padding, padlen, index);
> +               }
> +               __sha256_neon_update(desc, (const u8 *)&bits,
> +                                       sizeof(bits), 56);
> +               kernel_neon_end();
> +       }
> +
> +       /* Store state in digest */
> +       for (i = 0; i < 8; i++)
> +               dst[i] = cpu_to_be32(sctx->state[i]);
> +
> +       /* Wipe context */
> +       memset(sctx, 0, sizeof(*sctx));
> +
> +       return 0;
> +}
> +
> +static int sha256_neon_export(struct shash_desc *desc, void *out)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +
> +       memcpy(out, sctx, sizeof(*sctx));
> +
> +       return 0;
> +}
> +
> +static int sha256_neon_import(struct shash_desc *desc, const void *in)
> +{
> +       struct sha256_state *sctx = shash_desc_ctx(desc);
> +
> +       memcpy(sctx, in, sizeof(*sctx));
> +
> +       return 0;
> +}
> +
> +static struct shash_alg alg = {
> +       .digestsize     =       SHA256_DIGEST_SIZE,
> +       .init           =       sha256_neon_init,
> +       .update         =       sha256_neon_update,
> +       .final          =       sha256_neon_final,
> +       .export         =       sha256_neon_export,
> +       .import         =       sha256_neon_import,
> +       .descsize       =       sizeof(struct sha256_state),
> +       .statesize      =       sizeof(struct sha256_state),
> +       .base           =       {
> +               .cra_name       =       "sha256",
> +               .cra_driver_name =      "sha256-neon",
> +               .cra_priority   =       350,
> +               .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
> +               .cra_blocksize  =       SHA256_BLOCK_SIZE,
> +               .cra_module     =       THIS_MODULE,
> +       }
> +};
> +

You can also implement SHA-224 using the same core transform, it's
just some trivial glue code.

> +static int __init sha256_neon_mod_init(void)
> +{
> +       if (!cpu_has_neon())
> +               return -ENODEV;
> +
> +       return crypto_register_shash(&alg);
> +}
> +
> +static void __exit sha256_neon_mod_fini(void)
> +{
> +       crypto_unregister_shash(&alg);
> +}
> +
> +module_init(sha256_neon_mod_init);
> +module_exit(sha256_neon_mod_fini);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, NEON accelerated");
> +
> +MODULE_ALIAS("sha256");
> diff --git a/crypto/Kconfig b/crypto/Kconfig
> index 50f4da4..0505523 100644
> --- a/crypto/Kconfig
> +++ b/crypto/Kconfig
> @@ -610,6 +610,18 @@ config CRYPTO_SHA256
>           This code also includes SHA-224, a 224 bit hash with 112 bits
>           of security against collision attacks.
>
> +config CRYPTO_SHA256_ARM_NEON
> +       tristate "SHA256 digest algorithm (ARM NEON)"
> +       depends on ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
> +       select CRYPTO_SHA256
> +       select CRYPTO_HASH
> +       help
> +         SHA-256 secure hash standard (DFIPS 180-2) implemented
> +         using ARM NEON instructions, when available.
> +
> +         This version of SHA implements a 256 bit hash with 128 bits of
> +         security against collision attacks.
> +

Could you please rebase this onto Herbert's cryptodev tree and move
this to arch/arm/crypto/Kconfig?


>  config CRYPTO_SHA256_SPARC64
>         tristate "SHA224 and SHA256 digest algorithm (SPARC64)"
>         depends on SPARC64

Regards,
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] arm: crypto: Add NEON optimized SHA-256
  2015-03-16 16:08   ` Ard Biesheuvel
@ 2015-03-16 16:23     ` Sami Tolvanen
  -1 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-16 16:23 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Andy Polyakov, linux-arm-kernel, linux-crypto, Herbert Xu,
	David S. Miller

On Mon, Mar 16, 2015 at 05:08:03PM +0100, Ard Biesheuvel wrote:
> Have you tested this code with the tcrypt.ko module?

I have not, but I can look into it.

> Did you talk to Andy about the license? I don't think this is
> permissible for the kernel as-is.

Unless I have misunderstood something, the license at the Cryptogams
website includes an option to license the code under the GNU GPL.

However, I can certainly contact Andy to clarify his intentions.

> This is broken on thumb-2, use adr instead

> You can also implement SHA-224 using the same core transform, it's
> just some trivial glue code.

> Could you please rebase this onto Herbert's cryptodev tree and move
> this to arch/arm/crypto/Kconfig?

Thanks for the comments, I will submit a second version once we have
a clarification on the license.

Sami

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] arm: crypto: Add NEON optimized SHA-256
@ 2015-03-16 16:23     ` Sami Tolvanen
  0 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-16 16:23 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Mar 16, 2015 at 05:08:03PM +0100, Ard Biesheuvel wrote:
> Have you tested this code with the tcrypt.ko module?

I have not, but I can look into it.

> Did you talk to Andy about the license? I don't think this is
> permissible for the kernel as-is.

Unless I have misunderstood something, the license at the Cryptogams
website includes an option to license the code under the GNU GPL.

However, I can certainly contact Andy to clarify his intentions.

> This is broken on thumb-2, use adr instead

> You can also implement SHA-224 using the same core transform, it's
> just some trivial glue code.

> Could you please rebase this onto Herbert's cryptodev tree and move
> this to arch/arm/crypto/Kconfig?

Thanks for the comments, I will submit a second version once we have
a clarification on the license.

Sami

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] arm: crypto: Add NEON optimized SHA-256
  2015-03-16 16:23     ` Sami Tolvanen
@ 2015-03-17  6:56       ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-17  6:56 UTC (permalink / raw)
  To: Sami Tolvanen
  Cc: Andy Polyakov, linux-arm-kernel, linux-crypto, Herbert Xu,
	David S. Miller

On 16 March 2015 at 17:23, Sami Tolvanen <samitolvanen@google.com> wrote:
> On Mon, Mar 16, 2015 at 05:08:03PM +0100, Ard Biesheuvel wrote:
>> Have you tested this code with the tcrypt.ko module?
>
> I have not, but I can look into it.
>
>> Did you talk to Andy about the license? I don't think this is
>> permissible for the kernel as-is.
>
> Unless I have misunderstood something, the license at the Cryptogams
> website includes an option to license the code under the GNU GPL.
>
> However, I can certainly contact Andy to clarify his intentions.
>
>> This is broken on thumb-2, use adr instead
>
>> You can also implement SHA-224 using the same core transform, it's
>> just some trivial glue code.
>
>> Could you please rebase this onto Herbert's cryptodev tree and move
>> this to arch/arm/crypto/Kconfig?
>
> Thanks for the comments, I will submit a second version once we have
> a clarification on the license.
>

I have tested your code with a Thumb2 kernel. With this patch folded,
it builds and passes the tcrypt.ko test for me

--- a/arch/arm/crypto/sha256-armv7-neon.S
+++ b/arch/arm/crypto/sha256-armv7-neon.S
@@ -10,7 +10,6 @@
 #include <linux/linkage.h>

 .text
-.code   32
 .fpu neon

 .type  K256,%object
@@ -44,15 +43,15 @@ ENTRY(sha256_transform_neon)
         *      %r1: data
         *      %r2: nblks
         */
-       sub     r3,pc,#8                @ sha256_transform_neon
        add     r2,r1,r2,lsl#6  @ len to point at the end of inp

        stmdb   sp!,{r4-r12,lr}

        mov     r12,sp
-       sub     sp,sp,#16*4+16          @ alloca
-       sub     r14,r3,#256+32  @ K256
-       bic     sp,sp,#15               @ align for 128-bit stores
+       sub     r14,sp,#16*4+16         @ alloca
+       bic     r14,r14,#15             @ align for 128-bit stores
+       mov     sp,r14
+       adr     r14,K256

        vld1.8          {q0},[r1]!
        vld1.8          {q1},[r1]!

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] arm: crypto: Add NEON optimized SHA-256
@ 2015-03-17  6:56       ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-17  6:56 UTC (permalink / raw)
  To: linux-arm-kernel

On 16 March 2015 at 17:23, Sami Tolvanen <samitolvanen@google.com> wrote:
> On Mon, Mar 16, 2015 at 05:08:03PM +0100, Ard Biesheuvel wrote:
>> Have you tested this code with the tcrypt.ko module?
>
> I have not, but I can look into it.
>
>> Did you talk to Andy about the license? I don't think this is
>> permissible for the kernel as-is.
>
> Unless I have misunderstood something, the license at the Cryptogams
> website includes an option to license the code under the GNU GPL.
>
> However, I can certainly contact Andy to clarify his intentions.
>
>> This is broken on thumb-2, use adr instead
>
>> You can also implement SHA-224 using the same core transform, it's
>> just some trivial glue code.
>
>> Could you please rebase this onto Herbert's cryptodev tree and move
>> this to arch/arm/crypto/Kconfig?
>
> Thanks for the comments, I will submit a second version once we have
> a clarification on the license.
>

I have tested your code with a Thumb2 kernel. With this patch folded,
it builds and passes the tcrypt.ko test for me

--- a/arch/arm/crypto/sha256-armv7-neon.S
+++ b/arch/arm/crypto/sha256-armv7-neon.S
@@ -10,7 +10,6 @@
 #include <linux/linkage.h>

 .text
-.code   32
 .fpu neon

 .type  K256,%object
@@ -44,15 +43,15 @@ ENTRY(sha256_transform_neon)
         *      %r1: data
         *      %r2: nblks
         */
-       sub     r3,pc,#8                @ sha256_transform_neon
        add     r2,r1,r2,lsl#6  @ len to point at the end of inp

        stmdb   sp!,{r4-r12,lr}

        mov     r12,sp
-       sub     sp,sp,#16*4+16          @ alloca
-       sub     r14,r3,#256+32  @ K256
-       bic     sp,sp,#15               @ align for 128-bit stores
+       sub     r14,sp,#16*4+16         @ alloca
+       bic     r14,r14,#15             @ align for 128-bit stores
+       mov     sp,r14
+       adr     r14,K256

        vld1.8          {q0},[r1]!
        vld1.8          {q1},[r1]!

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] arm: crypto: Add NEON optimized SHA-256
  2015-03-17  6:56       ` Ard Biesheuvel
@ 2015-03-17 15:09         ` Andy Polyakov
  -1 siblings, 0 replies; 66+ messages in thread
From: Andy Polyakov @ 2015-03-17 15:09 UTC (permalink / raw)
  To: Ard Biesheuvel, Sami Tolvanen
  Cc: David S. Miller, linux-crypto, linux-arm-kernel, Herbert Xu

Hi,

>>> Have you tested this code with the tcrypt.ko module?
>>
>> I have not, but I can look into it.
>>
>>> Did you talk to Andy about the license? I don't think this is
>>> permissible for the kernel as-is.
>>
>> Unless I have misunderstood something, the license at the Cryptogams
>> website includes an option to license the code under the GNU GPL.
>>
>> However, I can certainly contact Andy to clarify his intentions.

I have no problems with reusing assembly modules in kernel context. The
whole idea behind cryptogams initiative was exactly to reuse code in
different contexts. But I'd prefer if it's more of cooperative effort,
when we help each other to improve code, test on and tune for more
platforms single developer might have access to, cross-report problems,
etc. For this reason I'd prefer if it can be arranged in way similar to
bsaes-armv7 module, i.e. we work together on shared copy of module that
generates assembly that can be then compiled for OpenSSL or kernel. Is
it sensible? BTW, why stop at SHA256? There is SHA512 and NEON SHA1...

As for practicalities. In order to spare brain capacity for list
subscribers, it would probably be appropriate to work out details such
as naming of entry points in smaller group and present result when it's
ready to go and tests. I'll start by looking at Thumb-ification...

Cheers.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] arm: crypto: Add NEON optimized SHA-256
@ 2015-03-17 15:09         ` Andy Polyakov
  0 siblings, 0 replies; 66+ messages in thread
From: Andy Polyakov @ 2015-03-17 15:09 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

>>> Have you tested this code with the tcrypt.ko module?
>>
>> I have not, but I can look into it.
>>
>>> Did you talk to Andy about the license? I don't think this is
>>> permissible for the kernel as-is.
>>
>> Unless I have misunderstood something, the license at the Cryptogams
>> website includes an option to license the code under the GNU GPL.
>>
>> However, I can certainly contact Andy to clarify his intentions.

I have no problems with reusing assembly modules in kernel context. The
whole idea behind cryptogams initiative was exactly to reuse code in
different contexts. But I'd prefer if it's more of cooperative effort,
when we help each other to improve code, test on and tune for more
platforms single developer might have access to, cross-report problems,
etc. For this reason I'd prefer if it can be arranged in way similar to
bsaes-armv7 module, i.e. we work together on shared copy of module that
generates assembly that can be then compiled for OpenSSL or kernel. Is
it sensible? BTW, why stop at SHA256? There is SHA512 and NEON SHA1...

As for practicalities. In order to spare brain capacity for list
subscribers, it would probably be appropriate to work out details such
as naming of entry points in smaller group and present result when it's
ready to go and tests. I'll start by looking at Thumb-ification...

Cheers.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] arm: crypto: Add NEON optimized SHA-256
  2015-03-17 15:09         ` Andy Polyakov
@ 2015-03-17 15:21           ` Sami Tolvanen
  -1 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-17 15:21 UTC (permalink / raw)
  To: Andy Polyakov
  Cc: Ard Biesheuvel, linux-arm-kernel, linux-crypto, Herbert Xu,
	David S. Miller

On Tue, Mar 17, 2015 at 04:09:40PM +0100, Andy Polyakov wrote:
> I have no problems with reusing assembly modules in kernel context.

Awesome, thank you for clarifying this.

> I'd prefer if it can be arranged in way similar to bsaes-armv7 module,
> i.e. we work together on shared copy of module that generates assembly
> that can be then compiled for OpenSSL or kernel. Is it sensible?

Sure, that sounds good to me.

> BTW, why stop at SHA256? There is SHA512 and NEON SHA1...

The kernel already has NEON SHA-1 and SHA-512, but for some reason is
lacking SHA-256. I have not tested how they compare to yours though.

Sami

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] arm: crypto: Add NEON optimized SHA-256
@ 2015-03-17 15:21           ` Sami Tolvanen
  0 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-17 15:21 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Mar 17, 2015 at 04:09:40PM +0100, Andy Polyakov wrote:
> I have no problems with reusing assembly modules in kernel context.

Awesome, thank you for clarifying this.

> I'd prefer if it can be arranged in way similar to bsaes-armv7 module,
> i.e. we work together on shared copy of module that generates assembly
> that can be then compiled for OpenSSL or kernel. Is it sensible?

Sure, that sounds good to me.

> BTW, why stop at SHA256? There is SHA512 and NEON SHA1...

The kernel already has NEON SHA-1 and SHA-512, but for some reason is
lacking SHA-256. I have not tested how they compare to yours though.

Sami

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] arm: crypto: Add NEON optimized SHA-256
  2015-03-17 15:09         ` Andy Polyakov
@ 2015-03-17 15:51           ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-17 15:51 UTC (permalink / raw)
  To: Andy Polyakov
  Cc: Sami Tolvanen, linux-arm-kernel, linux-crypto, Herbert Xu,
	David S. Miller

On 17 March 2015 at 16:09, Andy Polyakov <appro@openssl.org> wrote:
> Hi,
>
>>>> Have you tested this code with the tcrypt.ko module?
>>>
>>> I have not, but I can look into it.
>>>
>>>> Did you talk to Andy about the license? I don't think this is
>>>> permissible for the kernel as-is.
>>>
>>> Unless I have misunderstood something, the license at the Cryptogams
>>> website includes an option to license the code under the GNU GPL.
>>>
>>> However, I can certainly contact Andy to clarify his intentions.
>
> I have no problems with reusing assembly modules in kernel context. The
> whole idea behind cryptogams initiative was exactly to reuse code in
> different contexts. But I'd prefer if it's more of cooperative effort,
> when we help each other to improve code, test on and tune for more
> platforms single developer might have access to, cross-report problems,
> etc. For this reason I'd prefer if it can be arranged in way similar to
> bsaes-armv7 module, i.e. we work together on shared copy of module that
> generates assembly that can be then compiled for OpenSSL or kernel. Is

Yes, that is what I thought.

@Sami, this also means we should integrate the .pl file and the
generated .S. Please have a look at how I integrated Andy's
bsaes-armv7.pl module.

> it sensible? BTW, why stop at SHA256? There is SHA512 and NEON SHA1...
>
> As for practicalities. In order to spare brain capacity for list
> subscribers, it would probably be appropriate to work out details such
> as naming of entry points in smaller group and present result when it's
> ready to go and tests. I'll start by looking at Thumb-ification...
>

@Andy: the core code builds without problems, but for Thumb2 there are
some trivial changes to apply:
- '.code 32' needs to be made conditional
- use adr instead of pc arithmetic for the address of K256
- bic cannot use r13 as input/output, so it needs to go via another register
- some conditional instructions need 'it' instructions added (not
strictly necessary for the kernel, because it passes
-mimplicit-it-thumb on the assembler command line, but nice for
completeness)
(see patch in other reply)

I am happy to test and/or review stuff as well.

Thanks,
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] arm: crypto: Add NEON optimized SHA-256
@ 2015-03-17 15:51           ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-17 15:51 UTC (permalink / raw)
  To: linux-arm-kernel

On 17 March 2015 at 16:09, Andy Polyakov <appro@openssl.org> wrote:
> Hi,
>
>>>> Have you tested this code with the tcrypt.ko module?
>>>
>>> I have not, but I can look into it.
>>>
>>>> Did you talk to Andy about the license? I don't think this is
>>>> permissible for the kernel as-is.
>>>
>>> Unless I have misunderstood something, the license at the Cryptogams
>>> website includes an option to license the code under the GNU GPL.
>>>
>>> However, I can certainly contact Andy to clarify his intentions.
>
> I have no problems with reusing assembly modules in kernel context. The
> whole idea behind cryptogams initiative was exactly to reuse code in
> different contexts. But I'd prefer if it's more of cooperative effort,
> when we help each other to improve code, test on and tune for more
> platforms single developer might have access to, cross-report problems,
> etc. For this reason I'd prefer if it can be arranged in way similar to
> bsaes-armv7 module, i.e. we work together on shared copy of module that
> generates assembly that can be then compiled for OpenSSL or kernel. Is

Yes, that is what I thought.

@Sami, this also means we should integrate the .pl file and the
generated .S. Please have a look at how I integrated Andy's
bsaes-armv7.pl module.

> it sensible? BTW, why stop at SHA256? There is SHA512 and NEON SHA1...
>
> As for practicalities. In order to spare brain capacity for list
> subscribers, it would probably be appropriate to work out details such
> as naming of entry points in smaller group and present result when it's
> ready to go and tests. I'll start by looking at Thumb-ification...
>

@Andy: the core code builds without problems, but for Thumb2 there are
some trivial changes to apply:
- '.code 32' needs to be made conditional
- use adr instead of pc arithmetic for the address of K256
- bic cannot use r13 as input/output, so it needs to go via another register
- some conditional instructions need 'it' instructions added (not
strictly necessary for the kernel, because it passes
-mimplicit-it-thumb on the assembler command line, but nice for
completeness)
(see patch in other reply)

I am happy to test and/or review stuff as well.

Thanks,
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-16 15:48 ` Sami Tolvanen
@ 2015-03-23 13:50   ` Sami Tolvanen
  -1 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-23 13:50 UTC (permalink / raw)
  To: linux-arm-kernel, linux-crypto, ard.biesheuvel, herbert, davem
  Cc: Andy Polyakov

Add Andy Polyakov's optimized assembly and NEON implementations for
SHA-256/224.

The sha256-armv4.pl script for generating the assembly code is from
OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.

Compared to sha256-generic these implementations have the following
tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

  bs    b/u      sha256-neon  sha256-asm
  16    16       x1.32        x1.19
  64    16       x1.27        x1.15
  64    64       x1.36        x1.20
  256   16       x1.22        x1.11
  256   64       x1.36        x1.19
  256   256      x1.59        x1.23
  1024  16       x1.21        x1.10
  1024  256      x1.65        x1.23
  1024  1024     x1.76        x1.25
  2048  16       x1.21        x1.10
  2048  256      x1.66        x1.23
  2048  1024     x1.78        x1.25
  2048  2048     x1.79        x1.25
  4096  16       x1.20        x1.09
  4096  256      x1.66        x1.23
  4096  1024     x1.79        x1.26
  4096  4096     x1.82        x1.26
  8192  16       x1.20        x1.09
  8192  256      x1.67        x1.23
  8192  1024     x1.80        x1.26
  8192  4096     x1.85        x1.28
  8192  8192     x1.85        x1.27

Where bs refers to block size and b/u to bytes per update.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Cc: Andy Polyakov <appro@openssl.org>

---
Changes since v1:
  Rebased to Herbert's cryptodev tree
  Include sha256-armv4.pl and use it to generate sha256-core.S
  Add integer-only assembly version as sha256-asm
  Add support for SHA-224 to the glue code
  Change priority for sha256/224-ce to 300

---
 arch/arm/crypto/Kconfig               |    7 
 arch/arm/crypto/Makefile              |    8 
 arch/arm/crypto/sha2-ce-glue.c        |    4 
 arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
 arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
 arch/arm/crypto/sha256_glue.c         |  246 ++
 arch/arm/crypto/sha256_glue.h         |   23 
 arch/arm/crypto/sha256_neon_glue.c    |  172 +
 8 files changed, 3945 insertions(+), 3 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index d63f319..458729d 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
 	  SHA-256 secure hash standard (DFIPS 180-2) implemented
 	  using special ARMv8 Crypto Extensions.
 
+config CRYPTO_SHA256_ARM
+	tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
+	select CRYPTO_HASH
+	help
+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
+	  using optimized ARM assembler and NEON, when available.
+
 config CRYPTO_SHA512_ARM_NEON
 	tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 9a273bd..ef46e89 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
@@ -16,6 +17,8 @@ aes-arm-y	:= aes-armv4.o aes_glue.o
 aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
@@ -28,4 +31,7 @@ quiet_cmd_perl = PERL    $@
 $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
 	$(call cmd,perl)
 
-.PRECIOUS: $(obj)/aesbs-core.S
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+	$(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 9ffe8ad..0449eca 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
 	.base			= {
 		.cra_name		= "sha224",
 		.cra_driver_name	= "sha224-ce",
-		.cra_priority		= 200,
+		.cra_priority		= 300,
 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
@@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
 	.base			= {
 		.cra_name		= "sha256",
 		.cra_driver_name	= "sha256-ce",
-		.cra_priority		= 200,
+		.cra_priority		= 300,
 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644
index 0000000..4fee74d
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -0,0 +1,713 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	$t0="r0";
+$inp="r1";	$t4="r1";
+$len="r2";	$t1="r2";
+$T1="r3";	$t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	@ ldr	$t1,[$inp],#4			@ $i
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	rev	$t1,$t1
+#else
+	@ ldrb	$t1,[$inp,#3]			@ $i
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	ldrb	$t2,[$inp,#2]
+	ldrb	$t0,[$inp,#1]
+	orr	$t1,$t1,$t2,lsl#8
+	ldrb	$t2,[$inp],#4
+	orr	$t1,$t1,$t0,lsl#16
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	orr	$t1,$t1,$t2,lsl#24
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+#endif
+___
+$code.=<<___;
+	ldr	$t2,[$Ktbl],#4			@ *K256++
+	add	$h,$h,$t1			@ h+=X[i]
+	str	$t1,[sp,#`$i%16`*4]
+	eor	$t1,$f,$g
+	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
+	and	$t1,$t1,$e
+	add	$h,$h,$t2			@ h+=K256[i]
+	eor	$t1,$t1,$g			@ Ch(e,f,g)
+	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+	add	$h,$h,$t1			@ h+=Ch(e,f,g)
+#if $i==31
+	and	$t2,$t2,#0xff
+	cmp	$t2,#0xf2			@ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4			@ prefetch
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+#else
+	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
+#endif
+	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
+	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
+	add	$d,$d,$h			@ d+=h
+	eor	$t3,$t3,$b			@ Maj(a,b,c)
+	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
+	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t4,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t1,ror#$sigma0[0]
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	mov	$t2,$t4,ror#$sigma1[0]
+	eor	$t0,$t0,$t1,ror#$sigma0[1]
+	eor	$t2,$t2,$t4,ror#$sigma1[1]
+	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	ldr	$t1,[sp,#`($i+0)%16`*4]
+	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	ldr	$t4,[sp,#`($i+9)%16`*4]
+
+	add	$t2,$t2,$t0
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
+	add	$t1,$t1,$t2
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	add	$t1,$t1,$t4			@ X[i]
+___
+	&BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
+	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+	sub	$Ktbl,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t3,$B,$C		@ magic
+	eor	$t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
+	ldr	$t0,[$t3,#0]
+	ldr	$t1,[$t3,#4]
+	ldr	$t2,[$t3,#8]
+	add	$A,$A,$t0
+	ldr	$t0,[$t3,#12]
+	add	$B,$B,$t1
+	ldr	$t1,[$t3,#16]
+	add	$C,$C,$t2
+	ldr	$t2,[$t3,#20]
+	add	$D,$D,$t0
+	ldr	$t0,[$t3,#24]
+	add	$E,$E,$t1
+	ldr	$t1,[$t3,#28]
+	add	$F,$F,$t2
+	ldr	$inp,[sp,#17*4]		@ pull inp
+	ldr	$t2,[sp,#18*4]		@ pull inp+len
+	add	$G,$G,$t0
+	add	$H,$H,$t1
+	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+	cmp	$inp,$t2
+	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#`16+3`*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 while($#insns>=2) { eval(shift(@insns)); }
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&eor	($t1,$f,$g)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&and	($t1,$t1,$e)',
+	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
+	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&add	($d,$d,$h)',			# d+=h
+	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	$H,sp,#16*4+16
+	adr	$Ktbl,K256
+	bic	$H,$H,#15		@ align for 128-bit stores
+	mov	$t2,sp
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{@X[0]},[$inp]!
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	vld1.32		{$T0},[$Ktbl,:128]!
+	vld1.32		{$T1},[$Ktbl,:128]!
+	vld1.32		{$T2},[$Ktbl,:128]!
+	vld1.32		{$T3},[$Ktbl,:128]!
+	vrev32.8	@X[0],@X[0]		@ yes, even on
+	str		$ctx,[sp,#64]
+	vrev32.8	@X[1],@X[1]		@ big-endian
+	str		$inp,[sp,#68]
+	mov		$Xfer,sp
+	vrev32.8	@X[2],@X[2]
+	str		$len,[sp,#72]
+	vrev32.8	@X[3],@X[3]
+	str		$t2,[sp,#76]		@ save original sp
+	vadd.i32	$T0,$T0,@X[0]
+	vadd.i32	$T1,$T1,@X[1]
+	vst1.32		{$T0},[$Xfer,:128]!
+	vadd.i32	$T2,$T2,@X[2]
+	vst1.32		{$T1},[$Xfer,:128]!
+	vadd.i32	$T3,$T3,@X[3]
+	vst1.32		{$T2},[$Xfer,:128]!
+	vst1.32		{$T3},[$Xfer,:128]!
+
+	ldmia		$ctx,{$A-$H}
+	sub		$Xfer,$Xfer,#64
+	ldr		$t1,[sp,#0]
+	eor		$t2,$t2,$t2
+	eor		$t3,$B,$C
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	teq	$t1,#0				@ check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	ldr		$inp,[sp,#68]
+	ldr		$t0,[sp,#72]
+	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
+	teq		$inp,$t0
+	it		eq
+	subeq		$inp,$inp,#64		@ avoid SEGV
+	vld1.8		{@X[0]},[$inp]!		@ load next input block
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	it		ne
+	strne		$inp,[sp,#68]
+	mov		$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	ldr	$t0,[$t1,#0]
+	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
+	ldr	$t2,[$t1,#4]
+	ldr	$t3,[$t1,#8]
+	ldr	$t4,[$t1,#12]
+	add	$A,$A,$t0			@ accumulate
+	ldr	$t0,[$t1,#16]
+	add	$B,$B,$t2
+	ldr	$t2,[$t1,#20]
+	add	$C,$C,$t3
+	ldr	$t3,[$t1,#24]
+	add	$D,$D,$t4
+	ldr	$t4,[$t1,#28]
+	add	$E,$E,$t0
+	str	$A,[$t1],#4
+	add	$F,$F,$t2
+	str	$B,[$t1],#4
+	add	$G,$G,$t3
+	str	$C,[$t1],#4
+	add	$H,$H,$t4
+	str	$D,[$t1],#4
+	stmia	$t1,{$E-$H}
+
+	ittte	ne
+	movne	$Xfer,sp
+	ldrne	$t1,[sp,#0]
+	eorne	$t2,$t2,$t2
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	$t3,$B,$C
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+	adr	$Ktbl,.LARMv8
+	sub	$Ktbl,$Ktbl,#.LARMv8-K256
+# else
+	adrl	$Ktbl,K256
+# endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
+	vld1.32		{$W0},[$Ktbl]!
+	vrev32.8	@MSG[0],@MSG[0]
+	vrev32.8	@MSG[1],@MSG[1]
+	vrev32.8	@MSG[2],@MSG[2]
+	vrev32.8	@MSG[3],@MSG[3]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	vmov		$EFGH_SAVE,$EFGH
+	teq		$inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vld1.32		{$W0},[$Ktbl]!
+	vadd.i32	$W1,$W1,@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vld1.32		{$W1},[$Ktbl]
+	vadd.i32	$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#256-16	@ rewind
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vadd.i32	$W1,$W1,@MSG[3]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD,$EFGH},[$ctx]
+
+	ret		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+{   my  %opcode = (
+	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
+	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644
index 0000000..683f1cc
--- /dev/null
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -0,0 +1,2775 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r14,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6		@ magic
+	eor	r12,r12,r12
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 0
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 0
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 0==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 1
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 1
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 1==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 2
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 2
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 2==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 3
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 3
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 3==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 4
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 4
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 4==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 5
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 5==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 6
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 6
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 6==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 7
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 7==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 8
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 8
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 8==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 9
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 9
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 9==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 10
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 10
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 10==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 11
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 11
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 11==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 12
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 12
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 12==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 13
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 13
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 13==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 14
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 14
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 14==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 15
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 15
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 15==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+	@ ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#14*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#0*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#9*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 16==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#15*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#1*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#10*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 17==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#0*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#2*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#11*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 18==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#1*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#3*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#12*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 19==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#2*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#4*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#13*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 20==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#3*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#5*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#14*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 21==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#4*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#6*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#15*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 22==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#5*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#7*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#0*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 23==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#6*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#8*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#1*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 24==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#7*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#9*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#2*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 25==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#8*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#10*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#3*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 26==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#9*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#11*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#4*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 27==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#10*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#12*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#5*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 28==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#11*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#13*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#6*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 29==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#12*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#14*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#7*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 30==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#13*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#15*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#8*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 31==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
+	ldr	r0,[r3,#0]
+	ldr	r2,[r3,#4]
+	ldr	r12,[r3,#8]
+	add	r4,r4,r0
+	ldr	r0,[r3,#12]
+	add	r5,r5,r2
+	ldr	r2,[r3,#16]
+	add	r6,r6,r12
+	ldr	r12,[r3,#20]
+	add	r7,r7,r0
+	ldr	r0,[r3,#24]
+	add	r8,r8,r2
+	ldr	r2,[r3,#28]
+	add	r9,r9,r12
+	ldr	r1,[sp,#17*4]		@ pull inp
+	ldr	r12,[sp,#18*4]		@ pull inp+len
+	add	r10,r10,r0
+	add	r11,r11,r2
+	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+	cmp	r1,r12
+	sub	r14,r14,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#19*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	r11,sp,#16*4+16
+	adr	r14,K256
+	bic	r11,r11,#15		@ align for 128-bit stores
+	mov	r12,sp
+	mov	sp,r11			@ alloca
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{q0},[r1]!
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	vld1.32		{q8},[r14,:128]!
+	vld1.32		{q9},[r14,:128]!
+	vld1.32		{q10},[r14,:128]!
+	vld1.32		{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str		r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str		r1,[sp,#68]
+	mov		r1,sp
+	vrev32.8	q2,q2
+	str		r2,[sp,#72]
+	vrev32.8	q3,q3
+	str		r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32		{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32		{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32		{q10},[r1,:128]!
+	vst1.32		{q11},[r1,:128]!
+
+	ldmia		r0,{r4-r11}
+	sub		r1,r1,#64
+	ldr		r2,[sp,#0]
+	eor		r12,r12,r12
+	eor		r3,r5,r6
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	.L_00_48
+
+	ldr		r1,[sp,#68]
+	ldr		r0,[sp,#72]
+	sub		r14,r14,#256	@ rewind r14
+	teq		r1,r0
+	it		eq
+	subeq		r1,r1,#64		@ avoid SEGV
+	vld1.8		{q0},[r1]!		@ load next input block
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	it		ne
+	strne		r1,[sp,#68]
+	mov		r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8-r11}
+
+	ittte	ne
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	r3,r5,r6
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{q0,q1},[r0]
+# ifdef __thumb2__
+	adr	r3,.LARMv8
+	sub	r3,r3,#.LARMv8-K256
+# else
+	adrl	r3,K256
+# endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{q8-q9},[r1]!
+	vld1.8		{q10-q11},[r1]!
+	vld1.32		{q12},[r3]!
+	vrev32.8	q8,q8
+	vrev32.8	q9,q9
+	vrev32.8	q10,q10
+	vrev32.8	q11,q11
+	vmov		q14,q0	@ offload
+	vmov		q15,q1
+	teq		r1,r2
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vld1.32		{q13},[r3]
+	vadd.i32	q12,q12,q10
+	sub		r3,r3,#256-16	@ rewind
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vadd.i32	q13,q13,q11
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vadd.i32	q0,q0,q14
+	vadd.i32	q1,q1,q15
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{q0,q1},[r0]
+
+	bx	lr		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644
index 0000000..25ceba6
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.c
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ *   Copyright (C) 2013 Intel Corporation
+ *   Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+		    unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_block_data_order(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_block_data_order(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	/* We need to fill a whole block for __sha256_update */
+	if (padlen <= 56) {
+		sctx->count += padlen;
+		memcpy(sctx->buf + index, padding, padlen);
+	} else {
+		__sha256_update(desc, padding, padlen, index);
+	}
+	__sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_final(desc, D);
+
+	memcpy(out, D, SHA224_DIGEST_SIZE);
+	memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_init,
+	.update		=	sha256_update,
+	.final		=	sha256_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-asm",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_init,
+	.update		=	sha256_update,
+	.final		=	sha224_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-asm",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
+
+static int __init sha256_mod_init(void)
+{
+	int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+	if (res < 0)
+		return res;
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+		res = crypto_register_shashes(sha256_neon_algs,
+					      ARRAY_SIZE(sha256_neon_algs));
+
+		if (res < 0)
+			crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+	}
+
+	return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+		crypto_unregister_shashes(sha256_neon_algs,
+					  ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644
index 0000000..0312f4f
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.h
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+			   unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 0000000..d0f168d
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ *   Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+				unsigned int len, unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_block_data_order_neon(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!may_use_simd()) {
+		res = __sha256_update(desc, data, len, partial);
+	} else {
+		kernel_neon_begin();
+		res = __sha256_neon_update(desc, data, len, partial);
+		kernel_neon_end();
+	}
+
+	return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	if (!may_use_simd()) {
+		sha256_update(desc, padding, padlen);
+		sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_neon_begin();
+		/* We need to fill a whole block for __sha256_neon_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha256_neon_update(desc, padding, padlen, index);
+		}
+		__sha256_neon_update(desc, (const u8 *)&bits,
+					sizeof(bits), 56);
+		kernel_neon_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memzero_explicit(sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_neon_final(desc, D);
+
+	memcpy(out, D, SHA224_DIGEST_SIZE);
+	memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha256_neon_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha224_neon_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-23 13:50   ` Sami Tolvanen
  0 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-23 13:50 UTC (permalink / raw)
  To: linux-arm-kernel

Add Andy Polyakov's optimized assembly and NEON implementations for
SHA-256/224.

The sha256-armv4.pl script for generating the assembly code is from
OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.

Compared to sha256-generic these implementations have the following
tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

  bs    b/u      sha256-neon  sha256-asm
  16    16       x1.32        x1.19
  64    16       x1.27        x1.15
  64    64       x1.36        x1.20
  256   16       x1.22        x1.11
  256   64       x1.36        x1.19
  256   256      x1.59        x1.23
  1024  16       x1.21        x1.10
  1024  256      x1.65        x1.23
  1024  1024     x1.76        x1.25
  2048  16       x1.21        x1.10
  2048  256      x1.66        x1.23
  2048  1024     x1.78        x1.25
  2048  2048     x1.79        x1.25
  4096  16       x1.20        x1.09
  4096  256      x1.66        x1.23
  4096  1024     x1.79        x1.26
  4096  4096     x1.82        x1.26
  8192  16       x1.20        x1.09
  8192  256      x1.67        x1.23
  8192  1024     x1.80        x1.26
  8192  4096     x1.85        x1.28
  8192  8192     x1.85        x1.27

Where bs refers to block size and b/u to bytes per update.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Cc: Andy Polyakov <appro@openssl.org>

---
Changes since v1:
  Rebased to Herbert's cryptodev tree
  Include sha256-armv4.pl and use it to generate sha256-core.S
  Add integer-only assembly version as sha256-asm
  Add support for SHA-224 to the glue code
  Change priority for sha256/224-ce to 300

---
 arch/arm/crypto/Kconfig               |    7 
 arch/arm/crypto/Makefile              |    8 
 arch/arm/crypto/sha2-ce-glue.c        |    4 
 arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
 arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
 arch/arm/crypto/sha256_glue.c         |  246 ++
 arch/arm/crypto/sha256_glue.h         |   23 
 arch/arm/crypto/sha256_neon_glue.c    |  172 +
 8 files changed, 3945 insertions(+), 3 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index d63f319..458729d 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
 	  SHA-256 secure hash standard (DFIPS 180-2) implemented
 	  using special ARMv8 Crypto Extensions.
 
+config CRYPTO_SHA256_ARM
+	tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
+	select CRYPTO_HASH
+	help
+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
+	  using optimized ARM assembler and NEON, when available.
+
 config CRYPTO_SHA512_ARM_NEON
 	tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 9a273bd..ef46e89 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
@@ -16,6 +17,8 @@ aes-arm-y	:= aes-armv4.o aes_glue.o
 aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
@@ -28,4 +31,7 @@ quiet_cmd_perl = PERL    $@
 $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
 	$(call cmd,perl)
 
-.PRECIOUS: $(obj)/aesbs-core.S
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+	$(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 9ffe8ad..0449eca 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
 	.base			= {
 		.cra_name		= "sha224",
 		.cra_driver_name	= "sha224-ce",
-		.cra_priority		= 200,
+		.cra_priority		= 300,
 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
@@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
 	.base			= {
 		.cra_name		= "sha256",
 		.cra_driver_name	= "sha256-ce",
-		.cra_priority		= 200,
+		.cra_priority		= 300,
 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644
index 0000000..4fee74d
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -0,0 +1,713 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	$t0="r0";
+$inp="r1";	$t4="r1";
+$len="r2";	$t1="r2";
+$T1="r3";	$t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+ at V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+ at Sigma0=( 2,13,22);
+ at Sigma1=( 6,11,25);
+ at sigma0=( 7,18, 3);
+ at sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	@ ldr	$t1,[$inp],#4			@ $i
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	rev	$t1,$t1
+#else
+	@ ldrb	$t1,[$inp,#3]			@ $i
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	ldrb	$t2,[$inp,#2]
+	ldrb	$t0,[$inp,#1]
+	orr	$t1,$t1,$t2,lsl#8
+	ldrb	$t2,[$inp],#4
+	orr	$t1,$t1,$t0,lsl#16
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	orr	$t1,$t1,$t2,lsl#24
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+#endif
+___
+$code.=<<___;
+	ldr	$t2,[$Ktbl],#4			@ *K256++
+	add	$h,$h,$t1			@ h+=X[i]
+	str	$t1,[sp,#`$i%16`*4]
+	eor	$t1,$f,$g
+	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
+	and	$t1,$t1,$e
+	add	$h,$h,$t2			@ h+=K256[i]
+	eor	$t1,$t1,$g			@ Ch(e,f,g)
+	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+	add	$h,$h,$t1			@ h+=Ch(e,f,g)
+#if $i==31
+	and	$t2,$t2,#0xff
+	cmp	$t2,#0xf2			@ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4			@ prefetch
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+#else
+	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
+#endif
+	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
+	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
+	add	$d,$d,$h			@ d+=h
+	eor	$t3,$t3,$b			@ Maj(a,b,c)
+	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
+	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t4,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t1,ror#$sigma0[0]
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	mov	$t2,$t4,ror#$sigma1[0]
+	eor	$t0,$t0,$t1,ror#$sigma0[1]
+	eor	$t2,$t2,$t4,ror#$sigma1[1]
+	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	ldr	$t1,[sp,#`($i+0)%16`*4]
+	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	ldr	$t4,[sp,#`($i+9)%16`*4]
+
+	add	$t2,$t2,$t0
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
+	add	$t1,$t1,$t2
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	add	$t1,$t1,$t4			@ X[i]
+___
+	&BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
+	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+	sub	$Ktbl,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t3,$B,$C		@ magic
+	eor	$t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)	{ &BODY_00_15($i, at V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
+	ldr	$t0,[$t3,#0]
+	ldr	$t1,[$t3,#4]
+	ldr	$t2,[$t3,#8]
+	add	$A,$A,$t0
+	ldr	$t0,[$t3,#12]
+	add	$B,$B,$t1
+	ldr	$t1,[$t3,#16]
+	add	$C,$C,$t2
+	ldr	$t2,[$t3,#20]
+	add	$D,$D,$t0
+	ldr	$t0,[$t3,#24]
+	add	$E,$E,$t1
+	ldr	$t1,[$t3,#28]
+	add	$F,$F,$t2
+	ldr	$inp,[sp,#17*4]		@ pull inp
+	ldr	$t2,[sp,#18*4]		@ pull inp+len
+	add	$G,$G,$t0
+	add	$H,$H,$t1
+	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+	cmp	$inp,$t2
+	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#`16+3`*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',', at _,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&vext_8		($T0, at X[0], at X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		($T1, at X[2], at X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0], at X[0],$T1);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0], at X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0, at X[0]);
+	 while($#insns>=2) { eval(shift(@insns)); }
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[0], at X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0, at X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&eor	($t1,$f,$g)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&and	($t1,$t1,$e)',
+	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
+	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&add	($d,$d,$h)',			# d+=h
+	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	$H,sp,#16*4+16
+	adr	$Ktbl,K256
+	bic	$H,$H,#15		@ align for 128-bit stores
+	mov	$t2,sp
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{@X[0]},[$inp]!
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	vld1.32		{$T0},[$Ktbl,:128]!
+	vld1.32		{$T1},[$Ktbl,:128]!
+	vld1.32		{$T2},[$Ktbl,:128]!
+	vld1.32		{$T3},[$Ktbl,:128]!
+	vrev32.8	@X[0], at X[0]		@ yes, even on
+	str		$ctx,[sp,#64]
+	vrev32.8	@X[1], at X[1]		@ big-endian
+	str		$inp,[sp,#68]
+	mov		$Xfer,sp
+	vrev32.8	@X[2], at X[2]
+	str		$len,[sp,#72]
+	vrev32.8	@X[3], at X[3]
+	str		$t2,[sp,#76]		@ save original sp
+	vadd.i32	$T0,$T0, at X[0]
+	vadd.i32	$T1,$T1, at X[1]
+	vst1.32		{$T0},[$Xfer,:128]!
+	vadd.i32	$T2,$T2, at X[2]
+	vst1.32		{$T1},[$Xfer,:128]!
+	vadd.i32	$T3,$T3, at X[3]
+	vst1.32		{$T2},[$Xfer,:128]!
+	vst1.32		{$T3},[$Xfer,:128]!
+
+	ldmia		$ctx,{$A-$H}
+	sub		$Xfer,$Xfer,#64
+	ldr		$t1,[sp,#0]
+	eor		$t2,$t2,$t2
+	eor		$t3,$B,$C
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	teq	$t1,#0				@ check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	ldr		$inp,[sp,#68]
+	ldr		$t0,[sp,#72]
+	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
+	teq		$inp,$t0
+	it		eq
+	subeq		$inp,$inp,#64		@ avoid SEGV
+	vld1.8		{@X[0]},[$inp]!		@ load next input block
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	it		ne
+	strne		$inp,[sp,#68]
+	mov		$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	ldr	$t0,[$t1,#0]
+	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
+	ldr	$t2,[$t1,#4]
+	ldr	$t3,[$t1,#8]
+	ldr	$t4,[$t1,#12]
+	add	$A,$A,$t0			@ accumulate
+	ldr	$t0,[$t1,#16]
+	add	$B,$B,$t2
+	ldr	$t2,[$t1,#20]
+	add	$C,$C,$t3
+	ldr	$t3,[$t1,#24]
+	add	$D,$D,$t4
+	ldr	$t4,[$t1,#28]
+	add	$E,$E,$t0
+	str	$A,[$t1],#4
+	add	$F,$F,$t2
+	str	$B,[$t1],#4
+	add	$G,$G,$t3
+	str	$C,[$t1],#4
+	add	$H,$H,$t4
+	str	$D,[$t1],#4
+	stmia	$t1,{$E-$H}
+
+	ittte	ne
+	movne	$Xfer,sp
+	ldrne	$t1,[sp,#0]
+	eorne	$t2,$t2,$t2
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	$t3,$B,$C
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+	adr	$Ktbl,.LARMv8
+	sub	$Ktbl,$Ktbl,#.LARMv8-K256
+# else
+	adrl	$Ktbl,K256
+# endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{@MSG[0]- at MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]- at MSG[3]},[$inp]!
+	vld1.32		{$W0},[$Ktbl]!
+	vrev32.8	@MSG[0], at MSG[0]
+	vrev32.8	@MSG[1], at MSG[1]
+	vrev32.8	@MSG[2], at MSG[2]
+	vrev32.8	@MSG[3], at MSG[3]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	vmov		$EFGH_SAVE,$EFGH
+	teq		$inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0, at MSG[0]
+	sha256su0	@MSG[0], at MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0], at MSG[2], at MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0, at MSG[0]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vld1.32		{$W0},[$Ktbl]!
+	vadd.i32	$W1,$W1, at MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vld1.32		{$W1},[$Ktbl]
+	vadd.i32	$W0,$W0, at MSG[2]
+	sub		$Ktbl,$Ktbl,#256-16	@ rewind
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vadd.i32	$W1,$W1,@MSG[3]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD,$EFGH},[$ctx]
+
+	ret		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+{   my  %opcode = (
+	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
+	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644
index 0000000..683f1cc
--- /dev/null
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -0,0 +1,2775 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r14,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6		@ magic
+	eor	r12,r12,r12
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 0
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 0
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 0==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 1
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 1
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 1==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 2
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 2
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 2==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 3
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 3
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 3==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 4
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 4
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 4==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 5
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 5==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 6
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 6
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 6==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 7
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 7==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 8
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 8
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 8==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 9
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 9
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 9==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 10
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 10
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 10==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 11
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 11
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 11==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 12
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 12
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 12==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 13
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 13
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 13==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 14
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 14
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 14==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 15
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	rev	r2,r2
+#else
+	@ ldrb	r2,[r1,#3]			@ 15
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 15==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+	@ ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#14*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#0*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#9*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 16==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#15*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#1*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#10*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 17==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#0*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#2*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#11*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 18==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#1*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#3*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#12*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 19==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#2*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#4*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#13*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 20==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#3*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#5*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#14*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 21==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#4*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#6*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#15*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 22==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#5*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#7*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#0*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 23==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#6*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#8*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#1*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 24==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#7*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#9*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#2*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 25==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#8*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#10*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#3*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 26==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#9*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#11*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#4*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 27==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#10*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#12*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#5*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 28==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#11*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#13*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#6*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 29==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#12*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#14*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#7*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 30==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#13*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#15*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#8*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 31==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
+	ldr	r0,[r3,#0]
+	ldr	r2,[r3,#4]
+	ldr	r12,[r3,#8]
+	add	r4,r4,r0
+	ldr	r0,[r3,#12]
+	add	r5,r5,r2
+	ldr	r2,[r3,#16]
+	add	r6,r6,r12
+	ldr	r12,[r3,#20]
+	add	r7,r7,r0
+	ldr	r0,[r3,#24]
+	add	r8,r8,r2
+	ldr	r2,[r3,#28]
+	add	r9,r9,r12
+	ldr	r1,[sp,#17*4]		@ pull inp
+	ldr	r12,[sp,#18*4]		@ pull inp+len
+	add	r10,r10,r0
+	add	r11,r11,r2
+	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+	cmp	r1,r12
+	sub	r14,r14,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#19*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	r11,sp,#16*4+16
+	adr	r14,K256
+	bic	r11,r11,#15		@ align for 128-bit stores
+	mov	r12,sp
+	mov	sp,r11			@ alloca
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{q0},[r1]!
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	vld1.32		{q8},[r14,:128]!
+	vld1.32		{q9},[r14,:128]!
+	vld1.32		{q10},[r14,:128]!
+	vld1.32		{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str		r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str		r1,[sp,#68]
+	mov		r1,sp
+	vrev32.8	q2,q2
+	str		r2,[sp,#72]
+	vrev32.8	q3,q3
+	str		r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32		{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32		{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32		{q10},[r1,:128]!
+	vst1.32		{q11},[r1,:128]!
+
+	ldmia		r0,{r4-r11}
+	sub		r1,r1,#64
+	ldr		r2,[sp,#0]
+	eor		r12,r12,r12
+	eor		r3,r5,r6
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	.L_00_48
+
+	ldr		r1,[sp,#68]
+	ldr		r0,[sp,#72]
+	sub		r14,r14,#256	@ rewind r14
+	teq		r1,r0
+	it		eq
+	subeq		r1,r1,#64		@ avoid SEGV
+	vld1.8		{q0},[r1]!		@ load next input block
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	it		ne
+	strne		r1,[sp,#68]
+	mov		r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8-r11}
+
+	ittte	ne
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	r3,r5,r6
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{q0,q1},[r0]
+# ifdef __thumb2__
+	adr	r3,.LARMv8
+	sub	r3,r3,#.LARMv8-K256
+# else
+	adrl	r3,K256
+# endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{q8-q9},[r1]!
+	vld1.8		{q10-q11},[r1]!
+	vld1.32		{q12},[r3]!
+	vrev32.8	q8,q8
+	vrev32.8	q9,q9
+	vrev32.8	q10,q10
+	vrev32.8	q11,q11
+	vmov		q14,q0	@ offload
+	vmov		q15,q1
+	teq		r1,r2
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vld1.32		{q13},[r3]
+	vadd.i32	q12,q12,q10
+	sub		r3,r3,#256-16	@ rewind
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vadd.i32	q13,q13,q11
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vadd.i32	q0,q0,q14
+	vadd.i32	q1,q1,q15
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{q0,q1},[r0]
+
+	bx	lr		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644
index 0000000..25ceba6
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.c
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ *   Copyright (C) 2013 Intel Corporation
+ *   Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+		    unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_block_data_order(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_block_data_order(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	/* We need to fill a whole block for __sha256_update */
+	if (padlen <= 56) {
+		sctx->count += padlen;
+		memcpy(sctx->buf + index, padding, padlen);
+	} else {
+		__sha256_update(desc, padding, padlen, index);
+	}
+	__sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_final(desc, D);
+
+	memcpy(out, D, SHA224_DIGEST_SIZE);
+	memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_init,
+	.update		=	sha256_update,
+	.final		=	sha256_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-asm",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_init,
+	.update		=	sha256_update,
+	.final		=	sha224_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-asm",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
+
+static int __init sha256_mod_init(void)
+{
+	int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+	if (res < 0)
+		return res;
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+		res = crypto_register_shashes(sha256_neon_algs,
+					      ARRAY_SIZE(sha256_neon_algs));
+
+		if (res < 0)
+			crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+	}
+
+	return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+		crypto_unregister_shashes(sha256_neon_algs,
+					  ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644
index 0000000..0312f4f
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.h
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+			   unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 0000000..d0f168d
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ *   Copyright ? 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+				unsigned int len, unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_block_data_order_neon(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!may_use_simd()) {
+		res = __sha256_update(desc, data, len, partial);
+	} else {
+		kernel_neon_begin();
+		res = __sha256_neon_update(desc, data, len, partial);
+		kernel_neon_end();
+	}
+
+	return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	if (!may_use_simd()) {
+		sha256_update(desc, padding, padlen);
+		sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_neon_begin();
+		/* We need to fill a whole block for __sha256_neon_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha256_neon_update(desc, padding, padlen, index);
+		}
+		__sha256_neon_update(desc, (const u8 *)&bits,
+					sizeof(bits), 56);
+		kernel_neon_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memzero_explicit(sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_neon_final(desc, D);
+
+	memcpy(out, D, SHA224_DIGEST_SIZE);
+	memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha256_neon_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha224_neon_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-23 13:50   ` Sami Tolvanen
@ 2015-03-23 18:26     ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-23 18:26 UTC (permalink / raw)
  To: Sami Tolvanen
  Cc: linux-arm-kernel, linux-crypto, Herbert Xu, David S. Miller,
	Andy Polyakov

(resending due to size bounce)

On 23 March 2015 at 14:50, Sami Tolvanen <samitolvanen@google.com> wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
>
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>
>   bs    b/u      sha256-neon  sha256-asm
>   16    16       x1.32        x1.19
>   64    16       x1.27        x1.15
>   64    64       x1.36        x1.20
>   256   16       x1.22        x1.11
>   256   64       x1.36        x1.19
>   256   256      x1.59        x1.23
>   1024  16       x1.21        x1.10
>   1024  256      x1.65        x1.23
>   1024  1024     x1.76        x1.25
>   2048  16       x1.21        x1.10
>   2048  256      x1.66        x1.23
>   2048  1024     x1.78        x1.25
>   2048  2048     x1.79        x1.25
>   4096  16       x1.20        x1.09
>   4096  256      x1.66        x1.23
>   4096  1024     x1.79        x1.26
>   4096  4096     x1.82        x1.26
>   8192  16       x1.20        x1.09
>   8192  256      x1.67        x1.23
>   8192  1024     x1.80        x1.26
>   8192  4096     x1.85        x1.28
>   8192  8192     x1.85        x1.27
>
> Where bs refers to block size and b/u to bytes per update.
>
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
> Cc: Andy Polyakov <appro@openssl.org>
>

This builds fine and passes the tcrypt.ko tests in ARM and Thumb2 and
even in big-endian (ARM) mode, so

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Nice work!

Ard.

> ---
> Changes since v1:
>   Rebased to Herbert's cryptodev tree
>   Include sha256-armv4.pl and use it to generate sha256-core.S
>   Add integer-only assembly version as sha256-asm
>   Add support for SHA-224 to the glue code
>   Change priority for sha256/224-ce to 300
>
> ---
>  arch/arm/crypto/Kconfig               |    7
>  arch/arm/crypto/Makefile              |    8
>  arch/arm/crypto/sha2-ce-glue.c        |    4
>  arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
>  arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
>  arch/arm/crypto/sha256_glue.c         |  246 ++
>  arch/arm/crypto/sha256_glue.h         |   23
>  arch/arm/crypto/sha256_neon_glue.c    |  172 +
>  8 files changed, 3945 insertions(+), 3 deletions(-)
>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-23 18:26     ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-23 18:26 UTC (permalink / raw)
  To: linux-arm-kernel

(resending due to size bounce)

On 23 March 2015 at 14:50, Sami Tolvanen <samitolvanen@google.com> wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
>
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>
>   bs    b/u      sha256-neon  sha256-asm
>   16    16       x1.32        x1.19
>   64    16       x1.27        x1.15
>   64    64       x1.36        x1.20
>   256   16       x1.22        x1.11
>   256   64       x1.36        x1.19
>   256   256      x1.59        x1.23
>   1024  16       x1.21        x1.10
>   1024  256      x1.65        x1.23
>   1024  1024     x1.76        x1.25
>   2048  16       x1.21        x1.10
>   2048  256      x1.66        x1.23
>   2048  1024     x1.78        x1.25
>   2048  2048     x1.79        x1.25
>   4096  16       x1.20        x1.09
>   4096  256      x1.66        x1.23
>   4096  1024     x1.79        x1.26
>   4096  4096     x1.82        x1.26
>   8192  16       x1.20        x1.09
>   8192  256      x1.67        x1.23
>   8192  1024     x1.80        x1.26
>   8192  4096     x1.85        x1.28
>   8192  8192     x1.85        x1.27
>
> Where bs refers to block size and b/u to bytes per update.
>
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
> Cc: Andy Polyakov <appro@openssl.org>
>

This builds fine and passes the tcrypt.ko tests in ARM and Thumb2 and
even in big-endian (ARM) mode, so

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>

Nice work!

Ard.

> ---
> Changes since v1:
>   Rebased to Herbert's cryptodev tree
>   Include sha256-armv4.pl and use it to generate sha256-core.S
>   Add integer-only assembly version as sha256-asm
>   Add support for SHA-224 to the glue code
>   Change priority for sha256/224-ce to 300
>
> ---
>  arch/arm/crypto/Kconfig               |    7
>  arch/arm/crypto/Makefile              |    8
>  arch/arm/crypto/sha2-ce-glue.c        |    4
>  arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
>  arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
>  arch/arm/crypto/sha256_glue.c         |  246 ++
>  arch/arm/crypto/sha256_glue.h         |   23
>  arch/arm/crypto/sha256_neon_glue.c    |  172 +
>  8 files changed, 3945 insertions(+), 3 deletions(-)
>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-23 13:50   ` Sami Tolvanen
@ 2015-03-24 11:32     ` Herbert Xu
  -1 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-03-24 11:32 UTC (permalink / raw)
  To: Sami Tolvanen
  Cc: linux-arm-kernel, linux-crypto, ard.biesheuvel, davem, Andy Polyakov

On Mon, Mar 23, 2015 at 01:50:09PM +0000, Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
> 
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
> 
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
> 
>   bs    b/u      sha256-neon  sha256-asm
>   16    16       x1.32        x1.19
>   64    16       x1.27        x1.15
>   64    64       x1.36        x1.20
>   256   16       x1.22        x1.11
>   256   64       x1.36        x1.19
>   256   256      x1.59        x1.23
>   1024  16       x1.21        x1.10
>   1024  256      x1.65        x1.23
>   1024  1024     x1.76        x1.25
>   2048  16       x1.21        x1.10
>   2048  256      x1.66        x1.23
>   2048  1024     x1.78        x1.25
>   2048  2048     x1.79        x1.25
>   4096  16       x1.20        x1.09
>   4096  256      x1.66        x1.23
>   4096  1024     x1.79        x1.26
>   4096  4096     x1.82        x1.26
>   8192  16       x1.20        x1.09
>   8192  256      x1.67        x1.23
>   8192  1024     x1.80        x1.26
>   8192  4096     x1.85        x1.28
>   8192  8192     x1.85        x1.27
> 
> Where bs refers to block size and b/u to bytes per update.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
> Cc: Andy Polyakov <appro@openssl.org>

Your patch didn't make it to the linux-crypto list and therefore
it never got into patchwork.  Can you please find out why and
resend it?

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 11:32     ` Herbert Xu
  0 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-03-24 11:32 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Mar 23, 2015 at 01:50:09PM +0000, Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
> 
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
> 
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
> 
>   bs    b/u      sha256-neon  sha256-asm
>   16    16       x1.32        x1.19
>   64    16       x1.27        x1.15
>   64    64       x1.36        x1.20
>   256   16       x1.22        x1.11
>   256   64       x1.36        x1.19
>   256   256      x1.59        x1.23
>   1024  16       x1.21        x1.10
>   1024  256      x1.65        x1.23
>   1024  1024     x1.76        x1.25
>   2048  16       x1.21        x1.10
>   2048  256      x1.66        x1.23
>   2048  1024     x1.78        x1.25
>   2048  2048     x1.79        x1.25
>   4096  16       x1.20        x1.09
>   4096  256      x1.66        x1.23
>   4096  1024     x1.79        x1.26
>   4096  4096     x1.82        x1.26
>   8192  16       x1.20        x1.09
>   8192  256      x1.67        x1.23
>   8192  1024     x1.80        x1.26
>   8192  4096     x1.85        x1.28
>   8192  8192     x1.85        x1.27
> 
> Where bs refers to block size and b/u to bytes per update.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
> Cc: Andy Polyakov <appro@openssl.org>

Your patch didn't make it to the linux-crypto list and therefore
it never got into patchwork.  Can you please find out why and
resend it?

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 11:32     ` Herbert Xu
@ 2015-03-24 11:33       ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 11:33 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Sami Tolvanen, linux-arm-kernel, linux-crypto, David S. Miller,
	Andy Polyakov

On 24 March 2015 at 12:32, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Mon, Mar 23, 2015 at 01:50:09PM +0000, Sami Tolvanen wrote:
>> Add Andy Polyakov's optimized assembly and NEON implementations for
>> SHA-256/224.
>>
>> The sha256-armv4.pl script for generating the assembly code is from
>> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>>
>> Compared to sha256-generic these implementations have the following
>> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>>
>>   bs    b/u      sha256-neon  sha256-asm
>>   16    16       x1.32        x1.19
>>   64    16       x1.27        x1.15
>>   64    64       x1.36        x1.20
>>   256   16       x1.22        x1.11
>>   256   64       x1.36        x1.19
>>   256   256      x1.59        x1.23
>>   1024  16       x1.21        x1.10
>>   1024  256      x1.65        x1.23
>>   1024  1024     x1.76        x1.25
>>   2048  16       x1.21        x1.10
>>   2048  256      x1.66        x1.23
>>   2048  1024     x1.78        x1.25
>>   2048  2048     x1.79        x1.25
>>   4096  16       x1.20        x1.09
>>   4096  256      x1.66        x1.23
>>   4096  1024     x1.79        x1.26
>>   4096  4096     x1.82        x1.26
>>   8192  16       x1.20        x1.09
>>   8192  256      x1.67        x1.23
>>   8192  1024     x1.80        x1.26
>>   8192  4096     x1.85        x1.28
>>   8192  8192     x1.85        x1.27
>>
>> Where bs refers to block size and b/u to bytes per update.
>>
>> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
>> Cc: Andy Polyakov <appro@openssl.org>
>
> Your patch didn't make it to the linux-crypto list and therefore
> it never got into patchwork.  Can you please find out why and
> resend it?
>

Most likely because it is so big ...

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 11:33       ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 11:33 UTC (permalink / raw)
  To: linux-arm-kernel

On 24 March 2015 at 12:32, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Mon, Mar 23, 2015 at 01:50:09PM +0000, Sami Tolvanen wrote:
>> Add Andy Polyakov's optimized assembly and NEON implementations for
>> SHA-256/224.
>>
>> The sha256-armv4.pl script for generating the assembly code is from
>> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>>
>> Compared to sha256-generic these implementations have the following
>> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>>
>>   bs    b/u      sha256-neon  sha256-asm
>>   16    16       x1.32        x1.19
>>   64    16       x1.27        x1.15
>>   64    64       x1.36        x1.20
>>   256   16       x1.22        x1.11
>>   256   64       x1.36        x1.19
>>   256   256      x1.59        x1.23
>>   1024  16       x1.21        x1.10
>>   1024  256      x1.65        x1.23
>>   1024  1024     x1.76        x1.25
>>   2048  16       x1.21        x1.10
>>   2048  256      x1.66        x1.23
>>   2048  1024     x1.78        x1.25
>>   2048  2048     x1.79        x1.25
>>   4096  16       x1.20        x1.09
>>   4096  256      x1.66        x1.23
>>   4096  1024     x1.79        x1.26
>>   4096  4096     x1.82        x1.26
>>   8192  16       x1.20        x1.09
>>   8192  256      x1.67        x1.23
>>   8192  1024     x1.80        x1.26
>>   8192  4096     x1.85        x1.28
>>   8192  8192     x1.85        x1.27
>>
>> Where bs refers to block size and b/u to bytes per update.
>>
>> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
>> Cc: Andy Polyakov <appro@openssl.org>
>
> Your patch didn't make it to the linux-crypto list and therefore
> it never got into patchwork.  Can you please find out why and
> resend it?
>

Most likely because it is so big ...

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-23 18:26     ` Ard Biesheuvel
@ 2015-03-24 11:35       ` Herbert Xu
  -1 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-03-24 11:35 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Sami Tolvanen, linux-arm-kernel, linux-crypto, David S. Miller,
	Andy Polyakov

On Mon, Mar 23, 2015 at 07:26:03PM +0100, Ard Biesheuvel wrote:
> (resending due to size bounce)

Aha that's why the patch didn't make it through.  Can it be split
up?

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 11:35       ` Herbert Xu
  0 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-03-24 11:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Mar 23, 2015 at 07:26:03PM +0100, Ard Biesheuvel wrote:
> (resending due to size bounce)

Aha that's why the patch didn't make it through.  Can it be split
up?

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 11:35       ` Herbert Xu
@ 2015-03-24 11:40         ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 11:40 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Sami Tolvanen, linux-arm-kernel, linux-crypto, David S. Miller,
	Andy Polyakov

On 24 March 2015 at 12:35, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Mon, Mar 23, 2015 at 07:26:03PM +0100, Ard Biesheuvel wrote:
>> (resending due to size bounce)
>
> Aha that's why the patch didn't make it through.  Can it be split
> up?

Not so easily. It consists (among other things) of a .pl file that
generates a .S file, but to prevent introducing a build time
dependency on perl, the .S file is included as a .S_shipped file. That
is the big one.

i suppose we could add the .S_shipped file in a separate patch, but
I'd prefer to keep it as is

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 11:40         ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 11:40 UTC (permalink / raw)
  To: linux-arm-kernel

On 24 March 2015 at 12:35, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Mon, Mar 23, 2015 at 07:26:03PM +0100, Ard Biesheuvel wrote:
>> (resending due to size bounce)
>
> Aha that's why the patch didn't make it through.  Can it be split
> up?

Not so easily. It consists (among other things) of a .pl file that
generates a .S file, but to prevent introducing a build time
dependency on perl, the .S file is included as a .S_shipped file. That
is the big one.

i suppose we could add the .S_shipped file in a separate patch, but
I'd prefer to keep it as is

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 11:40         ` Ard Biesheuvel
@ 2015-03-24 11:46           ` Herbert Xu
  -1 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-03-24 11:46 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Sami Tolvanen, linux-arm-kernel, linux-crypto, David S. Miller,
	Andy Polyakov

On Tue, Mar 24, 2015 at 12:40:50PM +0100, Ard Biesheuvel wrote:
>
> Not so easily. It consists (among other things) of a .pl file that
> generates a .S file, but to prevent introducing a build time
> dependency on perl, the .S file is included as a .S_shipped file. That
> is the big one.
> 
> i suppose we could add the .S_shipped file in a separate patch, but
> I'd prefer to keep it as is

OK then this will have to go up on a website somewhere and then
the URL can be posted to the list for review.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 11:46           ` Herbert Xu
  0 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-03-24 11:46 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Mar 24, 2015 at 12:40:50PM +0100, Ard Biesheuvel wrote:
>
> Not so easily. It consists (among other things) of a .pl file that
> generates a .S file, but to prevent introducing a build time
> dependency on perl, the .S file is included as a .S_shipped file. That
> is the big one.
> 
> i suppose we could add the .S_shipped file in a separate patch, but
> I'd prefer to keep it as is

OK then this will have to go up on a website somewhere and then
the URL can be posted to the list for review.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 11:46           ` Herbert Xu
@ 2015-03-24 11:57             ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 11:57 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Sami Tolvanen, linux-arm-kernel, linux-crypto, David S. Miller,
	Andy Polyakov

On 24 March 2015 at 12:46, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Tue, Mar 24, 2015 at 12:40:50PM +0100, Ard Biesheuvel wrote:
>>
>> Not so easily. It consists (among other things) of a .pl file that
>> generates a .S file, but to prevent introducing a build time
>> dependency on perl, the .S file is included as a .S_shipped file. That
>> is the big one.
>>
>> i suppose we could add the .S_shipped file in a separate patch, but
>> I'd prefer to keep it as is
>
> OK then this will have to go up on a website somewhere and then
> the URL can be posted to the list for review.
>

You can pull it from here if you like
https://git.linaro.org/people/ard.biesheuvel/linux-arm.git/shortlog/refs/tags/arm-sha256-neon
(rebased onto your cryptodev-2.6/master branch as of 30 mins ago)

Does that work or you?

Thanks,
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 11:57             ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 11:57 UTC (permalink / raw)
  To: linux-arm-kernel

On 24 March 2015 at 12:46, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Tue, Mar 24, 2015 at 12:40:50PM +0100, Ard Biesheuvel wrote:
>>
>> Not so easily. It consists (among other things) of a .pl file that
>> generates a .S file, but to prevent introducing a build time
>> dependency on perl, the .S file is included as a .S_shipped file. That
>> is the big one.
>>
>> i suppose we could add the .S_shipped file in a separate patch, but
>> I'd prefer to keep it as is
>
> OK then this will have to go up on a website somewhere and then
> the URL can be posted to the list for review.
>

You can pull it from here if you like
https://git.linaro.org/people/ard.biesheuvel/linux-arm.git/shortlog/refs/tags/arm-sha256-neon
(rebased onto your cryptodev-2.6/master branch as of 30 mins ago)

Does that work or you?

Thanks,
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-23 13:50   ` Sami Tolvanen
@ 2015-03-24 12:27     ` Jean-Christophe PLAGNIOL-VILLARD
  -1 siblings, 0 replies; 66+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2015-03-24 12:27 UTC (permalink / raw)
  To: Sami Tolvanen
  Cc: herbert, ard.biesheuvel, linux-crypto, Andy Polyakov, davem,
	linux-arm-kernel

On 13:50 Mon 23 Mar     , Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
> 
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
> 
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
> 
>   bs    b/u      sha256-neon  sha256-asm
>   16    16       x1.32        x1.19
>   64    16       x1.27        x1.15
>   64    64       x1.36        x1.20
>   256   16       x1.22        x1.11
>   256   64       x1.36        x1.19
>   256   256      x1.59        x1.23
>   1024  16       x1.21        x1.10
>   1024  256      x1.65        x1.23
>   1024  1024     x1.76        x1.25
>   2048  16       x1.21        x1.10
>   2048  256      x1.66        x1.23
>   2048  1024     x1.78        x1.25
>   2048  2048     x1.79        x1.25
>   4096  16       x1.20        x1.09
>   4096  256      x1.66        x1.23
>   4096  1024     x1.79        x1.26
>   4096  4096     x1.82        x1.26
>   8192  16       x1.20        x1.09
>   8192  256      x1.67        x1.23
>   8192  1024     x1.80        x1.26
>   8192  4096     x1.85        x1.28
>   8192  8192     x1.85        x1.27
> 
> Where bs refers to block size and b/u to bytes per update.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
> Cc: Andy Polyakov <appro@openssl.org>
> 
> ---
> Changes since v1:
>   Rebased to Herbert's cryptodev tree
>   Include sha256-armv4.pl and use it to generate sha256-core.S
>   Add integer-only assembly version as sha256-asm
>   Add support for SHA-224 to the glue code
>   Change priority for sha256/224-ce to 300
> 
> ---
>  arch/arm/crypto/Kconfig               |    7 
>  arch/arm/crypto/Makefile              |    8 
>  arch/arm/crypto/sha2-ce-glue.c        |    4 
>  arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
>  arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
>  arch/arm/crypto/sha256_glue.c         |  246 ++
>  arch/arm/crypto/sha256_glue.h         |   23 
>  arch/arm/crypto/sha256_neon_glue.c    |  172 +
>  8 files changed, 3945 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
> index d63f319..458729d 100644
> --- a/arch/arm/crypto/Kconfig
> +++ b/arch/arm/crypto/Kconfig
> @@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
>  	  SHA-256 secure hash standard (DFIPS 180-2) implemented
>  	  using special ARMv8 Crypto Extensions.
>  
> +config CRYPTO_SHA256_ARM
> +	tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
> +	select CRYPTO_HASH
> +	help
> +	  SHA-256 secure hash standard (DFIPS 180-2) implemented
> +	  using optimized ARM assembler and NEON, when available.
> +
>  config CRYPTO_SHA512_ARM_NEON
>  	tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
>  	depends on KERNEL_MODE_NEON
> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
> index 9a273bd..ef46e89 100644
> --- a/arch/arm/crypto/Makefile
> +++ b/arch/arm/crypto/Makefile
> @@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
>  obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
> +obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
>  obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
>  obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
> @@ -16,6 +17,8 @@ aes-arm-y	:= aes-armv4.o aes_glue.o
>  aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
>  sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
>  sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
> +sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
> +sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
>  sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
>  sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
>  sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
> @@ -28,4 +31,7 @@ quiet_cmd_perl = PERL    $@
>  $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
>  	$(call cmd,perl)
>  
> -.PRECIOUS: $(obj)/aesbs-core.S
> +$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
> +	$(call cmd,perl)
> +
> +.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
> diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
> index 9ffe8ad..0449eca 100644
> --- a/arch/arm/crypto/sha2-ce-glue.c
> +++ b/arch/arm/crypto/sha2-ce-glue.c
> @@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
>  	.base			= {
>  		.cra_name		= "sha224",
>  		.cra_driver_name	= "sha224-ce",
> -		.cra_priority		= 200,
> +		.cra_priority		= 300,
>  		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
>  		.cra_blocksize		= SHA256_BLOCK_SIZE,
>  		.cra_module		= THIS_MODULE,
> @@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
>  	.base			= {
>  		.cra_name		= "sha256",
>  		.cra_driver_name	= "sha256-ce",
> -		.cra_priority		= 200,
> +		.cra_priority		= 300,
>  		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
>  		.cra_blocksize		= SHA256_BLOCK_SIZE,
>  		.cra_module		= THIS_MODULE,
> diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
> new file mode 100644
> index 0000000..4fee74d
> --- /dev/null
> +++ b/arch/arm/crypto/sha256-armv4.pl
> @@ -0,0 +1,713 @@
> +#!/usr/bin/env perl
> +
> +# ====================================================================
> +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
> +# project. The module is, however, dual licensed under OpenSSL and
> +# CRYPTOGAMS licenses depending on where you obtain it. For further
> +# details see http://www.openssl.org/~appro/cryptogams/.
> +#
> +# Permission to use under GPL terms is granted.
> +# ====================================================================
> +
> +# SHA256 block procedure for ARMv4. May 2007.
> +
> +# Performance is ~2x better than gcc 3.4 generated code and in "abso-
> +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
> +# byte [on single-issue Xscale PXA250 core].
> +
> +# July 2010.
> +#
> +# Rescheduling for dual-issue pipeline resulted in 22% improvement on
> +# Cortex A8 core and ~20 cycles per processed byte.
> +
> +# February 2011.
> +#
> +# Profiler-assisted and platform-specific optimization resulted in 16%
> +# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
> +
> +# September 2013.
> +#
> +# Add NEON implementation. On Cortex A8 it was measured to process one
> +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
> +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
> +# code (meaning that latter performs sub-optimally, nothing was done
> +# about it).
> +
> +# May 2014.
> +#
> +# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
> +
> +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
> +open STDOUT,">$output";
> +
> +$ctx="r0";	$t0="r0";
> +$inp="r1";	$t4="r1";
> +$len="r2";	$t1="r2";
> +$T1="r3";	$t3="r3";
> +$A="r4";
> +$B="r5";
> +$C="r6";
> +$D="r7";
> +$E="r8";
> +$F="r9";
> +$G="r10";
> +$H="r11";
> +@V=($A,$B,$C,$D,$E,$F,$G,$H);
> +$t2="r12";
> +$Ktbl="r14";
> +
> +@Sigma0=( 2,13,22);
> +@Sigma1=( 6,11,25);
> +@sigma0=( 7,18, 3);
> +@sigma1=(17,19,10);
> +
> +sub BODY_00_15 {
> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
> +
> +$code.=<<___ if ($i<16);
> +#if __ARM_ARCH__>=7
> +	@ ldr	$t1,[$inp],#4			@ $i
> +# if $i==15
> +	str	$inp,[sp,#17*4]			@ make room for $t4
> +# endif
> +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
> +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
> +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
> +	rev	$t1,$t1
> +#else
> +	@ ldrb	$t1,[$inp,#3]			@ $i
> +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
> +	ldrb	$t2,[$inp,#2]
> +	ldrb	$t0,[$inp,#1]
> +	orr	$t1,$t1,$t2,lsl#8
> +	ldrb	$t2,[$inp],#4
> +	orr	$t1,$t1,$t0,lsl#16
> +# if $i==15
> +	str	$inp,[sp,#17*4]			@ make room for $t4
> +# endif
> +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
> +	orr	$t1,$t1,$t2,lsl#24
> +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
> +#endif
> +___
> +$code.=<<___;
> +	ldr	$t2,[$Ktbl],#4			@ *K256++
> +	add	$h,$h,$t1			@ h+=X[i]
> +	str	$t1,[sp,#`$i%16`*4]
> +	eor	$t1,$f,$g
> +	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
> +	and	$t1,$t1,$e
> +	add	$h,$h,$t2			@ h+=K256[i]
> +	eor	$t1,$t1,$g			@ Ch(e,f,g)
> +	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
> +	add	$h,$h,$t1			@ h+=Ch(e,f,g)
> +#if $i==31
> +	and	$t2,$t2,#0xff
> +	cmp	$t2,#0xf2			@ done?
> +#endif
> +#if $i<15
> +# if __ARM_ARCH__>=7
> +	ldr	$t1,[$inp],#4			@ prefetch
> +# else
> +	ldrb	$t1,[$inp,#3]
> +# endif
> +	eor	$t2,$a,$b			@ a^b, b^c in next round
> +#else
> +	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
> +	eor	$t2,$a,$b			@ a^b, b^c in next round
> +	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
> +#endif
> +	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
> +	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
> +	add	$d,$d,$h			@ d+=h
> +	eor	$t3,$t3,$b			@ Maj(a,b,c)
> +	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
> +	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
> +___
> +	($t2,$t3)=($t3,$t2);
> +}
> +
> +sub BODY_16_XX {
> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
> +
> +$code.=<<___;
> +	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
> +	@ ldr	$t4,[sp,#`($i+14)%16`*4]
> +	mov	$t0,$t1,ror#$sigma0[0]
> +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
> +	mov	$t2,$t4,ror#$sigma1[0]
> +	eor	$t0,$t0,$t1,ror#$sigma0[1]
> +	eor	$t2,$t2,$t4,ror#$sigma1[1]
> +	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
> +	ldr	$t1,[sp,#`($i+0)%16`*4]
> +	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
> +	ldr	$t4,[sp,#`($i+9)%16`*4]
> +
> +	add	$t2,$t2,$t0
> +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
> +	add	$t1,$t1,$t2
> +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
> +	add	$t1,$t1,$t4			@ X[i]
> +___
> +	&BODY_00_15(@_);
> +}
> +
> +$code=<<___;
> +#ifndef __KERNEL__
> +# include "arm_arch.h"
> +#else
> +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
> +# define __ARM_MAX_ARCH__ 7
I'm not sure this will work for kernel that is for older arm
> +#endif
> +
> +.text
> +#if __ARM_ARCH__<7
> +.code	32
> +#else
> +.syntax unified
> +# ifdef __thumb2__
> +.thumb
> +# else
> +.code   32
> +# endif
> +#endif
> +
> +.type	K256,%object
> +.align	5
> +K256:
> +.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
> +.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
> +.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
> +.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
> +.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
> +.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
> +.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
> +.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
> +.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
> +.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
> +.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
> +.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
> +.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
> +.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
> +.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
> +.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
> +.size	K256,.-K256
> +.word	0				@ terminator
> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
> +.LOPENSSL_armcap:
> +.word	OPENSSL_armcap_P-sha256_block_data_order
> +#endif
> +.align	5
> +
> +.global	sha256_block_data_order
> +.type	sha256_block_data_order,%function
> +sha256_block_data_order:
> +#if __ARM_ARCH__<7
> +	sub	r3,pc,#8		@ sha256_block_data_order
> +#else
> +	adr	r3,sha256_block_data_order
> +#endif
> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
> +	ldr	r12,.LOPENSSL_armcap
> +	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
> +	tst	r12,#ARMV8_SHA256
> +	bne	.LARMv8
> +	tst	r12,#ARMV7_NEON
> +	bne	.LNEON
> +#endif
> +	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
> +	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
> +	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
> +	sub	$Ktbl,r3,#256+32	@ K256
> +	sub	sp,sp,#16*4		@ alloca(X[16])
> +.Loop:
> +# if __ARM_ARCH__>=7
> +	ldr	$t1,[$inp],#4
> +# else
> +	ldrb	$t1,[$inp,#3]
> +# endif
> +	eor	$t3,$B,$C		@ magic
> +	eor	$t2,$t2,$t2
> +___
> +for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
> +$code.=".Lrounds_16_xx:\n";
> +for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
> +$code.=<<___;
> +#if __ARM_ARCH__>=7
> +	ite	eq			@ Thumb2 thing, sanity check in ARM
> +#endif
> +	ldreq	$t3,[sp,#16*4]		@ pull ctx
> +	bne	.Lrounds_16_xx
> +
> +	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
> +	ldr	$t0,[$t3,#0]
> +	ldr	$t1,[$t3,#4]
> +	ldr	$t2,[$t3,#8]
> +	add	$A,$A,$t0
> +	ldr	$t0,[$t3,#12]
> +	add	$B,$B,$t1
> +	ldr	$t1,[$t3,#16]
> +	add	$C,$C,$t2
> +	ldr	$t2,[$t3,#20]
> +	add	$D,$D,$t0
> +	ldr	$t0,[$t3,#24]
> +	add	$E,$E,$t1
> +	ldr	$t1,[$t3,#28]
> +	add	$F,$F,$t2
> +	ldr	$inp,[sp,#17*4]		@ pull inp
> +	ldr	$t2,[sp,#18*4]		@ pull inp+len
> +	add	$G,$G,$t0
> +	add	$H,$H,$t1
> +	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
> +	cmp	$inp,$t2
> +	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
> +	bne	.Loop
> +
> +	add	sp,sp,#`16+3`*4	@ destroy frame
> +#if __ARM_ARCH__>=5
> +	ldmia	sp!,{r4-r11,pc}
> +#else
> +	ldmia	sp!,{r4-r11,lr}
> +	tst	lr,#1
> +	moveq	pc,lr			@ be binary compatible with V4, yet
> +	bx	lr			@ interoperable with Thumb ISA:-)
> +#endif
> +.size	sha256_block_data_order,.-sha256_block_data_order
> +___
> +######################################################################
> +# NEON stuff
> +#
> +{{{
> +my @X=map("q$_",(0..3));
> +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
> +my $Xfer=$t4;
> +my $j=0;
> +
> +sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
> +sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
> +
> +sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
> +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
> +  my $arg = pop;
> +    $arg = "#$arg" if ($arg*1 eq $arg);
> +    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
> +}
> +
> +sub Xupdate()
> +{ use integer;
> +  my $body = shift;
> +  my @insns = (&$body,&$body,&$body,&$body);
> +  my ($a,$b,$c,$d,$e,$f,$g,$h);
> +
> +	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vshr_u32	($T2,$T0,$sigma0[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vshr_u32	($T1,$T0,$sigma0[2]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vsli_32	($T2,$T0,32-$sigma0[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vshr_u32	($T3,$T0,$sigma0[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&veor		($T1,$T1,$T2);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vsli_32	($T3,$T0,32-$sigma0[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &veor		($T5,$T5,$T4);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &veor		($T5,$T5,$T4);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vld1_32	("{$T0}","[$Ktbl,:128]!");
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	($T0,$T0,@X[0]);
> +	 while($#insns>=2) { eval(shift(@insns)); }
> +	&vst1_32	("{$T0}","[$Xfer,:128]!");
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +
> +	push(@X,shift(@X));		# "rotate" X[]
> +}
> +
> +sub Xpreload()
> +{ use integer;
> +  my $body = shift;
> +  my @insns = (&$body,&$body,&$body,&$body);
> +  my ($a,$b,$c,$d,$e,$f,$g,$h);
> +
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vld1_32	("{$T0}","[$Ktbl,:128]!");
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vrev32_8	(@X[0],@X[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	($T0,$T0,@X[0]);
> +	 foreach (@insns) { eval; }	# remaining instructions
> +	&vst1_32	("{$T0}","[$Xfer,:128]!");
> +
> +	push(@X,shift(@X));		# "rotate" X[]
> +}
> +
> +sub body_00_15 () {
> +	(
> +	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
> +	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
> +	'&eor	($t1,$f,$g)',
> +	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
> +	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
> +	'&and	($t1,$t1,$e)',
> +	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
> +	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
> +	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
> +	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
> +	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
> +	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
> +	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
> +	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
> +	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
> +	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
> +	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
> +	'&add	($d,$d,$h)',			# d+=h
> +	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
> +	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
> +	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> +	)
> +}
> +
> +$code.=<<___;
> +#if __ARM_MAX_ARCH__>=7
this will be compile on armv4 but gcc will not allow it

we need to drop the neon code for older non v7 build

Best Regards,
J.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 12:27     ` Jean-Christophe PLAGNIOL-VILLARD
  0 siblings, 0 replies; 66+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2015-03-24 12:27 UTC (permalink / raw)
  To: linux-arm-kernel

On 13:50 Mon 23 Mar     , Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
> 
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
> 
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
> 
>   bs    b/u      sha256-neon  sha256-asm
>   16    16       x1.32        x1.19
>   64    16       x1.27        x1.15
>   64    64       x1.36        x1.20
>   256   16       x1.22        x1.11
>   256   64       x1.36        x1.19
>   256   256      x1.59        x1.23
>   1024  16       x1.21        x1.10
>   1024  256      x1.65        x1.23
>   1024  1024     x1.76        x1.25
>   2048  16       x1.21        x1.10
>   2048  256      x1.66        x1.23
>   2048  1024     x1.78        x1.25
>   2048  2048     x1.79        x1.25
>   4096  16       x1.20        x1.09
>   4096  256      x1.66        x1.23
>   4096  1024     x1.79        x1.26
>   4096  4096     x1.82        x1.26
>   8192  16       x1.20        x1.09
>   8192  256      x1.67        x1.23
>   8192  1024     x1.80        x1.26
>   8192  4096     x1.85        x1.28
>   8192  8192     x1.85        x1.27
> 
> Where bs refers to block size and b/u to bytes per update.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
> Cc: Andy Polyakov <appro@openssl.org>
> 
> ---
> Changes since v1:
>   Rebased to Herbert's cryptodev tree
>   Include sha256-armv4.pl and use it to generate sha256-core.S
>   Add integer-only assembly version as sha256-asm
>   Add support for SHA-224 to the glue code
>   Change priority for sha256/224-ce to 300
> 
> ---
>  arch/arm/crypto/Kconfig               |    7 
>  arch/arm/crypto/Makefile              |    8 
>  arch/arm/crypto/sha2-ce-glue.c        |    4 
>  arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
>  arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
>  arch/arm/crypto/sha256_glue.c         |  246 ++
>  arch/arm/crypto/sha256_glue.h         |   23 
>  arch/arm/crypto/sha256_neon_glue.c    |  172 +
>  8 files changed, 3945 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
> index d63f319..458729d 100644
> --- a/arch/arm/crypto/Kconfig
> +++ b/arch/arm/crypto/Kconfig
> @@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
>  	  SHA-256 secure hash standard (DFIPS 180-2) implemented
>  	  using special ARMv8 Crypto Extensions.
>  
> +config CRYPTO_SHA256_ARM
> +	tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
> +	select CRYPTO_HASH
> +	help
> +	  SHA-256 secure hash standard (DFIPS 180-2) implemented
> +	  using optimized ARM assembler and NEON, when available.
> +
>  config CRYPTO_SHA512_ARM_NEON
>  	tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
>  	depends on KERNEL_MODE_NEON
> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
> index 9a273bd..ef46e89 100644
> --- a/arch/arm/crypto/Makefile
> +++ b/arch/arm/crypto/Makefile
> @@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
>  obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
> +obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
>  obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
>  obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
>  obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
> @@ -16,6 +17,8 @@ aes-arm-y	:= aes-armv4.o aes_glue.o
>  aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
>  sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
>  sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
> +sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
> +sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
>  sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
>  sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
>  sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
> @@ -28,4 +31,7 @@ quiet_cmd_perl = PERL    $@
>  $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
>  	$(call cmd,perl)
>  
> -.PRECIOUS: $(obj)/aesbs-core.S
> +$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
> +	$(call cmd,perl)
> +
> +.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
> diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
> index 9ffe8ad..0449eca 100644
> --- a/arch/arm/crypto/sha2-ce-glue.c
> +++ b/arch/arm/crypto/sha2-ce-glue.c
> @@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
>  	.base			= {
>  		.cra_name		= "sha224",
>  		.cra_driver_name	= "sha224-ce",
> -		.cra_priority		= 200,
> +		.cra_priority		= 300,
>  		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
>  		.cra_blocksize		= SHA256_BLOCK_SIZE,
>  		.cra_module		= THIS_MODULE,
> @@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
>  	.base			= {
>  		.cra_name		= "sha256",
>  		.cra_driver_name	= "sha256-ce",
> -		.cra_priority		= 200,
> +		.cra_priority		= 300,
>  		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
>  		.cra_blocksize		= SHA256_BLOCK_SIZE,
>  		.cra_module		= THIS_MODULE,
> diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
> new file mode 100644
> index 0000000..4fee74d
> --- /dev/null
> +++ b/arch/arm/crypto/sha256-armv4.pl
> @@ -0,0 +1,713 @@
> +#!/usr/bin/env perl
> +
> +# ====================================================================
> +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
> +# project. The module is, however, dual licensed under OpenSSL and
> +# CRYPTOGAMS licenses depending on where you obtain it. For further
> +# details see http://www.openssl.org/~appro/cryptogams/.
> +#
> +# Permission to use under GPL terms is granted.
> +# ====================================================================
> +
> +# SHA256 block procedure for ARMv4. May 2007.
> +
> +# Performance is ~2x better than gcc 3.4 generated code and in "abso-
> +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
> +# byte [on single-issue Xscale PXA250 core].
> +
> +# July 2010.
> +#
> +# Rescheduling for dual-issue pipeline resulted in 22% improvement on
> +# Cortex A8 core and ~20 cycles per processed byte.
> +
> +# February 2011.
> +#
> +# Profiler-assisted and platform-specific optimization resulted in 16%
> +# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
> +
> +# September 2013.
> +#
> +# Add NEON implementation. On Cortex A8 it was measured to process one
> +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
> +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
> +# code (meaning that latter performs sub-optimally, nothing was done
> +# about it).
> +
> +# May 2014.
> +#
> +# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
> +
> +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
> +open STDOUT,">$output";
> +
> +$ctx="r0";	$t0="r0";
> +$inp="r1";	$t4="r1";
> +$len="r2";	$t1="r2";
> +$T1="r3";	$t3="r3";
> +$A="r4";
> +$B="r5";
> +$C="r6";
> +$D="r7";
> +$E="r8";
> +$F="r9";
> +$G="r10";
> +$H="r11";
> + at V=($A,$B,$C,$D,$E,$F,$G,$H);
> +$t2="r12";
> +$Ktbl="r14";
> +
> + at Sigma0=( 2,13,22);
> + at Sigma1=( 6,11,25);
> + at sigma0=( 7,18, 3);
> + at sigma1=(17,19,10);
> +
> +sub BODY_00_15 {
> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
> +
> +$code.=<<___ if ($i<16);
> +#if __ARM_ARCH__>=7
> +	@ ldr	$t1,[$inp],#4			@ $i
> +# if $i==15
> +	str	$inp,[sp,#17*4]			@ make room for $t4
> +# endif
> +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
> +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
> +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
> +	rev	$t1,$t1
> +#else
> +	@ ldrb	$t1,[$inp,#3]			@ $i
> +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
> +	ldrb	$t2,[$inp,#2]
> +	ldrb	$t0,[$inp,#1]
> +	orr	$t1,$t1,$t2,lsl#8
> +	ldrb	$t2,[$inp],#4
> +	orr	$t1,$t1,$t0,lsl#16
> +# if $i==15
> +	str	$inp,[sp,#17*4]			@ make room for $t4
> +# endif
> +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
> +	orr	$t1,$t1,$t2,lsl#24
> +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
> +#endif
> +___
> +$code.=<<___;
> +	ldr	$t2,[$Ktbl],#4			@ *K256++
> +	add	$h,$h,$t1			@ h+=X[i]
> +	str	$t1,[sp,#`$i%16`*4]
> +	eor	$t1,$f,$g
> +	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
> +	and	$t1,$t1,$e
> +	add	$h,$h,$t2			@ h+=K256[i]
> +	eor	$t1,$t1,$g			@ Ch(e,f,g)
> +	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
> +	add	$h,$h,$t1			@ h+=Ch(e,f,g)
> +#if $i==31
> +	and	$t2,$t2,#0xff
> +	cmp	$t2,#0xf2			@ done?
> +#endif
> +#if $i<15
> +# if __ARM_ARCH__>=7
> +	ldr	$t1,[$inp],#4			@ prefetch
> +# else
> +	ldrb	$t1,[$inp,#3]
> +# endif
> +	eor	$t2,$a,$b			@ a^b, b^c in next round
> +#else
> +	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
> +	eor	$t2,$a,$b			@ a^b, b^c in next round
> +	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
> +#endif
> +	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
> +	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
> +	add	$d,$d,$h			@ d+=h
> +	eor	$t3,$t3,$b			@ Maj(a,b,c)
> +	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
> +	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
> +___
> +	($t2,$t3)=($t3,$t2);
> +}
> +
> +sub BODY_16_XX {
> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
> +
> +$code.=<<___;
> +	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
> +	@ ldr	$t4,[sp,#`($i+14)%16`*4]
> +	mov	$t0,$t1,ror#$sigma0[0]
> +	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
> +	mov	$t2,$t4,ror#$sigma1[0]
> +	eor	$t0,$t0,$t1,ror#$sigma0[1]
> +	eor	$t2,$t2,$t4,ror#$sigma1[1]
> +	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
> +	ldr	$t1,[sp,#`($i+0)%16`*4]
> +	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
> +	ldr	$t4,[sp,#`($i+9)%16`*4]
> +
> +	add	$t2,$t2,$t0
> +	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
> +	add	$t1,$t1,$t2
> +	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
> +	add	$t1,$t1,$t4			@ X[i]
> +___
> +	&BODY_00_15(@_);
> +}
> +
> +$code=<<___;
> +#ifndef __KERNEL__
> +# include "arm_arch.h"
> +#else
> +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
> +# define __ARM_MAX_ARCH__ 7
I'm not sure this will work for kernel that is for older arm
> +#endif
> +
> +.text
> +#if __ARM_ARCH__<7
> +.code	32
> +#else
> +.syntax unified
> +# ifdef __thumb2__
> +.thumb
> +# else
> +.code   32
> +# endif
> +#endif
> +
> +.type	K256,%object
> +.align	5
> +K256:
> +.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
> +.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
> +.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
> +.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
> +.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
> +.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
> +.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
> +.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
> +.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
> +.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
> +.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
> +.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
> +.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
> +.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
> +.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
> +.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
> +.size	K256,.-K256
> +.word	0				@ terminator
> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
> +.LOPENSSL_armcap:
> +.word	OPENSSL_armcap_P-sha256_block_data_order
> +#endif
> +.align	5
> +
> +.global	sha256_block_data_order
> +.type	sha256_block_data_order,%function
> +sha256_block_data_order:
> +#if __ARM_ARCH__<7
> +	sub	r3,pc,#8		@ sha256_block_data_order
> +#else
> +	adr	r3,sha256_block_data_order
> +#endif
> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
> +	ldr	r12,.LOPENSSL_armcap
> +	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
> +	tst	r12,#ARMV8_SHA256
> +	bne	.LARMv8
> +	tst	r12,#ARMV7_NEON
> +	bne	.LNEON
> +#endif
> +	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
> +	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
> +	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
> +	sub	$Ktbl,r3,#256+32	@ K256
> +	sub	sp,sp,#16*4		@ alloca(X[16])
> +.Loop:
> +# if __ARM_ARCH__>=7
> +	ldr	$t1,[$inp],#4
> +# else
> +	ldrb	$t1,[$inp,#3]
> +# endif
> +	eor	$t3,$B,$C		@ magic
> +	eor	$t2,$t2,$t2
> +___
> +for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
> +$code.=".Lrounds_16_xx:\n";
> +for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
> +$code.=<<___;
> +#if __ARM_ARCH__>=7
> +	ite	eq			@ Thumb2 thing, sanity check in ARM
> +#endif
> +	ldreq	$t3,[sp,#16*4]		@ pull ctx
> +	bne	.Lrounds_16_xx
> +
> +	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
> +	ldr	$t0,[$t3,#0]
> +	ldr	$t1,[$t3,#4]
> +	ldr	$t2,[$t3,#8]
> +	add	$A,$A,$t0
> +	ldr	$t0,[$t3,#12]
> +	add	$B,$B,$t1
> +	ldr	$t1,[$t3,#16]
> +	add	$C,$C,$t2
> +	ldr	$t2,[$t3,#20]
> +	add	$D,$D,$t0
> +	ldr	$t0,[$t3,#24]
> +	add	$E,$E,$t1
> +	ldr	$t1,[$t3,#28]
> +	add	$F,$F,$t2
> +	ldr	$inp,[sp,#17*4]		@ pull inp
> +	ldr	$t2,[sp,#18*4]		@ pull inp+len
> +	add	$G,$G,$t0
> +	add	$H,$H,$t1
> +	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
> +	cmp	$inp,$t2
> +	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
> +	bne	.Loop
> +
> +	add	sp,sp,#`16+3`*4	@ destroy frame
> +#if __ARM_ARCH__>=5
> +	ldmia	sp!,{r4-r11,pc}
> +#else
> +	ldmia	sp!,{r4-r11,lr}
> +	tst	lr,#1
> +	moveq	pc,lr			@ be binary compatible with V4, yet
> +	bx	lr			@ interoperable with Thumb ISA:-)
> +#endif
> +.size	sha256_block_data_order,.-sha256_block_data_order
> +___
> +######################################################################
> +# NEON stuff
> +#
> +{{{
> +my @X=map("q$_",(0..3));
> +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
> +my $Xfer=$t4;
> +my $j=0;
> +
> +sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
> +sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
> +
> +sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
> +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
> +  my $arg = pop;
> +    $arg = "#$arg" if ($arg*1 eq $arg);
> +    $code .= "\t$opcode\t".join(',', at _,$arg)."\n";
> +}
> +
> +sub Xupdate()
> +{ use integer;
> +  my $body = shift;
> +  my @insns = (&$body,&$body,&$body,&$body);
> +  my ($a,$b,$c,$d,$e,$f,$g,$h);
> +
> +	&vext_8		($T0, at X[0], at X[1],4);	# X[1..4]
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vext_8		($T1, at X[2], at X[3],4);	# X[9..12]
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vshr_u32	($T2,$T0,$sigma0[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	(@X[0], at X[0],$T1);	# X[0..3] += X[9..12]
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vshr_u32	($T1,$T0,$sigma0[2]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vsli_32	($T2,$T0,32-$sigma0[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vshr_u32	($T3,$T0,$sigma0[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&veor		($T1,$T1,$T2);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vsli_32	($T3,$T0,32-$sigma0[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	(@X[0], at X[0],$T1);	# X[0..3] += sigma0(X[1..4])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &veor		($T5,$T5,$T4);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &veor		($T5,$T5,$T4);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vld1_32	("{$T0}","[$Ktbl,:128]!");
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	($T0,$T0, at X[0]);
> +	 while($#insns>=2) { eval(shift(@insns)); }
> +	&vst1_32	("{$T0}","[$Xfer,:128]!");
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +
> +	push(@X,shift(@X));		# "rotate" X[]
> +}
> +
> +sub Xpreload()
> +{ use integer;
> +  my $body = shift;
> +  my @insns = (&$body,&$body,&$body,&$body);
> +  my ($a,$b,$c,$d,$e,$f,$g,$h);
> +
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vld1_32	("{$T0}","[$Ktbl,:128]!");
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vrev32_8	(@X[0], at X[0]);
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	 eval(shift(@insns));
> +	&vadd_i32	($T0,$T0, at X[0]);
> +	 foreach (@insns) { eval; }	# remaining instructions
> +	&vst1_32	("{$T0}","[$Xfer,:128]!");
> +
> +	push(@X,shift(@X));		# "rotate" X[]
> +}
> +
> +sub body_00_15 () {
> +	(
> +	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
> +	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
> +	'&eor	($t1,$f,$g)',
> +	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
> +	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
> +	'&and	($t1,$t1,$e)',
> +	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
> +	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
> +	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
> +	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
> +	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
> +	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
> +	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
> +	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
> +	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
> +	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
> +	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
> +	'&add	($d,$d,$h)',			# d+=h
> +	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
> +	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
> +	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> +	)
> +}
> +
> +$code.=<<___;
> +#if __ARM_MAX_ARCH__>=7
this will be compile on armv4 but gcc will not allow it

we need to drop the neon code for older non v7 build

Best Regards,
J.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 12:27     ` Jean-Christophe PLAGNIOL-VILLARD
@ 2015-03-24 12:42       ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 12:42 UTC (permalink / raw)
  To: Jean-Christophe PLAGNIOL-VILLARD
  Cc: Sami Tolvanen, linux-arm-kernel, linux-crypto, Herbert Xu,
	David S. Miller, Andy Polyakov

On 24 March 2015 at 13:27, Jean-Christophe PLAGNIOL-VILLARD
<plagnioj@jcrosoft.com> wrote:
> On 13:50 Mon 23 Mar     , Sami Tolvanen wrote:
>> Add Andy Polyakov's optimized assembly and NEON implementations for
>> SHA-256/224.
>>
>> The sha256-armv4.pl script for generating the assembly code is from
>> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>>
>> Compared to sha256-generic these implementations have the following
>> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>>
>>   bs    b/u      sha256-neon  sha256-asm
>>   16    16       x1.32        x1.19
>>   64    16       x1.27        x1.15
>>   64    64       x1.36        x1.20
>>   256   16       x1.22        x1.11
>>   256   64       x1.36        x1.19
>>   256   256      x1.59        x1.23
>>   1024  16       x1.21        x1.10
>>   1024  256      x1.65        x1.23
>>   1024  1024     x1.76        x1.25
>>   2048  16       x1.21        x1.10
>>   2048  256      x1.66        x1.23
>>   2048  1024     x1.78        x1.25
>>   2048  2048     x1.79        x1.25
>>   4096  16       x1.20        x1.09
>>   4096  256      x1.66        x1.23
>>   4096  1024     x1.79        x1.26
>>   4096  4096     x1.82        x1.26
>>   8192  16       x1.20        x1.09
>>   8192  256      x1.67        x1.23
>>   8192  1024     x1.80        x1.26
>>   8192  4096     x1.85        x1.28
>>   8192  8192     x1.85        x1.27
>>
>> Where bs refers to block size and b/u to bytes per update.
>>
>> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
>> Cc: Andy Polyakov <appro@openssl.org>
>>
>> ---
>> Changes since v1:
>>   Rebased to Herbert's cryptodev tree
>>   Include sha256-armv4.pl and use it to generate sha256-core.S
>>   Add integer-only assembly version as sha256-asm
>>   Add support for SHA-224 to the glue code
>>   Change priority for sha256/224-ce to 300
>>
>> ---
>>  arch/arm/crypto/Kconfig               |    7
>>  arch/arm/crypto/Makefile              |    8
>>  arch/arm/crypto/sha2-ce-glue.c        |    4
>>  arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
>>  arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
>>  arch/arm/crypto/sha256_glue.c         |  246 ++
>>  arch/arm/crypto/sha256_glue.h         |   23
>>  arch/arm/crypto/sha256_neon_glue.c    |  172 +
>>  8 files changed, 3945 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
>> index d63f319..458729d 100644
>> --- a/arch/arm/crypto/Kconfig
>> +++ b/arch/arm/crypto/Kconfig
>> @@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
>>         SHA-256 secure hash standard (DFIPS 180-2) implemented
>>         using special ARMv8 Crypto Extensions.
>>
>> +config CRYPTO_SHA256_ARM
>> +     tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
>> +     select CRYPTO_HASH
>> +     help
>> +       SHA-256 secure hash standard (DFIPS 180-2) implemented
>> +       using optimized ARM assembler and NEON, when available.
>> +
>>  config CRYPTO_SHA512_ARM_NEON
>>       tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
>>       depends on KERNEL_MODE_NEON
>> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
>> index 9a273bd..ef46e89 100644
>> --- a/arch/arm/crypto/Makefile
>> +++ b/arch/arm/crypto/Makefile
>> @@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
>>  obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
>>  obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
>>  obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
>> +obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
>>  obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
>>  obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
>>  obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
>> @@ -16,6 +17,8 @@ aes-arm-y   := aes-armv4.o aes_glue.o
>>  aes-arm-bs-y := aesbs-core.o aesbs-glue.o
>>  sha1-arm-y   := sha1-armv4-large.o sha1_glue.o
>>  sha1-arm-neon-y      := sha1-armv7-neon.o sha1_neon_glue.o
>> +sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
>> +sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
>>  sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
>>  sha1-arm-ce-y        := sha1-ce-core.o sha1-ce-glue.o
>>  sha2-arm-ce-y        := sha2-ce-core.o sha2-ce-glue.o
>> @@ -28,4 +31,7 @@ quiet_cmd_perl = PERL    $@
>>  $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
>>       $(call cmd,perl)
>>
>> -.PRECIOUS: $(obj)/aesbs-core.S
>> +$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
>> +     $(call cmd,perl)
>> +
>> +.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
>> diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
>> index 9ffe8ad..0449eca 100644
>> --- a/arch/arm/crypto/sha2-ce-glue.c
>> +++ b/arch/arm/crypto/sha2-ce-glue.c
>> @@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
>>       .base                   = {
>>               .cra_name               = "sha224",
>>               .cra_driver_name        = "sha224-ce",
>> -             .cra_priority           = 200,
>> +             .cra_priority           = 300,
>>               .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
>>               .cra_blocksize          = SHA256_BLOCK_SIZE,
>>               .cra_module             = THIS_MODULE,
>> @@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
>>       .base                   = {
>>               .cra_name               = "sha256",
>>               .cra_driver_name        = "sha256-ce",
>> -             .cra_priority           = 200,
>> +             .cra_priority           = 300,
>>               .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
>>               .cra_blocksize          = SHA256_BLOCK_SIZE,
>>               .cra_module             = THIS_MODULE,
>> diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
>> new file mode 100644
>> index 0000000..4fee74d
>> --- /dev/null
>> +++ b/arch/arm/crypto/sha256-armv4.pl
>> @@ -0,0 +1,713 @@
>> +#!/usr/bin/env perl
>> +
>> +# ====================================================================
>> +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
>> +# project. The module is, however, dual licensed under OpenSSL and
>> +# CRYPTOGAMS licenses depending on where you obtain it. For further
>> +# details see http://www.openssl.org/~appro/cryptogams/.
>> +#
>> +# Permission to use under GPL terms is granted.
>> +# ====================================================================
>> +
>> +# SHA256 block procedure for ARMv4. May 2007.
>> +
>> +# Performance is ~2x better than gcc 3.4 generated code and in "abso-
>> +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
>> +# byte [on single-issue Xscale PXA250 core].
>> +
>> +# July 2010.
>> +#
>> +# Rescheduling for dual-issue pipeline resulted in 22% improvement on
>> +# Cortex A8 core and ~20 cycles per processed byte.
>> +
>> +# February 2011.
>> +#
>> +# Profiler-assisted and platform-specific optimization resulted in 16%
>> +# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
>> +
>> +# September 2013.
>> +#
>> +# Add NEON implementation. On Cortex A8 it was measured to process one
>> +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
>> +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
>> +# code (meaning that latter performs sub-optimally, nothing was done
>> +# about it).
>> +
>> +# May 2014.
>> +#
>> +# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
>> +
>> +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
>> +open STDOUT,">$output";
>> +
>> +$ctx="r0";   $t0="r0";
>> +$inp="r1";   $t4="r1";
>> +$len="r2";   $t1="r2";
>> +$T1="r3";    $t3="r3";
>> +$A="r4";
>> +$B="r5";
>> +$C="r6";
>> +$D="r7";
>> +$E="r8";
>> +$F="r9";
>> +$G="r10";
>> +$H="r11";
>> +@V=($A,$B,$C,$D,$E,$F,$G,$H);
>> +$t2="r12";
>> +$Ktbl="r14";
>> +
>> +@Sigma0=( 2,13,22);
>> +@Sigma1=( 6,11,25);
>> +@sigma0=( 7,18, 3);
>> +@sigma1=(17,19,10);
>> +
>> +sub BODY_00_15 {
>> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
>> +
>> +$code.=<<___ if ($i<16);
>> +#if __ARM_ARCH__>=7
>> +     @ ldr   $t1,[$inp],#4                   @ $i
>> +# if $i==15
>> +     str     $inp,[sp,#17*4]                 @ make room for $t4
>> +# endif
>> +     eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
>> +     add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
>> +     eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
>> +     rev     $t1,$t1
>> +#else
>> +     @ ldrb  $t1,[$inp,#3]                   @ $i
>> +     add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
>> +     ldrb    $t2,[$inp,#2]
>> +     ldrb    $t0,[$inp,#1]
>> +     orr     $t1,$t1,$t2,lsl#8
>> +     ldrb    $t2,[$inp],#4
>> +     orr     $t1,$t1,$t0,lsl#16
>> +# if $i==15
>> +     str     $inp,[sp,#17*4]                 @ make room for $t4
>> +# endif
>> +     eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
>> +     orr     $t1,$t1,$t2,lsl#24
>> +     eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
>> +#endif
>> +___
>> +$code.=<<___;
>> +     ldr     $t2,[$Ktbl],#4                  @ *K256++
>> +     add     $h,$h,$t1                       @ h+=X[i]
>> +     str     $t1,[sp,#`$i%16`*4]
>> +     eor     $t1,$f,$g
>> +     add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
>> +     and     $t1,$t1,$e
>> +     add     $h,$h,$t2                       @ h+=K256[i]
>> +     eor     $t1,$t1,$g                      @ Ch(e,f,g)
>> +     eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
>> +     add     $h,$h,$t1                       @ h+=Ch(e,f,g)
>> +#if $i==31
>> +     and     $t2,$t2,#0xff
>> +     cmp     $t2,#0xf2                       @ done?
>> +#endif
>> +#if $i<15
>> +# if __ARM_ARCH__>=7
>> +     ldr     $t1,[$inp],#4                   @ prefetch
>> +# else
>> +     ldrb    $t1,[$inp,#3]
>> +# endif
>> +     eor     $t2,$a,$b                       @ a^b, b^c in next round
>> +#else
>> +     ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
>> +     eor     $t2,$a,$b                       @ a^b, b^c in next round
>> +     ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
>> +#endif
>> +     eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
>> +     and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
>> +     add     $d,$d,$h                        @ d+=h
>> +     eor     $t3,$t3,$b                      @ Maj(a,b,c)
>> +     add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
>> +     @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
>> +___
>> +     ($t2,$t3)=($t3,$t2);
>> +}
>> +
>> +sub BODY_16_XX {
>> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
>> +
>> +$code.=<<___;
>> +     @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
>> +     @ ldr   $t4,[sp,#`($i+14)%16`*4]
>> +     mov     $t0,$t1,ror#$sigma0[0]
>> +     add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
>> +     mov     $t2,$t4,ror#$sigma1[0]
>> +     eor     $t0,$t0,$t1,ror#$sigma0[1]
>> +     eor     $t2,$t2,$t4,ror#$sigma1[1]
>> +     eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
>> +     ldr     $t1,[sp,#`($i+0)%16`*4]
>> +     eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
>> +     ldr     $t4,[sp,#`($i+9)%16`*4]
>> +
>> +     add     $t2,$t2,$t0
>> +     eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
>> +     add     $t1,$t1,$t2
>> +     eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
>> +     add     $t1,$t1,$t4                     @ X[i]
>> +___
>> +     &BODY_00_15(@_);
>> +}
>> +
>> +$code=<<___;
>> +#ifndef __KERNEL__
>> +# include "arm_arch.h"
>> +#else
>> +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
>> +# define __ARM_MAX_ARCH__ 7
> I'm not sure this will work for kernel that is for older arm

This code is only used at runtime if a NEON is detected on the current CPU

>> +#endif
>> +
>> +.text
>> +#if __ARM_ARCH__<7
>> +.code        32
>> +#else
>> +.syntax unified
>> +# ifdef __thumb2__
>> +.thumb
>> +# else
>> +.code   32
>> +# endif
>> +#endif
>> +
>> +.type        K256,%object
>> +.align       5
>> +K256:
>> +.word        0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
>> +.word        0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
>> +.word        0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
>> +.word        0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
>> +.word        0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
>> +.word        0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
>> +.word        0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
>> +.word        0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
>> +.word        0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
>> +.word        0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
>> +.word        0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
>> +.word        0xd192e819,0xd6990624,0xf40e3585,0x106aa070
>> +.word        0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
>> +.word        0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
>> +.word        0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
>> +.word        0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
>> +.size        K256,.-K256
>> +.word        0                               @ terminator
>> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
>> +.LOPENSSL_armcap:
>> +.word        OPENSSL_armcap_P-sha256_block_data_order
>> +#endif
>> +.align       5
>> +
>> +.global      sha256_block_data_order
>> +.type        sha256_block_data_order,%function
>> +sha256_block_data_order:
>> +#if __ARM_ARCH__<7
>> +     sub     r3,pc,#8                @ sha256_block_data_order
>> +#else
>> +     adr     r3,sha256_block_data_order
>> +#endif
>> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
>> +     ldr     r12,.LOPENSSL_armcap
>> +     ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
>> +     tst     r12,#ARMV8_SHA256
>> +     bne     .LARMv8
>> +     tst     r12,#ARMV7_NEON
>> +     bne     .LNEON
>> +#endif
>> +     add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
>> +     stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
>> +     ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
>> +     sub     $Ktbl,r3,#256+32        @ K256
>> +     sub     sp,sp,#16*4             @ alloca(X[16])
>> +.Loop:
>> +# if __ARM_ARCH__>=7
>> +     ldr     $t1,[$inp],#4
>> +# else
>> +     ldrb    $t1,[$inp,#3]
>> +# endif
>> +     eor     $t3,$B,$C               @ magic
>> +     eor     $t2,$t2,$t2
>> +___
>> +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
>> +$code.=".Lrounds_16_xx:\n";
>> +for (;$i<32;$i++)    { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
>> +$code.=<<___;
>> +#if __ARM_ARCH__>=7
>> +     ite     eq                      @ Thumb2 thing, sanity check in ARM
>> +#endif
>> +     ldreq   $t3,[sp,#16*4]          @ pull ctx
>> +     bne     .Lrounds_16_xx
>> +
>> +     add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
>> +     ldr     $t0,[$t3,#0]
>> +     ldr     $t1,[$t3,#4]
>> +     ldr     $t2,[$t3,#8]
>> +     add     $A,$A,$t0
>> +     ldr     $t0,[$t3,#12]
>> +     add     $B,$B,$t1
>> +     ldr     $t1,[$t3,#16]
>> +     add     $C,$C,$t2
>> +     ldr     $t2,[$t3,#20]
>> +     add     $D,$D,$t0
>> +     ldr     $t0,[$t3,#24]
>> +     add     $E,$E,$t1
>> +     ldr     $t1,[$t3,#28]
>> +     add     $F,$F,$t2
>> +     ldr     $inp,[sp,#17*4]         @ pull inp
>> +     ldr     $t2,[sp,#18*4]          @ pull inp+len
>> +     add     $G,$G,$t0
>> +     add     $H,$H,$t1
>> +     stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
>> +     cmp     $inp,$t2
>> +     sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
>> +     bne     .Loop
>> +
>> +     add     sp,sp,#`16+3`*4 @ destroy frame
>> +#if __ARM_ARCH__>=5
>> +     ldmia   sp!,{r4-r11,pc}
>> +#else
>> +     ldmia   sp!,{r4-r11,lr}
>> +     tst     lr,#1
>> +     moveq   pc,lr                   @ be binary compatible with V4, yet
>> +     bx      lr                      @ interoperable with Thumb ISA:-)
>> +#endif
>> +.size        sha256_block_data_order,.-sha256_block_data_order
>> +___
>> +######################################################################
>> +# NEON stuff
>> +#
>> +{{{
>> +my @X=map("q$_",(0..3));
>> +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
>> +my $Xfer=$t4;
>> +my $j=0;
>> +
>> +sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
>> +sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
>> +
>> +sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
>> +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
>> +  my $arg = pop;
>> +    $arg = "#$arg" if ($arg*1 eq $arg);
>> +    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
>> +}
>> +
>> +sub Xupdate()
>> +{ use integer;
>> +  my $body = shift;
>> +  my @insns = (&$body,&$body,&$body,&$body);
>> +  my ($a,$b,$c,$d,$e,$f,$g,$h);
>> +
>> +     &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vshr_u32       ($T2,$T0,$sigma0[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vshr_u32       ($T1,$T0,$sigma0[2]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vsli_32        ($T2,$T0,32-$sigma0[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vshr_u32       ($T3,$T0,$sigma0[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &veor           ($T1,$T1,$T2);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vsli_32        ($T3,$T0,32-$sigma0[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &veor         ($T5,$T5,$T4);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &veor         ($T5,$T5,$T4);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vld1_32        ("{$T0}","[$Ktbl,:128]!");
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       ($T0,$T0,@X[0]);
>> +      while($#insns>=2) { eval(shift(@insns)); }
>> +     &vst1_32        ("{$T0}","[$Xfer,:128]!");
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +
>> +     push(@X,shift(@X));             # "rotate" X[]
>> +}
>> +
>> +sub Xpreload()
>> +{ use integer;
>> +  my $body = shift;
>> +  my @insns = (&$body,&$body,&$body,&$body);
>> +  my ($a,$b,$c,$d,$e,$f,$g,$h);
>> +
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vld1_32        ("{$T0}","[$Ktbl,:128]!");
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vrev32_8       (@X[0],@X[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       ($T0,$T0,@X[0]);
>> +      foreach (@insns) { eval; }     # remaining instructions
>> +     &vst1_32        ("{$T0}","[$Xfer,:128]!");
>> +
>> +     push(@X,shift(@X));             # "rotate" X[]
>> +}
>> +
>> +sub body_00_15 () {
>> +     (
>> +     '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
>> +     '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
>> +     '&eor   ($t1,$f,$g)',
>> +     '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
>> +     '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
>> +     '&and   ($t1,$t1,$e)',
>> +     '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
>> +     '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
>> +     '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
>> +     '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
>> +     '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
>> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
>> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
>> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
>> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
>> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
>> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
>> +     '&add   ($d,$d,$h)',                    # d+=h
>> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
>> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>> +     )
>> +}
>> +
>> +$code.=<<___;
>> +#if __ARM_MAX_ARCH__>=7
> this will be compile on armv4 but gcc will not allow it
>
> we need to drop the neon code for older non v7 build
>

The .arch and .fpu declarations ensure that it can be built regardless
of the platform you are compiling for, unless you have a really old
toolchain.
The glue code ensures that the module can only be loaded if HWCAP_NEON is set.

Did you get errors trying to build it?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 12:42       ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 12:42 UTC (permalink / raw)
  To: linux-arm-kernel

On 24 March 2015 at 13:27, Jean-Christophe PLAGNIOL-VILLARD
<plagnioj@jcrosoft.com> wrote:
> On 13:50 Mon 23 Mar     , Sami Tolvanen wrote:
>> Add Andy Polyakov's optimized assembly and NEON implementations for
>> SHA-256/224.
>>
>> The sha256-armv4.pl script for generating the assembly code is from
>> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>>
>> Compared to sha256-generic these implementations have the following
>> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>>
>>   bs    b/u      sha256-neon  sha256-asm
>>   16    16       x1.32        x1.19
>>   64    16       x1.27        x1.15
>>   64    64       x1.36        x1.20
>>   256   16       x1.22        x1.11
>>   256   64       x1.36        x1.19
>>   256   256      x1.59        x1.23
>>   1024  16       x1.21        x1.10
>>   1024  256      x1.65        x1.23
>>   1024  1024     x1.76        x1.25
>>   2048  16       x1.21        x1.10
>>   2048  256      x1.66        x1.23
>>   2048  1024     x1.78        x1.25
>>   2048  2048     x1.79        x1.25
>>   4096  16       x1.20        x1.09
>>   4096  256      x1.66        x1.23
>>   4096  1024     x1.79        x1.26
>>   4096  4096     x1.82        x1.26
>>   8192  16       x1.20        x1.09
>>   8192  256      x1.67        x1.23
>>   8192  1024     x1.80        x1.26
>>   8192  4096     x1.85        x1.28
>>   8192  8192     x1.85        x1.27
>>
>> Where bs refers to block size and b/u to bytes per update.
>>
>> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
>> Cc: Andy Polyakov <appro@openssl.org>
>>
>> ---
>> Changes since v1:
>>   Rebased to Herbert's cryptodev tree
>>   Include sha256-armv4.pl and use it to generate sha256-core.S
>>   Add integer-only assembly version as sha256-asm
>>   Add support for SHA-224 to the glue code
>>   Change priority for sha256/224-ce to 300
>>
>> ---
>>  arch/arm/crypto/Kconfig               |    7
>>  arch/arm/crypto/Makefile              |    8
>>  arch/arm/crypto/sha2-ce-glue.c        |    4
>>  arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
>>  arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
>>  arch/arm/crypto/sha256_glue.c         |  246 ++
>>  arch/arm/crypto/sha256_glue.h         |   23
>>  arch/arm/crypto/sha256_neon_glue.c    |  172 +
>>  8 files changed, 3945 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
>> index d63f319..458729d 100644
>> --- a/arch/arm/crypto/Kconfig
>> +++ b/arch/arm/crypto/Kconfig
>> @@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
>>         SHA-256 secure hash standard (DFIPS 180-2) implemented
>>         using special ARMv8 Crypto Extensions.
>>
>> +config CRYPTO_SHA256_ARM
>> +     tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
>> +     select CRYPTO_HASH
>> +     help
>> +       SHA-256 secure hash standard (DFIPS 180-2) implemented
>> +       using optimized ARM assembler and NEON, when available.
>> +
>>  config CRYPTO_SHA512_ARM_NEON
>>       tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
>>       depends on KERNEL_MODE_NEON
>> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
>> index 9a273bd..ef46e89 100644
>> --- a/arch/arm/crypto/Makefile
>> +++ b/arch/arm/crypto/Makefile
>> @@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
>>  obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
>>  obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
>>  obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
>> +obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
>>  obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
>>  obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
>>  obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
>> @@ -16,6 +17,8 @@ aes-arm-y   := aes-armv4.o aes_glue.o
>>  aes-arm-bs-y := aesbs-core.o aesbs-glue.o
>>  sha1-arm-y   := sha1-armv4-large.o sha1_glue.o
>>  sha1-arm-neon-y      := sha1-armv7-neon.o sha1_neon_glue.o
>> +sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
>> +sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
>>  sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
>>  sha1-arm-ce-y        := sha1-ce-core.o sha1-ce-glue.o
>>  sha2-arm-ce-y        := sha2-ce-core.o sha2-ce-glue.o
>> @@ -28,4 +31,7 @@ quiet_cmd_perl = PERL    $@
>>  $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
>>       $(call cmd,perl)
>>
>> -.PRECIOUS: $(obj)/aesbs-core.S
>> +$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
>> +     $(call cmd,perl)
>> +
>> +.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
>> diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
>> index 9ffe8ad..0449eca 100644
>> --- a/arch/arm/crypto/sha2-ce-glue.c
>> +++ b/arch/arm/crypto/sha2-ce-glue.c
>> @@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
>>       .base                   = {
>>               .cra_name               = "sha224",
>>               .cra_driver_name        = "sha224-ce",
>> -             .cra_priority           = 200,
>> +             .cra_priority           = 300,
>>               .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
>>               .cra_blocksize          = SHA256_BLOCK_SIZE,
>>               .cra_module             = THIS_MODULE,
>> @@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
>>       .base                   = {
>>               .cra_name               = "sha256",
>>               .cra_driver_name        = "sha256-ce",
>> -             .cra_priority           = 200,
>> +             .cra_priority           = 300,
>>               .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
>>               .cra_blocksize          = SHA256_BLOCK_SIZE,
>>               .cra_module             = THIS_MODULE,
>> diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
>> new file mode 100644
>> index 0000000..4fee74d
>> --- /dev/null
>> +++ b/arch/arm/crypto/sha256-armv4.pl
>> @@ -0,0 +1,713 @@
>> +#!/usr/bin/env perl
>> +
>> +# ====================================================================
>> +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
>> +# project. The module is, however, dual licensed under OpenSSL and
>> +# CRYPTOGAMS licenses depending on where you obtain it. For further
>> +# details see http://www.openssl.org/~appro/cryptogams/.
>> +#
>> +# Permission to use under GPL terms is granted.
>> +# ====================================================================
>> +
>> +# SHA256 block procedure for ARMv4. May 2007.
>> +
>> +# Performance is ~2x better than gcc 3.4 generated code and in "abso-
>> +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
>> +# byte [on single-issue Xscale PXA250 core].
>> +
>> +# July 2010.
>> +#
>> +# Rescheduling for dual-issue pipeline resulted in 22% improvement on
>> +# Cortex A8 core and ~20 cycles per processed byte.
>> +
>> +# February 2011.
>> +#
>> +# Profiler-assisted and platform-specific optimization resulted in 16%
>> +# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
>> +
>> +# September 2013.
>> +#
>> +# Add NEON implementation. On Cortex A8 it was measured to process one
>> +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
>> +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
>> +# code (meaning that latter performs sub-optimally, nothing was done
>> +# about it).
>> +
>> +# May 2014.
>> +#
>> +# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
>> +
>> +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
>> +open STDOUT,">$output";
>> +
>> +$ctx="r0";   $t0="r0";
>> +$inp="r1";   $t4="r1";
>> +$len="r2";   $t1="r2";
>> +$T1="r3";    $t3="r3";
>> +$A="r4";
>> +$B="r5";
>> +$C="r6";
>> +$D="r7";
>> +$E="r8";
>> +$F="r9";
>> +$G="r10";
>> +$H="r11";
>> + at V=($A,$B,$C,$D,$E,$F,$G,$H);
>> +$t2="r12";
>> +$Ktbl="r14";
>> +
>> + at Sigma0=( 2,13,22);
>> + at Sigma1=( 6,11,25);
>> + at sigma0=( 7,18, 3);
>> + at sigma1=(17,19,10);
>> +
>> +sub BODY_00_15 {
>> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
>> +
>> +$code.=<<___ if ($i<16);
>> +#if __ARM_ARCH__>=7
>> +     @ ldr   $t1,[$inp],#4                   @ $i
>> +# if $i==15
>> +     str     $inp,[sp,#17*4]                 @ make room for $t4
>> +# endif
>> +     eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
>> +     add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
>> +     eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
>> +     rev     $t1,$t1
>> +#else
>> +     @ ldrb  $t1,[$inp,#3]                   @ $i
>> +     add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
>> +     ldrb    $t2,[$inp,#2]
>> +     ldrb    $t0,[$inp,#1]
>> +     orr     $t1,$t1,$t2,lsl#8
>> +     ldrb    $t2,[$inp],#4
>> +     orr     $t1,$t1,$t0,lsl#16
>> +# if $i==15
>> +     str     $inp,[sp,#17*4]                 @ make room for $t4
>> +# endif
>> +     eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
>> +     orr     $t1,$t1,$t2,lsl#24
>> +     eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
>> +#endif
>> +___
>> +$code.=<<___;
>> +     ldr     $t2,[$Ktbl],#4                  @ *K256++
>> +     add     $h,$h,$t1                       @ h+=X[i]
>> +     str     $t1,[sp,#`$i%16`*4]
>> +     eor     $t1,$f,$g
>> +     add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
>> +     and     $t1,$t1,$e
>> +     add     $h,$h,$t2                       @ h+=K256[i]
>> +     eor     $t1,$t1,$g                      @ Ch(e,f,g)
>> +     eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
>> +     add     $h,$h,$t1                       @ h+=Ch(e,f,g)
>> +#if $i==31
>> +     and     $t2,$t2,#0xff
>> +     cmp     $t2,#0xf2                       @ done?
>> +#endif
>> +#if $i<15
>> +# if __ARM_ARCH__>=7
>> +     ldr     $t1,[$inp],#4                   @ prefetch
>> +# else
>> +     ldrb    $t1,[$inp,#3]
>> +# endif
>> +     eor     $t2,$a,$b                       @ a^b, b^c in next round
>> +#else
>> +     ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
>> +     eor     $t2,$a,$b                       @ a^b, b^c in next round
>> +     ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
>> +#endif
>> +     eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
>> +     and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
>> +     add     $d,$d,$h                        @ d+=h
>> +     eor     $t3,$t3,$b                      @ Maj(a,b,c)
>> +     add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
>> +     @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
>> +___
>> +     ($t2,$t3)=($t3,$t2);
>> +}
>> +
>> +sub BODY_16_XX {
>> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
>> +
>> +$code.=<<___;
>> +     @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
>> +     @ ldr   $t4,[sp,#`($i+14)%16`*4]
>> +     mov     $t0,$t1,ror#$sigma0[0]
>> +     add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
>> +     mov     $t2,$t4,ror#$sigma1[0]
>> +     eor     $t0,$t0,$t1,ror#$sigma0[1]
>> +     eor     $t2,$t2,$t4,ror#$sigma1[1]
>> +     eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
>> +     ldr     $t1,[sp,#`($i+0)%16`*4]
>> +     eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
>> +     ldr     $t4,[sp,#`($i+9)%16`*4]
>> +
>> +     add     $t2,$t2,$t0
>> +     eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
>> +     add     $t1,$t1,$t2
>> +     eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
>> +     add     $t1,$t1,$t4                     @ X[i]
>> +___
>> +     &BODY_00_15(@_);
>> +}
>> +
>> +$code=<<___;
>> +#ifndef __KERNEL__
>> +# include "arm_arch.h"
>> +#else
>> +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
>> +# define __ARM_MAX_ARCH__ 7
> I'm not sure this will work for kernel that is for older arm

This code is only used at runtime if a NEON is detected on the current CPU

>> +#endif
>> +
>> +.text
>> +#if __ARM_ARCH__<7
>> +.code        32
>> +#else
>> +.syntax unified
>> +# ifdef __thumb2__
>> +.thumb
>> +# else
>> +.code   32
>> +# endif
>> +#endif
>> +
>> +.type        K256,%object
>> +.align       5
>> +K256:
>> +.word        0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
>> +.word        0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
>> +.word        0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
>> +.word        0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
>> +.word        0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
>> +.word        0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
>> +.word        0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
>> +.word        0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
>> +.word        0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
>> +.word        0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
>> +.word        0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
>> +.word        0xd192e819,0xd6990624,0xf40e3585,0x106aa070
>> +.word        0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
>> +.word        0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
>> +.word        0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
>> +.word        0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
>> +.size        K256,.-K256
>> +.word        0                               @ terminator
>> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
>> +.LOPENSSL_armcap:
>> +.word        OPENSSL_armcap_P-sha256_block_data_order
>> +#endif
>> +.align       5
>> +
>> +.global      sha256_block_data_order
>> +.type        sha256_block_data_order,%function
>> +sha256_block_data_order:
>> +#if __ARM_ARCH__<7
>> +     sub     r3,pc,#8                @ sha256_block_data_order
>> +#else
>> +     adr     r3,sha256_block_data_order
>> +#endif
>> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
>> +     ldr     r12,.LOPENSSL_armcap
>> +     ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
>> +     tst     r12,#ARMV8_SHA256
>> +     bne     .LARMv8
>> +     tst     r12,#ARMV7_NEON
>> +     bne     .LNEON
>> +#endif
>> +     add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
>> +     stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
>> +     ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
>> +     sub     $Ktbl,r3,#256+32        @ K256
>> +     sub     sp,sp,#16*4             @ alloca(X[16])
>> +.Loop:
>> +# if __ARM_ARCH__>=7
>> +     ldr     $t1,[$inp],#4
>> +# else
>> +     ldrb    $t1,[$inp,#3]
>> +# endif
>> +     eor     $t3,$B,$C               @ magic
>> +     eor     $t2,$t2,$t2
>> +___
>> +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
>> +$code.=".Lrounds_16_xx:\n";
>> +for (;$i<32;$i++)    { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
>> +$code.=<<___;
>> +#if __ARM_ARCH__>=7
>> +     ite     eq                      @ Thumb2 thing, sanity check in ARM
>> +#endif
>> +     ldreq   $t3,[sp,#16*4]          @ pull ctx
>> +     bne     .Lrounds_16_xx
>> +
>> +     add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
>> +     ldr     $t0,[$t3,#0]
>> +     ldr     $t1,[$t3,#4]
>> +     ldr     $t2,[$t3,#8]
>> +     add     $A,$A,$t0
>> +     ldr     $t0,[$t3,#12]
>> +     add     $B,$B,$t1
>> +     ldr     $t1,[$t3,#16]
>> +     add     $C,$C,$t2
>> +     ldr     $t2,[$t3,#20]
>> +     add     $D,$D,$t0
>> +     ldr     $t0,[$t3,#24]
>> +     add     $E,$E,$t1
>> +     ldr     $t1,[$t3,#28]
>> +     add     $F,$F,$t2
>> +     ldr     $inp,[sp,#17*4]         @ pull inp
>> +     ldr     $t2,[sp,#18*4]          @ pull inp+len
>> +     add     $G,$G,$t0
>> +     add     $H,$H,$t1
>> +     stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
>> +     cmp     $inp,$t2
>> +     sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
>> +     bne     .Loop
>> +
>> +     add     sp,sp,#`16+3`*4 @ destroy frame
>> +#if __ARM_ARCH__>=5
>> +     ldmia   sp!,{r4-r11,pc}
>> +#else
>> +     ldmia   sp!,{r4-r11,lr}
>> +     tst     lr,#1
>> +     moveq   pc,lr                   @ be binary compatible with V4, yet
>> +     bx      lr                      @ interoperable with Thumb ISA:-)
>> +#endif
>> +.size        sha256_block_data_order,.-sha256_block_data_order
>> +___
>> +######################################################################
>> +# NEON stuff
>> +#
>> +{{{
>> +my @X=map("q$_",(0..3));
>> +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
>> +my $Xfer=$t4;
>> +my $j=0;
>> +
>> +sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
>> +sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
>> +
>> +sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
>> +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
>> +  my $arg = pop;
>> +    $arg = "#$arg" if ($arg*1 eq $arg);
>> +    $code .= "\t$opcode\t".join(',', at _,$arg)."\n";
>> +}
>> +
>> +sub Xupdate()
>> +{ use integer;
>> +  my $body = shift;
>> +  my @insns = (&$body,&$body,&$body,&$body);
>> +  my ($a,$b,$c,$d,$e,$f,$g,$h);
>> +
>> +     &vext_8         ($T0, at X[0], at X[1],4);    # X[1..4]
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vext_8         ($T1, at X[2], at X[3],4);    # X[9..12]
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vshr_u32       ($T2,$T0,$sigma0[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       (@X[0], at X[0],$T1);      # X[0..3] += X[9..12]
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vshr_u32       ($T1,$T0,$sigma0[2]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vsli_32        ($T2,$T0,32-$sigma0[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vshr_u32       ($T3,$T0,$sigma0[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &veor           ($T1,$T1,$T2);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vsli_32        ($T3,$T0,32-$sigma0[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       (@X[0], at X[0],$T1);      # X[0..3] += sigma0(X[1..4])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &veor         ($T5,$T5,$T4);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &veor         ($T5,$T5,$T4);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vld1_32        ("{$T0}","[$Ktbl,:128]!");
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +       &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       ($T0,$T0, at X[0]);
>> +      while($#insns>=2) { eval(shift(@insns)); }
>> +     &vst1_32        ("{$T0}","[$Xfer,:128]!");
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +
>> +     push(@X,shift(@X));             # "rotate" X[]
>> +}
>> +
>> +sub Xpreload()
>> +{ use integer;
>> +  my $body = shift;
>> +  my @insns = (&$body,&$body,&$body,&$body);
>> +  my ($a,$b,$c,$d,$e,$f,$g,$h);
>> +
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vld1_32        ("{$T0}","[$Ktbl,:128]!");
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vrev32_8       (@X[0], at X[0]);
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +      eval(shift(@insns));
>> +     &vadd_i32       ($T0,$T0, at X[0]);
>> +      foreach (@insns) { eval; }     # remaining instructions
>> +     &vst1_32        ("{$T0}","[$Xfer,:128]!");
>> +
>> +     push(@X,shift(@X));             # "rotate" X[]
>> +}
>> +
>> +sub body_00_15 () {
>> +     (
>> +     '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
>> +     '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
>> +     '&eor   ($t1,$f,$g)',
>> +     '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
>> +     '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
>> +     '&and   ($t1,$t1,$e)',
>> +     '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
>> +     '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
>> +     '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
>> +     '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
>> +     '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
>> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
>> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
>> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
>> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
>> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
>> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
>> +     '&add   ($d,$d,$h)',                    # d+=h
>> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
>> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>> +     )
>> +}
>> +
>> +$code.=<<___;
>> +#if __ARM_MAX_ARCH__>=7
> this will be compile on armv4 but gcc will not allow it
>
> we need to drop the neon code for older non v7 build
>

The .arch and .fpu declarations ensure that it can be built regardless
of the platform you are compiling for, unless you have a really old
toolchain.
The glue code ensures that the module can only be loaded if HWCAP_NEON is set.

Did you get errors trying to build it?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 12:42       ` Ard Biesheuvel
@ 2015-03-24 13:05         ` Jean-Christophe PLAGNIOL-VILLARD
  -1 siblings, 0 replies; 66+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2015-03-24 13:05 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Herbert Xu, linux-crypto, Sami Tolvanen, Andy Polyakov,
	David S. Miller, linux-arm-kernel

 >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
> >> +     '&add   ($d,$d,$h)',                    # d+=h
> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> >> +     )
> >> +}
> >> +
> >> +$code.=<<___;
> >> +#if __ARM_MAX_ARCH__>=7
> > this will be compile on armv4 but gcc will not allow it
> >
> > we need to drop the neon code for older non v7 build
> >
> 
> The .arch and .fpu declarations ensure that it can be built regardless
> of the platform you are compiling for, unless you have a really old
> toolchain.
I known but does not work for me
> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
> 
> Did you get errors trying to build it?

yes I do

I use 

arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
(prerelease)
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

so it's not that old

Best Regards,
J.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 13:05         ` Jean-Christophe PLAGNIOL-VILLARD
  0 siblings, 0 replies; 66+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2015-03-24 13:05 UTC (permalink / raw)
  To: linux-arm-kernel

 >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
> >> +     '&add   ($d,$d,$h)',                    # d+=h
> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> >> +     )
> >> +}
> >> +
> >> +$code.=<<___;
> >> +#if __ARM_MAX_ARCH__>=7
> > this will be compile on armv4 but gcc will not allow it
> >
> > we need to drop the neon code for older non v7 build
> >
> 
> The .arch and .fpu declarations ensure that it can be built regardless
> of the platform you are compiling for, unless you have a really old
> toolchain.
I known but does not work for me
> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
> 
> Did you get errors trying to build it?

yes I do

I use 

arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
(prerelease)
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

so it's not that old

Best Regards,
J.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 13:05         ` Jean-Christophe PLAGNIOL-VILLARD
@ 2015-03-24 13:06           ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 13:06 UTC (permalink / raw)
  To: Jean-Christophe PLAGNIOL-VILLARD
  Cc: Sami Tolvanen, linux-arm-kernel, linux-crypto, Herbert Xu,
	David S. Miller, Andy Polyakov

On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
<plagnioj@jcrosoft.com> wrote:
>  >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
>> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
>> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
>> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
>> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
>> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
>> >> +     '&add   ($d,$d,$h)',                    # d+=h
>> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
>> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>> >> +     )
>> >> +}
>> >> +
>> >> +$code.=<<___;
>> >> +#if __ARM_MAX_ARCH__>=7
>> > this will be compile on armv4 but gcc will not allow it
>> >
>> > we need to drop the neon code for older non v7 build
>> >
>>
>> The .arch and .fpu declarations ensure that it can be built regardless
>> of the platform you are compiling for, unless you have a really old
>> toolchain.
> I known but does not work for me
>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>>
>> Did you get errors trying to build it?
>
> yes I do
>
> I use
>
> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
> (prerelease)
> Copyright (C) 2013 Free Software Foundation, Inc.
> This is free software; see the source for copying conditions.  There is NO
> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>
> so it's not that old
>

Could you share the error log please?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 13:06           ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 13:06 UTC (permalink / raw)
  To: linux-arm-kernel

On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
<plagnioj@jcrosoft.com> wrote:
>  >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
>> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
>> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
>> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
>> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
>> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
>> >> +     '&add   ($d,$d,$h)',                    # d+=h
>> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
>> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>> >> +     )
>> >> +}
>> >> +
>> >> +$code.=<<___;
>> >> +#if __ARM_MAX_ARCH__>=7
>> > this will be compile on armv4 but gcc will not allow it
>> >
>> > we need to drop the neon code for older non v7 build
>> >
>>
>> The .arch and .fpu declarations ensure that it can be built regardless
>> of the platform you are compiling for, unless you have a really old
>> toolchain.
> I known but does not work for me
>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>>
>> Did you get errors trying to build it?
>
> yes I do
>
> I use
>
> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
> (prerelease)
> Copyright (C) 2013 Free Software Foundation, Inc.
> This is free software; see the source for copying conditions.  There is NO
> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>
> so it's not that old
>

Could you share the error log please?

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 13:06           ` Ard Biesheuvel
@ 2015-03-24 14:46             ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 14:46 UTC (permalink / raw)
  To: Jean-Christophe PLAGNIOL-VILLARD, Andy Polyakov
  Cc: Sami Tolvanen, linux-arm-kernel, linux-crypto, Herbert Xu,
	David S. Miller

On 24 March 2015 at 14:06, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
> <plagnioj@jcrosoft.com> wrote:
>>  >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
>>> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
>>> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
>>> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
>>> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
>>> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
>>> >> +     '&add   ($d,$d,$h)',                    # d+=h
>>> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>>> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
>>> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>>> >> +     )
>>> >> +}
>>> >> +
>>> >> +$code.=<<___;
>>> >> +#if __ARM_MAX_ARCH__>=7
>>> > this will be compile on armv4 but gcc will not allow it
>>> >
>>> > we need to drop the neon code for older non v7 build
>>> >
>>>
>>> The .arch and .fpu declarations ensure that it can be built regardless
>>> of the platform you are compiling for, unless you have a really old
>>> toolchain.
>> I known but does not work for me
>>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>>>
>>> Did you get errors trying to build it?
>>
>> yes I do
>>
>> I use
>>
>> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
>> (prerelease)
>> Copyright (C) 2013 Free Software Foundation, Inc.
>> This is free software; see the source for copying conditions.  There is NO
>> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>>
>> so it's not that old
>>
>
> Could you share the error log please?

OK, I spotted one issue with this code:

arch/arm/crypto/sha256-core.S: Assembler messages:
arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
after fixup

This is caused by the fact that, when building the integer-only code
for an older architecture, the conditional compilation produces a
slightly bigger preceding function, and the symbol K256 is out of
range for the adr instruction.

@Jean-Christophe: is that the same problem that you hit?

@Andy: I propose we do something similar as in the bsaes code:

#ifdef __thumb__
#define adrl adr
#endif

and replace the offending line with

adrl r14,K256

@Herbert: we will need to respin this, so please don't pull it yet.

Regards,
-- 
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 14:46             ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 14:46 UTC (permalink / raw)
  To: linux-arm-kernel

On 24 March 2015 at 14:06, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
> <plagnioj@jcrosoft.com> wrote:
>>  >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
>>> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
>>> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
>>> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
>>> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
>>> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
>>> >> +     '&add   ($d,$d,$h)',                    # d+=h
>>> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>>> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
>>> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>>> >> +     )
>>> >> +}
>>> >> +
>>> >> +$code.=<<___;
>>> >> +#if __ARM_MAX_ARCH__>=7
>>> > this will be compile on armv4 but gcc will not allow it
>>> >
>>> > we need to drop the neon code for older non v7 build
>>> >
>>>
>>> The .arch and .fpu declarations ensure that it can be built regardless
>>> of the platform you are compiling for, unless you have a really old
>>> toolchain.
>> I known but does not work for me
>>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>>>
>>> Did you get errors trying to build it?
>>
>> yes I do
>>
>> I use
>>
>> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
>> (prerelease)
>> Copyright (C) 2013 Free Software Foundation, Inc.
>> This is free software; see the source for copying conditions.  There is NO
>> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>>
>> so it's not that old
>>
>
> Could you share the error log please?

OK, I spotted one issue with this code:

arch/arm/crypto/sha256-core.S: Assembler messages:
arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
after fixup

This is caused by the fact that, when building the integer-only code
for an older architecture, the conditional compilation produces a
slightly bigger preceding function, and the symbol K256 is out of
range for the adr instruction.

@Jean-Christophe: is that the same problem that you hit?

@Andy: I propose we do something similar as in the bsaes code:

#ifdef __thumb__
#define adrl adr
#endif

and replace the offending line with

adrl r14,K256

@Herbert: we will need to respin this, so please don't pull it yet.

Regards,
-- 
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 14:46             ` Ard Biesheuvel
@ 2015-03-24 17:05               ` Jean-Christophe PLAGNIOL-VILLARD
  -1 siblings, 0 replies; 66+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2015-03-24 17:05 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Herbert Xu, linux-crypto, Sami Tolvanen, Andy Polyakov,
	David S. Miller, linux-arm-kernel

On 15:46 Tue 24 Mar     , Ard Biesheuvel wrote:
> On 24 March 2015 at 14:06, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> > On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
> > <plagnioj@jcrosoft.com> wrote:
> >>  >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
> >>> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
> >>> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
> >>> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
> >>> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
> >>> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
> >>> >> +     '&add   ($d,$d,$h)',                    # d+=h
> >>> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
> >>> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
> >>> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> >>> >> +     )
> >>> >> +}
> >>> >> +
> >>> >> +$code.=<<___;
> >>> >> +#if __ARM_MAX_ARCH__>=7
> >>> > this will be compile on armv4 but gcc will not allow it
> >>> >
> >>> > we need to drop the neon code for older non v7 build
> >>> >
> >>>
> >>> The .arch and .fpu declarations ensure that it can be built regardless
> >>> of the platform you are compiling for, unless you have a really old
> >>> toolchain.
> >> I known but does not work for me
> >>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
> >>>
> >>> Did you get errors trying to build it?
> >>
> >> yes I do
> >>
> >> I use
> >>
> >> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
> >> (prerelease)
> >> Copyright (C) 2013 Free Software Foundation, Inc.
> >> This is free software; see the source for copying conditions.  There is NO
> >> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
> >>
> >> so it's not that old
> >>
> >
> > Could you share the error log please?
> 
> OK, I spotted one issue with this code:
> 
> arch/arm/crypto/sha256-core.S: Assembler messages:
> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
> after fixup

yes exactly
> 
> This is caused by the fact that, when building the integer-only code
> for an older architecture, the conditional compilation produces a
> slightly bigger preceding function, and the symbol K256 is out of
> range for the adr instruction.

Yeap I see that too when debuging
> 
> @Jean-Christophe: is that the same problem that you hit?
> 
> @Andy: I propose we do something similar as in the bsaes code:
> 
> #ifdef __thumb__
> #define adrl adr
> #endif
> 
> and replace the offending line with
> 
> adrl r14,K256

Acked-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
Tested-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>

on arm926ejs

Best Regards,
J.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 17:05               ` Jean-Christophe PLAGNIOL-VILLARD
  0 siblings, 0 replies; 66+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2015-03-24 17:05 UTC (permalink / raw)
  To: linux-arm-kernel

On 15:46 Tue 24 Mar     , Ard Biesheuvel wrote:
> On 24 March 2015 at 14:06, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> > On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
> > <plagnioj@jcrosoft.com> wrote:
> >>  >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
> >>> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
> >>> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
> >>> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
> >>> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
> >>> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
> >>> >> +     '&add   ($d,$d,$h)',                    # d+=h
> >>> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
> >>> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
> >>> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> >>> >> +     )
> >>> >> +}
> >>> >> +
> >>> >> +$code.=<<___;
> >>> >> +#if __ARM_MAX_ARCH__>=7
> >>> > this will be compile on armv4 but gcc will not allow it
> >>> >
> >>> > we need to drop the neon code for older non v7 build
> >>> >
> >>>
> >>> The .arch and .fpu declarations ensure that it can be built regardless
> >>> of the platform you are compiling for, unless you have a really old
> >>> toolchain.
> >> I known but does not work for me
> >>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
> >>>
> >>> Did you get errors trying to build it?
> >>
> >> yes I do
> >>
> >> I use
> >>
> >> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
> >> (prerelease)
> >> Copyright (C) 2013 Free Software Foundation, Inc.
> >> This is free software; see the source for copying conditions.  There is NO
> >> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
> >>
> >> so it's not that old
> >>
> >
> > Could you share the error log please?
> 
> OK, I spotted one issue with this code:
> 
> arch/arm/crypto/sha256-core.S: Assembler messages:
> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
> after fixup

yes exactly
> 
> This is caused by the fact that, when building the integer-only code
> for an older architecture, the conditional compilation produces a
> slightly bigger preceding function, and the symbol K256 is out of
> range for the adr instruction.

Yeap I see that too when debuging
> 
> @Jean-Christophe: is that the same problem that you hit?
> 
> @Andy: I propose we do something similar as in the bsaes code:
> 
> #ifdef __thumb__
> #define adrl adr
> #endif
> 
> and replace the offending line with
> 
> adrl r14,K256

Acked-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
Tested-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>

on arm926ejs

Best Regards,
J.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 17:05               ` Jean-Christophe PLAGNIOL-VILLARD
@ 2015-03-24 17:40                 ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 17:40 UTC (permalink / raw)
  To: Jean-Christophe PLAGNIOL-VILLARD
  Cc: Andy Polyakov, Sami Tolvanen, linux-arm-kernel, linux-crypto,
	Herbert Xu, David S. Miller

On 24 March 2015 at 18:05, Jean-Christophe PLAGNIOL-VILLARD
<plagnioj@jcrosoft.com> wrote:
> On 15:46 Tue 24 Mar     , Ard Biesheuvel wrote:
>> On 24 March 2015 at 14:06, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>> > On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
>> > <plagnioj@jcrosoft.com> wrote:
>> >>  >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
>> >>> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
>> >>> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
>> >>> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
>> >>> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
>> >>> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
>> >>> >> +     '&add   ($d,$d,$h)',                    # d+=h
>> >>> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>> >>> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
>> >>> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>> >>> >> +     )
>> >>> >> +}
>> >>> >> +
>> >>> >> +$code.=<<___;
>> >>> >> +#if __ARM_MAX_ARCH__>=7
>> >>> > this will be compile on armv4 but gcc will not allow it
>> >>> >
>> >>> > we need to drop the neon code for older non v7 build
>> >>> >
>> >>>
>> >>> The .arch and .fpu declarations ensure that it can be built regardless
>> >>> of the platform you are compiling for, unless you have a really old
>> >>> toolchain.
>> >> I known but does not work for me
>> >>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>> >>>
>> >>> Did you get errors trying to build it?
>> >>
>> >> yes I do
>> >>
>> >> I use
>> >>
>> >> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
>> >> (prerelease)
>> >> Copyright (C) 2013 Free Software Foundation, Inc.
>> >> This is free software; see the source for copying conditions.  There is NO
>> >> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>> >>
>> >> so it's not that old
>> >>
>> >
>> > Could you share the error log please?
>>
>> OK, I spotted one issue with this code:
>>
>> arch/arm/crypto/sha256-core.S: Assembler messages:
>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>> after fixup
>
> yes exactly
>>
>> This is caused by the fact that, when building the integer-only code
>> for an older architecture, the conditional compilation produces a
>> slightly bigger preceding function, and the symbol K256 is out of
>> range for the adr instruction.
>
> Yeap I see that too when debuging
>>
>> @Jean-Christophe: is that the same problem that you hit?
>>
>> @Andy: I propose we do something similar as in the bsaes code:
>>
>> #ifdef __thumb__
>> #define adrl adr
>> #endif
>>
>> and replace the offending line with
>>
>> adrl r14,K256
>
> Acked-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
> Tested-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
>

Thanks!

@Sami, Andy: we need to respin the whole patch, including updated
OpenSSL upstream commit id :-(

Regards,
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 17:40                 ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-24 17:40 UTC (permalink / raw)
  To: linux-arm-kernel

On 24 March 2015 at 18:05, Jean-Christophe PLAGNIOL-VILLARD
<plagnioj@jcrosoft.com> wrote:
> On 15:46 Tue 24 Mar     , Ard Biesheuvel wrote:
>> On 24 March 2015 at 14:06, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>> > On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
>> > <plagnioj@jcrosoft.com> wrote:
>> >>  >> +     '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
>> >>> >> +     '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
>> >>> >> +     '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
>> >>> >> +     '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
>> >>> >> +     '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
>> >>> >> +     '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
>> >>> >> +     '&add   ($d,$d,$h)',                    # d+=h
>> >>> >> +     '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>> >>> >> +     '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
>> >>> >> +     '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>> >>> >> +     )
>> >>> >> +}
>> >>> >> +
>> >>> >> +$code.=<<___;
>> >>> >> +#if __ARM_MAX_ARCH__>=7
>> >>> > this will be compile on armv4 but gcc will not allow it
>> >>> >
>> >>> > we need to drop the neon code for older non v7 build
>> >>> >
>> >>>
>> >>> The .arch and .fpu declarations ensure that it can be built regardless
>> >>> of the platform you are compiling for, unless you have a really old
>> >>> toolchain.
>> >> I known but does not work for me
>> >>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>> >>>
>> >>> Did you get errors trying to build it?
>> >>
>> >> yes I do
>> >>
>> >> I use
>> >>
>> >> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
>> >> (prerelease)
>> >> Copyright (C) 2013 Free Software Foundation, Inc.
>> >> This is free software; see the source for copying conditions.  There is NO
>> >> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>> >>
>> >> so it's not that old
>> >>
>> >
>> > Could you share the error log please?
>>
>> OK, I spotted one issue with this code:
>>
>> arch/arm/crypto/sha256-core.S: Assembler messages:
>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>> after fixup
>
> yes exactly
>>
>> This is caused by the fact that, when building the integer-only code
>> for an older architecture, the conditional compilation produces a
>> slightly bigger preceding function, and the symbol K256 is out of
>> range for the adr instruction.
>
> Yeap I see that too when debuging
>>
>> @Jean-Christophe: is that the same problem that you hit?
>>
>> @Andy: I propose we do something similar as in the bsaes code:
>>
>> #ifdef __thumb__
>> #define adrl adr
>> #endif
>>
>> and replace the offending line with
>>
>> adrl r14,K256
>
> Acked-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
> Tested-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
>

Thanks!

@Sami, Andy: we need to respin the whole patch, including updated
OpenSSL upstream commit id :-(

Regards,
Ard.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 17:40                 ` Ard Biesheuvel
@ 2015-03-24 18:17                   ` Sami Tolvanen
  -1 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-24 18:17 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Jean-Christophe PLAGNIOL-VILLARD, Andy Polyakov,
	linux-arm-kernel, linux-crypto, Herbert Xu, David S. Miller

On Tue, Mar 24, 2015 at 06:40:29PM +0100, Ard Biesheuvel wrote:
> @Sami, Andy: we need to respin the whole patch, including updated
> OpenSSL upstream commit id :-(

Sure, I will send v3 once the changes are in OpenSSL. Thanks for testing!
Do you still prefer everything in one patch?

Sami

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-24 18:17                   ` Sami Tolvanen
  0 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-24 18:17 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Mar 24, 2015 at 06:40:29PM +0100, Ard Biesheuvel wrote:
> @Sami, Andy: we need to respin the whole patch, including updated
> OpenSSL upstream commit id :-(

Sure, I will send v3 once the changes are in OpenSSL. Thanks for testing!
Do you still prefer everything in one patch?

Sami

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 18:17                   ` Sami Tolvanen
@ 2015-03-25  6:49                     ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-25  6:49 UTC (permalink / raw)
  To: Sami Tolvanen
  Cc: Jean-Christophe PLAGNIOL-VILLARD, Andy Polyakov,
	linux-arm-kernel, linux-crypto, Herbert Xu, David S. Miller

On 24 March 2015 at 19:17, Sami Tolvanen <samitolvanen@google.com> wrote:
> On Tue, Mar 24, 2015 at 06:40:29PM +0100, Ard Biesheuvel wrote:
>> @Sami, Andy: we need to respin the whole patch, including updated
>> OpenSSL upstream commit id :-(
>
> Sure, I will send v3 once the changes are in OpenSSL. Thanks for testing!
> Do you still prefer everything in one patch?
>

If it is easily done, you could add the .S_swapped file and the
related Makefile rule in a separate followup patch, yes.
But then, the second one might still be too big ...

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-25  6:49                     ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-25  6:49 UTC (permalink / raw)
  To: linux-arm-kernel

On 24 March 2015 at 19:17, Sami Tolvanen <samitolvanen@google.com> wrote:
> On Tue, Mar 24, 2015 at 06:40:29PM +0100, Ard Biesheuvel wrote:
>> @Sami, Andy: we need to respin the whole patch, including updated
>> OpenSSL upstream commit id :-(
>
> Sure, I will send v3 once the changes are in OpenSSL. Thanks for testing!
> Do you still prefer everything in one patch?
>

If it is easily done, you could add the .S_swapped file and the
related Makefile rule in a separate followup patch, yes.
But then, the second one might still be too big ...

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH] arm: crypto: Add NEON optimized SHA-256
  2015-03-16 15:48 ` Sami Tolvanen
@ 2015-03-25 20:00   ` Jean-Christophe PLAGNIOL-VILLARD
  -1 siblings, 0 replies; 66+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2015-03-25 20:00 UTC (permalink / raw)
  To: Sami Tolvanen
  Cc: herbert, davem, linux-crypto, linux-arm-kernel, ard.biesheuvel

On 15:48 Mon 16 Mar     , Sami Tolvanen wrote:
> Add Andy Polyakov's NEON optimized SHA-256 implementation.
> 
> On Nexus 6, this implementation is ~2x faster than sha256-generic.

do you plan to add the sha512 from openssl too?

Whould be nice so armv4 can get faster implementatio too

Best Regards,
J.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH] arm: crypto: Add NEON optimized SHA-256
@ 2015-03-25 20:00   ` Jean-Christophe PLAGNIOL-VILLARD
  0 siblings, 0 replies; 66+ messages in thread
From: Jean-Christophe PLAGNIOL-VILLARD @ 2015-03-25 20:00 UTC (permalink / raw)
  To: linux-arm-kernel

On 15:48 Mon 16 Mar     , Sami Tolvanen wrote:
> Add Andy Polyakov's NEON optimized SHA-256 implementation.
> 
> On Nexus 6, this implementation is ~2x faster than sha256-generic.

do you plan to add the sha512 from openssl too?

Whould be nice so armv4 can get faster implementatio too

Best Regards,
J.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-24 14:46             ` Ard Biesheuvel
@ 2015-03-27 10:42               ` Andy Polyakov
  -1 siblings, 0 replies; 66+ messages in thread
From: Andy Polyakov @ 2015-03-27 10:42 UTC (permalink / raw)
  To: Ard Biesheuvel, Jean-Christophe PLAGNIOL-VILLARD
  Cc: Herbert Xu, David S. Miller, linux-arm-kernel, Sami Tolvanen,
	linux-crypto

[-- Attachment #1: Type: text/plain, Size: 1488 bytes --]

>> Could you share the error log please?
> 
> OK, I spotted one issue with this code:
> 
> arch/arm/crypto/sha256-core.S: Assembler messages:
> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
> after fixup
> 
> This is caused by the fact that, when building the integer-only code
> for an older architecture, the conditional compilation produces a
> slightly bigger preceding function, and the symbol K256 is out of
> range for the adr instruction.
> 
> @Jean-Christophe: is that the same problem that you hit?
> 
> @Andy: I propose we do something similar as in the bsaes code:
> 
> #ifdef __thumb__
> #define adrl adr
> #endif
> 
> and replace the offending line with
> 
> adrl r14,K256

Sorry about delay. Yes, that would do. I did test all combinations, but
all "my" combinations, i.e. without __KERNEL__ defined :-( And without
__KERNEL__ there are few extra instructions in integer-only subroutine
that "push" instruction in question code toward higher address, so that
constant was ffffefc0, which can be encoded. Anyway, I've chosen to add
that #define next to .thumb directive. See attached.

Ard, you have mentioned that you've verified it on big-endian, but I've
spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
guess that it worked for you either because it was NEON that was tested
(it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
case it uses endian-neutral byte-by-byte data load). Can you confirm either?


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: sha256-armv4.diff --]
[-- Type: text/x-patch; name="sha256-armv4.diff", Size: 869 bytes --]

diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 4fee74d..fac0533 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -73,7 +73,9 @@ $code.=<<___ if ($i<16);
 	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
 	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+# ifndef __ARMEB__
 	rev	$t1,$t1
+# endif
 #else
 	@ ldrb	$t1,[$inp,#3]			@ $i
 	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
@@ -166,6 +168,7 @@ $code=<<___;
 #else
 .syntax unified
 # ifdef __thumb2__
+#  define adrl adr
 .thumb
 # else
 .code   32
@@ -460,7 +463,7 @@ sha256_block_data_order_neon:
 	stmdb	sp!,{r4-r12,lr}
 
 	sub	$H,sp,#16*4+16
-	adr	$Ktbl,K256
+	adrl	$Ktbl,K256
 	bic	$H,$H,#15		@ align for 128-bit stores
 	mov	$t2,sp
 	mov	sp,$H			@ alloca

[-- Attachment #3: Type: text/plain, Size: 176 bytes --]

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-27 10:42               ` Andy Polyakov
  0 siblings, 0 replies; 66+ messages in thread
From: Andy Polyakov @ 2015-03-27 10:42 UTC (permalink / raw)
  To: linux-arm-kernel

>> Could you share the error log please?
> 
> OK, I spotted one issue with this code:
> 
> arch/arm/crypto/sha256-core.S: Assembler messages:
> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
> after fixup
> 
> This is caused by the fact that, when building the integer-only code
> for an older architecture, the conditional compilation produces a
> slightly bigger preceding function, and the symbol K256 is out of
> range for the adr instruction.
> 
> @Jean-Christophe: is that the same problem that you hit?
> 
> @Andy: I propose we do something similar as in the bsaes code:
> 
> #ifdef __thumb__
> #define adrl adr
> #endif
> 
> and replace the offending line with
> 
> adrl r14,K256

Sorry about delay. Yes, that would do. I did test all combinations, but
all "my" combinations, i.e. without __KERNEL__ defined :-( And without
__KERNEL__ there are few extra instructions in integer-only subroutine
that "push" instruction in question code toward higher address, so that
constant was ffffefc0, which can be encoded. Anyway, I've chosen to add
that #define next to .thumb directive. See attached.

Ard, you have mentioned that you've verified it on big-endian, but I've
spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
guess that it worked for you either because it was NEON that was tested
(it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
case it uses endian-neutral byte-by-byte data load). Can you confirm either?

-------------- next part --------------
A non-text attachment was scrubbed...
Name: sha256-armv4.diff
Type: text/x-patch
Size: 838 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20150327/6d108844/attachment.bin>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-27 10:42               ` Andy Polyakov
@ 2015-03-27 10:44                 ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-27 10:44 UTC (permalink / raw)
  To: Andy Polyakov
  Cc: Jean-Christophe PLAGNIOL-VILLARD, Sami Tolvanen,
	linux-arm-kernel, linux-crypto, Herbert Xu, David S. Miller

On 27 March 2015 at 11:42, Andy Polyakov <appro@openssl.org> wrote:
>>> Could you share the error log please?
>>
>> OK, I spotted one issue with this code:
>>
>> arch/arm/crypto/sha256-core.S: Assembler messages:
>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>> after fixup
>>
>> This is caused by the fact that, when building the integer-only code
>> for an older architecture, the conditional compilation produces a
>> slightly bigger preceding function, and the symbol K256 is out of
>> range for the adr instruction.
>>
>> @Jean-Christophe: is that the same problem that you hit?
>>
>> @Andy: I propose we do something similar as in the bsaes code:
>>
>> #ifdef __thumb__
>> #define adrl adr
>> #endif
>>
>> and replace the offending line with
>>
>> adrl r14,K256
>
> Sorry about delay. Yes, that would do. I did test all combinations, but
> all "my" combinations, i.e. without __KERNEL__ defined :-( And without
> __KERNEL__ there are few extra instructions in integer-only subroutine
> that "push" instruction in question code toward higher address, so that
> constant was ffffefc0, which can be encoded. Anyway, I've chosen to add
> that #define next to .thumb directive. See attached.
>
> Ard, you have mentioned that you've verified it on big-endian, but I've
> spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
> guess that it worked for you either because it was NEON that was tested
> (it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
> case it uses endian-neutral byte-by-byte data load). Can you confirm either?
>

I need to double check that, but my suspicion is that it was the latter.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-27 10:44                 ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-27 10:44 UTC (permalink / raw)
  To: linux-arm-kernel

On 27 March 2015 at 11:42, Andy Polyakov <appro@openssl.org> wrote:
>>> Could you share the error log please?
>>
>> OK, I spotted one issue with this code:
>>
>> arch/arm/crypto/sha256-core.S: Assembler messages:
>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>> after fixup
>>
>> This is caused by the fact that, when building the integer-only code
>> for an older architecture, the conditional compilation produces a
>> slightly bigger preceding function, and the symbol K256 is out of
>> range for the adr instruction.
>>
>> @Jean-Christophe: is that the same problem that you hit?
>>
>> @Andy: I propose we do something similar as in the bsaes code:
>>
>> #ifdef __thumb__
>> #define adrl adr
>> #endif
>>
>> and replace the offending line with
>>
>> adrl r14,K256
>
> Sorry about delay. Yes, that would do. I did test all combinations, but
> all "my" combinations, i.e. without __KERNEL__ defined :-( And without
> __KERNEL__ there are few extra instructions in integer-only subroutine
> that "push" instruction in question code toward higher address, so that
> constant was ffffefc0, which can be encoded. Anyway, I've chosen to add
> that #define next to .thumb directive. See attached.
>
> Ard, you have mentioned that you've verified it on big-endian, but I've
> spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
> guess that it worked for you either because it was NEON that was tested
> (it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
> case it uses endian-neutral byte-by-byte data load). Can you confirm either?
>

I need to double check that, but my suspicion is that it was the latter.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-27 10:44                 ` Ard Biesheuvel
@ 2015-03-27 18:07                   ` Ard Biesheuvel
  -1 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-27 18:07 UTC (permalink / raw)
  To: Andy Polyakov
  Cc: Jean-Christophe PLAGNIOL-VILLARD, Sami Tolvanen,
	linux-arm-kernel, linux-crypto, Herbert Xu, David S. Miller

On 27 March 2015 at 11:44, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> On 27 March 2015 at 11:42, Andy Polyakov <appro@openssl.org> wrote:
>>>> Could you share the error log please?
>>>
>>> OK, I spotted one issue with this code:
>>>
>>> arch/arm/crypto/sha256-core.S: Assembler messages:
>>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>>> after fixup
>>>
>>> This is caused by the fact that, when building the integer-only code
>>> for an older architecture, the conditional compilation produces a
>>> slightly bigger preceding function, and the symbol K256 is out of
>>> range for the adr instruction.
>>>
>>> @Jean-Christophe: is that the same problem that you hit?
>>>
>>> @Andy: I propose we do something similar as in the bsaes code:
>>>
>>> #ifdef __thumb__
>>> #define adrl adr
>>> #endif
>>>
>>> and replace the offending line with
>>>
>>> adrl r14,K256
>>
>> Sorry about delay. Yes, that would do. I did test all combinations, but
>> all "my" combinations, i.e. without __KERNEL__ defined :-( And without
>> __KERNEL__ there are few extra instructions in integer-only subroutine
>> that "push" instruction in question code toward higher address, so that
>> constant was ffffefc0, which can be encoded. Anyway, I've chosen to add
>> that #define next to .thumb directive. See attached.
>>
>> Ard, you have mentioned that you've verified it on big-endian, but I've
>> spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
>> guess that it worked for you either because it was NEON that was tested
>> (it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
>> case it uses endian-neutral byte-by-byte data load). Can you confirm either?
>>
>
> I need to double check that, but my suspicion is that it was the latter.

Indeed, if I build for v7 I get

[    0.269418] 00000000: 4e a5 c5 08 a6 56 6e 76 24 05 43 f8 fe b0 6f d4
[    0.275261] 00000010: 57 77 7b e3 95 49 c4 01 64 36 af da 65 d2 33 0e
[    0.281031] alg: hash: Test 1 failed for sha224-asm
[    0.285315] 00000000: 9d 6a 5d e9 e1 6c 39 99 c7 14 84 0f 47 77 1f 36
[    0.290912] 00000010: dc c2 97 a7 bd ef aa c3 6c 95 15 ae

which is indeed the integer code failing, and your attached patch fixes it.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-27 18:07                   ` Ard Biesheuvel
  0 siblings, 0 replies; 66+ messages in thread
From: Ard Biesheuvel @ 2015-03-27 18:07 UTC (permalink / raw)
  To: linux-arm-kernel

On 27 March 2015 at 11:44, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
> On 27 March 2015 at 11:42, Andy Polyakov <appro@openssl.org> wrote:
>>>> Could you share the error log please?
>>>
>>> OK, I spotted one issue with this code:
>>>
>>> arch/arm/crypto/sha256-core.S: Assembler messages:
>>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>>> after fixup
>>>
>>> This is caused by the fact that, when building the integer-only code
>>> for an older architecture, the conditional compilation produces a
>>> slightly bigger preceding function, and the symbol K256 is out of
>>> range for the adr instruction.
>>>
>>> @Jean-Christophe: is that the same problem that you hit?
>>>
>>> @Andy: I propose we do something similar as in the bsaes code:
>>>
>>> #ifdef __thumb__
>>> #define adrl adr
>>> #endif
>>>
>>> and replace the offending line with
>>>
>>> adrl r14,K256
>>
>> Sorry about delay. Yes, that would do. I did test all combinations, but
>> all "my" combinations, i.e. without __KERNEL__ defined :-( And without
>> __KERNEL__ there are few extra instructions in integer-only subroutine
>> that "push" instruction in question code toward higher address, so that
>> constant was ffffefc0, which can be encoded. Anyway, I've chosen to add
>> that #define next to .thumb directive. See attached.
>>
>> Ard, you have mentioned that you've verified it on big-endian, but I've
>> spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
>> guess that it worked for you either because it was NEON that was tested
>> (it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
>> case it uses endian-neutral byte-by-byte data load). Can you confirm either?
>>
>
> I need to double check that, but my suspicion is that it was the latter.

Indeed, if I build for v7 I get

[    0.269418] 00000000: 4e a5 c5 08 a6 56 6e 76 24 05 43 f8 fe b0 6f d4
[    0.275261] 00000010: 57 77 7b e3 95 49 c4 01 64 36 af da 65 d2 33 0e
[    0.281031] alg: hash: Test 1 failed for sha224-asm
[    0.285315] 00000000: 9d 6a 5d e9 e1 6c 39 99 c7 14 84 0f 47 77 1f 36
[    0.290912] 00000010: dc c2 97 a7 bd ef aa c3 6c 95 15 ae

which is indeed the integer code failing, and your attached patch fixes it.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224
  2015-03-27 18:07                   ` Ard Biesheuvel
@ 2015-03-29 13:27                     ` Andy Polyakov
  -1 siblings, 0 replies; 66+ messages in thread
From: Andy Polyakov @ 2015-03-29 13:27 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Jean-Christophe PLAGNIOL-VILLARD, Sami Tolvanen,
	linux-arm-kernel, linux-crypto, Herbert Xu, David S. Miller

>>>> arch/arm/crypto/sha256-core.S: Assembler messages:
>>>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>>>> after fixup
>>>>
>>>> This is caused by the fact that, when building the integer-only code
>>>> for an older architecture, the conditional compilation produces a
>>>> slightly bigger preceding function, and the symbol K256 is out of
>>>> range for the adr instruction.
>>>>
>>>> ... and replace the offending line with
>>>>
>>>> adrl r14,K256
>>>
>>> Ard, you have mentioned that you've verified it on big-endian, but I've
>>> spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
>>> guess that it worked for you either because it was NEON that was tested
>>> (it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
>>> case it uses endian-neutral byte-by-byte data load). Can you confirm either?
> 
> Indeed, if I build for v7 I get
> 
> [    0.269418] 00000000: 4e a5 c5 08 a6 56 6e 76 24 05 43 f8 fe b0 6f d4
> [    0.275261] 00000010: 57 77 7b e3 95 49 c4 01 64 36 af da 65 d2 33 0e
> [    0.281031] alg: hash: Test 1 failed for sha224-asm
> [    0.285315] 00000000: 9d 6a 5d e9 e1 6c 39 99 c7 14 84 0f 47 77 1f 36
> [    0.290912] 00000010: dc c2 97 a7 bd ef aa c3 6c 95 15 ae
> 
> which is indeed the integer code failing, and your attached patch fixes it.

Committed as
http://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff;h=51f8d095562f36cdaa6893597b5c609e943b0565.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv2] arm: crypto: Add optimized SHA-256/224
@ 2015-03-29 13:27                     ` Andy Polyakov
  0 siblings, 0 replies; 66+ messages in thread
From: Andy Polyakov @ 2015-03-29 13:27 UTC (permalink / raw)
  To: linux-arm-kernel

>>>> arch/arm/crypto/sha256-core.S: Assembler messages:
>>>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>>>> after fixup
>>>>
>>>> This is caused by the fact that, when building the integer-only code
>>>> for an older architecture, the conditional compilation produces a
>>>> slightly bigger preceding function, and the symbol K256 is out of
>>>> range for the adr instruction.
>>>>
>>>> ... and replace the offending line with
>>>>
>>>> adrl r14,K256
>>>
>>> Ard, you have mentioned that you've verified it on big-endian, but I've
>>> spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
>>> guess that it worked for you either because it was NEON that was tested
>>> (it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
>>> case it uses endian-neutral byte-by-byte data load). Can you confirm either?
> 
> Indeed, if I build for v7 I get
> 
> [    0.269418] 00000000: 4e a5 c5 08 a6 56 6e 76 24 05 43 f8 fe b0 6f d4
> [    0.275261] 00000010: 57 77 7b e3 95 49 c4 01 64 36 af da 65 d2 33 0e
> [    0.281031] alg: hash: Test 1 failed for sha224-asm
> [    0.285315] 00000000: 9d 6a 5d e9 e1 6c 39 99 c7 14 84 0f 47 77 1f 36
> [    0.290912] 00000010: dc c2 97 a7 bd ef aa c3 6c 95 15 ae
> 
> which is indeed the integer code failing, and your attached patch fixes it.

Committed as
http://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff;h=51f8d095562f36cdaa6893597b5c609e943b0565.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv3] arm: crypto: Add optimized SHA-256/224
  2015-03-23 13:50   ` Sami Tolvanen
@ 2015-03-30  8:37     ` Sami Tolvanen
  -1 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-30  8:37 UTC (permalink / raw)
  To: linux-arm-kernel, linux-crypto, ard.biesheuvel, herbert, davem
  Cc: Andy Polyakov

Add Andy Polyakov's optimized assembly and NEON implementations for
SHA-256/224.

The sha256-armv4.pl script for generating the assembly code is from
OpenSSL commit 51f8d095562f36cdaa6893597b5c609e943b0565.

Compared to sha256-generic these implementations have the following
tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

  bs    b/u      sha256-neon  sha256-asm
  16    16       x1.32        x1.19
  64    16       x1.27        x1.15
  64    64       x1.36        x1.20
  256   16       x1.22        x1.11
  256   64       x1.36        x1.19
  256   256      x1.59        x1.23
  1024  16       x1.21        x1.10
  1024  256      x1.65        x1.23
  1024  1024     x1.76        x1.25
  2048  16       x1.21        x1.10
  2048  256      x1.66        x1.23
  2048  1024     x1.78        x1.25
  2048  2048     x1.79        x1.25
  4096  16       x1.20        x1.09
  4096  256      x1.66        x1.23
  4096  1024     x1.79        x1.26
  4096  4096     x1.82        x1.26
  8192  16       x1.20        x1.09
  8192  256      x1.67        x1.23
  8192  1024     x1.80        x1.26
  8192  4096     x1.85        x1.28
  8192  8192     x1.85        x1.27

Where bs refers to block size and b/u to bytes per update.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Cc: Andy Polyakov <appro@openssl.org>

---
Changes since v2:
  Updated sha256-armv4.pl and sha256-core.S_shipped

Changes since v1:
  Rebased to Herbert's cryptodev tree
  Include sha256-armv4.pl and use it to generate sha256-core.S
  Add integer-only assembly version as sha256-asm
  Add support for SHA-224 to the glue code
  Change priority for sha256/224-ce to 300

---
 arch/arm/crypto/Kconfig               |    7 
 arch/arm/crypto/Makefile              |    8 
 arch/arm/crypto/sha2-ce-glue.c        |    4 
 arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
 arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
 arch/arm/crypto/sha256_glue.c         |  246 ++
 arch/arm/crypto/sha256_glue.h         |   23 
 arch/arm/crypto/sha256_neon_glue.c    |  172 +
 8 files changed, 3945 insertions(+), 3 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index d63f319..458729d 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
 	  SHA-256 secure hash standard (DFIPS 180-2) implemented
 	  using special ARMv8 Crypto Extensions.
 
+config CRYPTO_SHA256_ARM
+	tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
+	select CRYPTO_HASH
+	help
+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
+	  using optimized ARM assembler and NEON, when available.
+
 config CRYPTO_SHA512_ARM_NEON
 	tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 9a273bd..ef46e89 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
@@ -16,6 +17,8 @@ aes-arm-y	:= aes-armv4.o aes_glue.o
 aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
@@ -28,4 +31,7 @@ quiet_cmd_perl = PERL    $@
 $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
 	$(call cmd,perl)
 
-.PRECIOUS: $(obj)/aesbs-core.S
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+	$(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 9ffe8ad..0449eca 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
 	.base			= {
 		.cra_name		= "sha224",
 		.cra_driver_name	= "sha224-ce",
-		.cra_priority		= 200,
+		.cra_priority		= 300,
 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
@@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
 	.base			= {
 		.cra_name		= "sha256",
 		.cra_driver_name	= "sha256-ce",
-		.cra_priority		= 200,
+		.cra_priority		= 300,
 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644
index 0000000..fac0533
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -0,0 +1,716 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	$t0="r0";
+$inp="r1";	$t4="r1";
+$len="r2";	$t1="r2";
+$T1="r3";	$t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	@ ldr	$t1,[$inp],#4			@ $i
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	$t1,$t1
+# endif
+#else
+	@ ldrb	$t1,[$inp,#3]			@ $i
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	ldrb	$t2,[$inp,#2]
+	ldrb	$t0,[$inp,#1]
+	orr	$t1,$t1,$t2,lsl#8
+	ldrb	$t2,[$inp],#4
+	orr	$t1,$t1,$t0,lsl#16
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	orr	$t1,$t1,$t2,lsl#24
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+#endif
+___
+$code.=<<___;
+	ldr	$t2,[$Ktbl],#4			@ *K256++
+	add	$h,$h,$t1			@ h+=X[i]
+	str	$t1,[sp,#`$i%16`*4]
+	eor	$t1,$f,$g
+	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
+	and	$t1,$t1,$e
+	add	$h,$h,$t2			@ h+=K256[i]
+	eor	$t1,$t1,$g			@ Ch(e,f,g)
+	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+	add	$h,$h,$t1			@ h+=Ch(e,f,g)
+#if $i==31
+	and	$t2,$t2,#0xff
+	cmp	$t2,#0xf2			@ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4			@ prefetch
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+#else
+	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
+#endif
+	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
+	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
+	add	$d,$d,$h			@ d+=h
+	eor	$t3,$t3,$b			@ Maj(a,b,c)
+	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
+	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t4,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t1,ror#$sigma0[0]
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	mov	$t2,$t4,ror#$sigma1[0]
+	eor	$t0,$t0,$t1,ror#$sigma0[1]
+	eor	$t2,$t2,$t4,ror#$sigma1[1]
+	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	ldr	$t1,[sp,#`($i+0)%16`*4]
+	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	ldr	$t4,[sp,#`($i+9)%16`*4]
+
+	add	$t2,$t2,$t0
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
+	add	$t1,$t1,$t2
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	add	$t1,$t1,$t4			@ X[i]
+___
+	&BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
+	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+	sub	$Ktbl,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t3,$B,$C		@ magic
+	eor	$t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
+	ldr	$t0,[$t3,#0]
+	ldr	$t1,[$t3,#4]
+	ldr	$t2,[$t3,#8]
+	add	$A,$A,$t0
+	ldr	$t0,[$t3,#12]
+	add	$B,$B,$t1
+	ldr	$t1,[$t3,#16]
+	add	$C,$C,$t2
+	ldr	$t2,[$t3,#20]
+	add	$D,$D,$t0
+	ldr	$t0,[$t3,#24]
+	add	$E,$E,$t1
+	ldr	$t1,[$t3,#28]
+	add	$F,$F,$t2
+	ldr	$inp,[sp,#17*4]		@ pull inp
+	ldr	$t2,[sp,#18*4]		@ pull inp+len
+	add	$G,$G,$t0
+	add	$H,$H,$t1
+	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+	cmp	$inp,$t2
+	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#`16+3`*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 while($#insns>=2) { eval(shift(@insns)); }
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&eor	($t1,$f,$g)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&and	($t1,$t1,$e)',
+	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
+	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&add	($d,$d,$h)',			# d+=h
+	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	$H,sp,#16*4+16
+	adrl	$Ktbl,K256
+	bic	$H,$H,#15		@ align for 128-bit stores
+	mov	$t2,sp
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{@X[0]},[$inp]!
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	vld1.32		{$T0},[$Ktbl,:128]!
+	vld1.32		{$T1},[$Ktbl,:128]!
+	vld1.32		{$T2},[$Ktbl,:128]!
+	vld1.32		{$T3},[$Ktbl,:128]!
+	vrev32.8	@X[0],@X[0]		@ yes, even on
+	str		$ctx,[sp,#64]
+	vrev32.8	@X[1],@X[1]		@ big-endian
+	str		$inp,[sp,#68]
+	mov		$Xfer,sp
+	vrev32.8	@X[2],@X[2]
+	str		$len,[sp,#72]
+	vrev32.8	@X[3],@X[3]
+	str		$t2,[sp,#76]		@ save original sp
+	vadd.i32	$T0,$T0,@X[0]
+	vadd.i32	$T1,$T1,@X[1]
+	vst1.32		{$T0},[$Xfer,:128]!
+	vadd.i32	$T2,$T2,@X[2]
+	vst1.32		{$T1},[$Xfer,:128]!
+	vadd.i32	$T3,$T3,@X[3]
+	vst1.32		{$T2},[$Xfer,:128]!
+	vst1.32		{$T3},[$Xfer,:128]!
+
+	ldmia		$ctx,{$A-$H}
+	sub		$Xfer,$Xfer,#64
+	ldr		$t1,[sp,#0]
+	eor		$t2,$t2,$t2
+	eor		$t3,$B,$C
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	teq	$t1,#0				@ check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	ldr		$inp,[sp,#68]
+	ldr		$t0,[sp,#72]
+	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
+	teq		$inp,$t0
+	it		eq
+	subeq		$inp,$inp,#64		@ avoid SEGV
+	vld1.8		{@X[0]},[$inp]!		@ load next input block
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	it		ne
+	strne		$inp,[sp,#68]
+	mov		$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	ldr	$t0,[$t1,#0]
+	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
+	ldr	$t2,[$t1,#4]
+	ldr	$t3,[$t1,#8]
+	ldr	$t4,[$t1,#12]
+	add	$A,$A,$t0			@ accumulate
+	ldr	$t0,[$t1,#16]
+	add	$B,$B,$t2
+	ldr	$t2,[$t1,#20]
+	add	$C,$C,$t3
+	ldr	$t3,[$t1,#24]
+	add	$D,$D,$t4
+	ldr	$t4,[$t1,#28]
+	add	$E,$E,$t0
+	str	$A,[$t1],#4
+	add	$F,$F,$t2
+	str	$B,[$t1],#4
+	add	$G,$G,$t3
+	str	$C,[$t1],#4
+	add	$H,$H,$t4
+	str	$D,[$t1],#4
+	stmia	$t1,{$E-$H}
+
+	ittte	ne
+	movne	$Xfer,sp
+	ldrne	$t1,[sp,#0]
+	eorne	$t2,$t2,$t2
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	$t3,$B,$C
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+	adr	$Ktbl,.LARMv8
+	sub	$Ktbl,$Ktbl,#.LARMv8-K256
+# else
+	adrl	$Ktbl,K256
+# endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
+	vld1.32		{$W0},[$Ktbl]!
+	vrev32.8	@MSG[0],@MSG[0]
+	vrev32.8	@MSG[1],@MSG[1]
+	vrev32.8	@MSG[2],@MSG[2]
+	vrev32.8	@MSG[3],@MSG[3]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	vmov		$EFGH_SAVE,$EFGH
+	teq		$inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vld1.32		{$W0},[$Ktbl]!
+	vadd.i32	$W1,$W1,@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vld1.32		{$W1},[$Ktbl]
+	vadd.i32	$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#256-16	@ rewind
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vadd.i32	$W1,$W1,@MSG[3]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD,$EFGH},[$ctx]
+
+	ret		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+{   my  %opcode = (
+	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
+	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644
index 0000000..555a1a8
--- /dev/null
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -0,0 +1,2808 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r14,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6		@ magic
+	eor	r12,r12,r12
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 0
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 0
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 0==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 1
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 1
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 1==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 2
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 2
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 2==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 3
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 3
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 3==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 4
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 4
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 4==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 5
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 5==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 6
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 6
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 6==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 7
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 7==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 8
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 8
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 8==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 9
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 9
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 9==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 10
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 10
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 10==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 11
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 11
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 11==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 12
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 12
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 12==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 13
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 13
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 13==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 14
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 14
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 14==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 15
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 15
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 15==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+	@ ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#14*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#0*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#9*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 16==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#15*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#1*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#10*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 17==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#0*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#2*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#11*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 18==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#1*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#3*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#12*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 19==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#2*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#4*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#13*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 20==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#3*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#5*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#14*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 21==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#4*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#6*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#15*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 22==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#5*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#7*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#0*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 23==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#6*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#8*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#1*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 24==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#7*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#9*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#2*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 25==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#8*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#10*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#3*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 26==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#9*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#11*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#4*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 27==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#10*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#12*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#5*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 28==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#11*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#13*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#6*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 29==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#12*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#14*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#7*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 30==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#13*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#15*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#8*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 31==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
+	ldr	r0,[r3,#0]
+	ldr	r2,[r3,#4]
+	ldr	r12,[r3,#8]
+	add	r4,r4,r0
+	ldr	r0,[r3,#12]
+	add	r5,r5,r2
+	ldr	r2,[r3,#16]
+	add	r6,r6,r12
+	ldr	r12,[r3,#20]
+	add	r7,r7,r0
+	ldr	r0,[r3,#24]
+	add	r8,r8,r2
+	ldr	r2,[r3,#28]
+	add	r9,r9,r12
+	ldr	r1,[sp,#17*4]		@ pull inp
+	ldr	r12,[sp,#18*4]		@ pull inp+len
+	add	r10,r10,r0
+	add	r11,r11,r2
+	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+	cmp	r1,r12
+	sub	r14,r14,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#19*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	r11,sp,#16*4+16
+	adrl	r14,K256
+	bic	r11,r11,#15		@ align for 128-bit stores
+	mov	r12,sp
+	mov	sp,r11			@ alloca
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{q0},[r1]!
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	vld1.32		{q8},[r14,:128]!
+	vld1.32		{q9},[r14,:128]!
+	vld1.32		{q10},[r14,:128]!
+	vld1.32		{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str		r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str		r1,[sp,#68]
+	mov		r1,sp
+	vrev32.8	q2,q2
+	str		r2,[sp,#72]
+	vrev32.8	q3,q3
+	str		r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32		{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32		{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32		{q10},[r1,:128]!
+	vst1.32		{q11},[r1,:128]!
+
+	ldmia		r0,{r4-r11}
+	sub		r1,r1,#64
+	ldr		r2,[sp,#0]
+	eor		r12,r12,r12
+	eor		r3,r5,r6
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	.L_00_48
+
+	ldr		r1,[sp,#68]
+	ldr		r0,[sp,#72]
+	sub		r14,r14,#256	@ rewind r14
+	teq		r1,r0
+	it		eq
+	subeq		r1,r1,#64		@ avoid SEGV
+	vld1.8		{q0},[r1]!		@ load next input block
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	it		ne
+	strne		r1,[sp,#68]
+	mov		r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8-r11}
+
+	ittte	ne
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	r3,r5,r6
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{q0,q1},[r0]
+# ifdef __thumb2__
+	adr	r3,.LARMv8
+	sub	r3,r3,#.LARMv8-K256
+# else
+	adrl	r3,K256
+# endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{q8-q9},[r1]!
+	vld1.8		{q10-q11},[r1]!
+	vld1.32		{q12},[r3]!
+	vrev32.8	q8,q8
+	vrev32.8	q9,q9
+	vrev32.8	q10,q10
+	vrev32.8	q11,q11
+	vmov		q14,q0	@ offload
+	vmov		q15,q1
+	teq		r1,r2
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vld1.32		{q13},[r3]
+	vadd.i32	q12,q12,q10
+	sub		r3,r3,#256-16	@ rewind
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vadd.i32	q13,q13,q11
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vadd.i32	q0,q0,q14
+	vadd.i32	q1,q1,q15
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{q0,q1},[r0]
+
+	bx	lr		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644
index 0000000..25ceba6
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.c
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ *   Copyright (C) 2013 Intel Corporation
+ *   Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+		    unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_block_data_order(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_block_data_order(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	/* We need to fill a whole block for __sha256_update */
+	if (padlen <= 56) {
+		sctx->count += padlen;
+		memcpy(sctx->buf + index, padding, padlen);
+	} else {
+		__sha256_update(desc, padding, padlen, index);
+	}
+	__sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_final(desc, D);
+
+	memcpy(out, D, SHA224_DIGEST_SIZE);
+	memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_init,
+	.update		=	sha256_update,
+	.final		=	sha256_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-asm",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_init,
+	.update		=	sha256_update,
+	.final		=	sha224_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-asm",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
+
+static int __init sha256_mod_init(void)
+{
+	int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+	if (res < 0)
+		return res;
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+		res = crypto_register_shashes(sha256_neon_algs,
+					      ARRAY_SIZE(sha256_neon_algs));
+
+		if (res < 0)
+			crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+	}
+
+	return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+		crypto_unregister_shashes(sha256_neon_algs,
+					  ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644
index 0000000..0312f4f
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.h
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+			   unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 0000000..d0f168d
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ *   Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+				unsigned int len, unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_block_data_order_neon(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!may_use_simd()) {
+		res = __sha256_update(desc, data, len, partial);
+	} else {
+		kernel_neon_begin();
+		res = __sha256_neon_update(desc, data, len, partial);
+		kernel_neon_end();
+	}
+
+	return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	if (!may_use_simd()) {
+		sha256_update(desc, padding, padlen);
+		sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_neon_begin();
+		/* We need to fill a whole block for __sha256_neon_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha256_neon_update(desc, padding, padlen, index);
+		}
+		__sha256_neon_update(desc, (const u8 *)&bits,
+					sizeof(bits), 56);
+		kernel_neon_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memzero_explicit(sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_neon_final(desc, D);
+
+	memcpy(out, D, SHA224_DIGEST_SIZE);
+	memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha256_neon_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha224_neon_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCHv3] arm: crypto: Add optimized SHA-256/224
@ 2015-03-30  8:37     ` Sami Tolvanen
  0 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-03-30  8:37 UTC (permalink / raw)
  To: linux-arm-kernel

Add Andy Polyakov's optimized assembly and NEON implementations for
SHA-256/224.

The sha256-armv4.pl script for generating the assembly code is from
OpenSSL commit 51f8d095562f36cdaa6893597b5c609e943b0565.

Compared to sha256-generic these implementations have the following
tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

  bs    b/u      sha256-neon  sha256-asm
  16    16       x1.32        x1.19
  64    16       x1.27        x1.15
  64    64       x1.36        x1.20
  256   16       x1.22        x1.11
  256   64       x1.36        x1.19
  256   256      x1.59        x1.23
  1024  16       x1.21        x1.10
  1024  256      x1.65        x1.23
  1024  1024     x1.76        x1.25
  2048  16       x1.21        x1.10
  2048  256      x1.66        x1.23
  2048  1024     x1.78        x1.25
  2048  2048     x1.79        x1.25
  4096  16       x1.20        x1.09
  4096  256      x1.66        x1.23
  4096  1024     x1.79        x1.26
  4096  4096     x1.82        x1.26
  8192  16       x1.20        x1.09
  8192  256      x1.67        x1.23
  8192  1024     x1.80        x1.26
  8192  4096     x1.85        x1.28
  8192  8192     x1.85        x1.27

Where bs refers to block size and b/u to bytes per update.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Cc: Andy Polyakov <appro@openssl.org>

---
Changes since v2:
  Updated sha256-armv4.pl and sha256-core.S_shipped

Changes since v1:
  Rebased to Herbert's cryptodev tree
  Include sha256-armv4.pl and use it to generate sha256-core.S
  Add integer-only assembly version as sha256-asm
  Add support for SHA-224 to the glue code
  Change priority for sha256/224-ce to 300

---
 arch/arm/crypto/Kconfig               |    7 
 arch/arm/crypto/Makefile              |    8 
 arch/arm/crypto/sha2-ce-glue.c        |    4 
 arch/arm/crypto/sha256-armv4.pl       |  713 ++++++
 arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
 arch/arm/crypto/sha256_glue.c         |  246 ++
 arch/arm/crypto/sha256_glue.h         |   23 
 arch/arm/crypto/sha256_neon_glue.c    |  172 +
 8 files changed, 3945 insertions(+), 3 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index d63f319..458729d 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
 	  SHA-256 secure hash standard (DFIPS 180-2) implemented
 	  using special ARMv8 Crypto Extensions.
 
+config CRYPTO_SHA256_ARM
+	tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
+	select CRYPTO_HASH
+	help
+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
+	  using optimized ARM assembler and NEON, when available.
+
 config CRYPTO_SHA512_ARM_NEON
 	tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 9a273bd..ef46e89 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
@@ -16,6 +17,8 @@ aes-arm-y	:= aes-armv4.o aes_glue.o
 aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
@@ -28,4 +31,7 @@ quiet_cmd_perl = PERL    $@
 $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
 	$(call cmd,perl)
 
-.PRECIOUS: $(obj)/aesbs-core.S
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+	$(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 9ffe8ad..0449eca 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
 	.base			= {
 		.cra_name		= "sha224",
 		.cra_driver_name	= "sha224-ce",
-		.cra_priority		= 200,
+		.cra_priority		= 300,
 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
@@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
 	.base			= {
 		.cra_name		= "sha256",
 		.cra_driver_name	= "sha256-ce",
-		.cra_priority		= 200,
+		.cra_priority		= 300,
 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644
index 0000000..fac0533
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -0,0 +1,716 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	$t0="r0";
+$inp="r1";	$t4="r1";
+$len="r2";	$t1="r2";
+$T1="r3";	$t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+ at V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+ at Sigma0=( 2,13,22);
+ at Sigma1=( 6,11,25);
+ at sigma0=( 7,18, 3);
+ at sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	@ ldr	$t1,[$inp],#4			@ $i
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	$t1,$t1
+# endif
+#else
+	@ ldrb	$t1,[$inp,#3]			@ $i
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	ldrb	$t2,[$inp,#2]
+	ldrb	$t0,[$inp,#1]
+	orr	$t1,$t1,$t2,lsl#8
+	ldrb	$t2,[$inp],#4
+	orr	$t1,$t1,$t0,lsl#16
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	orr	$t1,$t1,$t2,lsl#24
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+#endif
+___
+$code.=<<___;
+	ldr	$t2,[$Ktbl],#4			@ *K256++
+	add	$h,$h,$t1			@ h+=X[i]
+	str	$t1,[sp,#`$i%16`*4]
+	eor	$t1,$f,$g
+	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
+	and	$t1,$t1,$e
+	add	$h,$h,$t2			@ h+=K256[i]
+	eor	$t1,$t1,$g			@ Ch(e,f,g)
+	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+	add	$h,$h,$t1			@ h+=Ch(e,f,g)
+#if $i==31
+	and	$t2,$t2,#0xff
+	cmp	$t2,#0xf2			@ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4			@ prefetch
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+#else
+	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
+#endif
+	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
+	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
+	add	$d,$d,$h			@ d+=h
+	eor	$t3,$t3,$b			@ Maj(a,b,c)
+	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
+	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t4,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t1,ror#$sigma0[0]
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	mov	$t2,$t4,ror#$sigma1[0]
+	eor	$t0,$t0,$t1,ror#$sigma0[1]
+	eor	$t2,$t2,$t4,ror#$sigma1[1]
+	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	ldr	$t1,[sp,#`($i+0)%16`*4]
+	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	ldr	$t4,[sp,#`($i+9)%16`*4]
+
+	add	$t2,$t2,$t0
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
+	add	$t1,$t1,$t2
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	add	$t1,$t1,$t4			@ X[i]
+___
+	&BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
+	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+	sub	$Ktbl,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	$t1,[$inp],#4
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t3,$B,$C		@ magic
+	eor	$t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)	{ &BODY_00_15($i, at V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
+	ldr	$t0,[$t3,#0]
+	ldr	$t1,[$t3,#4]
+	ldr	$t2,[$t3,#8]
+	add	$A,$A,$t0
+	ldr	$t0,[$t3,#12]
+	add	$B,$B,$t1
+	ldr	$t1,[$t3,#16]
+	add	$C,$C,$t2
+	ldr	$t2,[$t3,#20]
+	add	$D,$D,$t0
+	ldr	$t0,[$t3,#24]
+	add	$E,$E,$t1
+	ldr	$t1,[$t3,#28]
+	add	$F,$F,$t2
+	ldr	$inp,[sp,#17*4]		@ pull inp
+	ldr	$t2,[sp,#18*4]		@ pull inp+len
+	add	$G,$G,$t0
+	add	$H,$H,$t1
+	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+	cmp	$inp,$t2
+	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#`16+3`*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',', at _,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&vext_8		($T0, at X[0], at X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		($T1, at X[2], at X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0], at X[0],$T1);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0], at X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0, at X[0]);
+	 while($#insns>=2) { eval(shift(@insns)); }
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[0], at X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0, at X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&eor	($t1,$f,$g)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&and	($t1,$t1,$e)',
+	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
+	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&add	($d,$d,$h)',			# d+=h
+	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	$H,sp,#16*4+16
+	adrl	$Ktbl,K256
+	bic	$H,$H,#15		@ align for 128-bit stores
+	mov	$t2,sp
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{@X[0]},[$inp]!
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	vld1.32		{$T0},[$Ktbl,:128]!
+	vld1.32		{$T1},[$Ktbl,:128]!
+	vld1.32		{$T2},[$Ktbl,:128]!
+	vld1.32		{$T3},[$Ktbl,:128]!
+	vrev32.8	@X[0], at X[0]		@ yes, even on
+	str		$ctx,[sp,#64]
+	vrev32.8	@X[1], at X[1]		@ big-endian
+	str		$inp,[sp,#68]
+	mov		$Xfer,sp
+	vrev32.8	@X[2], at X[2]
+	str		$len,[sp,#72]
+	vrev32.8	@X[3], at X[3]
+	str		$t2,[sp,#76]		@ save original sp
+	vadd.i32	$T0,$T0, at X[0]
+	vadd.i32	$T1,$T1, at X[1]
+	vst1.32		{$T0},[$Xfer,:128]!
+	vadd.i32	$T2,$T2, at X[2]
+	vst1.32		{$T1},[$Xfer,:128]!
+	vadd.i32	$T3,$T3, at X[3]
+	vst1.32		{$T2},[$Xfer,:128]!
+	vst1.32		{$T3},[$Xfer,:128]!
+
+	ldmia		$ctx,{$A-$H}
+	sub		$Xfer,$Xfer,#64
+	ldr		$t1,[sp,#0]
+	eor		$t2,$t2,$t2
+	eor		$t3,$B,$C
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	teq	$t1,#0				@ check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	ldr		$inp,[sp,#68]
+	ldr		$t0,[sp,#72]
+	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
+	teq		$inp,$t0
+	it		eq
+	subeq		$inp,$inp,#64		@ avoid SEGV
+	vld1.8		{@X[0]},[$inp]!		@ load next input block
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	it		ne
+	strne		$inp,[sp,#68]
+	mov		$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	ldr	$t0,[$t1,#0]
+	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
+	ldr	$t2,[$t1,#4]
+	ldr	$t3,[$t1,#8]
+	ldr	$t4,[$t1,#12]
+	add	$A,$A,$t0			@ accumulate
+	ldr	$t0,[$t1,#16]
+	add	$B,$B,$t2
+	ldr	$t2,[$t1,#20]
+	add	$C,$C,$t3
+	ldr	$t3,[$t1,#24]
+	add	$D,$D,$t4
+	ldr	$t4,[$t1,#28]
+	add	$E,$E,$t0
+	str	$A,[$t1],#4
+	add	$F,$F,$t2
+	str	$B,[$t1],#4
+	add	$G,$G,$t3
+	str	$C,[$t1],#4
+	add	$H,$H,$t4
+	str	$D,[$t1],#4
+	stmia	$t1,{$E-$H}
+
+	ittte	ne
+	movne	$Xfer,sp
+	ldrne	$t1,[sp,#0]
+	eorne	$t2,$t2,$t2
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	$t3,$B,$C
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+	adr	$Ktbl,.LARMv8
+	sub	$Ktbl,$Ktbl,#.LARMv8-K256
+# else
+	adrl	$Ktbl,K256
+# endif
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{@MSG[0]- at MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]- at MSG[3]},[$inp]!
+	vld1.32		{$W0},[$Ktbl]!
+	vrev32.8	@MSG[0], at MSG[0]
+	vrev32.8	@MSG[1], at MSG[1]
+	vrev32.8	@MSG[2], at MSG[2]
+	vrev32.8	@MSG[3], at MSG[3]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	vmov		$EFGH_SAVE,$EFGH
+	teq		$inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0, at MSG[0]
+	sha256su0	@MSG[0], at MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0], at MSG[2], at MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0, at MSG[0]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vld1.32		{$W0},[$Ktbl]!
+	vadd.i32	$W1,$W1, at MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vld1.32		{$W1},[$Ktbl]
+	vadd.i32	$W0,$W0, at MSG[2]
+	sub		$Ktbl,$Ktbl,#256-16	@ rewind
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vadd.i32	$W1,$W1,@MSG[3]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD,$EFGH},[$ctx]
+
+	ret		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+{   my  %opcode = (
+	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
+	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644
index 0000000..555a1a8
--- /dev/null
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -0,0 +1,2808 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align	5
+
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r14,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6		@ magic
+	eor	r12,r12,r12
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 0
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 0
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 0==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 1
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 1
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 1==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 2
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 2
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 2==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 3
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 3
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 3==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 4
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 4
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 4==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 5
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 5==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 6
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 6
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 6==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 7
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 7==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 8
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 8
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 8==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 9
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 9
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 9==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 10
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 10
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 10==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 11
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 11
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 11==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 12
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 12
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 12==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 13
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 13
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 13==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 14
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 14
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 14==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 15
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 15
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 15==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+	@ ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#14*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#0*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#9*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 16==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#15*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#1*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#10*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 17==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#0*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#2*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#11*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 18==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#1*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#3*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#12*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 19==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#2*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#4*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#13*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 20==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#3*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#5*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#14*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 21==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#4*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#6*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#15*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 22==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#5*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#7*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#0*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 23==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#6*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#8*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#1*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 24==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#7*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#9*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#2*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 25==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#8*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#10*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#3*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 26==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#9*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#11*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#4*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 27==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#10*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#12*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#5*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 28==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#11*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#13*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#6*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 29==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#12*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#14*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#7*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 30==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#13*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#15*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#8*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 31==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
+	ldr	r0,[r3,#0]
+	ldr	r2,[r3,#4]
+	ldr	r12,[r3,#8]
+	add	r4,r4,r0
+	ldr	r0,[r3,#12]
+	add	r5,r5,r2
+	ldr	r2,[r3,#16]
+	add	r6,r6,r12
+	ldr	r12,[r3,#20]
+	add	r7,r7,r0
+	ldr	r0,[r3,#24]
+	add	r8,r8,r2
+	ldr	r2,[r3,#28]
+	add	r9,r9,r12
+	ldr	r1,[sp,#17*4]		@ pull inp
+	ldr	r12,[sp,#18*4]		@ pull inp+len
+	add	r10,r10,r0
+	add	r11,r11,r2
+	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+	cmp	r1,r12
+	sub	r14,r14,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#19*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	4
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	r11,sp,#16*4+16
+	adrl	r14,K256
+	bic	r11,r11,#15		@ align for 128-bit stores
+	mov	r12,sp
+	mov	sp,r11			@ alloca
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{q0},[r1]!
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	vld1.32		{q8},[r14,:128]!
+	vld1.32		{q9},[r14,:128]!
+	vld1.32		{q10},[r14,:128]!
+	vld1.32		{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str		r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str		r1,[sp,#68]
+	mov		r1,sp
+	vrev32.8	q2,q2
+	str		r2,[sp,#72]
+	vrev32.8	q3,q3
+	str		r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32		{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32		{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32		{q10},[r1,:128]!
+	vst1.32		{q11},[r1,:128]!
+
+	ldmia		r0,{r4-r11}
+	sub		r1,r1,#64
+	ldr		r2,[sp,#0]
+	eor		r12,r12,r12
+	eor		r3,r5,r6
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	.L_00_48
+
+	ldr		r1,[sp,#68]
+	ldr		r0,[sp,#72]
+	sub		r14,r14,#256	@ rewind r14
+	teq		r1,r0
+	it		eq
+	subeq		r1,r1,#64		@ avoid SEGV
+	vld1.8		{q0},[r1]!		@ load next input block
+	vld1.8		{q1},[r1]!
+	vld1.8		{q2},[r1]!
+	vld1.8		{q3},[r1]!
+	it		ne
+	strne		r1,[sp,#68]
+	mov		r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8-r11}
+
+	ittte	ne
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	r3,r5,r6
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{q0,q1},[r0]
+# ifdef __thumb2__
+	adr	r3,.LARMv8
+	sub	r3,r3,#.LARMv8-K256
+# else
+	adrl	r3,K256
+# endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+.Loop_v8:
+	vld1.8		{q8-q9},[r1]!
+	vld1.8		{q10-q11},[r1]!
+	vld1.32		{q12},[r3]!
+	vrev32.8	q8,q8
+	vrev32.8	q9,q9
+	vrev32.8	q10,q10
+	vrev32.8	q11,q11
+	vmov		q14,q0	@ offload
+	vmov		q15,q1
+	teq		r1,r2
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32		{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vld1.32		{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vld1.32		{q13},[r3]
+	vadd.i32	q12,q12,q10
+	sub		r3,r3,#256-16	@ rewind
+	vmov		q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vadd.i32	q13,q13,q11
+	vmov		q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vadd.i32	q0,q0,q14
+	vadd.i32	q1,q1,q15
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{q0,q1},[r0]
+
+	bx	lr		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644
index 0000000..25ceba6
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.c
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ *   Copyright (C) 2013 Intel Corporation
+ *   Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+		    unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_block_data_order(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_block_data_order(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	/* We need to fill a whole block for __sha256_update */
+	if (padlen <= 56) {
+		sctx->count += padlen;
+		memcpy(sctx->buf + index, padding, padlen);
+	} else {
+		__sha256_update(desc, padding, padlen, index);
+	}
+	__sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_final(desc, D);
+
+	memcpy(out, D, SHA224_DIGEST_SIZE);
+	memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_init,
+	.update		=	sha256_update,
+	.final		=	sha256_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-asm",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_init,
+	.update		=	sha256_update,
+	.final		=	sha224_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-asm",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
+
+static int __init sha256_mod_init(void)
+{
+	int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+	if (res < 0)
+		return res;
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+		res = crypto_register_shashes(sha256_neon_algs,
+					      ARRAY_SIZE(sha256_neon_algs));
+
+		if (res < 0)
+			crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+	}
+
+	return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+		crypto_unregister_shashes(sha256_neon_algs,
+					  ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644
index 0000000..0312f4f
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.h
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+			   unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 0000000..d0f168d
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ *   Copyright ? 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+				      unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+				unsigned int len, unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_block_data_order_neon(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!may_use_simd()) {
+		res = __sha256_update(desc, data, len, partial);
+	} else {
+		kernel_neon_begin();
+		res = __sha256_neon_update(desc, data, len, partial);
+		kernel_neon_end();
+	}
+
+	return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	if (!may_use_simd()) {
+		sha256_update(desc, padding, padlen);
+		sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_neon_begin();
+		/* We need to fill a whole block for __sha256_neon_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha256_neon_update(desc, padding, padlen, index);
+		}
+		__sha256_neon_update(desc, (const u8 *)&bits,
+					sizeof(bits), 56);
+		kernel_neon_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memzero_explicit(sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_neon_final(desc, D);
+
+	memcpy(out, D, SHA224_DIGEST_SIZE);
+	memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha256_neon_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_init,
+	.update		=	sha256_neon_update,
+	.final		=	sha224_neon_final,
+	.export		=	sha256_export,
+	.import		=	sha256_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* Re: [PATCHv3] arm: crypto: Add optimized SHA-256/224
  2015-03-30  8:37     ` Sami Tolvanen
@ 2015-04-01 12:43       ` Herbert Xu
  -1 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-04-01 12:43 UTC (permalink / raw)
  To: Sami Tolvanen
  Cc: linux-arm-kernel, linux-crypto, ard.biesheuvel, davem, Andy Polyakov

On Mon, Mar 30, 2015 at 09:37:27AM +0100, Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
> 
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 51f8d095562f36cdaa6893597b5c609e943b0565.
> 
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

I cannot merge this because none of your emails are making it
to the mailing list so nobody except those on the cc list can
see the patch.

Please refer to section 7 in Documentation/SubmittingPatches.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv3] arm: crypto: Add optimized SHA-256/224
@ 2015-04-01 12:43       ` Herbert Xu
  0 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-04-01 12:43 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Mar 30, 2015 at 09:37:27AM +0100, Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
> 
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 51f8d095562f36cdaa6893597b5c609e943b0565.
> 
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

I cannot merge this because none of your emails are making it
to the mailing list so nobody except those on the cc list can
see the patch.

Please refer to section 7 in Documentation/SubmittingPatches.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv3] arm: crypto: Add optimized SHA-256/224
  2015-04-01 12:43       ` Herbert Xu
@ 2015-04-01 13:19         ` Sami Tolvanen
  -1 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-04-01 13:19 UTC (permalink / raw)
  To: Herbert Xu
  Cc: linux-arm-kernel, linux-crypto, ard.biesheuvel, davem, Andy Polyakov

On Wed, Apr 01, 2015 at 08:43:25PM +0800, Herbert Xu wrote:
> I cannot merge this because none of your emails are making it
> to the mailing list so nobody except those on the cc list can
> see the patch.
> 
> Please refer to section 7 in Documentation/SubmittingPatches.

Please find a hosted version of the patch here:

https://raw.githubusercontent.com/samitolvanen/kernelpatches/master/sha256.v3.patch

Sami

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv3] arm: crypto: Add optimized SHA-256/224
@ 2015-04-01 13:19         ` Sami Tolvanen
  0 siblings, 0 replies; 66+ messages in thread
From: Sami Tolvanen @ 2015-04-01 13:19 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Apr 01, 2015 at 08:43:25PM +0800, Herbert Xu wrote:
> I cannot merge this because none of your emails are making it
> to the mailing list so nobody except those on the cc list can
> see the patch.
> 
> Please refer to section 7 in Documentation/SubmittingPatches.

Please find a hosted version of the patch here:

https://raw.githubusercontent.com/samitolvanen/kernelpatches/master/sha256.v3.patch

Sami

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCHv3] arm: crypto: Add optimized SHA-256/224
  2015-04-01 13:19         ` Sami Tolvanen
@ 2015-04-03 10:04           ` Herbert Xu
  -1 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-04-03 10:04 UTC (permalink / raw)
  To: Sami Tolvanen
  Cc: linux-arm-kernel, linux-crypto, ard.biesheuvel, davem, Andy Polyakov

On Wed, Apr 01, 2015 at 02:19:24PM +0100, Sami Tolvanen wrote:
> On Wed, Apr 01, 2015 at 08:43:25PM +0800, Herbert Xu wrote:
> > I cannot merge this because none of your emails are making it
> > to the mailing list so nobody except those on the cc list can
> > see the patch.
> > 
> > Please refer to section 7 in Documentation/SubmittingPatches.
> 
> Please find a hosted version of the patch here:
> 
> https://raw.githubusercontent.com/samitolvanen/kernelpatches/master/sha256.v3.patch

Patch applied.  Thanks!
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCHv3] arm: crypto: Add optimized SHA-256/224
@ 2015-04-03 10:04           ` Herbert Xu
  0 siblings, 0 replies; 66+ messages in thread
From: Herbert Xu @ 2015-04-03 10:04 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Apr 01, 2015 at 02:19:24PM +0100, Sami Tolvanen wrote:
> On Wed, Apr 01, 2015 at 08:43:25PM +0800, Herbert Xu wrote:
> > I cannot merge this because none of your emails are making it
> > to the mailing list so nobody except those on the cc list can
> > see the patch.
> > 
> > Please refer to section 7 in Documentation/SubmittingPatches.
> 
> Please find a hosted version of the patch here:
> 
> https://raw.githubusercontent.com/samitolvanen/kernelpatches/master/sha256.v3.patch

Patch applied.  Thanks!
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2015-04-03 10:04 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-16 15:48 [PATCH] arm: crypto: Add NEON optimized SHA-256 Sami Tolvanen
2015-03-16 15:48 ` Sami Tolvanen
2015-03-16 16:08 ` Ard Biesheuvel
2015-03-16 16:08   ` Ard Biesheuvel
2015-03-16 16:23   ` Sami Tolvanen
2015-03-16 16:23     ` Sami Tolvanen
2015-03-17  6:56     ` Ard Biesheuvel
2015-03-17  6:56       ` Ard Biesheuvel
2015-03-17 15:09       ` Andy Polyakov
2015-03-17 15:09         ` Andy Polyakov
2015-03-17 15:21         ` Sami Tolvanen
2015-03-17 15:21           ` Sami Tolvanen
2015-03-17 15:51         ` Ard Biesheuvel
2015-03-17 15:51           ` Ard Biesheuvel
2015-03-23 13:50 ` [PATCHv2] arm: crypto: Add optimized SHA-256/224 Sami Tolvanen
2015-03-23 13:50   ` Sami Tolvanen
2015-03-23 18:26   ` Ard Biesheuvel
2015-03-23 18:26     ` Ard Biesheuvel
2015-03-24 11:35     ` Herbert Xu
2015-03-24 11:35       ` Herbert Xu
2015-03-24 11:40       ` Ard Biesheuvel
2015-03-24 11:40         ` Ard Biesheuvel
2015-03-24 11:46         ` Herbert Xu
2015-03-24 11:46           ` Herbert Xu
2015-03-24 11:57           ` Ard Biesheuvel
2015-03-24 11:57             ` Ard Biesheuvel
2015-03-24 11:32   ` Herbert Xu
2015-03-24 11:32     ` Herbert Xu
2015-03-24 11:33     ` Ard Biesheuvel
2015-03-24 11:33       ` Ard Biesheuvel
2015-03-24 12:27   ` Jean-Christophe PLAGNIOL-VILLARD
2015-03-24 12:27     ` Jean-Christophe PLAGNIOL-VILLARD
2015-03-24 12:42     ` Ard Biesheuvel
2015-03-24 12:42       ` Ard Biesheuvel
2015-03-24 13:05       ` Jean-Christophe PLAGNIOL-VILLARD
2015-03-24 13:05         ` Jean-Christophe PLAGNIOL-VILLARD
2015-03-24 13:06         ` Ard Biesheuvel
2015-03-24 13:06           ` Ard Biesheuvel
2015-03-24 14:46           ` Ard Biesheuvel
2015-03-24 14:46             ` Ard Biesheuvel
2015-03-24 17:05             ` Jean-Christophe PLAGNIOL-VILLARD
2015-03-24 17:05               ` Jean-Christophe PLAGNIOL-VILLARD
2015-03-24 17:40               ` Ard Biesheuvel
2015-03-24 17:40                 ` Ard Biesheuvel
2015-03-24 18:17                 ` Sami Tolvanen
2015-03-24 18:17                   ` Sami Tolvanen
2015-03-25  6:49                   ` Ard Biesheuvel
2015-03-25  6:49                     ` Ard Biesheuvel
2015-03-27 10:42             ` Andy Polyakov
2015-03-27 10:42               ` Andy Polyakov
2015-03-27 10:44               ` Ard Biesheuvel
2015-03-27 10:44                 ` Ard Biesheuvel
2015-03-27 18:07                 ` Ard Biesheuvel
2015-03-27 18:07                   ` Ard Biesheuvel
2015-03-29 13:27                   ` Andy Polyakov
2015-03-29 13:27                     ` Andy Polyakov
2015-03-30  8:37   ` [PATCHv3] " Sami Tolvanen
2015-03-30  8:37     ` Sami Tolvanen
2015-04-01 12:43     ` Herbert Xu
2015-04-01 12:43       ` Herbert Xu
2015-04-01 13:19       ` Sami Tolvanen
2015-04-01 13:19         ` Sami Tolvanen
2015-04-03 10:04         ` Herbert Xu
2015-04-03 10:04           ` Herbert Xu
2015-03-25 20:00 ` [PATCH] arm: crypto: Add NEON optimized SHA-256 Jean-Christophe PLAGNIOL-VILLARD
2015-03-25 20:00   ` Jean-Christophe PLAGNIOL-VILLARD

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.