From: <zbigniew.bodek@caviumnetworks.com>
To: <pablo.de.lara.guarch@intel.com>, <jerin.jacob@caviumnetworks.com>
Cc: <dev@dpdk.org>,
Zbigniew Bodek <zbigniew.bodek@caviumnetworks.com>,
Emery Davis <emery.davis@caviumnetworks.com>
Subject: [PATCH v2 05/12] crypto/armv8: Add AES+SHA1 crypto operations for ARMv8
Date: Tue, 6 Dec 2016 18:32:58 -0800 [thread overview]
Message-ID: <1481077985-4224-6-git-send-email-zbigniew.bodek@caviumnetworks.com> (raw)
In-Reply-To: <1481077985-4224-1-git-send-email-zbigniew.bodek@caviumnetworks.com>
From: Zbigniew Bodek <zbigniew.bodek@caviumnetworks.com>
This patch adds AES-128-CBC + SHA1 low-level
crypto operations for ARMv8 processors.
The assembly code is a base for an optimized PMD
and is currently excluded from the build.
This code is optimized to provide performance boost
for combined operations such as encryption + HMAC
generation, decryption + HMAC validation.
Introduced operations add support for AES-128-CBC in
combination with:
SHA1 MAC, SHA1 HMAC
Signed-off-by: Zbigniew Bodek <zbigniew.bodek@caviumnetworks.com>
Signed-off-by: Emery Davis <emery.davis@caviumnetworks.com>
---
drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S | 1719 ++++++++++++++++++++
drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S | 1650 +++++++++++++++++++
2 files changed, 3369 insertions(+)
create mode 100644 drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S
create mode 100644 drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S
diff --git a/drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S b/drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S
new file mode 100644
index 0000000..8b8348a
--- /dev/null
+++ b/drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S
@@ -0,0 +1,1719 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Enc/Auth Primitive = aes128cbc/sha1_hmac
+ *
+ * Operations:
+ *
+ * out = encrypt-AES128CBC(in)
+ * return_hash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | out))
+ *
+ * Prototype:
+ * void aes128cbc_sha1_hmac(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * aes128cbc_sha1_hmac(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
+ * arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 -- temp register for SHA1
+ * v20 -- ABCD copy (q20)
+ * v21 -- sha working state (q21)
+ * v22 -- sha working state (q22)
+ * v23 -- temp register for SHA1
+ * v24 -- sha state ABCD
+ * v25 -- sha state E
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16, otherwise results are not
+ * defined. For AES partial blocks the user is required to pad the input
+ * to modulus 16 = 0.
+ *
+ * Short lengths are not optimized at < 12 AES blocks
+ */
+
+ .file "aes128cbc_sha1_hmac.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global aes128cbc_sha1_hmac
+ .type aes128cbc_sha1_hmac,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+aes128cbc_sha1_hmac:
+/* fetch args */
+ ldr x6, [x5, #HMAC_IKEYPAD]
+ /* init ABCD, E */
+ ld1 {v24.4s, v25.4s},[x6]
+ /* save pointer to o_key_pad partial hash */
+ ldr x6, [x5, #HMAC_OKEYPAD]
+
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,12 /* no main loop if <12 */
+ b.lt .Lshort_cases /* branch if < 12 */
+
+ /* protect registers */
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ /* proceed */
+ ld1 {v3.16b},[x5] /* get 1st ivec */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v0.16b},[x0],16
+ mov x11,x4 /* len -> x11 needed at end */
+ lsr x12,x11,6 /* total_blocks */
+/*
+ * now we can do the loop prolog, 1st aes sequence of 4 blocks
+ */
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ aesmc v0.16b,v0.16b
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+ aese v0.16b,v9.16b
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+ /* base address for sha round consts */
+ adr x8,.Lrcon
+ aesmc v0.16b,v0.16b
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ aese v0.16b,v10.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v1.16b},[x0],16
+ aesmc v0.16b,v0.16b
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ aese v0.16b,v16.16b
+ mov x4,x1 /* sha_ptr_in = aes_ptr_out */
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v2.16b},[x0],16
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ prfm PLDL1KEEP,[x8,0*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ prfm PLDL1KEEP,[x8,2*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ prfm PLDL1KEEP,[x8,4*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ prfm PLDL1KEEP,[x8,6*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ prfm PLDL1KEEP,[x8,8*64] /* rcon */
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+
+ eor v2.16b,v2.16b,v1.16b /* xor w/ivec (modeop) */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v3.16b},[x0],16
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ mov x2,x0 /* lead_ptr = aes_ptr_in */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ prfm PLDL1KEEP,[x8,10*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ prfm PLDL1KEEP,[x8,12*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ prfm PLDL1KEEP,[x8,14*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+
+ eor v3.16b,v3.16b,v2.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ /* main_blocks = total_blocks - 1 */
+ sub x7,x12,1
+ and x13,x10,3 /* aes_blocks_left */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+
+/*
+ * Note, aes_blocks_left := number after
+ * the main (sha) block is done. Can be 0
+ */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+/*
+ * main combined loop CBC
+ */
+.Lmain_loop:
+/*
+ * because both mov, rev32 and eor have a busy cycle,
+ * this takes longer than it looks.
+ * Thats OK since there are 6 cycles before we can use the load anyway;
+ * so this goes as fast as it can without SW pipelining (too complicated
+ * given the code size)
+ */
+ rev32 v26.16b,v0.16b /* fix endian w0, aes res 0 */
+ /* next aes block, update aes_ptr_in */
+ ld1 {v0.16b},[x0],16
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v1.16b /* fix endian w1, aes res 1 */
+ /* pref next aes_ptr_out, streaming */
+ prfm PLDL1KEEP,[x1,64]
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0, sha quad 0 */
+ aese v0.16b,v8.16b
+ rev32 v28.16b,v2.16b /* fix endian w2, aes res 2 */
+ aesmc v0.16b,v0.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v1.16b},[x0],16
+ aese v0.16b,v9.16b
+ add v19.4s,v4.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v0.16b,v10.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ add v23.4s,v4.4s,v27.4s
+ /* no place to get rid of this stall */
+ rev32 v29.16b,v3.16b /* fix endian w3, aes res 3 */
+ aesmc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aese v0.16b,v12.16b
+ sha1su1 v26.4s,v29.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aese v0.16b,v13.16b
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aese v0.16b,v14.16b
+ add v23.4s,v4.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aese v0.16b,v15.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aese v0.16b,v16.16b
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v0.16b,v0.16b
+ sha1h s21,s24
+ aese v0.16b,v17.16b
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ add v23.4s,v5.4s,v27.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* aes xform 1, sha quad 1 */
+ eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/prev value */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ aese v1.16b,v8.16b
+ add v19.4s,v5.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v10.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v2.16b},[x0],16
+ add v23.4s,v5.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ sha1h s22,s24
+ aese v1.16b,v12.16b
+ sha1p q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aese v1.16b,v13.16b
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v14.16b
+ add v19.4s,v5.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v1.16b,v15.16b
+ sha1h s22,s24
+ add v23.4s,v5.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aese v1.16b,v16.16b
+ sha1su1 v26.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aese v1.16b,v17.16b
+ sha1h s21,s24
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ sha1p q24,s22,v23.4s
+ add v23.4s,v6.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+/* mode op 2 */
+ eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/prev value */
+
+/* aes xform 2, sha quad 2 */
+ aese v2.16b,v8.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ aesmc v2.16b,v2.16b
+ add v19.4s,v6.4s,v28.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aese v2.16b,v9.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v10.16b
+ sha1su1 v28.4s,v27.4s
+ aesmc v2.16b,v2.16b
+
+ aese v2.16b,v11.16b
+ add v19.4s,v6.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aese v2.16b,v12.16b
+ sha1h s21,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aese v2.16b,v13.16b
+ sha1su1 v29.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v3.16b},[x0],16
+ aese v2.16b,v14.16b
+ add v23.4s,v6.4s,v27.4s
+ aesmc v2.16b,v2.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v2.16b,v15.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v16.16b
+ add v19.4s,v6.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ sha1su1 v26.4s,v29.4s
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+
+/* mode op 3 */
+ eor v3.16b,v3.16b,v2.16b /* xor w/prev value */
+
+/* aes xform 3, sha quad 3 */
+ aese v3.16b,v8.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v3.16b,v3.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ aese v3.16b,v9.16b
+ sha1h s21,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aese v3.16b,v10.16b
+ sha1su1 v29.4s,v28.4s
+ aesmc v3.16b,v3.16b
+ add v19.4s,v7.4s,v26.4s
+ aese v3.16b,v11.16b
+ sha1h s22,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ add v23.4s,v7.4s,v27.4s
+ aese v3.16b,v13.16b
+ sha1h s21,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aese v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesmc v3.16b,v3.16b
+ add v19.4s,v7.4s,v28.4s
+ aese v3.16b,v15.16b
+ sha1h s22,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ add v23.4s,v7.4s,v29.4s
+ aese v3.16b,v17.16b
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ cbnz x7,.Lmain_loop /* loop if more to do */
+
+
+/*
+ * epilog, process remaining aes blocks and b-2 sha block
+ * do this inline (no loop) to overlap with the sha part
+ * note there are 0-3 aes blocks left.
+ */
+ rev32 v26.16b,v0.16b /* fix endian w0 */
+ rev32 v27.16b,v1.16b /* fix endian w1 */
+ rev32 v28.16b,v2.16b /* fix endian w2 */
+ rev32 v29.16b,v3.16b /* fix endian w3 */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ cbz x13, .Lbm2fromQ0 /* skip if none left */
+ /* local copy of aes_blocks_left */
+ subs x14,x13,1
+
+/*
+ * mode op 0
+ * read next aes block, update aes_ptr_in
+ */
+ ld1 {v0.16b},[x0],16
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0, sha quad 0 */
+ add v19.4s,v4.4s,v26.4s
+ aese v0.16b,v8.16b
+ add v23.4s,v4.4s,v27.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v0.16b,v9.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aese v0.16b,v10.16b
+ sha1su1 v26.4s,v29.4s
+ add v19.4s,v4.4s,v28.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ sha1h s21,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aese v0.16b,v12.16b
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v4.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aese v0.16b,v14.16b
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ sha1h s21,s24
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ sha1c q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ /* if aes_blocks_left_count == 0 */
+ beq .Lbm2fromQ1
+/*
+ * mode op 1
+ * read next aes block, update aes_ptr_in
+ */
+ ld1 {v1.16b},[x0],16
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1, sha quad 1 */
+ add v23.4s,v5.4s,v27.4s
+ aese v1.16b,v8.16b
+ add v19.4s,v5.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aese v1.16b,v9.16b
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v10.16b
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v5.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ subs x14,x14,1 /* dec counter */
+ aese v1.16b,v11.16b
+ sha1h s22,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aese v1.16b,v12.16b
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v5.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v14.16b
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ sha1h s22,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aese v1.16b,v16.16b
+ sha1su1 v26.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ /* if aes_blocks_left_count == 0 */
+ beq .Lbm2fromQ2
+
+/*
+ * mode op 2
+ * read next aes block, update aes_ptr_in
+ */
+ ld1 {v2.16b},[x0],16
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2, sha quad 2 */
+ add v19.4s,v6.4s,v28.4s
+ aese v2.16b,v8.16b
+ add v23.4s,v6.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aese v2.16b,v9.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v10.16b
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ sha1h s21,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aese v2.16b,v12.16b
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v6.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v14.16b
+ sha1su1 v26.4s,v29.4s
+ add v19.4s,v6.4s,v28.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ sha1h s21,s24
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ sha1m q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ /* join common code at Quad 3 */
+ b .Lbm2fromQ3
+
+/*
+ * now there is the b-2 sha block before the final one. Execution takes over
+ * in the appropriate part of this depending on how many aes blocks were left.
+ * If there were none, the whole thing is executed.
+ */
+.Lbm2fromQ0:
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+.Lbm2fromQ1:
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+.Lbm2fromQ2:
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+.Lbm2fromQ3:
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ eor v26.16b,v26.16b,v26.16b /* zero reg */
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ eor v27.16b,v27.16b,v27.16b /* zero reg */
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ eor v28.16b,v28.16b,v28.16b /* zero reg */
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+/*
+ * now we can do the final block, either all padding or 1-3 aes blocks
+ * len in x11, aes_blocks_left in x13. should move the aes data setup of this
+ * to the last aes bit.
+ */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ mov w15,0x80 /* that's the 1 of the pad */
+ /* Add one SHA-1 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x9,x11,0xffffffff /* len_lo */
+ mov v26.b[0],w15 /* assume block 0 is dst */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x9,x9,3 /* len_lo in bits */
+ eor v29.16b,v29.16b,v29.16b /* zero reg */
+/*
+ * places the 0x80 in the correct block, copies the appropriate data
+ */
+ cbz x13,.Lpad100 /* no data to get */
+ mov v26.16b,v0.16b
+ sub x14,x13,1 /* dec amount left */
+ mov v27.b[0],w15 /* assume block 1 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v27.16b,v1.16b
+ sub x14,x14,1 /* dec amount left */
+ mov v28.b[0],w15 /* assume block 2 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v28.16b,v2.16b
+ mov v29.b[3],w15 /* block 3, doesn't get rev'd */
+/*
+ * get the len_hi,LenLo in bits according to
+ * len_hi = (uint32_t)(((len>>32) & 0xffffffff)<<3); (x12)
+ * len_lo = (uint32_t)((len & 0xffffffff)<<3); (x9)
+ * this is done before the if/else above
+ */
+.Lpad100:
+ mov v29.s[3],w9 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+/*
+ * note that q29 is already built in the correct format, so no swap required
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+/*
+ * do last sha of pad block
+ */
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v26.4s,v24.4s,v20.4s
+ add v27.4s,v25.4s,v21.4s
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+ /* load o_key_pad partial hash */
+ ld1 {v24.16b,v25.16b}, [x6]
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v27.b[7], w11
+
+ mov x11, #64+20 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ /* move length to the end of the block */
+ mov v29.s[3], w11
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+
+ st1 {v24.16b}, [x3],16
+ st1 {v25.s}[0], [x3]
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v3.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ mov w15,0x80 /* sha padding word */
+
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+
+ mov x9,x8 /* top of rcon */
+
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+/*
+ * the idea in the short loop (at least 1) is to break out with the padding
+ * already in place excepting the final word.
+ */
+.Lshort_loop:
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v0.16b},[x0],16
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v9.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v10.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ /* assume this was final block */
+ mov v27.b[3],w15
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ /* load res to sha 0, endian swap */
+ rev32 v26.16b,v0.16b
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v1.16b},[x0],16
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ /* assume this was final block */
+ mov v28.b[3],w15
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ /* load res to sha 0, endian swap */
+ rev32 v27.16b,v1.16b
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v2.16b},[x0],16
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ /* assume this was final block */
+ mov v29.b[3],w15
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ /* load res to sha 0, endian swap */
+ rev32 v28.16b,v2.16b
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v3.16b},[x0],16
+ eor v3.16b,v3.16b,v2.16b /* xor w/prev value */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+ /* load res to sha 0, endian swap */
+ rev32 v29.16b,v3.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+/*
+ * now we have the sha1 to do for these 4 aes blocks
+ */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+ /* assume this was final block */
+ mov v26.b[3],w15
+
+ sub x10,x10,1 /* dec num_blocks */
+ cbnz x10,.Lshort_loop /* keep looping if more */
+/*
+ * there are between 0 and 3 aes blocks in the final sha1 blocks
+ */
+.Lpost_short_loop:
+ /* Add one SHA-2 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x13,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x13,x13,3 /* len_lo in bits */
+
+ mov v29.s[3],w13 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+ /* do final block */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v26.4s,v24.4s,v20.4s
+ add v27.4s,v25.4s,v21.4s
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+ /* load o_key_pad partial hash */
+ ld1 {v24.16b,v25.16b}, [x6]
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v27.b[7], w11
+
+ mov x11, #64+20 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ /* move length to the end of the block */
+ mov v29.s[3], w11
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+
+ st1 {v24.16b}, [x3],16
+ st1 {v25.s}[0], [x3]
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+ .size aes128cbc_sha1_hmac, .-aes128cbc_sha1_hmac
diff --git a/drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S b/drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S
new file mode 100644
index 0000000..a5a9e85
--- /dev/null
+++ b/drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S
@@ -0,0 +1,1650 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Auth/Dec Primitive = sha1_hmac/aes128cbc
+ *
+ * Operations:
+ *
+ * out = decrypt-AES128CBC(in)
+ * return_ash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | in))
+ *
+ * Prototype:
+ *
+ * void sha1_hmac_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * sha1_hmac_aes128cbc_dec(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
+ * arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 -- temp register for SHA1
+ * v20 -- ABCD copy (q20)
+ * v21 -- sha working state (q21)
+ * v22 -- sha working state (q22)
+ * v23 -- temp register for SHA1
+ * v24 -- sha state ABCD
+ * v25 -- sha state E
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16,
+ * otherwise results are not defined. For AES partial blocks the user
+ * is required to pad the input to modulus 16 = 0.
+ *
+ * Short lengths are less optimized at < 16 AES blocks,
+ * however they are somewhat optimized, and more so than the enc/auth versions.
+ */
+ .file "sha1_hmac_aes128cbc_dec.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global sha1_hmac_aes128cbc_dec
+ .type sha1_hmac_aes128cbc_dec,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+sha1_hmac_aes128cbc_dec:
+/* fetch args */
+ ldr x6, [x5, #HMAC_IKEYPAD]
+ /* init ABCD, E */
+ ld1 {v24.4s, v25.4s},[x6]
+ /* save pointer to o_key_pad partial hash */
+ ldr x6, [x5, #HMAC_OKEYPAD]
+
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next *in */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,16 /* no main loop if <16 */
+ blt .Lshort_cases /* branch if < 12 */
+
+/* protect registers */
+ sub sp,sp,8*16
+ mov x11,x4 /* len -> x11 needed at end */
+ mov x7,sp /* copy for address mode */
+ ld1 {v30.16b},[x5] /* get 1st ivec */
+ lsr x12,x11,6 /* total_blocks (sha) */
+ mov x4,x0 /* sha_ptr_in = *in */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ ld1 {v29.16b},[x4],16 /* next w3 */
+
+/*
+ * now we can do the loop prolog, 1st sha1 block
+ */
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+ /* base address for sha round consts */
+ adr x8,.Lrcon
+/*
+ * do the first sha1 block on the plaintext
+ */
+ mov v20.16b,v24.16b /* init working ABCD */
+ st1 {v8.16b},[x7],16
+ st1 {v9.16b},[x7],16
+ rev32 v26.16b,v26.16b /* endian swap w0 */
+ st1 {v10.16b},[x7],16
+ rev32 v27.16b,v27.16b /* endian swap w1 */
+ st1 {v11.16b},[x7],16
+ rev32 v28.16b,v28.16b /* endian swap w2 */
+ st1 {v12.16b},[x7],16
+ rev32 v29.16b,v29.16b /* endian swap w3 */
+ st1 {v13.16b},[x7],16
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ add v19.4s,v4.4s,v26.4s
+ st1 {v14.16b},[x7],16
+ add v23.4s,v4.4s,v27.4s
+ st1 {v15.16b},[x7],16
+/* quad 0 */
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ sha1c q24,s25,v19.4s
+ sha1su1 v26.4s,v29.4s
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+ sha1c q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v4.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+ sha1c q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ sha1c q24,s21,v19.4s
+ add v19.4s,v5.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+/* quad 1 */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ sha1p q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v5.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+ sha1p q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ add v19.4s,v5.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ sha1p q24,s21,v19.4s
+ sha1su1 v26.4s,v29.4s
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+ add v19.4s,v6.4s,v28.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ sha1p q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v23.4s,v6.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v6.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ add v19.4s,v6.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+/* quad 3 */
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ sha1p q24,s21,v19.4s
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ sha1p q24,s22,v23.4s
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ sha1p q24,s21,v19.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ ld1 {v29.16b},[x4],16 /* next w3 */
+ sha1p q24,s22,v23.4s
+
+/*
+ * aes_blocks_left := number after the main (sha) block is done.
+ * can be 0 note we account for the extra unwind in main_blocks
+ */
+ sub x7,x12,2 /* main_blocks=total_blocks-5 */
+ add v24.4s,v24.4s,v20.4s
+ and x13,x10,3 /* aes_blocks_left */
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ add v25.4s,v25.4s,v21.4s
+ add x2,x0,128 /* lead_ptr = *in */
+ /* next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+
+/*
+ * main combined loop CBC, can be used by auth/enc version
+ */
+.Lmain_loop:
+/*
+ * Because both mov, rev32 and eor have a busy cycle,
+ * this takes longer than it looks.
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ /* pref next aes_ptr_out, streaming */
+ prfm PLDL1KEEP,[x1,64]
+/* aes xform 0, sha quad 0 */
+ aesd v0.16b,v8.16b
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ add v19.4s,v4.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v0.16b,v10.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ add v23.4s,v4.4s,v27.4s
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ /* read next aes block, no update */
+ ld1 {v1.16b},[x0]
+ aesimc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aesd v0.16b,v12.16b
+ sha1su1 v26.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v0.16b,v13.16b
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v14.16b
+ add v23.4s,v4.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v0.16b,v15.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aesd v0.16b,v16.16b
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v17.16b
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ /* get next aes block, with update */
+ ld1 {v30.16b},[x0],16
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su1 v26.4s,v29.4s
+/* aes xform 1, sha quad 1 */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ aesd v1.16b,v8.16b
+ sha1h s21,s24
+ add v19.4s,v5.4s,v28.4s
+ sha1p q24,s22,v23.4s
+ aesimc v1.16b,v1.16b
+ sha1su1 v27.4s,v26.4s
+ aesd v1.16b,v9.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aesd v1.16b,v10.16b
+ /* read next aes block, no update */
+ ld1 {v2.16b},[x0]
+ add v23.4s,v5.4s,v29.4s
+ sha1su1 v28.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha1h s21,s24
+ aesd v1.16b,v12.16b
+ sha1p q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ sha1h s22,s24
+ add v19.4s,v5.4s,v26.4s
+ aesimc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aesd v1.16b,v14.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aesd v1.16b,v16.16b
+ sha1su1 v27.4s,v26.4s
+ add v19.4s,v6.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ add v23.4s,v6.4s,v29.4s
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+/* aes xform 2, sha quad 2 */
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v2.16b,v8.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v9.16b
+ sha1su1 v28.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesd v2.16b,v10.16b
+ sha1h s21,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aesd v2.16b,v11.16b
+ sha1su1 v29.4s,v28.4s
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v13.16b
+ sha1su1 v26.4s,v29.4s
+ add v23.4s,v6.4s,v27.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesimc v2.16b,v2.16b
+ /* read next aes block, no update */
+ ld1 {v3.16b},[x0]
+ aesd v2.16b,v14.16b
+ sha1h s21,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aesd v2.16b,v15.16b
+ sha1su1 v27.4s,v26.4s
+ add v19.4s,v6.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ sha1h s22,s24
+ aesd v2.16b,v16.16b
+ sha1m q24,s21,v19.4s
+ aesimc v2.16b,v2.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v2.16b,v17.16b
+ sha1su1 v28.4s,v27.4s
+ add v23.4s,v7.4s,v29.4s
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ add v19.4s,v7.4s,v26.4s
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+/* aes xform 3, sha quad 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ aesd v3.16b,v9.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v10.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v3.16b,v3.16b
+ sha1su1 v29.4s,v28.4s
+ aesd v3.16b,v11.16b
+ sha1h s22,s24
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ add v23.4s,v7.4s,v27.4s
+ aesd v3.16b,v13.16b
+ sha1h s21,s24
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesimc v3.16b,v3.16b
+ add v19.4s,v7.4s,v28.4s
+ aesd v3.16b,v15.16b
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ sha1h s22,s24
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ add v23.4s,v7.4s,v29.4s
+ aesd v3.16b,v17.16b
+ sha1h s21,s24
+ ld1 {v29.16b},[x4],16 /* next w3 */
+ sha1p q24,s22,v23.4s
+ add v24.4s,v24.4s,v20.4s
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ /* next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+ add v25.4s,v25.4s,v21.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ /* loop if more to do */
+ cbnz x7,.Lmain_loop
+/*
+ * now the loop epilog. Since the reads for sha have already been done
+ * in advance, we have to have an extra unwind.
+ * This is why the test for the short cases is 16 and not 12.
+ *
+ * the unwind, which is just the main loop without the tests or final reads.
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ /* pref next aes_ptr_out, streaming */
+ prfm PLDL1KEEP,[x1,64]
+/* aes xform 0, sha quad 0 */
+ aesd v0.16b,v8.16b
+ add v19.4s,v4.4s,v26.4s
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ aesimc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ /* read next aes block, no update */
+ ld1 {v1.16b},[x0]
+ aesd v0.16b,v9.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ add v23.4s,v4.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aesd v0.16b,v11.16b
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ aesimc v0.16b,v0.16b
+ sha1su1 v26.4s,v29.4s
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v0.16b,v13.16b
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v14.16b
+ add v23.4s,v4.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v0.16b,v15.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aesd v0.16b,v16.16b
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v17.16b
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ add v23.4s,v5.4s,v27.4s
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su1 v26.4s,v29.4s
+/* aes xform 1, sha quad 1 */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v1.16b,v8.16b
+ sha1h s21,s24
+ add v19.4s,v5.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ add v23.4s,v5.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesd v1.16b,v10.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ /* read next aes block, no update */
+ ld1 {v2.16b},[x0]
+ aesimc v1.16b,v1.16b
+ sha1h s22,s24
+ aesd v1.16b,v11.16b
+ sha1p q24,s21,v19.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ sha1su1 v28.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesd v1.16b,v13.16b
+ sha1h s21,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aesd v1.16b,v14.16b
+ add v19.4s,v5.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v23.4s,v5.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v1.16b,v16.16b
+ sha1h s22,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aesd v1.16b,v17.16b
+ add v19.4s,v6.4s,v28.4s
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ sha1su1 v26.4s,v29.4s
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ add v23.4s,v6.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* mode op 2 */
+/* aes xform 2, sha quad 2 */
+ aesd v2.16b,v8.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v10.16b
+ sha1su1 v28.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ add v19.4s,v6.4s,v26.4s
+ aesd v2.16b,v11.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ sha1h s21,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aesd v2.16b,v13.16b
+ sha1su1 v29.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ /* read next aes block, no update */
+ ld1 {v3.16b},[x0]
+ aesd v2.16b,v14.16b
+ add v23.4s,v6.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v2.16b,v15.16b
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v16.16b
+ add v19.4s,v6.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ sha1su1 v26.4s,v29.4s
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+/* mode op 3 */
+/* aes xform 3, sha quad 3 */
+ aesd v3.16b,v8.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v3.16b,v3.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ aesd v3.16b,v9.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v10.16b
+ sha1su1 v29.4s,v28.4s
+ aesimc v3.16b,v3.16b
+ add v19.4s,v7.4s,v26.4s
+ aesd v3.16b,v11.16b
+ sha1h s22,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v12.16b
+ /* read first aes block, no bump */
+ ld1 {v0.16b},[x0]
+ aesimc v3.16b,v3.16b
+ add v23.4s,v7.4s,v27.4s
+ aesd v3.16b,v13.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ add v19.4s,v7.4s,v28.4s
+ aesd v3.16b,v14.16b
+ sha1h s22,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v15.16b
+ add v23.4s,v7.4s,v29.4s
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+/*
+ * now we have to do the 4 aes blocks (b-2) that catch up to where sha is
+ */
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ /* read next aes block, no update */
+ ld1 {v1.16b},[x0]
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ /* read next aes block, no update */
+ ld1 {v2.16b},[x0]
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+ eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ /* read next aes block, no update */
+ ld1 {v3.16b},[x0]
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+/*
+ * Now, there is the final b-1 sha1 padded block.
+ * This contains between 0-3 aes blocks. We take some pains to avoid read spill
+ * by only reading the blocks that are actually defined.
+ * this is also the final sha block code for the short_cases.
+ */
+.Ljoin_common:
+ mov w15,0x80 /* that's the 1 of the pad */
+ cbnz x13,.Lpad100 /* branch if there is some real data */
+ eor v26.16b,v26.16b,v26.16b /* zero the rest */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v26.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad100:
+ sub x14,x13,1 /* dec amount left */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ cbnz x14,.Lpad200 /* branch if there is some real data */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v27.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad200:
+ sub x14,x14,1 /* dec amount left */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ cbnz x14,.Lpad300 /* branch if there is some real data */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v28.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad300:
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v29.b[3],w15 /* all data is bogus */
+
+.Lpad_done:
+ /* Add one SHA-1 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x14,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x14,x14,3 /* len_lo in bits */
+
+ mov v29.s[3],w14 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+/*
+ * final sha block
+ * the strategy is to combine the 0-3 aes blocks, which is faster but
+ * a little gourmand on code space.
+ */
+ cbz x13,.Lzero_aes_blocks_left /* none to do */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v0.16b},[x0]
+ ld1 {v31.16b},[x0],16
+
+ aesd v0.16b,v8.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+ add v19.4s,v4.4s,v26.4s
+ aesd v0.16b,v10.16b
+ add v23.4s,v4.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ sha1h s22,s24
+ aesd v0.16b,v12.16b
+ sha1c q24,s25,v19.4s
+ sha1su1 v26.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v0.16b,v13.16b
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v14.16b
+ sha1su1 v27.4s,v26.4s
+ add v19.4s,v4.4s,v28.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1h s22,s24
+ aesd v0.16b,v15.16b
+ sha1c q24,s21,v19.4s
+ aesimc v0.16b,v0.16b
+ sha1su1 v28.4s,v27.4s
+ add v23.4s,v4.4s,v29.4s
+ aesd v0.16b,v16.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v17.16b
+ sha1su1 v29.4s,v28.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ /* dec counter */
+ sub x13,x13,1
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ cbz x13,.Lfrmquad1
+
+/* aes xform 1 */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v0.16b},[x0]
+ ld1 {v30.16b},[x0],16
+ add v23.4s,v5.4s,v27.4s
+ aesd v0.16b,v8.16b
+ add v19.4s,v5.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s22,v23.4s
+ aesd v0.16b,v11.16b
+ sha1su1 v27.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v0.16b,v12.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s21,v19.4s
+ aesd v0.16b,v13.16b
+ sha1su1 v28.4s,v27.4s
+ add v23.4s,v5.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesd v0.16b,v14.16b
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s22,v23.4s
+ aesd v0.16b,v15.16b
+ sha1su1 v29.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ add v19.4s,v5.4s,v26.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v0.16b,v16.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s21,v19.4s
+ aesd v0.16b,v17.16b
+ sha1su1 v26.4s,v29.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ sub x13,x13,1 /* dec counter */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ cbz x13,.Lfrmquad2
+
+/* aes xform 2 */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v0.16b},[x0],16
+ add v19.4s,v6.4s,v28.4s
+ aesd v0.16b,v8.16b
+ add v23.4s,v6.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1h s22,s24
+ aesd v0.16b,v10.16b
+ sha1m q24,s21,v19.4s
+ aesimc v0.16b,v0.16b
+ sha1su1 v28.4s,v27.4s
+ aesd v0.16b,v11.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v12.16b
+ sha1m q24,s22,v23.4s
+ aesimc v0.16b,v0.16b
+ sha1su1 v29.4s,v28.4s
+ aesd v0.16b,v13.16b
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1m q24,s21,v19.4s
+ aesd v0.16b,v15.16b
+ sha1su1 v26.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ add v23.4s,v6.4s,v27.4s
+ aesd v0.16b,v16.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v17.16b
+ sha1m q24,s22,v23.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ sha1su1 v27.4s,v26.4s
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ b .Lfrmquad3
+/*
+ * the final block with no aes component, i.e from here there were zero blocks
+ */
+
+.Lzero_aes_blocks_left:
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+/* quad 1 */
+.Lfrmquad1:
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+/* quad 2 */
+.Lfrmquad2:
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+/* quad 3 */
+.Lfrmquad3:
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v26.4s,v24.4s,v20.4s
+ add v27.4s,v25.4s,v21.4s
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+ /* load o_key_pad partial hash */
+ ld1 {v24.16b,v25.16b}, [x6]
+ /* working ABCD <- ABCD */
+ mov v20.16b,v24.16b
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v27.b[7], w11
+ /* size of o_key_pad + inner hash */
+ mov x11, #64+20
+ /* move length to the end of the block */
+ lsl x11, x11, 3
+ mov v29.s[3], w11
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+
+ st1 {v24.16b}, [x3],16
+ st1 {v25.s}[0], [x3]
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v30.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+ mov x4,x0 /* sha_ptr_in = in */
+
+ mov x9,x8 /* top of rcon */
+
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+/*
+ * This loop does 4 at a time, so that at the end there is a final sha block
+ * and 0-3 aes blocks. Note that everything is done serially
+ * to avoid complication.
+ */
+.Lshort_loop:
+ cmp x10,4 /* check if 4 or more */
+ /* if less, bail to last block */
+ blt .Llast_sha_block
+
+ ld1 {v31.16b},[x4] /* next w no update */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v0.16b},[x4],16
+ rev32 v26.16b,v0.16b /* endian swap for sha */
+ add x0,x0,64
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v1.16b},[x4],16
+ rev32 v27.16b,v1.16b /* endian swap for sha */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ eor v1.16b,v1.16b,v31.16b /* xor w/ prev value */
+
+ ld1 {v31.16b},[x4] /* read no update */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v2.16b},[x4],16
+ rev32 v28.16b,v2.16b /* endian swap for sha */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ eor v2.16b,v2.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v3.16b},[x4],16
+ rev32 v29.16b,v3.16b /* endian swap for sha */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+/*
+ * now we have the sha1 to do for these 4 aes blocks. Note that.
+ */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+/* quad 0 */
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* quad 1 */
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+/* quad 3 */
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ sub x10,x10,4 /* 4 less */
+ b .Lshort_loop /* keep looping */
+/*
+ * this is arranged so that we can join the common unwind code
+ * that does the last sha block and the final 0-3 aes blocks
+ */
+.Llast_sha_block:
+ mov x13,x10 /* copy aes blocks for common */
+ b .Ljoin_common /* join common code */
+
+ .size sha1_hmac_aes128cbc_dec, .-sha1_hmac_aes128cbc_dec
--
1.9.1
next prev parent reply other threads:[~2016-12-06 17:35 UTC|newest]
Thread overview: 100+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-12-04 11:33 [PATCH] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2016-12-04 11:33 ` [PATCH 1/3] mk: fix build of assembly files for ARM64 zbigniew.bodek
2016-12-04 11:33 ` [PATCH 2/3] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2016-12-04 11:33 ` [PATCH 3/3] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2016-12-07 2:32 ` [PATCH v2 00/12] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2016-12-07 2:32 ` [PATCH v2 01/12] mk: fix build of assembly files for ARM64 zbigniew.bodek
2016-12-21 14:46 ` De Lara Guarch, Pablo
2017-01-04 17:33 ` [PATCH v3 0/8] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2017-01-04 17:33 ` [PATCH v3 1/8] mk: fix build of assembly files for ARM64 zbigniew.bodek
2017-01-13 8:13 ` Hemant Agrawal
2017-01-04 17:33 ` [PATCH v3 2/8] lib: add cryptodev type for the upcoming ARMv8 PMD zbigniew.bodek
2017-01-13 8:16 ` Hemant Agrawal
2017-01-13 15:50 ` Zbigniew Bodek
2017-01-16 5:57 ` Jianbo Liu
2017-01-04 17:33 ` [PATCH v3 3/8] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2017-01-06 2:45 ` Jianbo Liu
2017-01-12 13:12 ` Zbigniew Bodek
2017-01-13 7:41 ` Jianbo Liu
2017-01-13 19:09 ` Zbigniew Bodek
2017-01-13 7:57 ` Hemant Agrawal
2017-01-13 19:15 ` Zbigniew Bodek
2017-01-17 15:48 ` [PATCH v4 0/7] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2017-01-17 15:48 ` [PATCH v4 1/7] lib: add cryptodev type for the upcoming ARMv8 PMD zbigniew.bodek
2017-01-18 2:24 ` Jerin Jacob
2017-01-17 15:48 ` [PATCH v4 2/7] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2017-01-18 14:27 ` [PATCH v5 0/7] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2017-01-18 14:27 ` [PATCH v5 1/7] cryptodev: add cryptodev type for the ARMv8 PMD zbigniew.bodek
2017-01-18 14:27 ` [PATCH v5 2/7] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2017-01-18 20:01 ` [PATCH v6 0/8] Add crypto PMD optimized for ARMv8 zbigniew.bodek
2017-01-18 20:01 ` [PATCH v6 1/8] cryptodev: add cryptodev type for the ARMv8 PMD zbigniew.bodek
2017-01-18 20:01 ` [PATCH v6 2/8] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2017-01-18 20:01 ` [PATCH v6 3/8] mk: add PMD to the build system zbigniew.bodek
2017-01-18 20:01 ` [PATCH v6 4/8] cryptodev/armv8: introduce ARM-specific feature flags zbigniew.bodek
2017-01-18 20:01 ` [PATCH v6 5/8] doc: update documentation about ARMv8 crypto PMD zbigniew.bodek
2017-01-18 20:01 ` [PATCH v6 6/8] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2017-01-18 20:02 ` [PATCH v6 7/8] MAINTAINERS: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2017-01-18 20:02 ` [PATCH v6 8/8] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2017-01-18 21:14 ` [PATCH v6 0/8] Add crypto PMD optimized for ARMv8 De Lara Guarch, Pablo
2017-01-19 10:36 ` Zbigniew Bodek
2017-01-18 14:27 ` [PATCH v5 3/7] mk: add PMD to the build system zbigniew.bodek
2017-01-18 14:27 ` [PATCH v5 4/7] doc: update documentation about ARMv8 crypto PMD zbigniew.bodek
2017-01-18 17:05 ` De Lara Guarch, Pablo
2017-01-18 19:52 ` Zbigniew Bodek
2017-01-18 19:54 ` De Lara Guarch, Pablo
2017-01-18 14:27 ` [PATCH v5 5/7] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2017-01-18 14:27 ` [PATCH v5 6/7] MAINTAINERS: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2017-01-18 14:27 ` [PATCH v5 7/7] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2017-01-18 15:23 ` [PATCH v5 0/7] Add crypto PMD optimized for ARMv8 Jerin Jacob
2017-01-17 15:48 ` [PATCH v4 3/7] mk: add PMD to the build system zbigniew.bodek
2017-01-17 15:49 ` [PATCH v4 4/7] doc: update documentation about ARMv8 crypto PMD zbigniew.bodek
2017-01-17 15:49 ` [PATCH v4 5/7] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2017-01-17 15:49 ` [PATCH v4 6/7] MAINTAINERS: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2017-01-17 15:49 ` [PATCH v4 7/7] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2017-01-18 2:26 ` Jerin Jacob
2017-01-04 17:33 ` [PATCH v3 4/8] mk/crypto/armv8: add PMD to the build system zbigniew.bodek
2017-01-04 17:33 ` [PATCH v3 5/8] doc/armv8: update documentation about crypto PMD zbigniew.bodek
2017-01-04 17:33 ` [PATCH v3 6/8] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2017-01-04 17:33 ` [PATCH v3 7/8] crypto/armv8: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2017-01-04 17:33 ` [PATCH v3 8/8] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
2017-01-12 10:48 ` De Lara Guarch, Pablo
2017-01-12 11:50 ` Zbigniew Bodek
2017-01-12 12:07 ` De Lara Guarch, Pablo
2017-01-13 9:28 ` Hemant Agrawal
2017-01-10 17:11 ` [PATCH v3 0/8] Add crypto PMD optimized for ARMv8 De Lara Guarch, Pablo
2017-01-10 17:50 ` Zbigniew Bodek
2017-01-13 8:07 ` Hemant Agrawal
2017-01-13 18:59 ` Zbigniew Bodek
2017-01-16 6:57 ` Hemant Agrawal
2017-01-16 8:02 ` Jerin Jacob
2016-12-07 2:32 ` [PATCH v2 02/12] lib: add cryptodev type for the upcoming ARMv8 PMD zbigniew.bodek
2016-12-06 20:27 ` Thomas Monjalon
2016-12-07 19:04 ` Zbigniew Bodek
2016-12-07 20:09 ` Thomas Monjalon
2016-12-09 12:06 ` Declan Doherty
2016-12-07 2:32 ` [PATCH v2 03/12] crypto/armv8: Add core crypto operations for ARMv8 zbigniew.bodek
2016-12-06 20:29 ` Thomas Monjalon
2016-12-06 21:18 ` Jerin Jacob
2016-12-06 21:42 ` Thomas Monjalon
2016-12-06 22:05 ` Jerin Jacob
2016-12-06 22:41 ` Thomas Monjalon
2016-12-06 23:24 ` Jerin Jacob
2016-12-07 15:00 ` Thomas Monjalon
2016-12-07 16:30 ` Jerin Jacob
2016-12-07 2:32 ` [PATCH v2 04/12] crypto/armv8: Add AES+SHA256 " zbigniew.bodek
2016-12-07 2:32 ` zbigniew.bodek [this message]
2016-12-07 2:32 ` [PATCH v2 06/12] crypto/armv8: add PMD optimized for ARMv8 processors zbigniew.bodek
2016-12-21 14:55 ` De Lara Guarch, Pablo
2016-12-07 2:33 ` [PATCH v2 07/12] crypto/armv8: generate ASM symbols automatically zbigniew.bodek
2016-12-07 2:33 ` [PATCH v2 08/12] mk/crypto/armv8: add PMD to the build system zbigniew.bodek
2016-12-21 15:01 ` De Lara Guarch, Pablo
2016-12-07 2:33 ` [PATCH v2 09/12] doc/armv8: update documentation about crypto PMD zbigniew.bodek
2016-12-07 21:13 ` Mcnamara, John
2016-12-07 2:33 ` [PATCH v2 10/12] crypto/armv8: enable ARMv8 PMD in the configuration zbigniew.bodek
2016-12-08 10:24 ` [PATCH v2 00/12] Add crypto PMD optimized for ARMv8 Bruce Richardson
2016-12-08 11:32 ` Zbigniew Bodek
2016-12-08 17:45 ` Jerin Jacob
2016-12-21 15:34 ` Declan Doherty
2016-12-22 4:57 ` Jerin Jacob
2016-12-07 2:36 ` [PATCH v2 11/12] crypto/armv8: update MAINTAINERS entry for ARMv8 crypto zbigniew.bodek
2016-12-07 2:37 ` [PATCH v2 12/12] app/test: add ARMv8 crypto tests and test vectors zbigniew.bodek
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1481077985-4224-6-git-send-email-zbigniew.bodek@caviumnetworks.com \
--to=zbigniew.bodek@caviumnetworks.com \
--cc=dev@dpdk.org \
--cc=emery.davis@caviumnetworks.com \
--cc=jerin.jacob@caviumnetworks.com \
--cc=pablo.de.lara.guarch@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.