linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
@ 2021-09-23  6:30 XiaokangQian
  2021-09-28  6:27 ` Eric Biggers
  2021-12-15  3:04 ` [PATCH v2] " XiaokangQian
  0 siblings, 2 replies; 12+ messages in thread
From: XiaokangQian @ 2021-09-23  6:30 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, Catalin Marinas, Will Deacon
  Cc: nd, ardb, XiaokangQian, linux-crypto, linux-arm-kernel, linux-kernel

To improve performance on cores with deep piplines such as A72,N1,
implement gcm(aes) using a 4-way interleave of aes and ghash (totally
8 blocks in parallel), which can make full utilize of pipelines rather
than the 4-way interleave we used currently. It can gain about 20% for
big data sizes such that 8k.

This is a complete new version of the GCM part of the combined GCM/GHASH
driver, it will co-exist with the old driver, only serve for big data
sizes. Instead of interleaving four invocations of AES where each chunk
of 64 bytes is encrypted first and then ghashed, the new version uses a
more coarse grained approach where a chunk of 64 bytes is encrypted and
at the same time, one chunk of 64 bytes is ghashed (or ghashed and
decrypted in the converse case).

The table below compares the performance of the old driver and the new
one on various micro-architectures and running in various modes with
various data sizes.

            |     AES-128       |     AES-192       |     AES-256       |
     #bytes | 1024 | 1420 |  8k | 1024 | 1420 |  8k | 1024 | 1420 |  8k |
     -------+------+------+-----+------+------+-----+------+------+-----+
        A72 | 5.5% |  12% | 25% | 2.2% |  9.5%|  23%| -1%  |  6.7%| 19% |
        A57 |-0.5% |  9.3%| 32% | -3%  |  6.3%|  26%| -6%  |  3.3%| 21% |
        N1  | 0.4% |  7.6%|24.5%| -2%  |  5%  |  22%| -4%  |  2.7%| 20% |

Signed-off-by: XiaokangQian <xiaokang.qian@arm.com>
---
 arch/arm64/crypto/Makefile               |    2 +-
 arch/arm64/crypto/ghash-ce-core_unroll.S | 1176 ++++++++++++++++++++++
 arch/arm64/crypto/ghash-ce-glue.c        |  136 ++-
 3 files changed, 1295 insertions(+), 19 deletions(-)
 create mode 100644 arch/arm64/crypto/ghash-ce-core_unroll.S

diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 09a805cc32d7..068e9d377db2 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce.o
 sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o
 
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
-ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
+ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o ghash-ce-core_unroll.o
 
 obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
 crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
diff --git a/arch/arm64/crypto/ghash-ce-core_unroll.S b/arch/arm64/crypto/ghash-ce-core_unroll.S
new file mode 100644
index 000000000000..979bca90820f
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-core_unroll.S
@@ -0,0 +1,1176 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated GCM implementation with ARMv8 PMULL instructions
+ * and unroll factors.
+ *
+ * Copyright (C) 2021  Arm.ltd. <xiaokang.qian@arm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.arch	armv8-a+crypto
+.text
+
+.macro push_stack
+	stp	x19, x20, [sp, #-112]!
+	stp	x21, x22, [sp, #16]
+	stp	x23, x24, [sp, #32]
+	stp	d8, d9, [sp, #48]
+	stp	d10, d11, [sp, #64]
+	stp	d12, d13, [sp, #80]
+	stp	d14, d15, [sp, #96]
+.endm
+
+.macro pop_stack
+	ldp	x21, x22, [sp, #16]
+	ldp	x23, x24, [sp, #32]
+	ldp	d8, d9, [sp, #48]
+	ldp	d10, d11, [sp, #64]
+	ldp	d12, d13, [sp, #80]
+	ldp	d14, d15, [sp, #96]
+	ldp	x19, x20, [sp], #112
+.endm
+
+.macro load_const
+	movi	v8.8b, #0xc2
+	shl	d8, d8, #56               //mod_constant
+.endm
+
+.macro gcm_tidy_up high:req, mid:req, low:req, tmp1:req, tmp2:req
+	eor	\tmp1\().16b, \low\().16b, \high\().16b //MODULO-karatsuba tidy up
+	eor	\mid\().16b, \mid\().16b, \tmp1\().16b  //MODULO-karatsuba tidy up
+	pmull	\tmp2\().1q, \high\().1d, v8.1d
+	ext	\high\().16b, \high\().16b, \high\().16b, #8
+	eor	\mid\().16b, \mid\().16b, \tmp2\().16b //MODULO - fold into mid
+	eor	\mid\().16b, \mid\().16b, \high\().16b //MODULO - fold into mid
+	pmull	\high\().1q, \mid\().1d, v8.1d  //MODULO - mid 64b align with low
+	ext	\mid\().16b, \mid\().16b, \mid\().16b, #8
+	eor	\low\().16b, \low\().16b, \high\().16b //MODULO - fold into low
+	eor	\low\().16b, \low\().16b, \mid\().16b //MODULO - fold into low
+.endm
+
+.macro karasuba_multiply res:req, h:req, tmp1:req, tmp2:req, tmp3:req
+	pmull	\tmp1\().1q, \res\().1d, \h\().1d    //GHASH final block - low
+	eor	\tmp2\().8b, \tmp2\().8b, \res\().8b //GHASH final block - mid
+	pmull2	\tmp3\().1q, \res\().2d, \h\().2d    //GHASH final block - high
+	pmull	\tmp2\().1q, \tmp2\().1d, v16.1d     //GHASH final block - mid
+	eor	v11.16b, v11.16b, \tmp1\().16b       //GHASH final block - low
+	eor	v9.16b, v9.16b, \tmp3\().16b         //GHASH final block - high
+	eor	v10.16b, v10.16b, \tmp2\().16b       //GHASH final block - mid
+.endm
+
+.macro aes_encrypt_round    block:req,key:req
+	aese	\block\().16b,\key\().16b
+	aesmc	\block\().16b,\block\().16b
+.endm
+
+.macro aes_enc_extra_round rd_num:req
+        .if \rd_num == 12
+	add     x19,x8,#176
+	aes_encrypt_round	v0, v27         //AES block 0 - round 9
+	aes_encrypt_round	v3, v27         //AES block 3 - round 9
+	aes_encrypt_round	v2, v27         //AES block 2 - round 9
+	aes_encrypt_round	v1, v27         //AES block 1 - round 9
+	ldr	q27, [x19],#16                  //load rk9
+	aes_encrypt_round	v0, v28         //AES block 0 - round 10
+	aes_encrypt_round	v2, v28         //AES block 2 - round 10
+	aes_encrypt_round	v1, v28         //AES block 1 - round 10
+	aes_encrypt_round	v3, v28         //AES block 3 - round 10
+	ldr	q28, [x19],#16                  //load rk10
+        .elseif \rd_num == 14
+	aes_encrypt_round	v1, v27          //AES block 1 - round 11
+	aes_encrypt_round	v2, v27          //AES block 2 - round 11
+	aes_encrypt_round	v0, v27          //AES block 0 - round 11
+	aes_encrypt_round	v3, v27          //AES block 3 - round 11
+	ldr	q27, [x19],#16                   //load rk9
+	aes_encrypt_round	v1, v28          //AES block 1 - round 12
+	aes_encrypt_round	v2, v28          //AES block 2 - round 12
+	aes_encrypt_round	v0, v28          //AES block 0 - round 12
+	aes_encrypt_round	v3, v28          //AES block 3 - round 12
+	ldr	q28, [x19],#16                   //load rk10
+        .endif
+	fmov	x13, d28                         //load last second block
+	fmov	x14, v28.d[1]                    //load last second block
+.endm
+
+.macro load_initial_tag    dst:req,buf:req
+	ld1	{\dst\().16b}, [\buf]
+	ext	\dst\().16b, \dst\().16b, \dst\().16b, #8
+	rev64	\dst\().16b, \dst\().16b
+.endm
+
+SYM_FUNC_START(pmull_gcm_encrypt_unroll)
+	cbz	x1, .L128_enc_ret
+	push_stack
+	mov	x16, x4
+	mov	x8, x5
+	mov     x17, x6
+	ldp	x10, x11, [x16]                 //ctr96_b64, ctr96_t32
+	ldp	x13, x14, [x8, #160]            //load rk10
+	load_initial_tag v11,x3
+	lsr	x5, x1, #3                      //byte_len
+	mov	x15, x5
+	ldr	q27, [x8, #144]                 //load rk9
+	add	x4, x0, x1, lsr #3              //end_input_ptr
+	sub	x5, x5, #1                      //byte_len - 1
+	lsr	x12, x11, #32
+	ldr	q15, [x3, #112]                 //load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	fmov	d1, x10                         //CTR block 1
+	rev	w12, w12                        //rev_ctr32
+	add	w12, w12, #1                    //increment rev_ctr32
+	orr	w11, w11, w11
+	ldr	q18, [x8, #0]                   //load rk0
+	rev	w9, w12                         //CTR block 1
+	add	w12, w12, #1                    //CTR block 1
+	fmov	d3, x10                         //CTR block 3
+	ldr	q28, [x8, #160]                 //load rk10
+	orr	x9, x11, x9, lsl #32            //CTR block 1
+        //load initial counter so that start first AES block quickly
+	ld1	{ v0.16b}, [x16]
+	fmov	v1.d[1], x9                     //CTR block 1
+	rev	w9, w12                         //CTR block 2
+	fmov	d2, x10                         //CTR block 2
+	orr	x9, x11, x9, lsl #32            //CTR block 2
+	add	w12, w12, #1                    //CTR block 2
+	fmov	v2.d[1], x9                     //CTR block 2
+	rev	w9, w12                         //CTR block 3
+	orr	x9, x11, x9, lsl #32            //CTR block 3
+	ldr	q19, [x8, #16]                  //load rk1
+	add	w12, w12, #1                    //CTR block 3
+	fmov	v3.d[1], x9                     //CTR block 3
+	ldr	q14, [x3, #80]                  //load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aes_encrypt_round	v1, v18         //AES block 1 - round 0
+	ldr	q20, [x8, #32]                  //load rk2
+	aes_encrypt_round	v2, v18         //AES block 2 - round 0
+	ldr	q12, [x3, #32]                  //load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aes_encrypt_round	v0, v18         //AES block 0 - round 0
+	ldr	q26, [x8, #128]                 //load rk8
+	aes_encrypt_round	v3, v18         //AES block 3 - round 0
+	ldr	q21, [x8, #48]                  //load rk3
+	aes_encrypt_round	v2, v19         //AES block 2 - round 1
+	trn2	v17.2d,  v14.2d,    v15.2d      //h4l | h3l
+	aes_encrypt_round	v0, v19         //AES block 0 - round 1
+	ldr	q24, [x8, #96]                  //load rk6
+	aes_encrypt_round	v1, v19         //AES block 1 - round 1
+	ldr	q25, [x8, #112]                 //load rk7
+	aes_encrypt_round	v3, v19         //AES block 3 - round 1
+	trn1	v9.2d, v14.2d,    v15.2d        //h4h | h3h
+	aes_encrypt_round	v0, v20         //AES block 0 - round 2
+	ldr	q23, [x8, #80]                  //load rk5
+	aes_encrypt_round	v1, v20         //AES block 1 - round 2
+	ldr	q13, [x3, #64]                  //load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aes_encrypt_round	v3, v20         //AES block 3 - round 2
+	aes_encrypt_round	v2, v20         //AES block 2 - round 2
+	eor	v17.16b, v17.16b, v9.16b        //h4k | h3k
+	aes_encrypt_round	v0, v21         //AES block 0 - round 3
+	aes_encrypt_round	v1, v21         //AES block 1 - round 3
+	aes_encrypt_round	v2, v21         //AES block 2 - round 3
+	ldr	q22, [x8, #64]                  //load rk4
+	aes_encrypt_round	v3, v21         //AES block 3 - round 3
+        //bytes be processed in main loop(at least 1 byte be handled by tail)
+	and	x5, x5, #0xffffffffffffffc0
+	trn2	v16.2d,  v12.2d,    v13.2d      //h2l | h1l
+	aes_encrypt_round	v3, v22          //AES block 3 - round 4
+	add	x5, x5, x0
+	aes_encrypt_round	v2, v22          //AES block 2 - round 4
+	cmp	x0, x5                   //check if we have <= 4 blocks
+	aes_encrypt_round	v0, v22          //AES block 0 - round 4
+	aes_encrypt_round	v3, v23          //AES block 3 - round 5
+	aes_encrypt_round	v2, v23          //AES block 2 - round 5
+	aes_encrypt_round	v0, v23          //AES block 0 - round 5
+	aes_encrypt_round	v3, v24          //AES block 3 - round 6
+	aes_encrypt_round	v1, v22          //AES block 1 - round 4
+	aes_encrypt_round	v2, v24          //AES block 2 - round 6
+	trn1	v8.2d,    v12.2d,    v13.2d     //h2h | h1h
+	aes_encrypt_round	v0, v24          //AES block 0 - round 6
+	aes_encrypt_round	v1, v23          //AES block 1 - round 5
+	aes_encrypt_round	v1, v24          //AES block 1 - round 6
+	aes_encrypt_round	v3, v25          //AES block 3 - round 7
+	aes_encrypt_round	v0, v25          //AES block 0 - round 7
+	aes_encrypt_round	v2, v25          //AES block 2 - round 7
+	aes_encrypt_round	v0, v26          //AES block 0 - round 8
+	aes_encrypt_round	v1, v25          //AES block 1 - round 7
+	aes_encrypt_round	v2, v26          //AES block 2 - round 8
+	aes_encrypt_round	v3, v26          //AES block 3 - round 8
+	aes_encrypt_round	v1, v26          //AES block 1 - round 8
+
+	mov	x6, x17
+	sub	x6, x6, #10
+	cbz	x6, .Lleft_rounds
+	aes_enc_extra_round 12
+	sub	x6, x6, #2
+	cbz	x6, .Lleft_rounds
+	aes_enc_extra_round 14
+
+.Lleft_rounds:
+	aese	v2.16b, v27.16b                  //AES block 2 - round 9
+	aese	v0.16b, v27.16b                 //AES block 0 - round 9
+	eor	v16.16b, v16.16b, v8.16b        //h2k | h1k
+	aese	v1.16b, v27.16b                 //AES block 1 - round 9
+	aese	v3.16b, v27.16b                 //AES block 3 - round 9
+	b.ge	.L128_enc_tail                  //handle tail
+
+	ldp	x6, x7, [x0, #0]                //AES block 0 - load plaintext
+	ldp	x21, x22, [x0, #32]             //AES block 2 - load plaintext
+	ldp	x19, x20, [x0, #16]             //AES block 1 - load plaintext
+	ldp	x23, x24, [x0, #48]             //AES block 3 - load plaintext
+	eor	x6, x6, x13                     //AES block 0 - round 10 low
+	eor	x7, x7, x14                     //AES block 0 - round 10 high
+	eor	x21, x21, x13                   //AES block 2 - round 10 low
+	fmov	d4, x6                          //AES block 0 - mov low
+	eor	x19, x19, x13                   //AES block 1 - round 10 low
+	eor	x22, x22, x14                   //AES block 2 - round 10 high
+	fmov	v4.d[1], x7                     //AES block 0 - mov high
+	fmov	d5, x19                         //AES block 1 - mov low
+	eor	x20, x20, x14                   //AES block 1 - round 10 high
+	eor	x23, x23, x13                   //AES block 3 - round 10 low
+	fmov	v5.d[1], x20                    //AES block 1 - mov high
+	fmov	d6, x21                         //AES block 2 - mov low
+	eor	x24, x24, x14                   //AES block 3 - round 10 high
+	rev	w9, w12                         //CTR block 4
+	fmov	v6.d[1], x22                    //AES block 2 - mov high
+	orr	x9, x11, x9, lsl #32            //CTR block 4
+	eor	v4.16b, v4.16b, v0.16b          //AES block 0 - result
+	fmov	d0, x10                         //CTR block 4
+	add	w12, w12, #1                    //CTR block 4
+	fmov	v0.d[1], x9                     //CTR block 4
+	rev	w9, w12                         //CTR block 5
+	eor	v5.16b, v5.16b, v1.16b          //AES block 1 - result
+	fmov	d1, x10                         //CTR block 5
+	orr	x9, x11, x9, lsl #32            //CTR block 5
+	add	w12, w12, #1                    //CTR block 5
+	add	x0, x0, #64                     //AES input_ptr update
+	fmov	v1.d[1], x9                     //CTR block 5
+	fmov	d7, x23                         //AES block 3 - mov low
+	rev	w9, w12                         //CTR block 6
+	st1	{ v4.16b}, [x2], #16            //AES block 0 - store result
+	fmov	v7.d[1], x24                    //AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            //CTR block 6
+	add	w12, w12, #1                    //CTR block 6
+	eor	v6.16b, v6.16b, v2.16b          //AES block 2 - result
+	st1	{ v5.16b}, [x2], #16            //AES block 1 - store result
+	fmov	d2, x10                         //CTR block 6
+	cmp	x0, x5                   //check if we have <= 8 blocks
+	fmov	v2.d[1], x9                     //CTR block 6
+	rev	w9, w12                         //CTR block 7
+	st1	{ v6.16b}, [x2], #16            //AES block 2 - store result
+	orr	x9, x11, x9, lsl #32            //CTR block 7
+	eor	v7.16b, v7.16b, v3.16b          //AES block 3 - result
+	st1	{ v7.16b}, [x2], #16            //AES block 3 - store result
+	b.ge	.L128_enc_prepretail            //do prepretail
+.L128_enc_main_loop:	//main	loop start
+	ldp	x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
+	rev64	v4.16b, v4.16b                  //GHASH block 4k
+	rev64	v6.16b, v6.16b                  //GHASH block 4k+2
+	aes_encrypt_round	v2, v18          //AES block 4k+6 - round 0
+	fmov	d3, x10                         //CTR block 4k+3
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	rev64	v5.16b, v5.16b                  //GHASH block 4k+1
+	aes_encrypt_round	v1, v18          //AES block 4k+5 - round 0
+	add	w12, w12, #1                    //CTR block 4k+3
+	fmov	v3.d[1], x9                     //CTR block 4k+3
+	aes_encrypt_round	v0, v18          //AES block 4k+4 - round 0
+	mov	d31, v6.d[1]                    //GHASH block 4k+2 - mid
+	aes_encrypt_round	v2, v19          //AES block 4k+6 - round 1
+	mov	d30, v5.d[1]                    //GHASH block 4k+1 - mid
+	aes_encrypt_round	v1, v19          //AES block 4k+5 - round 1
+	eor	v4.16b, v4.16b, v11.16b         //PRE 1
+	aes_encrypt_round	v3, v18          //AES block 4k+7 - round 0
+	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
+	pmull2	v28.1q, v5.2d, v14.2d           //GHASH block 4k+1 - high
+	eor	v31.8b, v31.8b, v6.8b           //GHASH block 4k+2 - mid
+	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
+	aes_encrypt_round	v0, v19          //AES block 4k+4 - round 1
+	rev	w9, w12                         //CTR block 4k+8
+	eor	v30.8b, v30.8b, v5.8b           //GHASH block 4k+1 - mid
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH block 4k - high
+	add	w12, w12, #1                    //CTR block 4k+8
+	mov	d10, v17.d[1]                   //GHASH block 4k - mid
+	aes_encrypt_round	v0, v20          //AES block 4k+4 - round 2
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b             //GHASH block 4k - mid
+	aes_encrypt_round	v1, v20          //AES block 4k+5 - round 2
+	aes_encrypt_round	v0, v21          //AES block 4k+4 - round 3
+	eor	v9.16b, v9.16b, v28.16b         //GHASH block 4k+1 - high
+	pmull	v28.1q, v6.1d, v13.1d           //GHASH block 4k+2 - low
+	pmull	v10.1q, v8.1d, v10.1d           //GHASH block 4k - mid
+	rev64	v7.16b, v7.16b                  //GHASH block 4k+3
+	pmull	v30.1q, v30.1d, v17.1d          //GHASH block 4k+1 - mid
+	pmull	v29.1q, v5.1d, v14.1d           //GHASH block 4k+1 - low
+	ins	v31.d[1], v31.d[0]              //GHASH block 4k+2 - mid
+	pmull2	v8.1q, v6.2d, v13.2d            //GHASH block 4k+2 - high
+	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+1 - mid
+	mov	d30, v7.d[1]                    //GHASH block 4k+3 - mid
+	aes_encrypt_round	v3, v19          //AES block 4k+7 - round 1
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+1 - low
+	aes_encrypt_round	v2, v20          //AES block 4k+6 - round 2
+	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
+	aes_encrypt_round	v1, v21          //AES block 4k+5 - round 3
+	eor	v30.8b, v30.8b, v7.8b           //GHASH block 4k+3 - mid
+	pmull2	v4.1q, v7.2d, v12.2d            //GHASH block 4k+3 - high
+	aes_encrypt_round	v2, v21          //AES block 4k+6 - round 3
+	eor	v9.16b, v9.16b, v8.16b          //GHASH block 4k+2 - high
+	pmull2	v31.1q, v31.2d, v16.2d          //GHASH block 4k+2 - mid
+	pmull	v29.1q, v7.1d, v12.1d           //GHASH block 4k+3 - low
+	movi	v8.8b, #0xc2
+	pmull	v30.1q, v30.1d, v16.1d          //GHASH block 4k+3 - mid
+	eor	v11.16b, v11.16b, v28.16b       //GHASH block 4k+2 - low
+	aes_encrypt_round	v1, v22          //AES block 4k+5 - round 4
+	aes_encrypt_round	v3, v20          //AES block 4k+7 - round 2
+	shl	d8, d8, #56                     //mod_constant
+	aes_encrypt_round	v0, v22          //AES block 4k+4 - round 4
+	eor	v9.16b, v9.16b, v4.16b          //GHASH block 4k+3 - high
+	aes_encrypt_round	v1, v23          //AES block 4k+5 - round 5
+	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
+	aes_encrypt_round	v3, v21          //AES block 4k+7 - round 3
+	eor	v10.16b, v10.16b, v31.16b       //GHASH block 4k+2 - mid
+	aes_encrypt_round	v0, v23          //AES block 4k+4 - round 5
+	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
+	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+3 - low
+	aes_encrypt_round	v2, v22          //AES block 4k+6 - round 4
+	eor	x19, x19, x13                     //AES block 4k+5 - round 10 low
+	aes_encrypt_round	v3, v22          //AES block 4k+7 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v24          //AES block 4k+5 - round 6
+	eor	x23, x23, x13                     //AES block 4k+3 - round 10 low
+	aes_encrypt_round	v2, v23          //AES block 4k+6 - round 5
+	eor	v30.16b, v11.16b, v9.16b        //MODULO - karatsuba tidy up
+	fmov	d4, x6                          //AES block 4k+4 - mov low
+	aes_encrypt_round	v0, v24          //AES block 4k+4 - round 6
+	fmov	v4.d[1], x7                     //AES block 4k+4 - mov high
+	add	x0, x0, #64                     //AES input_ptr update
+	fmov	d7, x23                         //AES block 4k+3 - mov low
+	ext	v9.16b, v9.16b, v9.16b, #8      //MODULO - other top alignment
+	aes_encrypt_round	v3, v23          //AES block 4k+7 - round 5
+	fmov	d5, x19                         //AES block 4k+5 - mov low
+	aes_encrypt_round	v0, v25          //AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v30.16b       //MODULO - karatsuba tidy up
+	aes_encrypt_round	v2, v24          //AES block 4k+6 - round 6
+	eor	x20, x20, x14                     //AES block 4k+5 - round 10 high
+	aes_encrypt_round	v1, v25          //AES block 4k+5 - round 7
+	fmov	v5.d[1], x20                    //AES block 4k+5 - mov high
+	aes_encrypt_round	v0, v26          //AES block 4k+4 - round 8
+	fmov	v7.d[1], x24                    //AES block 4k+3 - mov high
+	aes_encrypt_round	v3, v24          //AES block 4k+7 - round 6
+	cmp	x0, x5                   //.LOOP CONTROL
+	aes_encrypt_round	v1, v26          //AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v31.16b       //MODULO - fold into mid
+	eor	x21, x21, x13                     //AES block 4k+6 - round 10 low
+	eor	x22, x22, x14                     //AES block 4k+6 - round 10 high
+	ldr	q27, [x8, #144]                 //load rk9
+	aes_encrypt_round	v3, v25          //AES block 4k+7 - round 7
+	fmov	d6, x21                         //AES block 4k+6 - mov low
+	fmov	v6.d[1], x22                    //AES block 4k+6 - mov high
+	aes_encrypt_round	v2, v25          //AES block 4k+6 - round 7
+	ldr	q28, [x8, #160]                 //load rk9
+	aes_encrypt_round	v3, v26          //AES block 4k+7 - round 8
+	eor	v10.16b, v10.16b, v9.16b        //MODULO - fold into mid
+	aes_encrypt_round	v2, v26          //AES block 4k+6 - round 8
+	mov	x6, x17
+	sub	x6,x6,#10
+	cbz	x6, .Lleft2_rounds
+	aes_enc_extra_round 12
+	sub	x6,x6,#2
+	cbz	x6, .Lleft2_rounds
+	aes_enc_extra_round 14
+.Lleft2_rounds:
+	aese	v0.16b, v27.16b                 //AES block 4k+4 - round 9
+	eor	v4.16b, v4.16b, v0.16b          //AES block 4k+4 - result
+	fmov	d0, x10                         //CTR block 4k+8
+	fmov	v0.d[1], x9                     //CTR block 4k+8
+	rev	w9, w12                         //CTR block 4k+9
+	add	w12, w12, #1                    //CTR block 4k+9
+	aese	v1.16b, v27.16b                 //AES block 4k+5 - round 9
+	eor	v5.16b, v5.16b, v1.16b          //AES block 4k+5 - result
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
+	fmov	d1, x10                         //CTR block 4k+9
+	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
+	fmov	v1.d[1], x9                     //CTR block 4k+9
+	rev	w9, w12                         //CTR block 4k+10
+	aese	v2.16b, v27.16b                 //AES block 4k+6 - round 9
+	st1	{ v4.16b}, [x2], #16            //AES block 4k+4 - store result
+	eor	v6.16b, v6.16b, v2.16b          //AES block 4k+6 - result
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
+	aese	v3.16b, v27.16b                 //AES block 4k+7 - round 9
+	add	w12, w12, #1                    //CTR block 4k+10
+	ext	v10.16b, v10.16b, v10.16b, #8   //MODULO - other mid alignment
+	fmov	d2, x10                         //CTR block 4k+10
+	eor	v11.16b, v11.16b, v9.16b        //MODULO - fold into low
+	st1	{ v5.16b}, [x2], #16            //AES block 4k+5 - store result
+	fmov	v2.d[1], x9                     //CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16            //AES block 4k+6 - store result
+	rev	w9, w12                         //CTR block 4k+11
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b          //AES block 4k+3 - result
+	eor	v11.16b, v11.16b, v10.16b       //MODULO - fold into low
+	st1	{ v7.16b}, [x2], #16            //AES block 4k+3 - store result
+	b.lt	.L128_enc_main_loop
+.L128_enc_prepretail:	//PREPRETAIL
+	rev64	v4.16b, v4.16b                  //GHASH block 4k (only t0 is free)
+	fmov	d3, x10                         //CTR block 4k+3
+	rev64	v5.16b, v5.16b                  //GHASH block 4k+1 (t0 and t1 free)
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	add	w12, w12, #1                    //CTR block 4k+3
+	fmov	v3.d[1], x9                     //CTR block 4k+3
+	aes_encrypt_round	v1, v18         //AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                  //GHASH block 4k+2 (t0, t1, and t2 free)
+	pmull	v29.1q, v5.1d, v14.1d           //GHASH block 4k+1 - low
+	rev64	v7.16b, v7.16b                  //GHASH block 4k+3 (t0, t1, t2 and t3 free)
+	eor	v4.16b, v4.16b, v11.16b         //PRE 1
+	pmull2	v28.1q, v5.2d, v14.2d           //GHASH block 4k+1 - high
+	aes_encrypt_round	v3, v18         //AES block 4k+7 - round 0
+	mov	d30, v5.d[1]                    //GHASH block 4k+1 - mid
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH block 4k - low
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	mov	d31, v6.d[1]                    //GHASH block 4k+2 - mid
+	mov	d10, v17.d[1]                   //GHASH block 4k - mid
+	aes_encrypt_round	v1, v19         //AES block 4k+5 - round 1
+	eor	v30.8b, v30.8b, v5.8b           //GHASH block 4k+1 - mid
+	eor	v8.8b, v8.8b, v4.8b             //GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH block 4k - high
+	eor	v31.8b, v31.8b, v6.8b           //GHASH block 4k+2 - mid
+	aes_encrypt_round	v3, v19         //AES block 4k+7 - round 1
+	pmull	v30.1q, v30.1d, v17.1d          //GHASH block 4k+1 - mid
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+1 - low
+	pmull	v10.1q, v8.1d, v10.1d           //GHASH block 4k - mid
+	aes_encrypt_round	v0, v18         //AES block 4k+4 - round 0
+	ins	v31.d[1], v31.d[0]              //GHASH block 4k+2 - mid
+	aes_encrypt_round	v2, v18         //AES block 4k+6 - round 0
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+1 - mid
+	mov	d30, v7.d[1]                    //GHASH block 4k+3 - mid
+	aes_encrypt_round	v0, v19          //AES block 4k+4 - round 1
+	eor	v9.16b, v9.16b, v28.16b         //GHASH block 4k+1 - high
+	pmull2	v31.1q, v31.2d, v16.2d          //GHASH block 4k+2 - mid
+	pmull2	v8.1q, v6.2d, v13.2d            //GHASH block 4k+2 - high
+	eor	v30.8b, v30.8b, v7.8b           //GHASH block 4k+3 - mid
+	pmull2	v4.1q, v7.2d, v12.2d            //GHASH block 4k+3 - high
+	pmull	v28.1q, v6.1d, v13.1d           //GHASH block 4k+2 - low
+	aes_encrypt_round	v2, v19          //AES block 4k+6 - round 1
+	eor	v9.16b, v9.16b, v8.16b          //GHASH block 4k+2 - high
+	aes_encrypt_round	v0, v20          //AES block 4k+4 - round 2
+	pmull	v29.1q, v7.1d, v12.1d           //GHASH block 4k+3 - low
+	movi	v8.8b, #0xc2
+	aes_encrypt_round	v2, v20          //AES block 4k+6 - round 2
+	eor	v11.16b, v11.16b, v28.16b       //GHASH block 4k+2 - low
+	aes_encrypt_round	v3, v20          //AES block 4k+7 - round 2
+	pmull	v30.1q, v30.1d, v16.1d          //GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v31.16b       //GHASH block 4k+2 - mid
+	aes_encrypt_round	v2, v21          //AES block 4k+6 - round 3
+	aes_encrypt_round	v1, v20          //AES block 4k+5 - round 2
+	eor	v9.16b, v9.16b, v4.16b          //GHASH block 4k+3 - high
+	aes_encrypt_round	v0, v21          //AES block 4k+4 - round 3
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+3 - mid
+	shl	d8, d8, #56               //mod_constant
+	aes_encrypt_round	v1, v21          //AES block 4k+5 - round 3
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+3 - low
+	aes_encrypt_round	v0, v22          //AES block 4k+4 - round 4
+	pmull	v28.1q, v9.1d, v8.1d
+	eor	v10.16b, v10.16b, v9.16b        //karatsuba tidy up
+	aes_encrypt_round	v1, v22          //AES block 4k+5 - round 4
+	aes_encrypt_round	v0, v23          //AES block 4k+4 - round 5
+	ext	v9.16b, v9.16b, v9.16b, #8
+	aes_encrypt_round	v3, v21          //AES block 4k+7 - round 3
+	aes_encrypt_round	v2, v22          //AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v11.16b
+	aes_encrypt_round	v0, v24          //AES block 4k+4 - round 6
+	aes_encrypt_round	v3, v22          //AES block 4k+7 - round 4
+	aes_encrypt_round	v1, v23          //AES block 4k+5 - round 5
+	aes_encrypt_round	v2, v23          //AES block 4k+6 - round 5
+	eor	v10.16b, v10.16b, v28.16b
+	aes_encrypt_round	v3, v23          //AES block 4k+7 - round 5
+	aes_encrypt_round	v1, v24          //AES block 4k+5 - round 6
+	aes_encrypt_round	v2, v24          //AES block 4k+6 - round 6
+	aes_encrypt_round	v3, v24          //AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v9.16b
+	ldr	q27, [x8, #144]                                //load rk9
+	aes_encrypt_round	v0, v25          //AES block 4k+4 - round 7
+	aes_encrypt_round	v2, v25          //AES block 4k+6 - round 7
+	aes_encrypt_round	v3, v25          //AES block 4k+7 - round 7
+	pmull	v28.1q, v10.1d, v8.1d
+	aes_encrypt_round	v1, v25          //AES block 4k+5 - round 7
+	ext	v10.16b, v10.16b, v10.16b, #8
+	aes_encrypt_round	v3, v26          //AES block 4k+7 - round 8
+	aes_encrypt_round	v0, v26          //AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v28.16b
+	aes_encrypt_round	v1, v26          //AES block 4k+5 - round 8
+	ldr	q28, [x8, #160]                                //load rk9
+	aes_encrypt_round	v2, v26          //AES block 4k+6 - round 8
+
+	mov	x6, x17
+	sub	x6,x6,#10
+	cbz	x6, .Lleft3_rounds
+	aes_enc_extra_round 12
+	sub	x6,x6,#2
+	cbz	x6, .Lleft3_rounds
+	aes_enc_extra_round 14
+
+.Lleft3_rounds:
+	aese	v0.16b, v27.16b                 //AES block 4k+4 - round 9
+	aese	v3.16b, v27.16b                 //AES block 4k+7 - round 9
+	aese	v1.16b, v27.16b                 //AES block 4k+5 - round 9
+	eor	v11.16b, v11.16b, v10.16b
+	aese	v2.16b, v27.16b                 //AES block 4k+6 - round 9
+.L128_enc_tail:	//TAIL
+	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left
+	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
+	cmp	x5, #48
+	ext	v8.16b, v11.16b, v11.16b, #8    //prepare final partial tag
+	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
+	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
+	fmov	d4, x6                          //AES block 4k+4 - mov low
+	fmov	v4.d[1], x7                     //AES block 4k+4 - mov high
+	eor	v5.16b, v4.16b, v0.16b          //AES block 4k+4 - result
+	b.gt	.L128_enc_blocks_more_than_3
+	sub	w12, w12, #1
+	movi	v11.8b, #0
+	mov	v3.16b, v2.16b
+	cmp	x5, #32
+	mov	v2.16b, v1.16b
+	movi	v9.8b, #0
+	movi	v10.8b, #0
+	b.gt	.L128_enc_blocks_more_than_2
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	sub	w12, w12, #1
+	b.gt	.L128_enc_blocks_more_than_1
+	sub	w12, w12, #1
+	b	.L128_enc_blocks_less_than_1
+.L128_enc_blocks_more_than_3:	//blocks	left >  3
+	st1	{ v5.16b}, [x2], #16    //AES final-3 block  - store result
+	ldp	x6, x7, [x0], #16       //AES final-2 block-load input low&high
+	rev64	v4.16b, v5.16b          //GHASH final-3 block
+	eor	v4.16b, v4.16b, v8.16b  //feed in partial tag
+	eor	x7, x7, x14             //AES final-2 block - round 10 high
+	eor	x6, x6, x13             //AES final-2 block - round 10 low
+	fmov	d5, x6                  //AES final-2 block - mov low
+	movi	v8.8b, #0               //suppress further partial tag feed in
+	fmov	v5.d[1], x7             //AES final-2 block - mov high
+	pmull	v11.1q, v4.1d, v15.1d   //GHASH final-3 block - low
+	mov	d22, v4.d[1]            //GHASH final-3 block - mid
+	pmull2	v9.1q, v4.2d, v15.2d    //GHASH final-3 block - high
+	mov	d10, v17.d[1]           //GHASH final-3 block - mid
+	eor	v5.16b, v5.16b, v1.16b  //AES final-2 block - result
+	eor	v22.8b, v22.8b, v4.8b   //GHASH final-3 block - mid
+	pmull	v10.1q, v22.1d, v10.1d  //GHASH final-3 block - mid
+.L128_enc_blocks_more_than_2:	//blocks	left >  2
+	st1	{ v5.16b}, [x2], #16    //AES final-2 block - store result
+	rev64	v4.16b, v5.16b          //GHASH final-2 block
+	ldp	x6, x7, [x0], #16       //AES final-1 block-load input low&high
+	eor	v4.16b, v4.16b, v8.16b  //feed in partial tag
+	eor	x6, x6, x13             //AES final-1 block - round 10 low
+	fmov	d5, x6                  //AES final-1 block - mov low
+	eor	x7, x7, x14             //AES final-1 block - round 10 high
+	pmull2	v20.1q, v4.2d, v14.2d   //GHASH final-2 block - high
+	fmov	v5.d[1], x7             //AES final-1 block - mov high
+	mov	d22, v4.d[1]            //GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d   //GHASH final-2 block - low
+	eor	v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b   //GHASH final-2 block - mid
+	eor	v5.16b, v5.16b, v2.16b  //AES final-1 block - result
+	eor	v11.16b, v11.16b, v21.16b       //GHASH final-2 block - low
+	pmull	v22.1q, v22.1d, v17.1d  //GHASH final-2 block - mid
+	movi	v8.8b, #0               //suppress further partial tag feed in
+	eor	v10.16b, v10.16b, v22.16b       //GHASH final-2 block - mid
+.L128_enc_blocks_more_than_1:	//blocks	left >  1
+	st1	{ v5.16b}, [x2], #16    //AES final-1 block - store result
+	rev64	v4.16b, v5.16b          //GHASH final-1 block
+	ldp	x6, x7, [x0], #16       //AES final block - load input low & high
+	eor	v4.16b, v4.16b, v8.16b  //feed in partial tag
+	eor	x7, x7, x14             //AES final block - round 10 high
+	eor	x6, x6, x13             //AES final block - round 10 low
+	fmov	d5, x6                  //AES final block - mov low
+	pmull2	v20.1q, v4.2d, v13.2d   //GHASH final-1 block - high
+	fmov	v5.d[1], x7             //AES final block - mov high
+	mov	d22, v4.d[1]            //GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d   //GHASH final-1 block - low
+	eor	v22.8b, v22.8b, v4.8b   //GHASH final-1 block - mid
+	eor	v5.16b, v5.16b, v3.16b  //AES final block - result
+	ins	v22.d[1], v22.d[0]      //GHASH final-1 block - mid
+	pmull2	v22.1q, v22.2d, v16.2d  //GHASH final-1 block - mid
+	eor	v11.16b, v11.16b, v21.16b       //GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b         //GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b       //GHASH final-1 block - mid
+	movi	v8.8b, #0                       //suppress further partial tag feed in
+.L128_enc_blocks_less_than_1:	//blocks	left <= 1
+	and	x1, x1, #127                    //bit_length %= 128
+	mvn	x13, xzr                        //rk10_l = 0xffffffffffffffff
+	mvn	x14, xzr                        //rk10_h = 0xffffffffffffffff
+	sub	x1, x1, #128                    //bit_length -= 128
+	neg	x1, x1                          //bit_length = 128 - #bits
+	and	x1, x1, #127                    //bit_length %= 128
+	lsr	x14, x14, x1
+	cmp	x1, #64
+	csel	x6, x13, x14, lt
+	csel	x7, x14, xzr, lt
+	fmov	d0, x6                          //ctr0b is mask for last block
+	fmov	v0.d[1], x7
+        //possibly partial last block has zeroes in highest bits
+	and	v5.16b, v5.16b, v0.16b
+	rev64	v4.16b, v5.16b                  //GHASH final block
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	mov	d8, v4.d[1]                     //GHASH final block - mid
+        //load existing bytes where the possibly partial last block is to be stored
+	ld1	{ v18.16b}, [x2]
+	rev	w9, w12
+	karasuba_multiply v4, v12, v20, v8, v21
+	load_const
+	gcm_tidy_up v9, v10, v11, v30, v31
+        //insert existing bytes in top end of result
+	bif	v5.16b, v18.16b, v0.16b
+	st1	{ v5.16b}, [x2]                 //store all 16B
+	str	w9, [x16, #12]                  //store the updated counter
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	pop_stack
+	ret
+.L128_enc_ret:
+	mov	w0, #0x0
+	ret
+SYM_FUNC_END(pmull_gcm_encrypt_unroll)
+
+SYM_FUNC_START(pmull_gcm_decrypt_unroll)
+	cbz	x1, .L128_dec_ret
+        push_stack
+
+	mov	x16, x4
+	mov	x8, x5
+	lsr	x5, x1, #3                      //byte_len
+	mov	x15, x5
+        mov     x17, x6
+	ldp	x10, x11, [x16]                 //ctr96_b64, ctr96_t32
+	sub	x5, x5, #1                      //byte_len - 1
+	ldr	q18, [x8, #0]                   //load rk0
+	and	x5, x5, #0xffffffffffffffc0
+	ld1	{ v0.16b}, [x16]
+	ldr	q28, [x8, #160]                 //load rk10
+	ldr	q13, [x3, #64]                  //load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	lsr	x12, x11, #32
+	fmov	d2, x10                         //CTR block 2
+	ldr	q19, [x8, #16]                  //load rk1
+	orr	w11, w11, w11
+	rev	w12, w12                        //rev_ctr32
+	fmov	d1, x10                         //CTR block 1
+	add	w12, w12, #1                    //increment rev_ctr32
+	aes_encrypt_round	v0, v18         //AES block 0 - round 0
+	rev	w9, w12                         //CTR block 1
+	orr	x9, x11, x9, lsl #32            //CTR block 1
+	ldr	q20, [x8, #32]                  //load rk2
+	add	w12, w12, #1                    //CTR block 1
+	fmov	v1.d[1], x9                     //CTR block 1
+	rev	w9, w12                         //CTR block 2
+	add	w12, w12, #1                    //CTR block 2
+	aes_encrypt_round	v0, v19         //AES block 0 - round 1
+	orr	x9, x11, x9, lsl #32            //CTR block 2
+	fmov	v2.d[1], x9                     //CTR block 2
+	rev	w9, w12                         //CTR block 3
+	fmov	d3, x10                         //CTR block 3
+	orr	x9, x11, x9, lsl #32            //CTR block 3
+	add	w12, w12, #1                    //CTR block 3
+	fmov	v3.d[1], x9                     //CTR block 3
+	add	x4, x0, x1, lsr #3              //end_input_ptr
+	aes_encrypt_round	v1, v18          //AES block 1 - round 0
+	ldr	q21, [x8, #48]                                 //load rk3
+	aes_encrypt_round	v0, v20          //AES block 0 - round 2
+	ldr	q24, [x8, #96]                                 //load rk6
+	aes_encrypt_round	v2, v18          //AES block 2 - round 0
+	ldr	q25, [x8, #112]                                //load rk7
+	aes_encrypt_round	v1, v19          //AES block 1 - round 1
+	ldr	q22, [x8, #64]                                 //load rk4
+	aes_encrypt_round	v3, v18          //AES block 3 - round 0
+	aes_encrypt_round	v2, v19          //AES block 2 - round 1
+	aes_encrypt_round	v1, v20          //AES block 1 - round 2
+	ldp	x13, x14, [x8, #160]                     //load rk10
+	aes_encrypt_round	v3, v19          //AES block 3 - round 1
+	load_initial_tag v11,x3
+	aes_encrypt_round	v0, v21          //AES block 0 - round 3
+	ldr	q23, [x8, #80]                                 //load rk5
+	aes_encrypt_round	v1, v21          //AES block 1 - round 3
+	aes_encrypt_round	v3, v20          //AES block 3 - round 2
+	aes_encrypt_round	v2, v20          //AES block 2 - round 2
+	ldr	q27, [x8, #144]                                //load rk9
+	aes_encrypt_round	v1, v22          //AES block 1 - round 4
+	aes_encrypt_round	v3, v21          //AES block 3 - round 3
+	aes_encrypt_round	v2, v21          //AES block 2 - round 3
+	ldr	q14, [x3, #80]                         //load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aes_encrypt_round	v0, v22          //AES block 0 - round 4
+	ldr	q26, [x8, #128]                                //load rk8
+	aes_encrypt_round	v1, v23          //AES block 1 - round 5
+	aes_encrypt_round	v2, v22          //AES block 2 - round 4
+	aes_encrypt_round	v3, v22          //AES block 3 - round 4
+	aes_encrypt_round	v0, v23          //AES block 0 - round 5
+	aes_encrypt_round	v2, v23          //AES block 2 - round 5
+	ldr	q12, [x3, #32]                         //load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aes_encrypt_round	v3, v23          //AES block 3 - round 5
+	aes_encrypt_round	v0, v24          //AES block 0 - round 6
+	aes_encrypt_round	v1, v24          //AES block 1 - round 6
+	aes_encrypt_round	v3, v24          //AES block 3 - round 6
+	aes_encrypt_round	v2, v24          //AES block 2 - round 6
+	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
+	ldr	q15, [x3, #112]                        //load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
+	add	x5, x5, x0
+	aes_encrypt_round	v1, v25          //AES block 1 - round 7
+	aes_encrypt_round	v2, v25          //AES block 2 - round 7
+	aes_encrypt_round	v0, v25          //AES block 0 - round 7
+	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
+	aes_encrypt_round	v3, v25          //AES block 3 - round 7
+	aes_encrypt_round	v1, v26          //AES block 1 - round 8
+	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
+	aes_encrypt_round	v2, v26          //AES block 2 - round 8
+	aes_encrypt_round	v3, v26          //AES block 3 - round 8
+	aes_encrypt_round	v0, v26          //AES block 0 - round 8
+
+	mov     x6, x17
+	sub    x6, x6, #10
+	cbz     x6, .Lleft_dec_rounds
+	aes_enc_extra_round 12
+	sub    x6, x6, #2
+	cbz     x6, .Lleft_dec_rounds
+	aes_enc_extra_round 14
+
+.Lleft_dec_rounds:
+	trn1	v9.2d, v14.2d,    v15.2d        //h4h | h3h
+	aese	v2.16b, v27.16b                 //AES block 2 - round 9
+	aese	v3.16b, v27.16b                 //AES block 3 - round 9
+	aese	v0.16b, v27.16b                 //AES block 0 - round 9
+	cmp	x0, x5                   //check if we have <= 4 blocks
+	aese	v1.16b, v27.16b                 //AES block 1 - round 9
+	eor	v17.16b, v17.16b, v9.16b        //h4k | h3k
+	b.ge	.L128_dec_tail                  //handle tail
+	ldr	q5, [x0, #16]                   //AES block 1 - load ciphertext
+	ldr	q4, [x0, #0]                    //AES block 0 - load ciphertext
+	eor	v1.16b, v5.16b, v1.16b          //AES block 1 - result
+	ldr	q6, [x0, #32]                   //AES block 2 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b          //AES block 0 - result
+	rev64	v4.16b, v4.16b                  //GHASH block 0
+	rev	w9, w12                         //CTR block 4
+	orr	x9, x11, x9, lsl #32            //CTR block 4
+	add	w12, w12, #1                    //CTR block 4
+	ldr	q7, [x0, #48]                   //AES block 3 - load ciphertext
+	rev64	v5.16b, v5.16b                  //GHASH block 1
+	add	x0, x0, #64                     //AES input_ptr update
+	mov	x19, v1.d[0]                    //AES block 1 - mov low
+	mov	x20, v1.d[1]                    //AES block 1 - mov high
+	mov	x6, v0.d[0]                     //AES block 0 - mov low
+	cmp	x0, x5                   //check if we have <= 8 blocks
+	mov	x7, v0.d[1]                     //AES block 0 - mov high
+	fmov	d0, x10                         //CTR block 4
+	fmov	v0.d[1], x9                     //CTR block 4
+	rev	w9, w12                         //CTR block 5
+	eor	x19, x19, x13                   //AES block 1 - round 10 low
+	fmov	d1, x10                         //CTR block 5
+	add	w12, w12, #1                    //CTR block 5
+	orr	x9, x11, x9, lsl #32            //CTR block 5
+	fmov	v1.d[1], x9                     //CTR block 5
+	rev	w9, w12                         //CTR block 6
+	add	w12, w12, #1                    //CTR block 6
+	orr	x9, x11, x9, lsl #32            //CTR block 6
+	eor	x20, x20, x14                   //AES block 1 - round 10 high
+	eor	x6, x6, x13                   //AES block 0 - round 10 low
+	eor	v2.16b, v6.16b, v2.16b          //AES block 2 - result
+	eor	x7, x7, x14                   //AES block 0 - round 10 high
+	stp	x6, x7, [x2], #16               //AES block 0 - store result
+	stp	x19, x20, [x2], #16             //AES block 1 - store result
+	b.ge	.L128_dec_prepretail            //do prepretail
+.L128_dec_main_loop:	//main	loop start
+	eor	v3.16b, v7.16b, v3.16b          //AES block 4k+3 - result
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	mov	x21, v2.d[0]                    //AES block 4k+2 - mov low
+	pmull2	v28.1q, v5.2d, v14.2d           //GHASH block 4k+1 - high
+	mov	x22, v2.d[1]                    //AES block 4k+2 - mov high
+	aes_encrypt_round	v1, v18          //AES block 4k+5 - round 0
+	fmov	d2, x10                         //CTR block 4k+6
+	rev64	v6.16b, v6.16b                  //GHASH block 4k+2
+	fmov	v2.d[1], x9                     //CTR block 4k+6
+	rev	w9, w12                         //CTR block 4k+7
+	mov	x23, v3.d[0]                    //AES block 4k+3 - mov low
+	eor	v4.16b, v4.16b, v11.16b         //PRE 1
+	mov	d30, v5.d[1]                    //GHASH block 4k+1 - mid
+	aes_encrypt_round	v1, v19         //AES block 4k+5 - round 1
+	rev64	v7.16b, v7.16b                  //GHASH block 4k+3
+	pmull	v29.1q, v5.1d, v14.1d           //GHASH block 4k+1 - low
+	mov	x24, v3.d[1]                    //AES block 4k+3 - mov high
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH block 4k - low
+	fmov	d3, x10                         //CTR block 4k+7
+	eor	v30.8b, v30.8b, v5.8b           //GHASH block 4k+1 - mid
+	aes_encrypt_round	v1, v20          //AES block 4k+5 - round 2
+	fmov	v3.d[1], x9                     //CTR block 4k+7
+	aes_encrypt_round	v2, v18          //AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                   //GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH block 4k - high
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+1 - low
+	pmull	v29.1q, v7.1d, v12.1d           //GHASH block 4k+3 - low
+	aes_encrypt_round	v1, v21          //AES block 4k+5 - round 3
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	aes_encrypt_round	v3, v18          //AES block 4k+7 - round 0
+	eor	v9.16b, v9.16b, v28.16b         //GHASH block 4k+1 - high
+	aes_encrypt_round	v0, v18          //AES block 4k+4 - round 0
+	pmull	v28.1q, v6.1d, v13.1d           //GHASH block 4k+2 - low
+	eor	v8.8b, v8.8b, v4.8b             //GHASH block 4k - mid
+	aes_encrypt_round	v3, v19          //AES block 4k+7 - round 1
+	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
+	pmull	v30.1q, v30.1d, v17.1d          //GHASH block 4k+1 - mid
+	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
+	mov	d31, v6.d[1]                    //GHASH block 4k+2 - mid
+	aes_encrypt_round	v0, v19          //AES block 4k+4 - round 1
+	eor	v11.16b, v11.16b, v28.16b       //GHASH block 4k+2 - low
+	pmull	v10.1q, v8.1d, v10.1d           //GHASH block 4k - mid
+	aes_encrypt_round	v3, v20          //AES block 4k+7 - round 2
+	eor	v31.8b, v31.8b, v6.8b           //GHASH block 4k+2 - mid
+	aes_encrypt_round	v0, v20          //AES block 4k+4 - round 2
+	aes_encrypt_round	v1, v22          //AES block 4k+5 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+1 - mid
+	pmull2	v8.1q, v6.2d, v13.2d            //GHASH block 4k+2 - high
+	aes_encrypt_round	v0, v21          //AES block 4k+4 - round 3
+	ins	v31.d[1], v31.d[0]              //GHASH block 4k+2 - mid
+	pmull2	v4.1q, v7.2d, v12.2d            //GHASH block 4k+3 - high
+	aes_encrypt_round	v2, v19          //AES block 4k+6 - round 1
+	mov	d30, v7.d[1]                    //GHASH block 4k+3 - mid
+	aes_encrypt_round	v0, v22          //AES block 4k+4 - round 4
+	eor	v9.16b, v9.16b, v8.16b          //GHASH block 4k+2 - high
+	pmull2	v31.1q, v31.2d, v16.2d          //GHASH block 4k+2 - mid
+	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
+	aes_encrypt_round	v2, v20          //AES block 4k+6 - round 2
+	eor	v30.8b, v30.8b, v7.8b           //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v23          //AES block 4k+5 - round 5
+	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
+	aes_encrypt_round	v0, v23          //AES block 4k+4 - round 5
+	movi	v8.8b, #0xc2
+	aes_encrypt_round	v2, v21          //AES block 4k+6 - round 3
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+3 - low
+	aes_encrypt_round	v1, v24          //AES block 4k+5 - round 6
+	aes_encrypt_round	v0, v24          //AES block 4k+4 - round 6
+	eor	v10.16b, v10.16b, v31.16b       //GHASH block 4k+2 - mid
+	aes_encrypt_round	v2, v22          //AES block 4k+6 - round 4
+	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
+	pmull	v30.1q, v30.1d, v16.1d          //GHASH block 4k+3 - mid
+	eor	v9.16b, v9.16b, v4.16b          //GHASH block 4k+3 - high
+	ldr	q4, [x0, #0]                    //AES block 4k+4 - load cipher
+	aes_encrypt_round	v1, v25          //AES block 4k+5 - round 7
+	add	w12, w12, #1                    //CTR block 4k+7
+	aes_encrypt_round	v0, v25          //AES block 4k+4 - round 7
+	shl	d8, d8, #56                     //mod_constant
+	aes_encrypt_round	v2, v23          //AES block 4k+6 - round 5
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v26          //AES block 4k+5 - round 8
+	stp	x23, x24, [x2], #16             //AES block 4k+3 - store result
+	aes_encrypt_round	v0, v26          //AES block 4k+4 - round 8
+	eor	v30.16b, v11.16b, v9.16b        //MODULO - karatsuba tidy up
+        ldr	q27, [x8, #144]                 //load rk9
+	aes_encrypt_round	v3, v21         //AES block 4k+7 - round 3
+	rev	w9, w12                         //CTR block 4k+8
+	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
+	ldr	q5, [x0, #16]                   //AES block 4k+5 - load ciphertext
+	ext	v9.16b, v9.16b, v9.16b, #8      //MODULO - other top alignment
+	ldr	q28, [x8, #160]                 //load rk9
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
+	aes_encrypt_round	v3, v22         //AES block 4k+7 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //MODULO - karatsuba tidy up
+	aes_encrypt_round	v2, v24         //AES block 4k+6 - round 6
+	aes_encrypt_round	v3, v23         //AES block 4k+7 - round 5
+	ldr	q6, [x0, #32]                   //AES block 4k+6 - load ciphertext
+	add	w12, w12, #1                    //CTR block 4k+8
+	eor	v10.16b, v10.16b, v31.16b       //MODULO - fold into mid
+	aes_encrypt_round	v2, v25         //AES block 4k+6 - round 7
+	ldr	q7, [x0, #48]                   //AES block 4k+3 - load ciphertext
+	aes_encrypt_round	v3, v24         //AES block 4k+7 - round 6
+	add	x0, x0, #64                     //AES input_ptr update
+	aes_encrypt_round	v3, v25         //AES block 4k+7 - round 7
+	eor	v10.16b, v10.16b, v9.16b        //MODULO - fold into mid
+	aes_encrypt_round	v2, v26         //AES block 4k+6 - round 8
+	aes_encrypt_round	v3, v26         //AES block 4k+7 - round 8
+
+	mov	x18, x17
+	sub	x18,x18,#10
+	cbz	x18, .Lleft2_dec_rounds
+	aes_enc_extra_round	12
+	sub	x18,x18,#2
+	cbz	x18, .Lleft2_dec_rounds
+	aes_enc_extra_round 14
+
+.Lleft2_dec_rounds:
+	aese	v0.16b, v27.16b                 //AES block 4k+4 - round 9
+	aese	v1.16b, v27.16b                 //AES block 4k+5 - round 9
+	eor	v0.16b, v4.16b, v0.16b          //AES block 4k+4 - result
+	eor	v1.16b, v5.16b, v1.16b          //AES block 4k+5 - result
+	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
+	rev64	v5.16b, v5.16b                  //GHASH block 4k+5
+	mov	x7, v0.d[1]                     //AES block 4k+4 - mov high
+	mov	x6, v0.d[0]                     //AES block 4k+4 - mov low
+	fmov	d0, x10                         //CTR block 4k+8
+	fmov	v0.d[1], x9                     //CTR block 4k+8
+	rev	w9, w12                         //CTR block 4k+9
+	aese	v2.16b, v27.16b                 //AES block 4k+6 - round 9
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8   //MODULO - other mid alignment
+	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
+	eor	v11.16b, v11.16b, v8.16b        //MODULO - fold into low
+	mov	x20, v1.d[1]                    //AES block 4k+5 - mov high
+	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
+	eor	v2.16b, v6.16b, v2.16b          //AES block 4k+6 - result
+	mov	x19, v1.d[0]                    //AES block 4k+5 - mov low
+	add	w12, w12, #1                    //CTR block 4k+9
+	aese	v3.16b, v27.16b                 //AES block 4k+7 - round 9
+	fmov	d1, x10                         //CTR block 4k+9
+	cmp	x0, x5                   //.LOOP CONTROL
+	rev64	v4.16b, v4.16b                  //GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b       //MODULO - fold into low
+	fmov	v1.d[1], x9                     //CTR block 4k+9
+	rev	w9, w12                         //CTR block 4k+10
+	add	w12, w12, #1                    //CTR block 4k+10
+	eor	x20, x20, x14                   //AES block 4k+5 - round 10 high
+	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
+	eor	x19, x19, x13                   //AES block 4k+5 - round 10 low
+	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
+	b.lt	.L128_dec_main_loop
+.L128_dec_prepretail:	//PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	mov	x21, v2.d[0]                    //AES block 4k+2 - mov low
+	mov	d30, v5.d[1]                    //GHASH block 4k+1 - mid
+	aes_encrypt_round	v0, v18         //AES block 4k+4 - round 0
+	eor	v3.16b, v7.16b, v3.16b          //AES block 4k+3 - result
+	aes_encrypt_round	v1, v18         //AES block 4k+5 - round 0
+	mov	x22, v2.d[1]                    //AES block 4k+2 - mov high
+	eor	v4.16b, v4.16b, v11.16b         //PRE 1
+	fmov	d2, x10                         //CTR block 4k+6
+	rev64	v6.16b, v6.16b                  //GHASH block 4k+2
+	aes_encrypt_round	v0, v19         //AES block 4k+4 - round 1
+	fmov	v2.d[1], x9                     //CTR block 4k+6
+	rev	w9, w12                         //CTR block 4k+7
+	mov	x23, v3.d[0]                    //AES block 4k+3 - mov low
+	eor	v30.8b, v30.8b, v5.8b           //GHASH block 4k+1 - mid
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH block 4k - low
+	mov	d10, v17.d[1]                   //GHASH block 4k - mid
+	mov	x24, v3.d[1]                    //AES block 4k+3 - mov high
+	aes_encrypt_round	v1, v19         //AES block 4k+5 - round 1
+	mov	d31, v6.d[1]                    //GHASH block 4k+2 - mid
+	aes_encrypt_round	v0, v20         //AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
+	pmull	v29.1q, v5.1d, v14.1d           //GHASH block 4k+1 - low
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	fmov	d3, x10                         //CTR block 4k+7
+	aes_encrypt_round	v2, v18         //AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                     //CTR block 4k+7
+	pmull	v30.1q, v30.1d, v17.1d          //GHASH block 4k+1 - mid
+	eor	v31.8b, v31.8b, v6.8b           //GHASH block 4k+2 - mid
+	rev64	v7.16b, v7.16b                  //GHASH block 4k+3
+	aes_encrypt_round	v2, v19         //AES block 4k+6 - round 1
+	eor	v8.8b, v8.8b, v4.8b             //GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH block 4k - high
+	aes_encrypt_round	v3, v18         //AES block 4k+7 - round 0
+	ins	v31.d[1], v31.d[0]              //GHASH block 4k+2 - mid
+	pmull2	v28.1q, v5.2d, v14.2d           //GHASH block 4k+1 - high
+	pmull	v10.1q, v8.1d, v10.1d           //GHASH block 4k - mid
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+1 - low
+	pmull	v29.1q, v7.1d, v12.1d           //GHASH block 4k+3 - low
+	pmull2	v31.1q, v31.2d, v16.2d          //GHASH block 4k+2 - mid
+	eor	v9.16b, v9.16b, v28.16b         //GHASH block 4k+1 - high
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+1 - mid
+	pmull2	v4.1q, v7.2d, v12.2d            //GHASH block 4k+3 - high
+	pmull2	v8.1q, v6.2d, v13.2d            //GHASH block 4k+2 - high
+	mov	d30, v7.d[1]                    //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v20         //AES block 4k+5 - round 2
+	eor	v10.16b, v10.16b, v31.16b       //GHASH block 4k+2 - mid
+	pmull	v28.1q, v6.1d, v13.1d           //GHASH block 4k+2 - low
+	eor	v9.16b, v9.16b, v8.16b          //GHASH block 4k+2 - high
+	movi	v8.8b, #0xc2
+	aes_encrypt_round	v3, v19         //AES block 4k+7 - round 1
+	eor	v30.8b, v30.8b, v7.8b           //GHASH block 4k+3 - mid
+	eor	v11.16b, v11.16b, v28.16b       //GHASH block 4k+2 - low
+	aes_encrypt_round	v2, v20         //AES block 4k+6 - round 2
+	eor	v9.16b, v9.16b, v4.16b          //GHASH block 4k+3 - high
+	aes_encrypt_round	v3, v20         //AES block 4k+7 - round 2
+	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
+	pmull	v30.1q, v30.1d, v16.1d          //GHASH block 4k+3 - mid
+	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+3 - low
+	aes_encrypt_round	v2, v21         //AES block 4k+6 - round 3
+	aes_encrypt_round	v1, v21         //AES block 4k+5 - round 3
+	shl	d8, d8, #56               //mod_constant
+	aes_encrypt_round	v0, v21         //AES block 4k+4 - round 3
+	aes_encrypt_round	v2, v22         //AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v22         //AES block 4k+5 - round 4
+	aes_encrypt_round	v3, v21         //AES block 4k+7 - round 3
+	eor	v30.16b, v11.16b, v9.16b        //MODULO - karatsuba tidy up
+	aes_encrypt_round	v2, v23         //AES block 4k+6 - round 5
+	aes_encrypt_round	v1, v23         //AES block 4k+5 - round 5
+	aes_encrypt_round	v3, v22         //AES block 4k+7 - round 4
+	aes_encrypt_round	v0, v22         //AES block 4k+4 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //MODULO - karatsuba tidy up
+	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
+	aes_encrypt_round	v1, v24         //AES block 4k+5 - round 6
+	ext	v9.16b, v9.16b, v9.16b, #8      //MODULO - other top alignment
+	aes_encrypt_round	v3, v23         //AES block 4k+7 - round 5
+	aes_encrypt_round	v0, v23         //AES block 4k+4 - round 5
+	eor	v10.16b, v10.16b, v31.16b       //MODULO - fold into mid
+	aes_encrypt_round	v1, v25         //AES block 4k+5 - round 7
+	aes_encrypt_round	v2, v24         //AES block 4k+6 - round 6
+	ldr	q27, [x8, #144]                 //load rk9
+	aes_encrypt_round	v0, v24         //AES block 4k+4 - round 6
+	aes_encrypt_round	v1, v26         //AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v9.16b        //MODULO - fold into mid
+	aes_encrypt_round	v3, v24         //AES block 4k+7 - round 6
+	ldr	q28, [x8, #160]                 //load rk9
+	aes_encrypt_round	v0, v25         //AES block 4k+4 - round 7
+	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
+	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
+	aes_encrypt_round	v2, v25         //AES block 4k+6 - round 7
+	ext	v10.16b, v10.16b, v10.16b, #8   //MODULO - other mid alignment
+	aes_encrypt_round	v3, v25         //AES block 4k+7 - round 7
+	aes_encrypt_round	v0, v26         //AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v8.16b        //MODULO - fold into low
+	aes_encrypt_round	v2, v26         //AES block 4k+6 - round 8
+	aes_encrypt_round	v3, v26         //AES block 4k+7 - round 8
+	mov	x6, x17
+	sub	x6,x6,#10
+	cbz	x6, .Lleft3_dec_rounds
+	aes_enc_extra_round	12
+	sub	x6,x6,#2
+	cbz	x6, .Lleft3_dec_rounds
+	aes_enc_extra_round	14
+.Lleft3_dec_rounds:
+	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
+	aese	v0.16b, v27.16b                 //AES block 4k+4 - round 9
+	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
+	aese	v1.16b, v27.16b                 //AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b                 //AES block 4k+6 - round 9
+	add	w12, w12, #1                    //CTR block 4k+7
+	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
+	aese	v3.16b, v27.16b                 //AES block 4k+7 - round 9
+	eor	v11.16b, v11.16b, v10.16b       //MODULO - fold into low
+.L128_dec_tail:	//TAIL
+	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left
+	ld1	{ v5.16b}, [x0], #16            //AES block 4k+4 - load cipher
+	eor	v0.16b, v5.16b, v0.16b          //AES block 4k+4 - result
+	mov	x7, v0.d[1]                     //AES block 4k+4 - mov high
+	mov	x6, v0.d[0]                     //AES block 4k+4 - mov low
+	cmp	x5, #48
+	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
+	ext	v8.16b, v11.16b, v11.16b, #8    //prepare final partial tag
+	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
+	b.gt	.L128_dec_blocks_more_than_3
+	mov	v3.16b, v2.16b
+	sub	w12, w12, #1
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	movi	v10.8b, #0
+	cmp	x5, #32
+	b.gt	.L128_dec_blocks_more_than_2
+	cmp	x5, #16
+	mov	v3.16b, v1.16b
+	sub	w12, w12, #1
+	b.gt	.L128_dec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	.L128_dec_blocks_less_than_1
+.L128_dec_blocks_more_than_3:	//blocks	left >  3
+	rev64	v4.16b, v5.16b                  //GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16            //final-2 block - load cipher
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	mov	d10, v17.d[1]                   //GHASH final-3 block - mid
+	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
+	eor	v0.16b, v5.16b, v1.16b          //AES final-2 block - result
+	mov	d22, v4.d[1]                    //GHASH final-3 block - mid
+	mov	x7, v0.d[1]                     //AES final-2 block - mov high
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH final-3 block - low
+	mov	x6, v0.d[0]                     //AES final-2 block - mov low
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH final-3 block - high
+	eor	v22.8b, v22.8b, v4.8b           //GHASH final-3 block - mid
+	movi	v8.8b, #0                       //suppress further partial tag
+	eor	x7, x7, x14                   //final-2 block - round 10 high
+	pmull	v10.1q, v22.1d, v10.1d          //GHASH final-3 block - mid
+	eor	x6, x6, x13                   //AES final-2 block - round 10 low
+.L128_dec_blocks_more_than_2:	//blocks	left >  2
+	rev64	v4.16b, v5.16b                  //GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16          //final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	eor	v0.16b, v5.16b, v2.16b          //AES final-1 block - result
+	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
+	mov	d22, v4.d[1]                    //GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d           //GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d           //GHASH final-2 block - high
+	mov	x6, v0.d[0]                     //AES final-1 block - mov low
+	mov	x7, v0.d[1]                     //AES final-1 block - mov high
+	eor	v22.8b, v22.8b, v4.8b           //GHASH final-2 block - mid
+	movi	v8.8b, #0                       //suppress further partial tag
+	pmull	v22.1q, v22.1d, v17.1d          //GHASH final-2 block - mid
+	eor	x6, x6, x13                   //AES final-1 block - round 10 low
+	eor	v11.16b, v11.16b, v21.16b       //GHASH final-2 block - low
+	eor	v9.16b, v9.16b, v20.16b         //GHASH final-2 block - high
+	eor	v10.16b, v10.16b, v22.16b       //GHASH final-2 block - mid
+	eor	x7, x7, x14                   //final-1 block - round 10 high
+.L128_dec_blocks_more_than_1:	//blocks	left >  1
+	rev64	v4.16b, v5.16b                  //GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16            //final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	mov	d22, v4.d[1]                    //GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b          //AES final block - result
+	eor	v22.8b, v22.8b, v4.8b           //GHASH final-1 block - mid
+	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
+	mov	x6, v0.d[0]                     //AES final block - mov low
+	mov	x7, v0.d[1]                     //AES final block - mov high
+	ins	v22.d[1], v22.d[0]              //GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d           //GHASH final-1 block - low
+	pmull2	v20.1q, v4.2d, v13.2d           //GHASH final-1 block - high
+	pmull2	v22.1q, v22.2d, v16.2d          //GHASH final-1 block - mid
+	movi	v8.8b, #0                       //suppress further partial tag
+	eor	v11.16b, v11.16b, v21.16b       //GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b         //GHASH final-1 block - high
+	eor	x7, x7, x14                   //AES final block - round 10 high
+	eor	x6, x6, x13                   //AES final block - round 10 low
+	eor	v10.16b, v10.16b, v22.16b       //GHASH final-1 block - mid
+.L128_dec_blocks_less_than_1:	//blocks        left <= 1
+	mvn	x14, xzr                        //rk10_h = 0xffffffffffffffff
+	and	x1, x1, #127                    //bit_length %= 128
+	mvn	x13, xzr                        //rk10_l = 0xffffffffffffffff
+	sub	x1, x1, #128                    //bit_length -= 128
+	neg	x1, x1          //bit_length = 128 - #bits in input
+	and	x1, x1, #127    //bit_length %= 128
+	lsr	x14, x14, x1    //rk10_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x10, x14, xzr, lt
+	csel	x9, x13, x14, lt
+	fmov	d0, x9                          //ctr0b is mask for last block
+	mov	v0.d[1], x10
+	and	v5.16b, v5.16b, v0.16b
+	rev64	v4.16b, v5.16b                  //GHASH final block
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
+	and	x7, x7, x10
+	mov	d8, v4.d[1]                     //GHASH final block - mid
+	bic	x4, x4, x9                      //mask out low existing bytes
+	and	x6, x6, x9
+	rev	w9, w12
+	bic	x5, x5, x10                     //mask out high existing bytes
+	orr	x6, x6, x4
+	str	w9, [x16, #12]                  //store the updated counter
+	orr	x7, x7, x5
+	stp	x6, x7, [x2]
+	karasuba_multiply v4, v12, v20, v8, v21
+	load_const
+	gcm_tidy_up v9, v10, v11, v30, v31
+	mov	x0, x15
+	st1	{ v11.16b }, [x3]
+	pop_stack
+	ret
+.L128_dec_ret:
+	mov	w0, #0x0
+	ret
+SYM_FUNC_END(pmull_gcm_decrypt_unroll)
+.align	2
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 720cd3a58da3..7e59736ed122 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -29,6 +29,7 @@ MODULE_ALIAS_CRYPTO("ghash");
 #define GHASH_BLOCK_SIZE	16
 #define GHASH_DIGEST_SIZE	16
 #define GCM_IV_SIZE		12
+#define UNROLL_DATA_SIZE	1024
 
 struct ghash_key {
 	be128			k;
@@ -59,6 +60,17 @@ asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
 				 u64 const h[][2], u64 dg[], u8 ctr[],
 				 u32 const rk[], int rounds, const u8 l[],
 				 const u8 tag[], u64 authsize);
+asmlinkage size_t pmull_gcm_encrypt_unroll(const unsigned char *in,
+				 size_t len,
+				 unsigned char *out,
+				 u64 Xi[][2],
+				 unsigned char ivec[16],
+				 const void *key, int rounds);
+asmlinkage size_t pmull_gcm_decrypt_unroll(const uint8_t *ciphertext,
+				 uint64_t plaintext_length,
+				 uint8_t *plaintext, uint64_t Xi[][2],
+				 unsigned char ivec[16], const void *key,
+				 int rounds);
 
 static int ghash_init(struct shash_desc *desc)
 {
@@ -98,11 +110,15 @@ void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
 			  void (*simd_update)(int blocks, u64 dg[],
 					      const char *src,
 					      u64 const h[][2],
-					      const char *head))
+					      const char *head),
+			  int unroll4_flag)
 {
 	if (likely(crypto_simd_usable())) {
 		kernel_neon_begin();
-		simd_update(blocks, dg, src, key->h, head);
+		if (unroll4_flag)
+			simd_update(blocks, dg, src, &key->h[6], head);
+		else
+			simd_update(blocks, dg, src, key->h, head);
 		kernel_neon_end();
 	} else {
 		ghash_do_update(blocks, dg, src, key, head);
@@ -140,7 +156,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
 
 			ghash_do_simd_update(chunk, ctx->digest, src, key,
 					     partial ? ctx->buf : NULL,
-					     pmull_ghash_update_p8);
+					     pmull_ghash_update_p8, 0);
 
 			blocks -= chunk;
 			src += chunk * GHASH_BLOCK_SIZE;
@@ -163,7 +179,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
 		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
 
 		ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL,
-				     pmull_ghash_update_p8);
+				     pmull_ghash_update_p8, 0);
 	}
 	put_unaligned_be64(ctx->digest[1], dst);
 	put_unaligned_be64(ctx->digest[0], dst + 8);
@@ -255,6 +271,16 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
 	gf128mul_lle(&h, &ctx->ghash_key.k);
 	ghash_reflect(ctx->ghash_key.h[3], &h);
 
+	ghash_reflect(ctx->ghash_key.h[6], &ctx->ghash_key.k);
+	h = ctx->ghash_key.k;
+	gf128mul_lle(&h, &ctx->ghash_key.k);
+	ghash_reflect(ctx->ghash_key.h[8], &h);
+
+	gf128mul_lle(&h, &ctx->ghash_key.k);
+	ghash_reflect(ctx->ghash_key.h[9], &h);
+
+	gf128mul_lle(&h, &ctx->ghash_key.k);
+	ghash_reflect(ctx->ghash_key.h[11], &h);
 	return 0;
 }
 
@@ -272,7 +298,7 @@ static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
 }
 
 static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
-			   int *buf_count, struct gcm_aes_ctx *ctx)
+			   int *buf_count, struct gcm_aes_ctx *ctx, int unroll4_flag)
 {
 	if (*buf_count > 0) {
 		int buf_added = min(count, GHASH_BLOCK_SIZE - *buf_count);
@@ -289,7 +315,7 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
 
 		ghash_do_simd_update(blocks, dg, src, &ctx->ghash_key,
 				     *buf_count ? buf : NULL,
-				     pmull_ghash_update_p64);
+				     pmull_ghash_update_p64, unroll4_flag);
 
 		src += blocks * GHASH_BLOCK_SIZE;
 		count %= GHASH_BLOCK_SIZE;
@@ -302,7 +328,7 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
 	}
 }
 
-static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
+static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[], int unroll4_flag)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
@@ -323,7 +349,7 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
 		}
 		p = scatterwalk_map(&walk);
 
-		gcm_update_mac(dg, p, n, buf, &buf_count, ctx);
+		gcm_update_mac(dg, p, n, buf, &buf_count, ctx, unroll4_flag);
 		len -= n;
 
 		scatterwalk_unmap(p);
@@ -334,7 +360,7 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
 	if (buf_count) {
 		memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
 		ghash_do_simd_update(1, dg, buf, &ctx->ghash_key, NULL,
-				     pmull_ghash_update_p64);
+				     pmull_ghash_update_p64, unroll4_flag);
 	}
 }
 
@@ -350,14 +376,21 @@ static int gcm_encrypt(struct aead_request *req)
 	be128 lengths;
 	u8 *tag;
 	int err;
+	int unroll4_flag = 0;
 
 	lengths.a = cpu_to_be64(req->assoclen * 8);
 	lengths.b = cpu_to_be64(req->cryptlen * 8);
 
+	if (req->cryptlen >= UNROLL_DATA_SIZE)
+		unroll4_flag = 1;
 	if (req->assoclen)
-		gcm_calculate_auth_mac(req, dg);
+		gcm_calculate_auth_mac(req, dg, unroll4_flag);
 
 	memcpy(iv, req->iv, GCM_IV_SIZE);
+	if (unroll4_flag) {
+		ctx->ghash_key.h[4][1] = cpu_to_be64(((u64 *)dg)[0]);
+		ctx->ghash_key.h[4][0] = cpu_to_be64(((u64 *)dg)[1]);
+	}
 	put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
 	err = skcipher_walk_aead_encrypt(&walk, req, false);
@@ -378,11 +411,38 @@ static int gcm_encrypt(struct aead_request *req)
 				tag = NULL;
 			}
 
-			kernel_neon_begin();
-			pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
+			if (unroll4_flag) {
+				kernel_neon_begin();
+				pmull_gcm_encrypt_unroll(src, nbytes*8, dst, &ctx->ghash_key.h[4],
+						iv, ctx->aes_key.key_enc, nrounds);
+				kernel_neon_end();
+				if (tag) {
+					kernel_neon_begin();
+					pmull_ghash_update_p64(1, ctx->ghash_key.h[4],
+						tag, &ctx->ghash_key.h[6], NULL);
+					kernel_neon_end();
+
+					memcpy((u8 *)dg, ctx->ghash_key.h[4], GHASH_BLOCK_SIZE);
+					put_unaligned_be64(dg[1], tag);
+					put_unaligned_be64(dg[0], tag + 8);
+					put_unaligned_be32(1, iv + GCM_IV_SIZE);
+					aes_encrypt(&ctx->aes_key, iv, iv);
+					crypto_xor(tag, iv, AES_BLOCK_SIZE);
+				} else {
+
+					memcpy((u8 *)dg, ctx->ghash_key.h[4], GHASH_BLOCK_SIZE);
+					put_unaligned_be64(dg[1],
+						(unsigned char *)ctx->ghash_key.h[4]);
+					put_unaligned_be64(dg[0],
+						((unsigned char *)ctx->ghash_key.h[4] + 8));
+				}
+			} else {
+				kernel_neon_begin();
+				pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
 					  dg, iv, ctx->aes_key.key_enc, nrounds,
 					  tag);
-			kernel_neon_end();
+				kernel_neon_end();
+			}
 
 			if (unlikely(!nbytes))
 				break;
@@ -465,14 +525,22 @@ static int gcm_decrypt(struct aead_request *req)
 	be128 lengths;
 	u8 *tag;
 	int err;
+	int unroll4_flag = 0;
 
 	lengths.a = cpu_to_be64(req->assoclen * 8);
 	lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8);
 
+	if (req->cryptlen >= UNROLL_DATA_SIZE)
+		unroll4_flag = 1;
+
 	if (req->assoclen)
-		gcm_calculate_auth_mac(req, dg);
+		gcm_calculate_auth_mac(req, dg, unroll4_flag);
 
 	memcpy(iv, req->iv, GCM_IV_SIZE);
+	if (unroll4_flag) {
+		ctx->ghash_key.h[4][1] = cpu_to_be64(((u64 *)dg)[0]);
+		ctx->ghash_key.h[4][0] = cpu_to_be64(((u64 *)dg)[1]);
+	}
 	put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
 	scatterwalk_map_and_copy(otag, req->src,
@@ -499,12 +567,44 @@ static int gcm_decrypt(struct aead_request *req)
 				tag = NULL;
 			}
 
-			kernel_neon_begin();
-			ret = pmull_gcm_decrypt(nbytes, dst, src,
+			if (unroll4_flag) {
+				kernel_neon_begin();
+				pmull_gcm_decrypt_unroll(src, nbytes*8, dst, &ctx->ghash_key.h[4],
+						iv, ctx->aes_key.key_enc, nrounds);
+				kernel_neon_end();
+
+				if (tag) {
+					kernel_neon_begin();
+					pmull_ghash_update_p64(1, ctx->ghash_key.h[4], tag,
+							(u64 (*)[2])ctx->ghash_key.h[6], NULL);
+					kernel_neon_end();
+
+					memcpy((u8 *)dg, ctx->ghash_key.h[4], GHASH_BLOCK_SIZE);
+					put_unaligned_be64(dg[1], tag);
+					put_unaligned_be64(dg[0], tag + 8);
+					put_unaligned_be32(1, iv + GCM_IV_SIZE);
+					aes_encrypt(&ctx->aes_key, iv, iv);
+					crypto_xor(tag, iv, AES_BLOCK_SIZE);
+					ret = crypto_memneq(tag, otag, authsize);
+					if (unlikely(ret)) {
+						memzero_explicit(tag, AES_BLOCK_SIZE);
+						break;
+					}
+				} else {
+					memcpy((u8 *)dg, ctx->ghash_key.h[4], GHASH_BLOCK_SIZE);
+					put_unaligned_be64(dg[1],
+						(unsigned char *)ctx->ghash_key.h[4]);
+					put_unaligned_be64(dg[0],
+						((unsigned char *)ctx->ghash_key.h[4] + 8));
+				}
+			} else {
+				kernel_neon_begin();
+				ret = pmull_gcm_decrypt(nbytes, dst, src,
 						ctx->ghash_key.h,
 						dg, iv, ctx->aes_key.key_enc,
 						nrounds, tag, otag, authsize);
-			kernel_neon_end();
+				kernel_neon_end();
+			}
 
 			if (unlikely(!nbytes))
 				break;
@@ -592,7 +692,7 @@ static struct aead_alg gcm_aes_alg = {
 	.base.cra_priority	= 300,
 	.base.cra_blocksize	= 1,
 	.base.cra_ctxsize	= sizeof(struct gcm_aes_ctx) +
-				  4 * sizeof(u64[2]),
+				  12 * sizeof(u64[2]),
 	.base.cra_module	= THIS_MODULE,
 };
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-09-23  6:30 [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash XiaokangQian
@ 2021-09-28  6:27 ` Eric Biggers
  2021-09-28 21:04   ` Ard Biesheuvel
  2021-12-15  3:04 ` [PATCH v2] " XiaokangQian
  1 sibling, 1 reply; 12+ messages in thread
From: Eric Biggers @ 2021-09-28  6:27 UTC (permalink / raw)
  To: XiaokangQian
  Cc: Herbert Xu, David S. Miller, Catalin Marinas, Will Deacon, nd,
	ardb, linux-crypto, linux-arm-kernel, linux-kernel

On Thu, Sep 23, 2021 at 06:30:25AM +0000, XiaokangQian wrote:
> To improve performance on cores with deep piplines such as A72,N1,
> implement gcm(aes) using a 4-way interleave of aes and ghash (totally
> 8 blocks in parallel), which can make full utilize of pipelines rather
> than the 4-way interleave we used currently. It can gain about 20% for
> big data sizes such that 8k.
> 
> This is a complete new version of the GCM part of the combined GCM/GHASH
> driver, it will co-exist with the old driver, only serve for big data
> sizes. Instead of interleaving four invocations of AES where each chunk
> of 64 bytes is encrypted first and then ghashed, the new version uses a
> more coarse grained approach where a chunk of 64 bytes is encrypted and
> at the same time, one chunk of 64 bytes is ghashed (or ghashed and
> decrypted in the converse case).
> 
> The table below compares the performance of the old driver and the new
> one on various micro-architectures and running in various modes with
> various data sizes.
> 
>             |     AES-128       |     AES-192       |     AES-256       |
>      #bytes | 1024 | 1420 |  8k | 1024 | 1420 |  8k | 1024 | 1420 |  8k |
>      -------+------+------+-----+------+------+-----+------+------+-----+
>         A72 | 5.5% |  12% | 25% | 2.2% |  9.5%|  23%| -1%  |  6.7%| 19% |
>         A57 |-0.5% |  9.3%| 32% | -3%  |  6.3%|  26%| -6%  |  3.3%| 21% |
>         N1  | 0.4% |  7.6%|24.5%| -2%  |  5%  |  22%| -4%  |  2.7%| 20% |
> 
> Signed-off-by: XiaokangQian <xiaokang.qian@arm.com>

Does this pass the self-tests, including the fuzz tests which are enabled by
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y?

- Eric

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-09-28  6:27 ` Eric Biggers
@ 2021-09-28 21:04   ` Ard Biesheuvel
  2021-09-30  1:32     ` Xiaokang Qian
  2021-12-13 18:29     ` Will Deacon
  0 siblings, 2 replies; 12+ messages in thread
From: Ard Biesheuvel @ 2021-09-28 21:04 UTC (permalink / raw)
  To: Eric Biggers
  Cc: XiaokangQian, Herbert Xu, David S. Miller, Catalin Marinas,
	Will Deacon, nd, Linux Crypto Mailing List, Linux ARM,
	Linux Kernel Mailing List

On Tue, 28 Sept 2021 at 08:27, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Thu, Sep 23, 2021 at 06:30:25AM +0000, XiaokangQian wrote:
> > To improve performance on cores with deep piplines such as A72,N1,
> > implement gcm(aes) using a 4-way interleave of aes and ghash (totally
> > 8 blocks in parallel), which can make full utilize of pipelines rather
> > than the 4-way interleave we used currently. It can gain about 20% for
> > big data sizes such that 8k.
> >
> > This is a complete new version of the GCM part of the combined GCM/GHASH
> > driver, it will co-exist with the old driver, only serve for big data
> > sizes. Instead of interleaving four invocations of AES where each chunk
> > of 64 bytes is encrypted first and then ghashed, the new version uses a
> > more coarse grained approach where a chunk of 64 bytes is encrypted and
> > at the same time, one chunk of 64 bytes is ghashed (or ghashed and
> > decrypted in the converse case).
> >
> > The table below compares the performance of the old driver and the new
> > one on various micro-architectures and running in various modes with
> > various data sizes.
> >
> >             |     AES-128       |     AES-192       |     AES-256       |
> >      #bytes | 1024 | 1420 |  8k | 1024 | 1420 |  8k | 1024 | 1420 |  8k |
> >      -------+------+------+-----+------+------+-----+------+------+-----+
> >         A72 | 5.5% |  12% | 25% | 2.2% |  9.5%|  23%| -1%  |  6.7%| 19% |
> >         A57 |-0.5% |  9.3%| 32% | -3%  |  6.3%|  26%| -6%  |  3.3%| 21% |
> >         N1  | 0.4% |  7.6%|24.5%| -2%  |  5%  |  22%| -4%  |  2.7%| 20% |
> >
> > Signed-off-by: XiaokangQian <xiaokang.qian@arm.com>
>
> Does this pass the self-tests, including the fuzz tests which are enabled by
> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y?
>

Please test both little-endian and big-endian. (Note that you don't
need a big-endian user space for this - the self tests are executed
before the rootfs is mounted)

Also, you will have to rebase this onto the latest cryptodev tree,
which carries some changes I made recently to this driver.

Finally, I'd like to discuss whether we really need two separate
drivers here. The 1k data point is not as relevant as the other ones,
which show a worthwhile speedup for all micro architectures and data
sizes (although I will give this a spin on TX2 myself when I have the
chance)

*If* we switch to this implementation completely, I would like to keep
the improvement I added recently to the decrypt path to compare the
tag using SIMD code, rather than copying it out and using memcmp().
Could you look into adopting this for this version as well?

-- 
Ard.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-09-28 21:04   ` Ard Biesheuvel
@ 2021-09-30  1:32     ` Xiaokang Qian
  2021-09-30 14:57       ` Ard Biesheuvel
  2021-12-13 18:29     ` Will Deacon
  1 sibling, 1 reply; 12+ messages in thread
From: Xiaokang Qian @ 2021-09-30  1:32 UTC (permalink / raw)
  To: Ard Biesheuvel, Eric Biggers
  Cc: Herbert Xu, David S. Miller, Catalin Marinas, Will Deacon, nd,
	Linux Crypto Mailing List, Linux ARM, Linux Kernel Mailing List

Thanks for the review.

I will firstly change the decrypt path to compare the tag using SIMD code, and then  pass all of the self tests include fuzz tests(enabled by CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y), big endian ,little endian tests.

About the 1K data point, I just remember that the 1420 bytes packet is commonly used in IPSEC.


-----Original Message-----
From: Ard Biesheuvel <ardb@kernel.org> 
Sent: Wednesday, September 29, 2021 5:04 AM
To: Eric Biggers <ebiggers@kernel.org>
Cc: Xiaokang Qian <Xiaokang.Qian@arm.com>; Herbert Xu <herbert@gondor.apana.org.au>; David S. Miller <davem@davemloft.net>; Catalin Marinas <Catalin.Marinas@arm.com>; Will Deacon <will@kernel.org>; nd <nd@arm.com>; Linux Crypto Mailing List <linux-crypto@vger.kernel.org>; Linux ARM <linux-arm-kernel@lists.infradead.org>; Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash

On Tue, 28 Sept 2021 at 08:27, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Thu, Sep 23, 2021 at 06:30:25AM +0000, XiaokangQian wrote:
> > To improve performance on cores with deep piplines such as A72,N1, 
> > implement gcm(aes) using a 4-way interleave of aes and ghash 
> > (totally
> > 8 blocks in parallel), which can make full utilize of pipelines 
> > rather than the 4-way interleave we used currently. It can gain 
> > about 20% for big data sizes such that 8k.
> >
> > This is a complete new version of the GCM part of the combined 
> > GCM/GHASH driver, it will co-exist with the old driver, only serve 
> > for big data sizes. Instead of interleaving four invocations of AES 
> > where each chunk of 64 bytes is encrypted first and then ghashed, 
> > the new version uses a more coarse grained approach where a chunk of 
> > 64 bytes is encrypted and at the same time, one chunk of 64 bytes is 
> > ghashed (or ghashed and decrypted in the converse case).
> >
> > The table below compares the performance of the old driver and the 
> > new one on various micro-architectures and running in various modes 
> > with various data sizes.
> >
> >             |     AES-128       |     AES-192       |     AES-256       |
> >      #bytes | 1024 | 1420 |  8k | 1024 | 1420 |  8k | 1024 | 1420 |  8k |
> >      -------+------+------+-----+------+------+-----+------+------+-----+
> >         A72 | 5.5% |  12% | 25% | 2.2% |  9.5%|  23%| -1%  |  6.7%| 19% |
> >         A57 |-0.5% |  9.3%| 32% | -3%  |  6.3%|  26%| -6%  |  3.3%| 21% |
> >         N1  | 0.4% |  7.6%|24.5%| -2%  |  5%  |  22%| -4%  |  2.7%| 
> > 20% |
> >
> > Signed-off-by: XiaokangQian <xiaokang.qian@arm.com>
>
> Does this pass the self-tests, including the fuzz tests which are 
> enabled by CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y?
>

Please test both little-endian and big-endian. (Note that you don't need a big-endian user space for this - the self tests are executed before the rootfs is mounted)

Also, you will have to rebase this onto the latest cryptodev tree, which carries some changes I made recently to this driver.

Finally, I'd like to discuss whether we really need two separate drivers here. The 1k data point is not as relevant as the other ones, which show a worthwhile speedup for all micro architectures and data sizes (although I will give this a spin on TX2 myself when I have the
chance)

*If* we switch to this implementation completely, I would like to keep the improvement I added recently to the decrypt path to compare the tag using SIMD code, rather than copying it out and using memcmp().
Could you look into adopting this for this version as well?

--
Ard.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-09-30  1:32     ` Xiaokang Qian
@ 2021-09-30 14:57       ` Ard Biesheuvel
  2021-10-15  8:58         ` Xiaokang Qian
  0 siblings, 1 reply; 12+ messages in thread
From: Ard Biesheuvel @ 2021-09-30 14:57 UTC (permalink / raw)
  To: Xiaokang Qian
  Cc: Eric Biggers, Herbert Xu, David S. Miller, Catalin Marinas,
	Will Deacon, nd, Linux Crypto Mailing List, Linux ARM,
	Linux Kernel Mailing List

On Thu, 30 Sept 2021 at 03:32, Xiaokang Qian <Xiaokang.Qian@arm.com> wrote:
>
> Thanks for the review.
>
> I will firstly change the decrypt path to compare the tag using SIMD code, and then  pass all of the self tests include fuzz tests(enabled by CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y), big endian ,little endian tests.
>

OK

> About the 1K data point, I just remember that the 1420 bytes packet is commonly used in IPSEC.
>

Yes, but your code is faster than the existing code for 1420 byte
packets, right? So why should we keep the original code? We don't use
GCM for block storage, and if IPsec throughput is a key performance
metric for your system, you are likely to be using the maximum packet
size so 1420 bytes not 1k.


>
> -----Original Message-----
> From: Ard Biesheuvel <ardb@kernel.org>
> Sent: Wednesday, September 29, 2021 5:04 AM
> To: Eric Biggers <ebiggers@kernel.org>
> Cc: Xiaokang Qian <Xiaokang.Qian@arm.com>; Herbert Xu <herbert@gondor.apana.org.au>; David S. Miller <davem@davemloft.net>; Catalin Marinas <Catalin.Marinas@arm.com>; Will Deacon <will@kernel.org>; nd <nd@arm.com>; Linux Crypto Mailing List <linux-crypto@vger.kernel.org>; Linux ARM <linux-arm-kernel@lists.infradead.org>; Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
> Subject: Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
>
> On Tue, 28 Sept 2021 at 08:27, Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > On Thu, Sep 23, 2021 at 06:30:25AM +0000, XiaokangQian wrote:
> > > To improve performance on cores with deep piplines such as A72,N1,
> > > implement gcm(aes) using a 4-way interleave of aes and ghash
> > > (totally
> > > 8 blocks in parallel), which can make full utilize of pipelines
> > > rather than the 4-way interleave we used currently. It can gain
> > > about 20% for big data sizes such that 8k.
> > >
> > > This is a complete new version of the GCM part of the combined
> > > GCM/GHASH driver, it will co-exist with the old driver, only serve
> > > for big data sizes. Instead of interleaving four invocations of AES
> > > where each chunk of 64 bytes is encrypted first and then ghashed,
> > > the new version uses a more coarse grained approach where a chunk of
> > > 64 bytes is encrypted and at the same time, one chunk of 64 bytes is
> > > ghashed (or ghashed and decrypted in the converse case).
> > >
> > > The table below compares the performance of the old driver and the
> > > new one on various micro-architectures and running in various modes
> > > with various data sizes.
> > >
> > >             |     AES-128       |     AES-192       |     AES-256       |
> > >      #bytes | 1024 | 1420 |  8k | 1024 | 1420 |  8k | 1024 | 1420 |  8k |
> > >      -------+------+------+-----+------+------+-----+------+------+-----+
> > >         A72 | 5.5% |  12% | 25% | 2.2% |  9.5%|  23%| -1%  |  6.7%| 19% |
> > >         A57 |-0.5% |  9.3%| 32% | -3%  |  6.3%|  26%| -6%  |  3.3%| 21% |
> > >         N1  | 0.4% |  7.6%|24.5%| -2%  |  5%  |  22%| -4%  |  2.7%|
> > > 20% |
> > >
> > > Signed-off-by: XiaokangQian <xiaokang.qian@arm.com>
> >
> > Does this pass the self-tests, including the fuzz tests which are
> > enabled by CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y?
> >
>
> Please test both little-endian and big-endian. (Note that you don't need a big-endian user space for this - the self tests are executed before the rootfs is mounted)
>
> Also, you will have to rebase this onto the latest cryptodev tree, which carries some changes I made recently to this driver.
>
> Finally, I'd like to discuss whether we really need two separate drivers here. The 1k data point is not as relevant as the other ones, which show a worthwhile speedup for all micro architectures and data sizes (although I will give this a spin on TX2 myself when I have the
> chance)
>
> *If* we switch to this implementation completely, I would like to keep the improvement I added recently to the decrypt path to compare the tag using SIMD code, rather than copying it out and using memcmp().
> Could you look into adopting this for this version as well?
>
> --
> Ard.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-09-30 14:57       ` Ard Biesheuvel
@ 2021-10-15  8:58         ` Xiaokang Qian
  0 siblings, 0 replies; 12+ messages in thread
From: Xiaokang Qian @ 2021-10-15  8:58 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Eric Biggers, Herbert Xu, David S. Miller, Catalin Marinas,
	Will Deacon, nd, Linux Crypto Mailing List, Linux ARM,
	Linux Kernel Mailing List



On Thu, September 30, 2021 10:57 PM, Ard Biesheuvel <ardb@kernel.org>
wrote:
> 
> On Thu, 30 Sept 2021 at 03:32, Xiaokang Qian <Xiaokang.Qian@arm.com>
> wrote:
> >
> > Thanks for the review.
> >
> > I will firstly change the decrypt path to compare the tag using SIMD code,
> and then  pass all of the self tests include fuzz tests(enabled by
> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y), big endian ,little endian
> tests.
> >
> 
> OK
> 
> > About the 1K data point, I just remember that the 1420 bytes packet is
> commonly used in IPSEC.
> >
> 
> Yes, but your code is faster than the existing code for 1420 byte packets, right?
> So why should we keep the original code? We don't use GCM for block
> storage, and if IPsec throughput is a key performance metric for your system,
> you are likely to be using the maximum packet size so 1420 bytes not 1k.
> 
> 

Yes, the code is faster than the existing code for 1420 bytes packets, and the bigger the data size, the more the performance is uplifted.
But there is one issue,  our code will interleave 4 blocks for crypto-AES instructions and another 4 blocks for ghash(pmull) in parallel, so 
it's more friendly to the bigger data size but not friendly to the smaller ones.
For the data size that is smaller than 1k data size, the performance  will have some regression. 
So we keep the two driver exist together.

> >
> > -----Original Message-----
> > From: Ard Biesheuvel <ardb@kernel.org>
> > Sent: Wednesday, September 29, 2021 5:04 AM
> > To: Eric Biggers <ebiggers@kernel.org>
> > Cc: Xiaokang Qian <Xiaokang.Qian@arm.com>; Herbert Xu
> > <herbert@gondor.apana.org.au>; David S. Miller <davem@davemloft.net>;
> > Catalin Marinas <Catalin.Marinas@arm.com>; Will Deacon
> > <will@kernel.org>; nd <nd@arm.com>; Linux Crypto Mailing List
> > <linux-crypto@vger.kernel.org>; Linux ARM
> > <linux-arm-kernel@lists.infradead.org>; Linux Kernel Mailing List
> > <linux-kernel@vger.kernel.org>
> > Subject: Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way
> > interleave of aes and ghash
> >
> > On Tue, 28 Sept 2021 at 08:27, Eric Biggers <ebiggers@kernel.org> wrote:
> > >
> > > On Thu, Sep 23, 2021 at 06:30:25AM +0000, XiaokangQian wrote:
> > > > To improve performance on cores with deep piplines such as A72,N1,
> > > > implement gcm(aes) using a 4-way interleave of aes and ghash
> > > > (totally
> > > > 8 blocks in parallel), which can make full utilize of pipelines
> > > > rather than the 4-way interleave we used currently. It can gain
> > > > about 20% for big data sizes such that 8k.
> > > >
> > > > This is a complete new version of the GCM part of the combined
> > > > GCM/GHASH driver, it will co-exist with the old driver, only serve
> > > > for big data sizes. Instead of interleaving four invocations of
> > > > AES where each chunk of 64 bytes is encrypted first and then
> > > > ghashed, the new version uses a more coarse grained approach where
> > > > a chunk of
> > > > 64 bytes is encrypted and at the same time, one chunk of 64 bytes
> > > > is ghashed (or ghashed and decrypted in the converse case).
> > > >
> > > > The table below compares the performance of the old driver and the
> > > > new one on various micro-architectures and running in various
> > > > modes with various data sizes.
> > > >
> > > >             |     AES-128       |     AES-192       |     AES-256       |
> > > >      #bytes | 1024 | 1420 |  8k | 1024 | 1420 |  8k | 1024 | 1420 |  8k |
> > > >      -------+------+------+-----+------+------+-----+------+------+-----+
> > > >         A72 | 5.5% |  12% | 25% | 2.2% |  9.5%|  23%| -1%  |  6.7%| 19% |
> > > >         A57 |-0.5% |  9.3%| 32% | -3%  |  6.3%|  26%| -6%  |  3.3%| 21% |
> > > >         N1  | 0.4% |  7.6%|24.5%| -2%  |  5%  |  22%| -4%  |
> > > > 2.7%| 20% |
> > > >
> > > > Signed-off-by: XiaokangQian <xiaokang.qian@arm.com>
> > >
> > > Does this pass the self-tests, including the fuzz tests which are
> > > enabled by CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y?
> > >
> >
> > Please test both little-endian and big-endian. (Note that you don't
> > need a big-endian user space for this - the self tests are executed
> > before the rootfs is mounted)
> >
> > Also, you will have to rebase this onto the latest cryptodev tree, which
> carries some changes I made recently to this driver.
> >
> > Finally, I'd like to discuss whether we really need two separate
> > drivers here. The 1k data point is not as relevant as the other ones,
> > which show a worthwhile speedup for all micro architectures and data
> > sizes (although I will give this a spin on TX2 myself when I have the
> > chance)
> >
> > *If* we switch to this implementation completely, I would like to keep the
> improvement I added recently to the decrypt path to compare the tag using
> SIMD code, rather than copying it out and using memcmp().
> > Could you look into adopting this for this version as well?
> >
> > --
> > Ard.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-09-28 21:04   ` Ard Biesheuvel
  2021-09-30  1:32     ` Xiaokang Qian
@ 2021-12-13 18:29     ` Will Deacon
  2021-12-14  1:39       ` Xiaokang Qian
  1 sibling, 1 reply; 12+ messages in thread
From: Will Deacon @ 2021-12-13 18:29 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Eric Biggers, XiaokangQian, Herbert Xu, David S. Miller,
	Catalin Marinas, nd, Linux Crypto Mailing List, Linux ARM,
	Linux Kernel Mailing List

On Tue, Sep 28, 2021 at 11:04:03PM +0200, Ard Biesheuvel wrote:
> On Tue, 28 Sept 2021 at 08:27, Eric Biggers <ebiggers@kernel.org> wrote:
> >
> > On Thu, Sep 23, 2021 at 06:30:25AM +0000, XiaokangQian wrote:
> > > To improve performance on cores with deep piplines such as A72,N1,
> > > implement gcm(aes) using a 4-way interleave of aes and ghash (totally
> > > 8 blocks in parallel), which can make full utilize of pipelines rather
> > > than the 4-way interleave we used currently. It can gain about 20% for
> > > big data sizes such that 8k.
> > >
> > > This is a complete new version of the GCM part of the combined GCM/GHASH
> > > driver, it will co-exist with the old driver, only serve for big data
> > > sizes. Instead of interleaving four invocations of AES where each chunk
> > > of 64 bytes is encrypted first and then ghashed, the new version uses a
> > > more coarse grained approach where a chunk of 64 bytes is encrypted and
> > > at the same time, one chunk of 64 bytes is ghashed (or ghashed and
> > > decrypted in the converse case).
> > >
> > > The table below compares the performance of the old driver and the new
> > > one on various micro-architectures and running in various modes with
> > > various data sizes.
> > >
> > >             |     AES-128       |     AES-192       |     AES-256       |
> > >      #bytes | 1024 | 1420 |  8k | 1024 | 1420 |  8k | 1024 | 1420 |  8k |
> > >      -------+------+------+-----+------+------+-----+------+------+-----+
> > >         A72 | 5.5% |  12% | 25% | 2.2% |  9.5%|  23%| -1%  |  6.7%| 19% |
> > >         A57 |-0.5% |  9.3%| 32% | -3%  |  6.3%|  26%| -6%  |  3.3%| 21% |
> > >         N1  | 0.4% |  7.6%|24.5%| -2%  |  5%  |  22%| -4%  |  2.7%| 20% |
> > >
> > > Signed-off-by: XiaokangQian <xiaokang.qian@arm.com>
> >
> > Does this pass the self-tests, including the fuzz tests which are enabled by
> > CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y?
> >
> 
> Please test both little-endian and big-endian. (Note that you don't
> need a big-endian user space for this - the self tests are executed
> before the rootfs is mounted)
> 
> Also, you will have to rebase this onto the latest cryptodev tree,
> which carries some changes I made recently to this driver.

XiaokangQian -- did you post an updated version of this? It would end up
going via Herbert, but I was keeping half an eye on it and it all seems
to have gone quiet.

Thanks,

Will

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-12-13 18:29     ` Will Deacon
@ 2021-12-14  1:39       ` Xiaokang Qian
  2021-12-14 15:59         ` Ard Biesheuvel
  0 siblings, 1 reply; 12+ messages in thread
From: Xiaokang Qian @ 2021-12-14  1:39 UTC (permalink / raw)
  To: Will Deacon, Ard Biesheuvel
  Cc: Eric Biggers, Herbert Xu, David S. Miller, Catalin Marinas, nd,
	Linux Crypto Mailing List, Linux ARM, Linux Kernel Mailing List

Hi Will:
I will post the update version 2 of this patch today or tomorrow.
Sorry for the delay.

> -----Original Message-----
> From: Will Deacon <will@kernel.org>
> Sent: Tuesday, December 14, 2021 2:29 AM
> To: Ard Biesheuvel <ardb@kernel.org>
> Cc: Eric Biggers <ebiggers@kernel.org>; Xiaokang Qian
> <Xiaokang.Qian@arm.com>; Herbert Xu <herbert@gondor.apana.org.au>;
> David S. Miller <davem@davemloft.net>; Catalin Marinas
> <Catalin.Marinas@arm.com>; nd <nd@arm.com>; Linux Crypto Mailing List
> <linux-crypto@vger.kernel.org>; Linux ARM <linux-arm-
> kernel@lists.infradead.org>; Linux Kernel Mailing List <linux-
> kernel@vger.kernel.org>
> Subject: Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way
> interleave of aes and ghash
> 
> On Tue, Sep 28, 2021 at 11:04:03PM +0200, Ard Biesheuvel wrote:
> > On Tue, 28 Sept 2021 at 08:27, Eric Biggers <ebiggers@kernel.org> wrote:
> > >
> > > On Thu, Sep 23, 2021 at 06:30:25AM +0000, XiaokangQian wrote:
> > > > To improve performance on cores with deep piplines such as A72,N1,
> > > > implement gcm(aes) using a 4-way interleave of aes and ghash
> > > > (totally
> > > > 8 blocks in parallel), which can make full utilize of pipelines
> > > > rather than the 4-way interleave we used currently. It can gain
> > > > about 20% for big data sizes such that 8k.
> > > >
> > > > This is a complete new version of the GCM part of the combined
> > > > GCM/GHASH driver, it will co-exist with the old driver, only serve
> > > > for big data sizes. Instead of interleaving four invocations of
> > > > AES where each chunk of 64 bytes is encrypted first and then
> > > > ghashed, the new version uses a more coarse grained approach where
> > > > a chunk of 64 bytes is encrypted and at the same time, one chunk
> > > > of 64 bytes is ghashed (or ghashed and decrypted in the converse case).
> > > >
> > > > The table below compares the performance of the old driver and the
> > > > new one on various micro-architectures and running in various
> > > > modes with various data sizes.
> > > >
> > > >             |     AES-128       |     AES-192       |     AES-256       |
> > > >      #bytes | 1024 | 1420 |  8k | 1024 | 1420 |  8k | 1024 | 1420 |  8k |
> > > >      -------+------+------+-----+------+------+-----+------+------+-----+
> > > >         A72 | 5.5% |  12% | 25% | 2.2% |  9.5%|  23%| -1%  |  6.7%| 19% |
> > > >         A57 |-0.5% |  9.3%| 32% | -3%  |  6.3%|  26%| -6%  |  3.3%| 21% |
> > > >         N1  | 0.4% |  7.6%|24.5%| -2%  |  5%  |  22%| -4%  |
> > > > 2.7%| 20% |
> > > >
> > > > Signed-off-by: XiaokangQian <xiaokang.qian@arm.com>
> > >
> > > Does this pass the self-tests, including the fuzz tests which are
> > > enabled by CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y?
> > >
> >
> > Please test both little-endian and big-endian. (Note that you don't
> > need a big-endian user space for this - the self tests are executed
> > before the rootfs is mounted)
> >
> > Also, you will have to rebase this onto the latest cryptodev tree,
> > which carries some changes I made recently to this driver.
> 
> XiaokangQian -- did you post an updated version of this? It would end up
> going via Herbert, but I was keeping half an eye on it and it all seems to have
> gone quiet.
> 
> Thanks,
> 
> Will

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-12-14  1:39       ` Xiaokang Qian
@ 2021-12-14 15:59         ` Ard Biesheuvel
  2021-12-15  5:48           ` Xiaokang Qian
  0 siblings, 1 reply; 12+ messages in thread
From: Ard Biesheuvel @ 2021-12-14 15:59 UTC (permalink / raw)
  To: Xiaokang Qian
  Cc: Will Deacon, Eric Biggers, Herbert Xu, David S. Miller,
	Catalin Marinas, nd, Linux Crypto Mailing List, Linux ARM,
	Linux Kernel Mailing List

On Tue, 14 Dec 2021 at 02:40, Xiaokang Qian <Xiaokang.Qian@arm.com> wrote:
>
> Hi Will:
> I will post the update version 2 of this patch today or tomorrow.
> Sorry for the delay.
>

Great, but please make sure you run the extended test suite.

I applied this version of the patch to test the performance delta
between the old and the new version on TX2, but it hit a failure in
the self test:

[    0.592203] alg: aead: gcm-aes-ce decryption unexpectedly succeeded
on test vector "random: alen=91 plen=5326 authsize=16 klen=32
novrfy=1"; expected_error=-EBADMSG, cfg="random: inplace use_finup
src_divs=[100.0%@+3779] key_offset=43"

It's non-deterministic, though, so it may take a few attempts to reproduce it.

As for the performance delta, your code is 18% slower on TX2 for 1420
byte packets using AES-256 (and 9% slower on AES-192). In your
results, AES-256 does not outperform the old code as much as it does
with smaller key sizes either.

Is this something that can be solved? If not, the numbers are not as
appealing, to be honest, given the substantial performance regressions
on the other micro-architecture.

-- 
Ard.



Tcrypt output follows


OLD CODE

testing speed of gcm(aes) (gcm-aes-ce) encryption
test 0 (128 bit key, 16 byte blocks): 2023626 operations in 1 seconds
(32378016 bytes)
test 1 (128 bit key, 64 byte blocks): 2005175 operations in 1 seconds
(128331200 bytes)
test 2 (128 bit key, 256 byte blocks): 1408367 operations in 1 seconds
(360541952 bytes)
test 3 (128 bit key, 512 byte blocks): 1011877 operations in 1 seconds
(518081024 bytes)
test 4 (128 bit key, 1024 byte blocks): 646552 operations in 1 seconds
(662069248 bytes)
test 5 (128 bit key, 1420 byte blocks): 490188 operations in 1 seconds
(696066960 bytes)
test 6 (128 bit key, 4096 byte blocks): 204423 operations in 1 seconds
(837316608 bytes)
test 7 (128 bit key, 8192 byte blocks): 105149 operations in 1 seconds
(861380608 bytes)
test 8 (192 bit key, 16 byte blocks): 1924506 operations in 1 seconds
(30792096 bytes)
test 9 (192 bit key, 64 byte blocks): 1944413 operations in 1 seconds
(124442432 bytes)
test 10 (192 bit key, 256 byte blocks): 1337001 operations in 1
seconds (342272256 bytes)
test 11 (192 bit key, 512 byte blocks): 941146 operations in 1 seconds
(481866752 bytes)
test 12 (192 bit key, 1024 byte blocks): 590614 operations in 1
seconds (604788736 bytes)
test 13 (192 bit key, 1420 byte blocks): 443363 operations in 1
seconds (629575460 bytes)
test 14 (192 bit key, 4096 byte blocks): 182890 operations in 1
seconds (749117440 bytes)
test 15 (192 bit key, 8192 byte blocks): 93813 operations in 1 seconds
(768516096 bytes)
test 16 (256 bit key, 16 byte blocks): 1886970 operations in 1 seconds
(30191520 bytes)
test 17 (256 bit key, 64 byte blocks): 1893574 operations in 1 seconds
(121188736 bytes)
test 18 (256 bit key, 256 byte blocks): 1245478 operations in 1
seconds (318842368 bytes)
test 19 (256 bit key, 512 byte blocks): 865507 operations in 1 seconds
(443139584 bytes)
test 20 (256 bit key, 1024 byte blocks): 537822 operations in 1
seconds (550729728 bytes)
test 21 (256 bit key, 1420 byte blocks): 401451 operations in 1
seconds (570060420 bytes)
test 22 (256 bit key, 4096 byte blocks): 164378 operations in 1
seconds (673292288 bytes)
test 23 (256 bit key, 8192 byte blocks): 84205 operations in 1 seconds
(689807360 bytes)


NEW CODE

testing speed of gcm(aes) (gcm-aes-ce) encryption
test 0 (128 bit key, 16 byte blocks): 1894587 operations in 1 seconds
(30313392 bytes)
test 1 (128 bit key, 64 byte blocks): 1910971 operations in 1 seconds
(122302144 bytes)
test 2 (128 bit key, 256 byte blocks): 1360037 operations in 1 seconds
(348169472 bytes)
test 3 (128 bit key, 512 byte blocks): 985577 operations in 1 seconds
(504615424 bytes)
test 4 (128 bit key, 1024 byte blocks): 569656 operations in 1 seconds
(583327744 bytes)
test 5 (128 bit key, 1420 byte blocks): 462129 operations in 1 seconds
(656223180 bytes)
test 6 (128 bit key, 4096 byte blocks): 215284 operations in 1 seconds
(881803264 bytes)
test 7 (128 bit key, 8192 byte blocks): 115459 operations in 1 seconds
(945840128 bytes)
test 8 (192 bit key, 16 byte blocks): 1825915 operations in 1 seconds
(29214640 bytes)
test 9 (192 bit key, 64 byte blocks): 1836850 operations in 1 seconds
(117558400 bytes)
test 10 (192 bit key, 256 byte blocks): 1281626 operations in 1
seconds (328096256 bytes)
test 11 (192 bit key, 512 byte blocks): 913114 operations in 1 seconds
(467514368 bytes)
test 12 (192 bit key, 1024 byte blocks): 504804 operations in 1
seconds (516919296 bytes)
test 13 (192 bit key, 1420 byte blocks): 405749 operations in 1
seconds (576163580 bytes)
test 14 (192 bit key, 4096 byte blocks): 183999 operations in 1
seconds (753659904 bytes)
test 15 (192 bit key, 8192 byte blocks): 97914 operations in 1 seconds
(802111488 bytes)
test 16 (256 bit key, 16 byte blocks): 1776659 operations in 1 seconds
(28426544 bytes)
test 17 (256 bit key, 64 byte blocks): 1781110 operations in 1 seconds
(113991040 bytes)
test 18 (256 bit key, 256 byte blocks): 1206511 operations in 1
seconds (308866816 bytes)
test 19 (256 bit key, 512 byte blocks): 846284 operations in 1 seconds
(433297408 bytes)
test 20 (256 bit key, 1024 byte blocks): 424405 operations in 1
seconds (434590720 bytes)
test 21 (256 bit key, 1420 byte blocks): 331558 operations in 1
seconds (470812360 bytes)
test 22 (256 bit key, 4096 byte blocks): 143821 operations in 1
seconds (589090816 bytes)
test 23 (256 bit key, 8192 byte blocks): 75641 operations in 1 seconds
(619651072 bytes)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-09-23  6:30 [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash XiaokangQian
  2021-09-28  6:27 ` Eric Biggers
@ 2021-12-15  3:04 ` XiaokangQian
  1 sibling, 0 replies; 12+ messages in thread
From: XiaokangQian @ 2021-12-15  3:04 UTC (permalink / raw)
  To: Herbert Xu, David S. Miller, Catalin Marinas, Will Deacon
  Cc: nd, Ard Biesheuvel, XiaokangQian, linux-crypto, linux-arm-kernel,
	linux-kernel

To improve performance on cores with deep piplines such as A72,N1,
implement gcm(aes) using a 4-way interleave of aes and ghash (totally
8 blocks in parallel), which can make full utilize of pipelines rather
than the 4-way interleave we used currently. It can gain about 20% for
big data sizes such that 8k.

This is a complete new version of the GCM part of the combined GCM/GHASH
driver, it will co-exist with the old driver, only serve for big data
sizes. Instead of interleaving four invocations of AES where each chunk
of 64 bytes is encrypted first and then ghashed, the new version uses a
more coarse grained approach where a chunk of 64 bytes is encrypted and
at the same time, one chunk of 64 bytes is ghashed (or ghashed and
decrypted in the converse case).

The table below compares the performance of the old driver and the new
one on various micro-architectures and running in various modes with
various data sizes.

            |     AES-128       |     AES-192       |     AES-256       |
     #bytes | 1024 | 1420 |  8k | 1024 | 1420 |  8k | 1024 | 1420 |  8k |
     -------+------+------+-----+------+------+-----+------+------+-----+
        A72 | 5.5% |  12% | 25% | 2.2% |  9.5%|  23%| -1%  |  6.7%| 19% |
        A57 |-0.5% |  9.3%| 32% | -3%  |  6.3%|  26%| -6%  |  3.3%| 21% |
        N1  | 0.4% |  7.6%|24.5%| -2%  |  5%  |  22%| -4%  |  2.7%| 20% |

Signed-off-by: XiaokangQian <xiaokang.qian@arm.com>
---
 arch/arm64/crypto/Makefile               |    2 +-
 arch/arm64/crypto/ghash-ce-core_unroll.S | 1333 ++++++++++++++++++++++
 arch/arm64/crypto/ghash-ce-glue.c        |   85 +-
 3 files changed, 1408 insertions(+), 12 deletions(-)
 create mode 100644 arch/arm64/crypto/ghash-ce-core_unroll.S

diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 09a805cc32d7..068e9d377db2 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce.o
 sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o
 
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
-ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
+ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o ghash-ce-core_unroll.o
 
 obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
 crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
diff --git a/arch/arm64/crypto/ghash-ce-core_unroll.S b/arch/arm64/crypto/ghash-ce-core_unroll.S
new file mode 100644
index 000000000000..bd754940e76e
--- /dev/null
+++ b/arch/arm64/crypto/ghash-ce-core_unroll.S
@@ -0,0 +1,1333 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Accelerated GCM implementation with ARMv8 PMULL instructions
+ * and unroll factors.
+ *
+ * Copyright (C) 2021  Arm.ltd. <xiaokang.qian@arm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.arch	armv8-a+crypto
+.text
+
+.macro push_stack
+	stp	x19, x20, [sp, #-128]!
+	stp	x21, x22, [sp, #16]
+	stp	x23, x24, [sp, #32]
+	stp	x25, x26, [sp, #48]
+	stp	d8, d9, [sp, #64]
+	stp	d10, d11, [sp, #80]
+	stp	d12, d13, [sp, #96]
+	stp	d14, d15, [sp, #112]
+.endm
+
+.macro pop_stack
+	ldp	x21, x22, [sp, #16]
+	ldp	x23, x24, [sp, #32]
+	ldp	x25, x26, [sp, #48]
+	ldp	d8, d9, [sp, #64]
+	ldp	d10, d11, [sp, #80]
+	ldp	d12, d13, [sp, #96]
+	ldp	d14, d15, [sp, #112]
+	ldp	x19, x20, [sp], #128
+.endm
+
+.macro load_const
+	movi	v8.8b, #0xc2
+	shl	d8, d8, #56               //mod_constant
+.endm
+
+.macro gcm_tidy_up high:req, mid:req, low:req, tmp1:req, tmp2:req
+	eor	\tmp1\().16b, \low\().16b, \high\().16b //MODULO-karatsuba tidy up
+	eor	\mid\().16b, \mid\().16b, \tmp1\().16b  //MODULO-karatsuba tidy up
+	pmull	\tmp2\().1q, \high\().1d, v8.1d
+	ext	\high\().16b, \high\().16b, \high\().16b, #8
+	eor	\mid\().16b, \mid\().16b, \tmp2\().16b //MODULO - fold into mid
+	eor	\mid\().16b, \mid\().16b, \high\().16b //MODULO - fold into mid
+	pmull	\high\().1q, \mid\().1d, v8.1d  //MODULO - mid 64b align with low
+	ext	\mid\().16b, \mid\().16b, \mid\().16b, #8
+	eor	\low\().16b, \low\().16b, \high\().16b //MODULO - fold into low
+	eor	\low\().16b, \low\().16b, \mid\().16b //MODULO - fold into low
+.endm
+
+.macro karasuba_multiply res:req, h:req, tmp1:req, tmp2:req, tmp3:req
+	pmull	\tmp1\().1q, \res\().1d, \h\().1d    //GHASH final block - low
+	eor	\tmp2\().8b, \tmp2\().8b, \res\().8b //GHASH final block - mid
+	pmull2	\tmp3\().1q, \res\().2d, \h\().2d    //GHASH final block - high
+	pmull	\tmp2\().1q, \tmp2\().1d, v16.1d     //GHASH final block - mid
+	eor	v11.16b, v11.16b, \tmp1\().16b       //GHASH final block - low
+	eor	v9.16b, v9.16b, \tmp3\().16b         //GHASH final block - high
+	eor	v10.16b, v10.16b, \tmp2\().16b       //GHASH final block - mid
+.endm
+
+.macro aes_encrypt_round    block:req,key:req
+	aese	\block\().16b,\key\().16b
+	aesmc	\block\().16b,\block\().16b
+.endm
+
+.macro aes_enc_extra_round rd_num:req
+	.if \rd_num == 12
+	add     x19,x8,#176
+	aes_encrypt_round	v0, v27         //AES block 0 - round 9
+	aes_encrypt_round	v3, v27         //AES block 3 - round 9
+	aes_encrypt_round	v2, v27         //AES block 2 - round 9
+	aes_encrypt_round	v1, v27         //AES block 1 - round 9
+	ldr	q27, [x19],#16                  //load rk9
+	aes_encrypt_round	v0, v28         //AES block 0 - round 10
+	aes_encrypt_round	v2, v28         //AES block 2 - round 10
+	aes_encrypt_round	v1, v28         //AES block 1 - round 10
+	aes_encrypt_round	v3, v28         //AES block 3 - round 10
+	ldr	q28, [x19],#16                  //load rk10
+	.elseif \rd_num == 14
+	aes_encrypt_round	v1, v27          //AES block 1 - round 11
+	aes_encrypt_round	v2, v27          //AES block 2 - round 11
+	aes_encrypt_round	v0, v27          //AES block 0 - round 11
+	aes_encrypt_round	v3, v27          //AES block 3 - round 11
+	ldr	q27, [x19],#16                   //load rk9
+	aes_encrypt_round	v1, v28          //AES block 1 - round 12
+	aes_encrypt_round	v2, v28          //AES block 2 - round 12
+	aes_encrypt_round	v0, v28          //AES block 0 - round 12
+	aes_encrypt_round	v3, v28          //AES block 3 - round 12
+	ldr	q28, [x19],#16                   //load rk10
+	.endif
+	fmov	x13, d28                         //load last second block
+	fmov	x14, v28.d[1]                    //load last second block
+.endm
+
+.macro aes_enc_iv_init
+	ldr	q18, [x8, #0]                   //load rk0
+	ldr	q19, [x8, #16]                   //load rk1
+	mov	w11, #(0x1 << 24)		// BE '1U'
+	ld1	{v0.16b}, [x25]
+	mov	v0.s[3], w11
+	aes_encrypt_round	v0, v18         //AES block 0 - round 0
+	ldr	q20, [x8, #32]                   //load rk2
+	aes_encrypt_round	v0, v19         //AES block 0 - round 1
+	ldr	q21, [x8, #48]                  //load rk3
+	aes_encrypt_round	v0, v20         //AES block 0 - round 2
+	ldr	q22, [x8, #64]                  //load rk4
+	aes_encrypt_round	v0, v21         //AES block 0 - round 3
+	ldr	q23, [x8, #80]                  //load rk5
+	aes_encrypt_round	v0, v22          //AES block 0 - round 4
+	ldr	q24, [x8, #96]                  //load rk6
+	aes_encrypt_round	v0, v23          //AES block 0 - round 5
+	ldr	q25, [x8, #112]                  //load rk7
+	aes_encrypt_round	v0, v24          //AES block 0 - round 6
+	ldr	q26, [x8, #128]                  //load rk8
+	aes_encrypt_round	v0, v25          //AES block 0 - round 7
+	ldr	q27, [x8, #144]                  //load rk9
+.endm
+
+.macro aes_enc_iv_common rd_num:req
+	.if \rd_num == 12
+	aes_encrypt_round	v0, v26          //AES block 0 - round 8
+	ldr	q26, [x19],#16                  //load rk10
+	aes_encrypt_round	v0, v27         //AES block 0 - round 9
+	ldr	q27, [x19],#16                  //load rk9
+	.elseif \rd_num == 14
+	aes_encrypt_round	v0, v26         //AES block 0 - round 10
+	ldr	q26, [x19],#16                  //load rk10
+	aes_encrypt_round	v0, v27          //AES block 0 - round 11
+	ldr	q27, [x19],#16                   //load rk9
+	.endif
+.endm
+
+.macro aes_enc_iv_final
+	aes_encrypt_round	v0, v26          //AES block 0 - round 12
+	ldr	q26, [x19],#16                   //load rk10
+	aese	v0.16b, v27.16b                 //AES block 0 - round 9
+	eor	v4.16b, v26.16b, v0.16b          //AES block 0 - result
+.endm
+
+.macro load_initial_tag    dst:req,buf:req
+	ld1	{\dst\().16b}, [\buf]
+	ext	\dst\().16b, \dst\().16b, \dst\().16b, #8
+	rev64	\dst\().16b, \dst\().16b
+.endm
+
+SYM_FUNC_START(pmull_gcm_encrypt_unroll)
+	push_stack
+	mov	x25, x4
+	mov	x15, x7
+	mov	x8, x5
+	lsr	x5, x1, #3                      //byte_len
+	mov	x26, x6
+	load_initial_tag v11,x3
+	cbz	x1, .Lenc_final_tag_pre
+	ldp	x10, x11, [x25]                 //ctr96_b64, ctr96_t32
+	ldp	x13, x14, [x8, #160]            //load rk10
+	load_initial_tag v11,x3
+	ldr	q27, [x8, #144]                 //load rk9
+	add	x4, x0, x1, lsr #3              //end_input_ptr
+	sub	x5, x5, #1                      //byte_len - 1
+	lsr	x12, x11, #32
+	ldr	q15, [x3, #112]                 //load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	fmov	d1, x10                         //CTR block 1
+	rev	w12, w12                        //rev_ctr32
+	add	w12, w12, #1                    //increment rev_ctr32
+	orr	w11, w11, w11
+	ldr	q18, [x8, #0]                   //load rk0
+	rev	w9, w12                         //CTR block 1
+	add	w12, w12, #1                    //CTR block 1
+	fmov	d3, x10                         //CTR block 3
+	ldr	q28, [x8, #160]                 //load rk10
+	orr	x9, x11, x9, lsl #32            //CTR block 1
+	//load initial counter so that start first AES block quickly
+	ld1	{ v0.16b}, [x25]
+	fmov	v1.d[1], x9                     //CTR block 1
+	rev	w9, w12                         //CTR block 2
+	fmov	d2, x10                         //CTR block 2
+	orr	x9, x11, x9, lsl #32            //CTR block 2
+	add	w12, w12, #1                    //CTR block 2
+	fmov	v2.d[1], x9                     //CTR block 2
+	rev	w9, w12                         //CTR block 3
+	orr	x9, x11, x9, lsl #32            //CTR block 3
+	ldr	q19, [x8, #16]                  //load rk1
+	add	w12, w12, #1                    //CTR block 3
+	fmov	v3.d[1], x9                     //CTR block 3
+	ldr	q14, [x3, #80]                  //load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aes_encrypt_round	v1, v18         //AES block 1 - round 0
+	ldr	q20, [x8, #32]                  //load rk2
+	aes_encrypt_round	v2, v18         //AES block 2 - round 0
+	ldr	q12, [x3, #32]                  //load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aes_encrypt_round	v0, v18         //AES block 0 - round 0
+	ldr	q26, [x8, #128]                 //load rk8
+	aes_encrypt_round	v3, v18         //AES block 3 - round 0
+	ldr	q21, [x8, #48]                  //load rk3
+	aes_encrypt_round	v2, v19         //AES block 2 - round 1
+	trn2	v17.2d,  v14.2d,    v15.2d      //h4l | h3l
+	aes_encrypt_round	v0, v19         //AES block 0 - round 1
+	ldr	q24, [x8, #96]                  //load rk6
+	aes_encrypt_round	v1, v19         //AES block 1 - round 1
+	ldr	q25, [x8, #112]                 //load rk7
+	aes_encrypt_round	v3, v19         //AES block 3 - round 1
+	trn1	v9.2d, v14.2d,    v15.2d        //h4h | h3h
+	aes_encrypt_round	v0, v20         //AES block 0 - round 2
+	ldr	q23, [x8, #80]                  //load rk5
+	aes_encrypt_round	v1, v20         //AES block 1 - round 2
+	ldr	q13, [x3, #64]                  //load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	aes_encrypt_round	v3, v20         //AES block 3 - round 2
+	aes_encrypt_round	v2, v20         //AES block 2 - round 2
+	eor	v17.16b, v17.16b, v9.16b        //h4k | h3k
+	aes_encrypt_round	v0, v21         //AES block 0 - round 3
+	aes_encrypt_round	v1, v21         //AES block 1 - round 3
+	aes_encrypt_round	v2, v21         //AES block 2 - round 3
+	ldr	q22, [x8, #64]                  //load rk4
+	aes_encrypt_round	v3, v21         //AES block 3 - round 3
+	//bytes be processed in main loop(at least 1 byte be handled by tail)
+	and	x5, x5, #0xffffffffffffffc0
+	trn2	v16.2d,  v12.2d,    v13.2d      //h2l | h1l
+	aes_encrypt_round	v3, v22          //AES block 3 - round 4
+	add	x5, x5, x0
+	aes_encrypt_round	v2, v22          //AES block 2 - round 4
+	cmp	x0, x5                   //check if we have <= 4 blocks
+	aes_encrypt_round	v0, v22          //AES block 0 - round 4
+	aes_encrypt_round	v3, v23          //AES block 3 - round 5
+	aes_encrypt_round	v2, v23          //AES block 2 - round 5
+	aes_encrypt_round	v0, v23          //AES block 0 - round 5
+	aes_encrypt_round	v3, v24          //AES block 3 - round 6
+	aes_encrypt_round	v1, v22          //AES block 1 - round 4
+	aes_encrypt_round	v2, v24          //AES block 2 - round 6
+	trn1	v8.2d,    v12.2d,    v13.2d     //h2h | h1h
+	aes_encrypt_round	v0, v24          //AES block 0 - round 6
+	aes_encrypt_round	v1, v23          //AES block 1 - round 5
+	aes_encrypt_round	v1, v24          //AES block 1 - round 6
+	aes_encrypt_round	v3, v25          //AES block 3 - round 7
+	aes_encrypt_round	v0, v25          //AES block 0 - round 7
+	aes_encrypt_round	v2, v25          //AES block 2 - round 7
+	aes_encrypt_round	v0, v26          //AES block 0 - round 8
+	aes_encrypt_round	v1, v25          //AES block 1 - round 7
+	aes_encrypt_round	v2, v26          //AES block 2 - round 8
+	aes_encrypt_round	v3, v26          //AES block 3 - round 8
+	aes_encrypt_round	v1, v26          //AES block 1 - round 8
+
+	mov	x6, x26
+	sub	x6, x6, #10
+	cbz	x6, .Lleft_rounds
+	aes_enc_extra_round 12
+	sub	x6, x6, #2
+	cbz	x6, .Lleft_rounds
+	aes_enc_extra_round 14
+
+.Lleft_rounds:
+	aese	v2.16b, v27.16b                  //AES block 2 - round 9
+	aese	v0.16b, v27.16b                 //AES block 0 - round 9
+	eor	v16.16b, v16.16b, v8.16b        //h2k | h1k
+	aese	v1.16b, v27.16b                 //AES block 1 - round 9
+	aese	v3.16b, v27.16b                 //AES block 3 - round 9
+	b.ge	.L128_enc_tail                  //handle tail
+
+	ldp	x6, x7, [x0, #0]                //AES block 0 - load plaintext
+	ldp	x21, x22, [x0, #32]             //AES block 2 - load plaintext
+	ldp	x19, x20, [x0, #16]             //AES block 1 - load plaintext
+	ldp	x23, x24, [x0, #48]             //AES block 3 - load plaintext
+	eor	x6, x6, x13                     //AES block 0 - round 10 low
+	eor	x7, x7, x14                     //AES block 0 - round 10 high
+	eor	x21, x21, x13                   //AES block 2 - round 10 low
+	fmov	d4, x6                          //AES block 0 - mov low
+	eor	x19, x19, x13                   //AES block 1 - round 10 low
+	eor	x22, x22, x14                   //AES block 2 - round 10 high
+	fmov	v4.d[1], x7                     //AES block 0 - mov high
+	fmov	d5, x19                         //AES block 1 - mov low
+	eor	x20, x20, x14                   //AES block 1 - round 10 high
+	eor	x23, x23, x13                   //AES block 3 - round 10 low
+	fmov	v5.d[1], x20                    //AES block 1 - mov high
+	fmov	d6, x21                         //AES block 2 - mov low
+	eor	x24, x24, x14                   //AES block 3 - round 10 high
+	rev	w9, w12                         //CTR block 4
+	fmov	v6.d[1], x22                    //AES block 2 - mov high
+	orr	x9, x11, x9, lsl #32            //CTR block 4
+	eor	v4.16b, v4.16b, v0.16b          //AES block 0 - result
+	fmov	d0, x10                         //CTR block 4
+	add	w12, w12, #1                    //CTR block 4
+	fmov	v0.d[1], x9                     //CTR block 4
+	rev	w9, w12                         //CTR block 5
+	eor	v5.16b, v5.16b, v1.16b          //AES block 1 - result
+	fmov	d1, x10                         //CTR block 5
+	orr	x9, x11, x9, lsl #32            //CTR block 5
+	add	w12, w12, #1                    //CTR block 5
+	add	x0, x0, #64                     //AES input_ptr update
+	fmov	v1.d[1], x9                     //CTR block 5
+	fmov	d7, x23                         //AES block 3 - mov low
+	rev	w9, w12                         //CTR block 6
+	st1	{ v4.16b}, [x2], #16            //AES block 0 - store result
+	fmov	v7.d[1], x24                    //AES block 3 - mov high
+	orr	x9, x11, x9, lsl #32            //CTR block 6
+	add	w12, w12, #1                    //CTR block 6
+	eor	v6.16b, v6.16b, v2.16b          //AES block 2 - result
+	st1	{ v5.16b}, [x2], #16            //AES block 1 - store result
+	fmov	d2, x10                         //CTR block 6
+	cmp	x0, x5                   //check if we have <= 8 blocks
+	fmov	v2.d[1], x9                     //CTR block 6
+	rev	w9, w12                         //CTR block 7
+	st1	{ v6.16b}, [x2], #16            //AES block 2 - store result
+	orr	x9, x11, x9, lsl #32            //CTR block 7
+	eor	v7.16b, v7.16b, v3.16b          //AES block 3 - result
+	st1	{ v7.16b}, [x2], #16            //AES block 3 - store result
+	b.ge	.L128_enc_prepretail            //do prepretail
+.L128_enc_main_loop:	//main	loop start
+	ldp	x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
+	rev64	v4.16b, v4.16b                  //GHASH block 4k
+	rev64	v6.16b, v6.16b                  //GHASH block 4k+2
+	aes_encrypt_round	v2, v18          //AES block 4k+6 - round 0
+	fmov	d3, x10                         //CTR block 4k+3
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	rev64	v5.16b, v5.16b                  //GHASH block 4k+1
+	aes_encrypt_round	v1, v18          //AES block 4k+5 - round 0
+	add	w12, w12, #1                    //CTR block 4k+3
+	fmov	v3.d[1], x9                     //CTR block 4k+3
+	aes_encrypt_round	v0, v18          //AES block 4k+4 - round 0
+	mov	d31, v6.d[1]                    //GHASH block 4k+2 - mid
+	aes_encrypt_round	v2, v19          //AES block 4k+6 - round 1
+	mov	d30, v5.d[1]                    //GHASH block 4k+1 - mid
+	aes_encrypt_round	v1, v19          //AES block 4k+5 - round 1
+	eor	v4.16b, v4.16b, v11.16b         //PRE 1
+	aes_encrypt_round	v3, v18          //AES block 4k+7 - round 0
+	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
+	pmull2	v28.1q, v5.2d, v14.2d           //GHASH block 4k+1 - high
+	eor	v31.8b, v31.8b, v6.8b           //GHASH block 4k+2 - mid
+	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
+	aes_encrypt_round	v0, v19          //AES block 4k+4 - round 1
+	rev	w9, w12                         //CTR block 4k+8
+	eor	v30.8b, v30.8b, v5.8b           //GHASH block 4k+1 - mid
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH block 4k - high
+	add	w12, w12, #1                    //CTR block 4k+8
+	mov	d10, v17.d[1]                   //GHASH block 4k - mid
+	aes_encrypt_round	v0, v20          //AES block 4k+4 - round 2
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH block 4k - low
+	eor	v8.8b, v8.8b, v4.8b             //GHASH block 4k - mid
+	aes_encrypt_round	v1, v20          //AES block 4k+5 - round 2
+	aes_encrypt_round	v0, v21          //AES block 4k+4 - round 3
+	eor	v9.16b, v9.16b, v28.16b         //GHASH block 4k+1 - high
+	pmull	v28.1q, v6.1d, v13.1d           //GHASH block 4k+2 - low
+	pmull	v10.1q, v8.1d, v10.1d           //GHASH block 4k - mid
+	rev64	v7.16b, v7.16b                  //GHASH block 4k+3
+	pmull	v30.1q, v30.1d, v17.1d          //GHASH block 4k+1 - mid
+	pmull	v29.1q, v5.1d, v14.1d           //GHASH block 4k+1 - low
+	ins	v31.d[1], v31.d[0]              //GHASH block 4k+2 - mid
+	pmull2	v8.1q, v6.2d, v13.2d            //GHASH block 4k+2 - high
+	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+1 - mid
+	mov	d30, v7.d[1]                    //GHASH block 4k+3 - mid
+	aes_encrypt_round	v3, v19          //AES block 4k+7 - round 1
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+1 - low
+	aes_encrypt_round	v2, v20          //AES block 4k+6 - round 2
+	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
+	aes_encrypt_round	v1, v21          //AES block 4k+5 - round 3
+	eor	v30.8b, v30.8b, v7.8b           //GHASH block 4k+3 - mid
+	pmull2	v4.1q, v7.2d, v12.2d            //GHASH block 4k+3 - high
+	aes_encrypt_round	v2, v21          //AES block 4k+6 - round 3
+	eor	v9.16b, v9.16b, v8.16b          //GHASH block 4k+2 - high
+	pmull2	v31.1q, v31.2d, v16.2d          //GHASH block 4k+2 - mid
+	pmull	v29.1q, v7.1d, v12.1d           //GHASH block 4k+3 - low
+	movi	v8.8b, #0xc2
+	pmull	v30.1q, v30.1d, v16.1d          //GHASH block 4k+3 - mid
+	eor	v11.16b, v11.16b, v28.16b       //GHASH block 4k+2 - low
+	aes_encrypt_round	v1, v22          //AES block 4k+5 - round 4
+	aes_encrypt_round	v3, v20          //AES block 4k+7 - round 2
+	shl	d8, d8, #56                     //mod_constant
+	aes_encrypt_round	v0, v22          //AES block 4k+4 - round 4
+	eor	v9.16b, v9.16b, v4.16b          //GHASH block 4k+3 - high
+	aes_encrypt_round	v1, v23          //AES block 4k+5 - round 5
+	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
+	aes_encrypt_round	v3, v21          //AES block 4k+7 - round 3
+	eor	v10.16b, v10.16b, v31.16b       //GHASH block 4k+2 - mid
+	aes_encrypt_round	v0, v23          //AES block 4k+4 - round 5
+	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
+	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+3 - low
+	aes_encrypt_round	v2, v22          //AES block 4k+6 - round 4
+	eor	x19, x19, x13                     //AES block 4k+5 - round 10 low
+	aes_encrypt_round	v3, v22          //AES block 4k+7 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v24          //AES block 4k+5 - round 6
+	eor	x23, x23, x13                     //AES block 4k+3 - round 10 low
+	aes_encrypt_round	v2, v23          //AES block 4k+6 - round 5
+	eor	v30.16b, v11.16b, v9.16b        //MODULO - karatsuba tidy up
+	fmov	d4, x6                          //AES block 4k+4 - mov low
+	aes_encrypt_round	v0, v24          //AES block 4k+4 - round 6
+	fmov	v4.d[1], x7                     //AES block 4k+4 - mov high
+	add	x0, x0, #64                     //AES input_ptr update
+	fmov	d7, x23                         //AES block 4k+3 - mov low
+	ext	v9.16b, v9.16b, v9.16b, #8      //MODULO - other top alignment
+	aes_encrypt_round	v3, v23          //AES block 4k+7 - round 5
+	fmov	d5, x19                         //AES block 4k+5 - mov low
+	aes_encrypt_round	v0, v25          //AES block 4k+4 - round 7
+	eor	v10.16b, v10.16b, v30.16b       //MODULO - karatsuba tidy up
+	aes_encrypt_round	v2, v24         //AES block 4k+6 - round 6
+	eor	x20, x20, x14                   //AES block 4k+5 - round 10 high
+	aes_encrypt_round	v1, v25         //AES block 4k+5 - round 7
+	fmov	v5.d[1], x20                    //AES block 4k+5 - mov high
+	aes_encrypt_round	v0, v26          //AES block 4k+4 - round 8
+	fmov	v7.d[1], x24                    //AES block 4k+3 - mov high
+	aes_encrypt_round	v3, v24          //AES block 4k+7 - round 6
+	cmp	x0, x5                   //.LOOP CONTROL
+	aes_encrypt_round	v1, v26          //AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v31.16b       //MODULO - fold into mid
+	eor	x21, x21, x13                   //AES block 4k+6 - round 10 low
+	eor	x22, x22, x14                   //AES block 4k+6 - round 10 high
+	ldr	q27, [x8, #144]                 //load rk9
+	aes_encrypt_round	v3, v25          //AES block 4k+7 - round 7
+	fmov	d6, x21                         //AES block 4k+6 - mov low
+	fmov	v6.d[1], x22                    //AES block 4k+6 - mov high
+	aes_encrypt_round	v2, v25          //AES block 4k+6 - round 7
+	ldr	q28, [x8, #160]                 //load rk9
+	aes_encrypt_round	v3, v26          //AES block 4k+7 - round 8
+	eor	v10.16b, v10.16b, v9.16b        //MODULO - fold into mid
+	aes_encrypt_round	v2, v26          //AES block 4k+6 - round 8
+	mov	x6, x26
+	sub	x6,x6,#10
+	cbz	x6, .Lleft2_rounds
+	aes_enc_extra_round 12
+	sub	x6,x6,#2
+	cbz	x6, .Lleft2_rounds
+	aes_enc_extra_round 14
+.Lleft2_rounds:
+	aese	v0.16b, v27.16b                 //AES block 4k+4 - round 9
+	eor	v4.16b, v4.16b, v0.16b          //AES block 4k+4 - result
+	fmov	d0, x10                         //CTR block 4k+8
+	fmov	v0.d[1], x9                     //CTR block 4k+8
+	rev	w9, w12                         //CTR block 4k+9
+	add	w12, w12, #1                    //CTR block 4k+9
+	aese	v1.16b, v27.16b                 //AES block 4k+5 - round 9
+	eor	v5.16b, v5.16b, v1.16b          //AES block 4k+5 - result
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
+	fmov	d1, x10                         //CTR block 4k+9
+	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
+	fmov	v1.d[1], x9                     //CTR block 4k+9
+	rev	w9, w12                         //CTR block 4k+10
+	aese	v2.16b, v27.16b                 //AES block 4k+6 - round 9
+	st1	{ v4.16b}, [x2], #16            //AES block 4k+4 - store result
+	eor	v6.16b, v6.16b, v2.16b          //AES block 4k+6 - result
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
+	aese	v3.16b, v27.16b                 //AES block 4k+7 - round 9
+	add	w12, w12, #1                    //CTR block 4k+10
+	ext	v10.16b, v10.16b, v10.16b, #8   //MODULO - other mid alignment
+	fmov	d2, x10                         //CTR block 4k+10
+	eor	v11.16b, v11.16b, v9.16b        //MODULO - fold into low
+	st1	{ v5.16b}, [x2], #16            //AES block 4k+5 - store result
+	fmov	v2.d[1], x9                     //CTR block 4k+10
+	st1	{ v6.16b}, [x2], #16            //AES block 4k+6 - store result
+	rev	w9, w12                         //CTR block 4k+11
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
+	eor	v7.16b, v7.16b, v3.16b          //AES block 4k+3 - result
+	eor	v11.16b, v11.16b, v10.16b       //MODULO - fold into low
+	st1	{ v7.16b}, [x2], #16            //AES block 4k+3 - store result
+	b.lt	.L128_enc_main_loop
+.L128_enc_prepretail:	//PREPRETAIL
+	rev64	v4.16b, v4.16b                  //GHASH block 4k (only t0 is free)
+	fmov	d3, x10                         //CTR block 4k+3
+	rev64	v5.16b, v5.16b                  //GHASH block 4k+1 (t0 and t1 free)
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	add	w12, w12, #1                    //CTR block 4k+3
+	fmov	v3.d[1], x9                     //CTR block 4k+3
+	aes_encrypt_round	v1, v18         //AES block 4k+5 - round 0
+	rev64	v6.16b, v6.16b                  //GHASH block 4k+2
+	pmull	v29.1q, v5.1d, v14.1d           //GHASH block 4k+1 - low
+	rev64	v7.16b, v7.16b                  //GHASH block 4k+3
+	eor	v4.16b, v4.16b, v11.16b         //PRE 1
+	pmull2	v28.1q, v5.2d, v14.2d           //GHASH block 4k+1 - high
+	aes_encrypt_round	v3, v18         //AES block 4k+7 - round 0
+	mov	d30, v5.d[1]                    //GHASH block 4k+1 - mid
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH block 4k - low
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	mov	d31, v6.d[1]                    //GHASH block 4k+2 - mid
+	mov	d10, v17.d[1]                   //GHASH block 4k - mid
+	aes_encrypt_round	v1, v19         //AES block 4k+5 - round 1
+	eor	v30.8b, v30.8b, v5.8b           //GHASH block 4k+1 - mid
+	eor	v8.8b, v8.8b, v4.8b             //GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH block 4k - high
+	eor	v31.8b, v31.8b, v6.8b           //GHASH block 4k+2 - mid
+	aes_encrypt_round	v3, v19         //AES block 4k+7 - round 1
+	pmull	v30.1q, v30.1d, v17.1d          //GHASH block 4k+1 - mid
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+1 - low
+	pmull	v10.1q, v8.1d, v10.1d           //GHASH block 4k - mid
+	aes_encrypt_round	v0, v18         //AES block 4k+4 - round 0
+	ins	v31.d[1], v31.d[0]              //GHASH block 4k+2 - mid
+	aes_encrypt_round	v2, v18         //AES block 4k+6 - round 0
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+1 - mid
+	mov	d30, v7.d[1]                    //GHASH block 4k+3 - mid
+	aes_encrypt_round	v0, v19          //AES block 4k+4 - round 1
+	eor	v9.16b, v9.16b, v28.16b         //GHASH block 4k+1 - high
+	pmull2	v31.1q, v31.2d, v16.2d          //GHASH block 4k+2 - mid
+	pmull2	v8.1q, v6.2d, v13.2d            //GHASH block 4k+2 - high
+	eor	v30.8b, v30.8b, v7.8b           //GHASH block 4k+3 - mid
+	pmull2	v4.1q, v7.2d, v12.2d            //GHASH block 4k+3 - high
+	pmull	v28.1q, v6.1d, v13.1d           //GHASH block 4k+2 - low
+	aes_encrypt_round	v2, v19          //AES block 4k+6 - round 1
+	eor	v9.16b, v9.16b, v8.16b          //GHASH block 4k+2 - high
+	aes_encrypt_round	v0, v20          //AES block 4k+4 - round 2
+	pmull	v29.1q, v7.1d, v12.1d           //GHASH block 4k+3 - low
+	movi	v8.8b, #0xc2
+	aes_encrypt_round	v2, v20          //AES block 4k+6 - round 2
+	eor	v11.16b, v11.16b, v28.16b       //GHASH block 4k+2 - low
+	aes_encrypt_round	v3, v20          //AES block 4k+7 - round 2
+	pmull	v30.1q, v30.1d, v16.1d          //GHASH block 4k+3 - mid
+	eor	v10.16b, v10.16b, v31.16b       //GHASH block 4k+2 - mid
+	aes_encrypt_round	v2, v21          //AES block 4k+6 - round 3
+	aes_encrypt_round	v1, v20          //AES block 4k+5 - round 2
+	eor	v9.16b, v9.16b, v4.16b          //GHASH block 4k+3 - high
+	aes_encrypt_round	v0, v21          //AES block 4k+4 - round 3
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+3 - mid
+	shl	d8, d8, #56               //mod_constant
+	aes_encrypt_round	v1, v21          //AES block 4k+5 - round 3
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+3 - low
+	aes_encrypt_round	v0, v22          //AES block 4k+4 - round 4
+	pmull	v28.1q, v9.1d, v8.1d
+	eor	v10.16b, v10.16b, v9.16b        //karatsuba tidy up
+	aes_encrypt_round	v1, v22          //AES block 4k+5 - round 4
+	aes_encrypt_round	v0, v23          //AES block 4k+4 - round 5
+	ext	v9.16b, v9.16b, v9.16b, #8
+	aes_encrypt_round	v3, v21          //AES block 4k+7 - round 3
+	aes_encrypt_round	v2, v22          //AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v11.16b
+	aes_encrypt_round	v0, v24          //AES block 4k+4 - round 6
+	aes_encrypt_round	v3, v22          //AES block 4k+7 - round 4
+	aes_encrypt_round	v1, v23          //AES block 4k+5 - round 5
+	aes_encrypt_round	v2, v23          //AES block 4k+6 - round 5
+	eor	v10.16b, v10.16b, v28.16b
+	aes_encrypt_round	v3, v23          //AES block 4k+7 - round 5
+	aes_encrypt_round	v1, v24          //AES block 4k+5 - round 6
+	aes_encrypt_round	v2, v24          //AES block 4k+6 - round 6
+	aes_encrypt_round	v3, v24          //AES block 4k+7 - round 6
+	eor	v10.16b, v10.16b, v9.16b
+	ldr	q27, [x8, #144]                                //load rk9
+	aes_encrypt_round	v0, v25          //AES block 4k+4 - round 7
+	aes_encrypt_round	v2, v25          //AES block 4k+6 - round 7
+	aes_encrypt_round	v3, v25          //AES block 4k+7 - round 7
+	pmull	v28.1q, v10.1d, v8.1d
+	aes_encrypt_round	v1, v25          //AES block 4k+5 - round 7
+	ext	v10.16b, v10.16b, v10.16b, #8
+	aes_encrypt_round	v3, v26          //AES block 4k+7 - round 8
+	aes_encrypt_round	v0, v26          //AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v28.16b
+	aes_encrypt_round	v1, v26          //AES block 4k+5 - round 8
+	ldr	q28, [x8, #160]                                //load rk9
+	aes_encrypt_round	v2, v26          //AES block 4k+6 - round 8
+
+	mov	x6, x26
+	sub	x6,x6,#10
+	cbz	x6, .Lleft3_rounds
+	aes_enc_extra_round 12
+	sub	x6,x6,#2
+	cbz	x6, .Lleft3_rounds
+	aes_enc_extra_round 14
+
+.Lleft3_rounds:
+	aese	v0.16b, v27.16b                 //AES block 4k+4 - round 9
+	aese	v3.16b, v27.16b                 //AES block 4k+7 - round 9
+	aese	v1.16b, v27.16b                 //AES block 4k+5 - round 9
+	eor	v11.16b, v11.16b, v10.16b
+	aese	v2.16b, v27.16b                 //AES block 4k+6 - round 9
+.L128_enc_tail:	//TAIL
+	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left
+	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
+	cmp	x5, #48
+	ext	v8.16b, v11.16b, v11.16b, #8    //prepare final partial tag
+	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
+	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
+	fmov	d4, x6                          //AES block 4k+4 - mov low
+	fmov	v4.d[1], x7                     //AES block 4k+4 - mov high
+	eor	v5.16b, v4.16b, v0.16b          //AES block 4k+4 - result
+	b.gt	.L128_enc_blocks_more_than_3
+	sub	w12, w12, #1
+	movi	v11.8b, #0
+	mov	v3.16b, v2.16b
+	cmp	x5, #32
+	mov	v2.16b, v1.16b
+	movi	v9.8b, #0
+	movi	v10.8b, #0
+	b.gt	.L128_enc_blocks_more_than_2
+	mov	v3.16b, v1.16b
+	cmp	x5, #16
+	sub	w12, w12, #1
+	b.gt	.L128_enc_blocks_more_than_1
+	sub	w12, w12, #1
+	b	.L128_enc_blocks_less_than_1
+.L128_enc_blocks_more_than_3:	//blocks	left >  3
+	st1	{ v5.16b}, [x2], #16    //AES final-3 block  - store result
+	ldp	x6, x7, [x0], #16       //AES final-2 block-load input low&high
+	rev64	v4.16b, v5.16b          //GHASH final-3 block
+	eor	v4.16b, v4.16b, v8.16b  //feed in partial tag
+	eor	x7, x7, x14             //AES final-2 block - round 10 high
+	eor	x6, x6, x13             //AES final-2 block - round 10 low
+	fmov	d5, x6                  //AES final-2 block - mov low
+	movi	v8.8b, #0               //suppress further partial tag feed in
+	fmov	v5.d[1], x7             //AES final-2 block - mov high
+	pmull	v11.1q, v4.1d, v15.1d   //GHASH final-3 block - low
+	mov	d22, v4.d[1]            //GHASH final-3 block - mid
+	pmull2	v9.1q, v4.2d, v15.2d    //GHASH final-3 block - high
+	mov	d10, v17.d[1]           //GHASH final-3 block - mid
+	eor	v5.16b, v5.16b, v1.16b  //AES final-2 block - result
+	eor	v22.8b, v22.8b, v4.8b   //GHASH final-3 block - mid
+	pmull	v10.1q, v22.1d, v10.1d  //GHASH final-3 block - mid
+.L128_enc_blocks_more_than_2:	//blocks	left >  2
+	st1	{ v5.16b}, [x2], #16    //AES final-2 block - store result
+	rev64	v4.16b, v5.16b          //GHASH final-2 block
+	ldp	x6, x7, [x0], #16       //AES final-1 block-load input low&high
+	eor	v4.16b, v4.16b, v8.16b  //feed in partial tag
+	eor	x6, x6, x13             //AES final-1 block - round 10 low
+	fmov	d5, x6                  //AES final-1 block - mov low
+	eor	x7, x7, x14             //AES final-1 block - round 10 high
+	pmull2	v20.1q, v4.2d, v14.2d   //GHASH final-2 block - high
+	fmov	v5.d[1], x7             //AES final-1 block - mov high
+	mov	d22, v4.d[1]            //GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d   //GHASH final-2 block - low
+	eor	v9.16b, v9.16b, v20.16b //GHASH final-2 block - high
+	eor	v22.8b, v22.8b, v4.8b   //GHASH final-2 block - mid
+	eor	v5.16b, v5.16b, v2.16b  //AES final-1 block - result
+	eor	v11.16b, v11.16b, v21.16b       //GHASH final-2 block - low
+	pmull	v22.1q, v22.1d, v17.1d  //GHASH final-2 block - mid
+	movi	v8.8b, #0               //suppress further partial tag feed in
+	eor	v10.16b, v10.16b, v22.16b       //GHASH final-2 block - mid
+.L128_enc_blocks_more_than_1:	//blocks	left >  1
+	st1	{ v5.16b}, [x2], #16    //AES final-1 block - store result
+	rev64	v4.16b, v5.16b          //GHASH final-1 block
+	ldp	x6, x7, [x0], #16       //AES final block - load input low & high
+	eor	v4.16b, v4.16b, v8.16b  //feed in partial tag
+	eor	x7, x7, x14             //AES final block - round 10 high
+	eor	x6, x6, x13             //AES final block - round 10 low
+	fmov	d5, x6                  //AES final block - mov low
+	pmull2	v20.1q, v4.2d, v13.2d   //GHASH final-1 block - high
+	fmov	v5.d[1], x7             //AES final block - mov high
+	mov	d22, v4.d[1]            //GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d   //GHASH final-1 block - low
+	eor	v22.8b, v22.8b, v4.8b   //GHASH final-1 block - mid
+	eor	v5.16b, v5.16b, v3.16b  //AES final block - result
+	ins	v22.d[1], v22.d[0]      //GHASH final-1 block - mid
+	pmull2	v22.1q, v22.2d, v16.2d  //GHASH final-1 block - mid
+	eor	v11.16b, v11.16b, v21.16b       //GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b         //GHASH final-1 block - high
+	eor	v10.16b, v10.16b, v22.16b       //GHASH final-1 block - mid
+	movi	v8.8b, #0                       //suppress further partial tag feed in
+.L128_enc_blocks_less_than_1:	//blocks	left <= 1
+	and	x1, x1, #127                    //bit_length %= 128
+	mvn	x13, xzr                        //rk10_l = 0xffffffffffffffff
+	mvn	x14, xzr                        //rk10_h = 0xffffffffffffffff
+	sub	x1, x1, #128                    //bit_length -= 128
+	neg	x1, x1                          //bit_length = 128 - #bits
+	and	x1, x1, #127                    //bit_length %= 128
+	lsr	x14, x14, x1
+	cmp	x1, #64
+	csel	x6, x13, x14, lt
+	csel	x7, x14, xzr, lt
+	fmov	d0, x6                          //ctr0b is mask for last block
+	fmov	v0.d[1], x7
+	//possibly partial last block has zeroes in highest bits
+	and	v5.16b, v5.16b, v0.16b
+	rev64	v4.16b, v5.16b                  //GHASH final block
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	mov	d8, v4.d[1]                     //GHASH final block - mid
+	//load existing bytes where the possibly partial last block is to be stored
+	ld1	{ v18.16b}, [x2]
+	rev	w9, w12
+	karasuba_multiply v4, v12, v20, v8, v21
+	load_const
+	gcm_tidy_up v9, v10, v11, v30, v31
+	//insert existing bytes in top end of result
+	bif	v5.16b, v18.16b, v0.16b
+	st1	{ v5.16b}, [x2]                 //store all 16B
+	str	w9, [x25, #12]                  //store the updated counter
+	b	.Lenc_final_tag
+
+.Lenc_final_tag_pre:
+	ldr	q12, [x3, #32]                         //load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	ldr	q13, [x3, #64]                  //load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
+	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
+	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
+.Lenc_final_tag:
+	cbz	x15, .Lrounds_enc_store
+
+	ld1	{ v5.16b}, [x15]            //load length
+	ext	v5.16b, v5.16b, v5.16b, #8   //PRE 0
+	rev64	v5.16b, v5.16b                  //GHASH block 4k+1
+	eor	v4.16b, v5.16b, v11.16b          //final tax eor with length
+	ext	v4.16b, v4.16b, v4.16b, #8   //PRE 0
+	movi	v8.8b, #0                       //suppress further partial tag
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	movi	v10.8b, #0
+	karasuba_multiply v4, v12, v20, v8, v21
+	load_const
+	gcm_tidy_up v9, v10, v11, v30, v31
+	mov	x6, x26
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	rev64	v11.16b, v11.16b                  //Final has tag
+	aes_enc_iv_init
+
+	add	x19,x8,#160
+	sub	x6,x6,#10
+	cbz	x6, .Lenc_enc_iv_final
+	aes_enc_iv_common       12
+	sub	x6,x6,#2
+	cbz	x6, .Lenc_enc_iv_final
+	aes_enc_iv_common       14
+.Lenc_enc_iv_final:
+	aes_enc_iv_final
+
+	eor	v11.16b, v4.16b, v11.16b          //final tax eor with length
+	st1	{ v11.16b }, [x15]
+	b	.L128_enc_ret
+
+.Lrounds_enc_store:
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	rev64	v11.16b, v11.16b                  //GHASH block 4k+1
+	st1	{ v11.16b }, [x3]
+.L128_enc_ret:
+	mov	w0, #0x0
+	pop_stack
+	ret
+SYM_FUNC_END(pmull_gcm_encrypt_unroll)
+
+SYM_FUNC_START(pmull_gcm_decrypt_unroll)
+	push_stack
+	mov	x25, x4
+	mov	x15, x7
+	mov	x8, x5
+	lsr	x5, x1, #3                      //byte_len
+	mov	x26, x6
+	load_initial_tag v11,x3
+	cbz	x1, .Ldec_final_tag_pre
+
+	ldp	x10, x11, [x25]                 //ctr96_b64, ctr96_t32
+	sub	x5, x5, #1                      //byte_len - 1
+	ldr	q18, [x8, #0]                   //load rk0
+	and	x5, x5, #0xffffffffffffffc0
+	ld1	{ v0.16b}, [x25]
+	ldr	q28, [x8, #160]                 //load rk10
+	ldr	q13, [x3, #64]                  //load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	lsr	x12, x11, #32
+	fmov	d2, x10                         //CTR block 2
+	ldr	q19, [x8, #16]                  //load rk1
+	orr	w11, w11, w11
+	rev	w12, w12                        //rev_ctr32
+	fmov	d1, x10                         //CTR block 1
+	add	w12, w12, #1                    //increment rev_ctr32
+	aes_encrypt_round	v0, v18         //AES block 0 - round 0
+	rev	w9, w12                         //CTR block 1
+	orr	x9, x11, x9, lsl #32            //CTR block 1
+	ldr	q20, [x8, #32]                  //load rk2
+	add	w12, w12, #1                    //CTR block 1
+	fmov	v1.d[1], x9                     //CTR block 1
+	rev	w9, w12                         //CTR block 2
+	add	w12, w12, #1                    //CTR block 2
+	aes_encrypt_round	v0, v19         //AES block 0 - round 1
+	orr	x9, x11, x9, lsl #32            //CTR block 2
+	fmov	v2.d[1], x9                     //CTR block 2
+	rev	w9, w12                         //CTR block 3
+	fmov	d3, x10                         //CTR block 3
+	orr	x9, x11, x9, lsl #32            //CTR block 3
+	add	w12, w12, #1                    //CTR block 3
+	fmov	v3.d[1], x9                     //CTR block 3
+	add	x4, x0, x1, lsr #3              //end_input_ptr
+	aes_encrypt_round	v1, v18          //AES block 1 - round 0
+	ldr	q21, [x8, #48]                                 //load rk3
+	aes_encrypt_round	v0, v20          //AES block 0 - round 2
+	ldr	q24, [x8, #96]                                 //load rk6
+	aes_encrypt_round	v2, v18          //AES block 2 - round 0
+	ldr	q25, [x8, #112]                                //load rk7
+	aes_encrypt_round	v1, v19          //AES block 1 - round 1
+	ldr	q22, [x8, #64]                                 //load rk4
+	aes_encrypt_round	v3, v18          //AES block 3 - round 0
+	aes_encrypt_round	v2, v19          //AES block 2 - round 1
+	aes_encrypt_round	v1, v20          //AES block 1 - round 2
+	ldp	x13, x14, [x8, #160]                     //load rk10
+	aes_encrypt_round	v3, v19          //AES block 3 - round 1
+	aes_encrypt_round	v0, v21          //AES block 0 - round 3
+	ldr	q23, [x8, #80]                                 //load rk5
+	aes_encrypt_round	v1, v21          //AES block 1 - round 3
+	aes_encrypt_round	v3, v20          //AES block 3 - round 2
+	aes_encrypt_round	v2, v20          //AES block 2 - round 2
+	ldr	q27, [x8, #144]                                //load rk9
+	aes_encrypt_round	v1, v22          //AES block 1 - round 4
+	aes_encrypt_round	v3, v21          //AES block 3 - round 3
+	aes_encrypt_round	v2, v21          //AES block 2 - round 3
+	ldr	q14, [x3, #80]                         //load h3l | h3h
+	ext	v14.16b, v14.16b, v14.16b, #8
+	aes_encrypt_round	v0, v22          //AES block 0 - round 4
+	ldr	q26, [x8, #128]                                //load rk8
+	aes_encrypt_round	v1, v23          //AES block 1 - round 5
+	aes_encrypt_round	v2, v22          //AES block 2 - round 4
+	aes_encrypt_round	v3, v22          //AES block 3 - round 4
+	aes_encrypt_round	v0, v23          //AES block 0 - round 5
+	aes_encrypt_round	v2, v23          //AES block 2 - round 5
+	ldr	q12, [x3, #32]                         //load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	aes_encrypt_round	v3, v23          //AES block 3 - round 5
+	aes_encrypt_round	v0, v24          //AES block 0 - round 6
+	aes_encrypt_round	v1, v24          //AES block 1 - round 6
+	aes_encrypt_round	v3, v24          //AES block 3 - round 6
+	aes_encrypt_round	v2, v24          //AES block 2 - round 6
+	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
+	ldr	q15, [x3, #112]                        //load h4l | h4h
+	ext	v15.16b, v15.16b, v15.16b, #8
+	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
+	add	x5, x5, x0
+	aes_encrypt_round	v1, v25          //AES block 1 - round 7
+	aes_encrypt_round	v2, v25          //AES block 2 - round 7
+	aes_encrypt_round	v0, v25          //AES block 0 - round 7
+	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
+	aes_encrypt_round	v3, v25          //AES block 3 - round 7
+	aes_encrypt_round	v1, v26          //AES block 1 - round 8
+	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
+	aes_encrypt_round	v2, v26          //AES block 2 - round 8
+	aes_encrypt_round	v3, v26          //AES block 3 - round 8
+	aes_encrypt_round	v0, v26          //AES block 0 - round 8
+
+	mov	x6, x26
+	sub	x6, x6, #10
+	cbz	x6, .Lleft_dec_rounds
+	aes_enc_extra_round 12
+	sub	x6, x6, #2
+	cbz	x6, .Lleft_dec_rounds
+	aes_enc_extra_round 14
+
+.Lleft_dec_rounds:
+	trn1	v9.2d, v14.2d,    v15.2d        //h4h | h3h
+	aese	v2.16b, v27.16b                 //AES block 2 - round 9
+	aese	v3.16b, v27.16b                 //AES block 3 - round 9
+	aese	v0.16b, v27.16b                 //AES block 0 - round 9
+	cmp	x0, x5                   //check if we have <= 4 blocks
+	aese	v1.16b, v27.16b                 //AES block 1 - round 9
+	eor	v17.16b, v17.16b, v9.16b        //h4k | h3k
+	b.ge	.L128_dec_tail                  //handle tail
+	ldr	q5, [x0, #16]                   //AES block 1 - load ciphertext
+	ldr	q4, [x0, #0]                    //AES block 0 - load ciphertext
+	eor	v1.16b, v5.16b, v1.16b          //AES block 1 - result
+	ldr	q6, [x0, #32]                   //AES block 2 - load ciphertext
+	eor	v0.16b, v4.16b, v0.16b          //AES block 0 - result
+	rev64	v4.16b, v4.16b                  //GHASH block 0
+	rev	w9, w12                         //CTR block 4
+	orr	x9, x11, x9, lsl #32            //CTR block 4
+	add	w12, w12, #1                    //CTR block 4
+	ldr	q7, [x0, #48]                   //AES block 3 - load ciphertext
+	rev64	v5.16b, v5.16b                  //GHASH block 1
+	add	x0, x0, #64                     //AES input_ptr update
+	mov	x19, v1.d[0]                    //AES block 1 - mov low
+	mov	x20, v1.d[1]                    //AES block 1 - mov high
+	mov	x6, v0.d[0]                     //AES block 0 - mov low
+	cmp	x0, x5                   //check if we have <= 8 blocks
+	mov	x7, v0.d[1]                     //AES block 0 - mov high
+	fmov	d0, x10                         //CTR block 4
+	fmov	v0.d[1], x9                     //CTR block 4
+	rev	w9, w12                         //CTR block 5
+	eor	x19, x19, x13                   //AES block 1 - round 10 low
+	fmov	d1, x10                         //CTR block 5
+	add	w12, w12, #1                    //CTR block 5
+	orr	x9, x11, x9, lsl #32            //CTR block 5
+	fmov	v1.d[1], x9                     //CTR block 5
+	rev	w9, w12                         //CTR block 6
+	add	w12, w12, #1                    //CTR block 6
+	orr	x9, x11, x9, lsl #32            //CTR block 6
+	eor	x20, x20, x14                   //AES block 1 - round 10 high
+	eor	x6, x6, x13                   //AES block 0 - round 10 low
+	eor	v2.16b, v6.16b, v2.16b          //AES block 2 - result
+	eor	x7, x7, x14                   //AES block 0 - round 10 high
+	stp	x6, x7, [x2], #16               //AES block 0 - store result
+	stp	x19, x20, [x2], #16             //AES block 1 - store result
+	b.ge	.L128_dec_prepretail            //do prepretail
+.L128_dec_main_loop:	//main	loop start
+	eor	v3.16b, v7.16b, v3.16b          //AES block 4k+3 - result
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	mov	x21, v2.d[0]                    //AES block 4k+2 - mov low
+	pmull2	v28.1q, v5.2d, v14.2d           //GHASH block 4k+1 - high
+	mov	x22, v2.d[1]                    //AES block 4k+2 - mov high
+	aes_encrypt_round	v1, v18          //AES block 4k+5 - round 0
+	fmov	d2, x10                         //CTR block 4k+6
+	rev64	v6.16b, v6.16b                  //GHASH block 4k+2
+	fmov	v2.d[1], x9                     //CTR block 4k+6
+	rev	w9, w12                         //CTR block 4k+7
+	mov	x23, v3.d[0]                    //AES block 4k+3 - mov low
+	eor	v4.16b, v4.16b, v11.16b         //PRE 1
+	mov	d30, v5.d[1]                    //GHASH block 4k+1 - mid
+	aes_encrypt_round	v1, v19         //AES block 4k+5 - round 1
+	rev64	v7.16b, v7.16b                  //GHASH block 4k+3
+	pmull	v29.1q, v5.1d, v14.1d           //GHASH block 4k+1 - low
+	mov	x24, v3.d[1]                    //AES block 4k+3 - mov high
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH block 4k - low
+	fmov	d3, x10                         //CTR block 4k+7
+	eor	v30.8b, v30.8b, v5.8b           //GHASH block 4k+1 - mid
+	aes_encrypt_round	v1, v20          //AES block 4k+5 - round 2
+	fmov	v3.d[1], x9                     //CTR block 4k+7
+	aes_encrypt_round	v2, v18          //AES block 4k+6 - round 0
+	mov	d10, v17.d[1]                   //GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH block 4k - high
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+1 - low
+	pmull	v29.1q, v7.1d, v12.1d           //GHASH block 4k+3 - low
+	aes_encrypt_round	v1, v21          //AES block 4k+5 - round 3
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	aes_encrypt_round	v3, v18          //AES block 4k+7 - round 0
+	eor	v9.16b, v9.16b, v28.16b         //GHASH block 4k+1 - high
+	aes_encrypt_round	v0, v18          //AES block 4k+4 - round 0
+	pmull	v28.1q, v6.1d, v13.1d           //GHASH block 4k+2 - low
+	eor	v8.8b, v8.8b, v4.8b             //GHASH block 4k - mid
+	aes_encrypt_round	v3, v19          //AES block 4k+7 - round 1
+	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
+	pmull	v30.1q, v30.1d, v17.1d          //GHASH block 4k+1 - mid
+	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
+	mov	d31, v6.d[1]                    //GHASH block 4k+2 - mid
+	aes_encrypt_round	v0, v19          //AES block 4k+4 - round 1
+	eor	v11.16b, v11.16b, v28.16b       //GHASH block 4k+2 - low
+	pmull	v10.1q, v8.1d, v10.1d           //GHASH block 4k - mid
+	aes_encrypt_round	v3, v20          //AES block 4k+7 - round 2
+	eor	v31.8b, v31.8b, v6.8b           //GHASH block 4k+2 - mid
+	aes_encrypt_round	v0, v20          //AES block 4k+4 - round 2
+	aes_encrypt_round	v1, v22          //AES block 4k+5 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+1 - mid
+	pmull2	v8.1q, v6.2d, v13.2d            //GHASH block 4k+2 - high
+	aes_encrypt_round	v0, v21          //AES block 4k+4 - round 3
+	ins	v31.d[1], v31.d[0]              //GHASH block 4k+2 - mid
+	pmull2	v4.1q, v7.2d, v12.2d            //GHASH block 4k+3 - high
+	aes_encrypt_round	v2, v19          //AES block 4k+6 - round 1
+	mov	d30, v7.d[1]                    //GHASH block 4k+3 - mid
+	aes_encrypt_round	v0, v22          //AES block 4k+4 - round 4
+	eor	v9.16b, v9.16b, v8.16b          //GHASH block 4k+2 - high
+	pmull2	v31.1q, v31.2d, v16.2d          //GHASH block 4k+2 - mid
+	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
+	aes_encrypt_round	v2, v20          //AES block 4k+6 - round 2
+	eor	v30.8b, v30.8b, v7.8b           //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v23          //AES block 4k+5 - round 5
+	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
+	aes_encrypt_round	v0, v23          //AES block 4k+4 - round 5
+	movi	v8.8b, #0xc2
+	aes_encrypt_round	v2, v21          //AES block 4k+6 - round 3
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+3 - low
+	aes_encrypt_round	v1, v24          //AES block 4k+5 - round 6
+	aes_encrypt_round	v0, v24          //AES block 4k+4 - round 6
+	eor	v10.16b, v10.16b, v31.16b       //GHASH block 4k+2 - mid
+	aes_encrypt_round	v2, v22          //AES block 4k+6 - round 4
+	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
+	pmull	v30.1q, v30.1d, v16.1d          //GHASH block 4k+3 - mid
+	eor	v9.16b, v9.16b, v4.16b          //GHASH block 4k+3 - high
+	ldr	q4, [x0, #0]                    //AES block 4k+4 - load cipher
+	aes_encrypt_round	v1, v25          //AES block 4k+5 - round 7
+	add	w12, w12, #1                    //CTR block 4k+7
+	aes_encrypt_round	v0, v25          //AES block 4k+4 - round 7
+	shl	d8, d8, #56                     //mod_constant
+	aes_encrypt_round	v2, v23          //AES block 4k+6 - round 5
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v26          //AES block 4k+5 - round 8
+	stp	x23, x24, [x2], #16             //AES block 4k+3 - store result
+	aes_encrypt_round	v0, v26          //AES block 4k+4 - round 8
+	eor	v30.16b, v11.16b, v9.16b        //MODULO - karatsuba tidy up
+	ldr	q27, [x8, #144]                 //load rk9
+	aes_encrypt_round	v3, v21         //AES block 4k+7 - round 3
+	rev	w9, w12                         //CTR block 4k+8
+	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
+	ldr	q5, [x0, #16]                   //AES block 4k+5 - ciphertext
+	ext	v9.16b, v9.16b, v9.16b, #8      //MODULO - other top alignment
+	ldr	q28, [x8, #160]                 //load rk9
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
+	aes_encrypt_round	v3, v22         //AES block 4k+7 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //MODULO - karatsuba tidy up
+	aes_encrypt_round	v2, v24         //AES block 4k+6 - round 6
+	aes_encrypt_round	v3, v23         //AES block 4k+7 - round 5
+	ldr	q6, [x0, #32]                   //AES block 4k+6 - ciphertext
+	add	w12, w12, #1                    //CTR block 4k+8
+	eor	v10.16b, v10.16b, v31.16b       //MODULO - fold into mid
+	aes_encrypt_round	v2, v25         //AES block 4k+6 - round 7
+	ldr	q7, [x0, #48]                   //AES block 4k+3 - ciphertext
+	aes_encrypt_round	v3, v24         //AES block 4k+7 - round 6
+	add	x0, x0, #64                     //AES input_ptr update
+	aes_encrypt_round	v3, v25         //AES block 4k+7 - round 7
+	eor	v10.16b, v10.16b, v9.16b        //MODULO - fold into mid
+	aes_encrypt_round	v2, v26         //AES block 4k+6 - round 8
+	aes_encrypt_round	v3, v26         //AES block 4k+7 - round 8
+
+	mov	x6, x26
+	sub	x6,x6,#10
+	cbz	x6, .Lleft2_dec_rounds
+	aes_enc_extra_round	12
+	sub	x6,x6,#2
+	cbz	x6, .Lleft2_dec_rounds
+	aes_enc_extra_round 14
+
+.Lleft2_dec_rounds:
+	aese	v0.16b, v27.16b                 //AES block 4k+4 - round 9
+	aese	v1.16b, v27.16b                 //AES block 4k+5 - round 9
+	eor	v0.16b, v4.16b, v0.16b          //AES block 4k+4 - result
+	eor	v1.16b, v5.16b, v1.16b          //AES block 4k+5 - result
+	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
+	rev64	v5.16b, v5.16b                  //GHASH block 4k+5
+	mov	x7, v0.d[1]                     //AES block 4k+4 - mov high
+	mov	x6, v0.d[0]                     //AES block 4k+4 - mov low
+	fmov	d0, x10                         //CTR block 4k+8
+	fmov	v0.d[1], x9                     //CTR block 4k+8
+	rev	w9, w12                         //CTR block 4k+9
+	aese	v2.16b, v27.16b                 //AES block 4k+6 - round 9
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
+	ext	v10.16b, v10.16b, v10.16b, #8   //MODULO - other mid alignment
+	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
+	eor	v11.16b, v11.16b, v8.16b        //MODULO - fold into low
+	mov	x20, v1.d[1]                    //AES block 4k+5 - mov high
+	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
+	eor	v2.16b, v6.16b, v2.16b          //AES block 4k+6 - result
+	mov	x19, v1.d[0]                    //AES block 4k+5 - mov low
+	add	w12, w12, #1                    //CTR block 4k+9
+	aese	v3.16b, v27.16b                 //AES block 4k+7 - round 9
+	fmov	d1, x10                         //CTR block 4k+9
+	cmp	x0, x5                   //.LOOP CONTROL
+	rev64	v4.16b, v4.16b                  //GHASH block 4k+4
+	eor	v11.16b, v11.16b, v10.16b       //MODULO - fold into low
+	fmov	v1.d[1], x9                     //CTR block 4k+9
+	rev	w9, w12                         //CTR block 4k+10
+	add	w12, w12, #1                    //CTR block 4k+10
+	eor	x20, x20, x14                   //AES block 4k+5 - round 10 high
+	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
+	eor	x19, x19, x13                   //AES block 4k+5 - round 10 low
+	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
+	b.lt	.L128_dec_main_loop
+.L128_dec_prepretail:	//PREPRETAIL
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	mov	x21, v2.d[0]                    //AES block 4k+2 - mov low
+	mov	d30, v5.d[1]                    //GHASH block 4k+1 - mid
+	aes_encrypt_round	v0, v18         //AES block 4k+4 - round 0
+	eor	v3.16b, v7.16b, v3.16b          //AES block 4k+3 - result
+	aes_encrypt_round	v1, v18         //AES block 4k+5 - round 0
+	mov	x22, v2.d[1]                    //AES block 4k+2 - mov high
+	eor	v4.16b, v4.16b, v11.16b         //PRE 1
+	fmov	d2, x10                         //CTR block 4k+6
+	rev64	v6.16b, v6.16b                  //GHASH block 4k+2
+	aes_encrypt_round	v0, v19         //AES block 4k+4 - round 1
+	fmov	v2.d[1], x9                     //CTR block 4k+6
+	rev	w9, w12                         //CTR block 4k+7
+	mov	x23, v3.d[0]                    //AES block 4k+3 - mov low
+	eor	v30.8b, v30.8b, v5.8b           //GHASH block 4k+1 - mid
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH block 4k - low
+	mov	d10, v17.d[1]                   //GHASH block 4k - mid
+	mov	x24, v3.d[1]                    //AES block 4k+3 - mov high
+	aes_encrypt_round	v1, v19         //AES block 4k+5 - round 1
+	mov	d31, v6.d[1]                    //GHASH block 4k+2 - mid
+	aes_encrypt_round	v0, v20         //AES block 4k+4 - round 2
+	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
+	pmull	v29.1q, v5.1d, v14.1d           //GHASH block 4k+1 - low
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	fmov	d3, x10                         //CTR block 4k+7
+	aes_encrypt_round	v2, v18         //AES block 4k+6 - round 0
+	fmov	v3.d[1], x9                     //CTR block 4k+7
+	pmull	v30.1q, v30.1d, v17.1d          //GHASH block 4k+1 - mid
+	eor	v31.8b, v31.8b, v6.8b           //GHASH block 4k+2 - mid
+	rev64	v7.16b, v7.16b                  //GHASH block 4k+3
+	aes_encrypt_round	v2, v19         //AES block 4k+6 - round 1
+	eor	v8.8b, v8.8b, v4.8b             //GHASH block 4k - mid
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH block 4k - high
+	aes_encrypt_round	v3, v18         //AES block 4k+7 - round 0
+	ins	v31.d[1], v31.d[0]              //GHASH block 4k+2 - mid
+	pmull2	v28.1q, v5.2d, v14.2d           //GHASH block 4k+1 - high
+	pmull	v10.1q, v8.1d, v10.1d           //GHASH block 4k - mid
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+1 - low
+	pmull	v29.1q, v7.1d, v12.1d           //GHASH block 4k+3 - low
+	pmull2	v31.1q, v31.2d, v16.2d          //GHASH block 4k+2 - mid
+	eor	v9.16b, v9.16b, v28.16b         //GHASH block 4k+1 - high
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+1 - mid
+	pmull2	v4.1q, v7.2d, v12.2d            //GHASH block 4k+3 - high
+	pmull2	v8.1q, v6.2d, v13.2d            //GHASH block 4k+2 - high
+	mov	d30, v7.d[1]                    //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v20         //AES block 4k+5 - round 2
+	eor	v10.16b, v10.16b, v31.16b       //GHASH block 4k+2 - mid
+	pmull	v28.1q, v6.1d, v13.1d           //GHASH block 4k+2 - low
+	eor	v9.16b, v9.16b, v8.16b          //GHASH block 4k+2 - high
+	movi	v8.8b, #0xc2
+	aes_encrypt_round	v3, v19         //AES block 4k+7 - round 1
+	eor	v30.8b, v30.8b, v7.8b           //GHASH block 4k+3 - mid
+	eor	v11.16b, v11.16b, v28.16b       //GHASH block 4k+2 - low
+	aes_encrypt_round	v2, v20         //AES block 4k+6 - round 2
+	eor	v9.16b, v9.16b, v4.16b          //GHASH block 4k+3 - high
+	aes_encrypt_round	v3, v20         //AES block 4k+7 - round 2
+	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
+	pmull	v30.1q, v30.1d, v16.1d          //GHASH block 4k+3 - mid
+	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
+	eor	v11.16b, v11.16b, v29.16b       //GHASH block 4k+3 - low
+	aes_encrypt_round	v2, v21         //AES block 4k+6 - round 3
+	aes_encrypt_round	v1, v21         //AES block 4k+5 - round 3
+	shl	d8, d8, #56               //mod_constant
+	aes_encrypt_round	v0, v21         //AES block 4k+4 - round 3
+	aes_encrypt_round	v2, v22         //AES block 4k+6 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //GHASH block 4k+3 - mid
+	aes_encrypt_round	v1, v22         //AES block 4k+5 - round 4
+	aes_encrypt_round	v3, v21         //AES block 4k+7 - round 3
+	eor	v30.16b, v11.16b, v9.16b        //MODULO - karatsuba tidy up
+	aes_encrypt_round	v2, v23         //AES block 4k+6 - round 5
+	aes_encrypt_round	v1, v23         //AES block 4k+5 - round 5
+	aes_encrypt_round	v3, v22         //AES block 4k+7 - round 4
+	aes_encrypt_round	v0, v22         //AES block 4k+4 - round 4
+	eor	v10.16b, v10.16b, v30.16b       //MODULO - karatsuba tidy up
+	pmull	v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
+	aes_encrypt_round	v1, v24         //AES block 4k+5 - round 6
+	ext	v9.16b, v9.16b, v9.16b, #8      //MODULO - other top alignment
+	aes_encrypt_round	v3, v23         //AES block 4k+7 - round 5
+	aes_encrypt_round	v0, v23         //AES block 4k+4 - round 5
+	eor	v10.16b, v10.16b, v31.16b       //MODULO - fold into mid
+	aes_encrypt_round	v1, v25         //AES block 4k+5 - round 7
+	aes_encrypt_round	v2, v24         //AES block 4k+6 - round 6
+	ldr	q27, [x8, #144]                 //load rk9
+	aes_encrypt_round	v0, v24         //AES block 4k+4 - round 6
+	aes_encrypt_round	v1, v26         //AES block 4k+5 - round 8
+	eor	v10.16b, v10.16b, v9.16b        //MODULO - fold into mid
+	aes_encrypt_round	v3, v24         //AES block 4k+7 - round 6
+	ldr	q28, [x8, #160]                 //load rk9
+	aes_encrypt_round	v0, v25         //AES block 4k+4 - round 7
+	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
+	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
+	aes_encrypt_round	v2, v25         //AES block 4k+6 - round 7
+	ext	v10.16b, v10.16b, v10.16b, #8   //MODULO - other mid alignment
+	aes_encrypt_round	v3, v25         //AES block 4k+7 - round 7
+	aes_encrypt_round	v0, v26         //AES block 4k+4 - round 8
+	eor	v11.16b, v11.16b, v8.16b        //MODULO - fold into low
+	aes_encrypt_round	v2, v26         //AES block 4k+6 - round 8
+	aes_encrypt_round	v3, v26         //AES block 4k+7 - round 8
+	mov	x6, x26
+	sub	x6,x6,#10
+	cbz	x6, .Lleft3_dec_rounds
+	aes_enc_extra_round	12
+	sub	x6,x6,#2
+	cbz	x6, .Lleft3_dec_rounds
+	aes_enc_extra_round	14
+.Lleft3_dec_rounds:
+	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
+	aese	v0.16b, v27.16b                 //AES block 4k+4 - round 9
+	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
+	aese	v1.16b, v27.16b                 //AES block 4k+5 - round 9
+	aese	v2.16b, v27.16b                 //AES block 4k+6 - round 9
+	add	w12, w12, #1                    //CTR block 4k+7
+	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
+	aese	v3.16b, v27.16b                 //AES block 4k+7 - round 9
+	eor	v11.16b, v11.16b, v10.16b       //MODULO - fold into low
+.L128_dec_tail:	//TAIL
+	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left
+	ld1	{ v5.16b}, [x0], #16            //AES block 4k+4 - load cipher
+	eor	v0.16b, v5.16b, v0.16b          //AES block 4k+4 - result
+	mov	x7, v0.d[1]                     //AES block 4k+4 - mov high
+	mov	x6, v0.d[0]                     //AES block 4k+4 - mov low
+	cmp	x5, #48
+	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
+	ext	v8.16b, v11.16b, v11.16b, #8    //prepare final partial tag
+	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
+	b.gt	.L128_dec_blocks_more_than_3
+	mov	v3.16b, v2.16b
+	sub	w12, w12, #1
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	mov	v2.16b, v1.16b
+	movi	v10.8b, #0
+	cmp	x5, #32
+	b.gt	.L128_dec_blocks_more_than_2
+	cmp	x5, #16
+	mov	v3.16b, v1.16b
+	sub	w12, w12, #1
+	b.gt	.L128_dec_blocks_more_than_1
+	sub	w12, w12, #1
+	b	.L128_dec_blocks_less_than_1
+.L128_dec_blocks_more_than_3:	//blocks	left >  3
+	rev64	v4.16b, v5.16b                  //GHASH final-3 block
+	ld1	{ v5.16b}, [x0], #16            //final-2 block - load cipher
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	mov	d10, v17.d[1]                   //GHASH final-3 block - mid
+	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
+	eor	v0.16b, v5.16b, v1.16b          //AES final-2 block - result
+	mov	d22, v4.d[1]                    //GHASH final-3 block - mid
+	mov	x7, v0.d[1]                     //AES final-2 block - mov high
+	pmull	v11.1q, v4.1d, v15.1d           //GHASH final-3 block - low
+	mov	x6, v0.d[0]                     //AES final-2 block - mov low
+	pmull2	v9.1q, v4.2d, v15.2d            //GHASH final-3 block - high
+	eor	v22.8b, v22.8b, v4.8b           //GHASH final-3 block - mid
+	movi	v8.8b, #0                       //suppress further partial tag
+	eor	x7, x7, x14                   //final-2 block - round 10 high
+	pmull	v10.1q, v22.1d, v10.1d          //GHASH final-3 block - mid
+	eor	x6, x6, x13                   //AES final-2 block - round 10 low
+.L128_dec_blocks_more_than_2:	//blocks	left >  2
+	rev64	v4.16b, v5.16b                  //GHASH final-2 block
+	ld1	{ v5.16b}, [x0], #16          //final-1 block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	eor	v0.16b, v5.16b, v2.16b          //AES final-1 block - result
+	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
+	mov	d22, v4.d[1]                    //GHASH final-2 block - mid
+	pmull	v21.1q, v4.1d, v14.1d           //GHASH final-2 block - low
+	pmull2	v20.1q, v4.2d, v14.2d           //GHASH final-2 block - high
+	mov	x6, v0.d[0]                     //AES final-1 block - mov low
+	mov	x7, v0.d[1]                     //AES final-1 block - mov high
+	eor	v22.8b, v22.8b, v4.8b           //GHASH final-2 block - mid
+	movi	v8.8b, #0                       //suppress further partial tag
+	pmull	v22.1q, v22.1d, v17.1d          //GHASH final-2 block - mid
+	eor	x6, x6, x13                   //AES final-1 block - round 10 low
+	eor	v11.16b, v11.16b, v21.16b       //GHASH final-2 block - low
+	eor	v9.16b, v9.16b, v20.16b         //GHASH final-2 block - high
+	eor	v10.16b, v10.16b, v22.16b       //GHASH final-2 block - mid
+	eor	x7, x7, x14                   //final-1 block - round 10 high
+.L128_dec_blocks_more_than_1:	//blocks	left >  1
+	rev64	v4.16b, v5.16b                  //GHASH final-1 block
+	ld1	{ v5.16b}, [x0], #16            //final block - load ciphertext
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	mov	d22, v4.d[1]                    //GHASH final-1 block - mid
+	eor	v0.16b, v5.16b, v3.16b          //AES final block - result
+	eor	v22.8b, v22.8b, v4.8b           //GHASH final-1 block - mid
+	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
+	mov	x6, v0.d[0]                     //AES final block - mov low
+	mov	x7, v0.d[1]                     //AES final block - mov high
+	ins	v22.d[1], v22.d[0]              //GHASH final-1 block - mid
+	pmull	v21.1q, v4.1d, v13.1d           //GHASH final-1 block - low
+	pmull2	v20.1q, v4.2d, v13.2d           //GHASH final-1 block - high
+	pmull2	v22.1q, v22.2d, v16.2d          //GHASH final-1 block - mid
+	movi	v8.8b, #0                       //suppress further partial tag
+	eor	v11.16b, v11.16b, v21.16b       //GHASH final-1 block - low
+	eor	v9.16b, v9.16b, v20.16b         //GHASH final-1 block - high
+	eor	x7, x7, x14                   //AES final block - round 10 high
+	eor	x6, x6, x13                   //AES final block - round 10 low
+	eor	v10.16b, v10.16b, v22.16b       //GHASH final-1 block - mid
+.L128_dec_blocks_less_than_1:	//blocks        left <= 1
+	mvn	x14, xzr                        //rk10_h = 0xffffffffffffffff
+	and	x1, x1, #127                    //bit_length %= 128
+	mvn	x13, xzr                        //rk10_l = 0xffffffffffffffff
+	sub	x1, x1, #128                    //bit_length -= 128
+	neg	x1, x1          //bit_length = 128 - #bits in input
+	and	x1, x1, #127    //bit_length %= 128
+	lsr	x14, x14, x1    //rk10_h is mask for top 64b of last block
+	cmp	x1, #64
+	csel	x10, x14, xzr, lt
+	csel	x9, x13, x14, lt
+	fmov	d0, x9                          //ctr0b is mask for last block
+	mov	v0.d[1], x10
+	and	v5.16b, v5.16b, v0.16b
+	rev64	v4.16b, v5.16b                  //GHASH final block
+	eor	v4.16b, v4.16b, v8.16b          //feed in partial tag
+	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
+	and	x7, x7, x10
+	mov	d8, v4.d[1]                     //GHASH final block - mid
+	bic	x4, x4, x9                      //mask out low existing bytes
+	and	x6, x6, x9
+	rev	w9, w12
+	bic	x5, x5, x10                     //mask out high existing bytes
+	orr	x6, x6, x4
+	orr	x7, x7, x5
+	str	w9, [x25, #12]                  //store the updated counter
+	stp	x6, x7, [x2]
+	karasuba_multiply v4, v12, v20, v8, v21
+	load_const
+	gcm_tidy_up v9, v10, v11, v30, v31
+	b	.Ldec_final_tag
+
+.Ldec_final_tag_pre:
+	ldr	q12, [x3, #32]                         //load h1l | h1h
+	ext	v12.16b, v12.16b, v12.16b, #8
+	ldr	q13, [x3, #64]                  //load h2l | h2h
+	ext	v13.16b, v13.16b, v13.16b, #8
+	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
+	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
+	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
+.Ldec_final_tag:
+	cbz	x15, .Lrounds_dec_ret
+
+	ld1	{ v5.16b}, [x15], #16            //load length
+	ext	v5.16b, v5.16b, v5.16b, #8   //PRE 0
+	rev64	v5.16b, v5.16b                  //GHASH block 4k+1
+	eor	v4.16b, v5.16b, v11.16b          //final tax eor with length
+	ext	v4.16b, v4.16b, v4.16b, #8   //PRE 0
+	movi	v8.8b, #0                       //suppress further partial tag
+	mov	d8, v4.d[1]                     //GHASH block 4k - mid
+	movi	v11.8b, #0
+	movi	v9.8b, #0
+	movi	v10.8b, #0
+	karasuba_multiply v4, v12, v20, v8, v21
+	load_const
+	gcm_tidy_up v9, v10, v11, v30, v31
+	mov	x6, x26
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	rev64	v11.16b, v11.16b                  //Final has tag
+	aes_enc_iv_init
+
+	add	x19,x8,#160
+	sub	x6,x6,#10
+	cbz	x6, .Ldec_enc_iv_final
+	aes_enc_iv_common       12
+	sub	x6,x6,#2
+	cbz	x6, .Ldec_enc_iv_final
+	aes_enc_iv_common       14
+.Ldec_enc_iv_final:
+	aes_enc_iv_final
+
+	eor	v11.16b, v4.16b, v11.16b       //final tax eor with length
+	ldp	x9, x10, [sp, #128]            //Load otag pointer and authsize
+	adr_l	x26, .Lpermute_table
+	ld1	{ v5.16b}, [x9], #16           //load otag
+	add	x26, x26, x10
+	ld1	{v9.16b}, [x26]                // load permute vector
+
+	cmeq	v5.16b, v5.16b, v11.16b        // compare tags
+	mvn	v5.16b, v5.16b                 // -1 for fail, 0 for pass
+	tbl	v5.16b, {v5.16b}, v9.16b       // keep authsize bytes only
+	sminv	b0, v5.16b                     // signed minimum across XL
+	smov	w0, v0.b[0]                    // return b0
+
+.Lrounds_dec_ret:
+	ext	v11.16b, v11.16b, v11.16b, #8   //PRE 0
+	rev64	v11.16b, v11.16b
+	st1	{ v11.16b }, [x3]
+	pop_stack
+	ret
+SYM_FUNC_END(pmull_gcm_decrypt_unroll)
+.align	6
+.Lpermute_table:
+        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+        .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
+        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
+        .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
+        .previous
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 15794fe21a0b..f9a60a99d871 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -29,6 +29,7 @@ MODULE_ALIAS_CRYPTO("ghash");
 #define GHASH_BLOCK_SIZE	16
 #define GHASH_DIGEST_SIZE	16
 #define GCM_IV_SIZE		12
+#define UNROLL_DATA_SIZE	1024
 
 struct ghash_key {
 	be128			k;
@@ -59,6 +60,19 @@ asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
 				 u64 const h[][2], u64 dg[], u8 ctr[],
 				 u32 const rk[], int rounds, const u8 l[],
 				 const u8 tag[], u64 authsize);
+asmlinkage size_t pmull_gcm_encrypt_unroll(const unsigned char *in,
+				 size_t len,
+				 unsigned char *out,
+				 u64 Xi[][2],
+				 unsigned char ivec[16],
+				 const void *key, int rounds,
+				 uint8_t *tag);
+asmlinkage size_t pmull_gcm_decrypt_unroll(const uint8_t *ciphertext,
+				 uint64_t plaintext_length,
+				 uint8_t *plaintext, uint64_t Xi[][2],
+				 unsigned char ivec[16], const void *key,
+				 int rounds, uint8_t *tag,
+                                 uint8_t *otag, uint64_t authsize);
 
 static int ghash_init(struct shash_desc *desc)
 {
@@ -255,6 +269,16 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
 	gf128mul_lle(&h, &ctx->ghash_key.k);
 	ghash_reflect(ctx->ghash_key.h[3], &h);
 
+	ghash_reflect(ctx->ghash_key.h[6], &ctx->ghash_key.k);
+	h = ctx->ghash_key.k;
+	gf128mul_lle(&h, &ctx->ghash_key.k);
+	ghash_reflect(ctx->ghash_key.h[8], &h);
+
+	gf128mul_lle(&h, &ctx->ghash_key.k);
+	ghash_reflect(ctx->ghash_key.h[9], &h);
+
+	gf128mul_lle(&h, &ctx->ghash_key.k);
+	ghash_reflect(ctx->ghash_key.h[11], &h);
 	return 0;
 }
 
@@ -350,14 +374,21 @@ static int gcm_encrypt(struct aead_request *req)
 	be128 lengths;
 	u8 *tag;
 	int err;
+	int unroll4_flag = 0;
 
 	lengths.a = cpu_to_be64(req->assoclen * 8);
 	lengths.b = cpu_to_be64(req->cryptlen * 8);
 
+	if (req->cryptlen >= UNROLL_DATA_SIZE)
+		unroll4_flag = 1;
 	if (req->assoclen)
 		gcm_calculate_auth_mac(req, dg);
 
 	memcpy(iv, req->iv, GCM_IV_SIZE);
+	if (unroll4_flag) {
+		ctx->ghash_key.h[4][1] = cpu_to_be64(((u64 *)dg)[0]);
+		ctx->ghash_key.h[4][0] = cpu_to_be64(((u64 *)dg)[1]);
+	}
 	put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
 	err = skcipher_walk_aead_encrypt(&walk, req, false);
@@ -377,11 +408,23 @@ static int gcm_encrypt(struct aead_request *req)
 			tag = NULL;
 		}
 
-		kernel_neon_begin();
-		pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
-				  dg, iv, ctx->aes_key.key_enc, nrounds,
-				  tag);
-		kernel_neon_end();
+		if (unroll4_flag) {
+			kernel_neon_begin();
+			pmull_gcm_encrypt_unroll(src, nbytes*8, dst,
+						 &ctx->ghash_key.h[4],
+						 iv,
+						 ctx->aes_key.key_enc,
+						 nrounds, tag);
+			kernel_neon_end();
+		} else {
+			kernel_neon_begin();
+			pmull_gcm_encrypt(nbytes, dst, src,
+					  ctx->ghash_key.h,
+					  dg, iv,
+					  ctx->aes_key.key_enc,
+					  nrounds, tag);
+			kernel_neon_end();
+		}
 
 		if (unlikely(!nbytes))
 			break;
@@ -418,14 +461,22 @@ static int gcm_decrypt(struct aead_request *req)
 	u8 *tag;
 	int ret;
 	int err;
+	int unroll4_flag = 0;
 
 	lengths.a = cpu_to_be64(req->assoclen * 8);
 	lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8);
 
+	if (req->cryptlen >= UNROLL_DATA_SIZE)
+		unroll4_flag = 1;
+
 	if (req->assoclen)
 		gcm_calculate_auth_mac(req, dg);
 
 	memcpy(iv, req->iv, GCM_IV_SIZE);
+	if (unroll4_flag) {
+		ctx->ghash_key.h[4][1] = cpu_to_be64(((u64 *)dg)[0]);
+		ctx->ghash_key.h[4][0] = cpu_to_be64(((u64 *)dg)[1]);
+	}
 	put_unaligned_be32(2, iv + GCM_IV_SIZE);
 
 	scatterwalk_map_and_copy(otag, req->src,
@@ -449,11 +500,23 @@ static int gcm_decrypt(struct aead_request *req)
 			tag = NULL;
 		}
 
-		kernel_neon_begin();
-		ret = pmull_gcm_decrypt(nbytes, dst, src, ctx->ghash_key.h,
-					dg, iv, ctx->aes_key.key_enc,
-					nrounds, tag, otag, authsize);
-		kernel_neon_end();
+		if (unroll4_flag) {
+			kernel_neon_begin();
+			ret = pmull_gcm_decrypt_unroll(src, nbytes*8,
+						dst,
+						&ctx->ghash_key.h[4], iv,
+						ctx->aes_key.key_enc, nrounds,
+						tag, otag, authsize);
+			kernel_neon_end();
+		} else {
+			kernel_neon_begin();
+			ret = pmull_gcm_decrypt(nbytes, dst, src,
+						ctx->ghash_key.h,
+						dg, iv, ctx->aes_key.key_enc,
+						nrounds, tag, otag, authsize);
+			kernel_neon_end();
+		}
+
 
 		if (unlikely(!nbytes))
 			break;
@@ -485,7 +548,7 @@ static struct aead_alg gcm_aes_alg = {
 	.base.cra_priority	= 300,
 	.base.cra_blocksize	= 1,
 	.base.cra_ctxsize	= sizeof(struct gcm_aes_ctx) +
-				  4 * sizeof(u64[2]),
+				  12 * sizeof(u64[2]),
 	.base.cra_module	= THIS_MODULE,
 };
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* RE: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-12-14 15:59         ` Ard Biesheuvel
@ 2021-12-15  5:48           ` Xiaokang Qian
  2021-12-15  7:24             ` Ard Biesheuvel
  0 siblings, 1 reply; 12+ messages in thread
From: Xiaokang Qian @ 2021-12-15  5:48 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Will Deacon, Eric Biggers, Herbert Xu, David S. Miller,
	Catalin Marinas, nd, Linux Crypto Mailing List, Linux ARM,
	Linux Kernel Mailing List

Hi Ard:

I have posted the updated patch with version 2. It has passed the extended test suite and extra tests. 

For the performance data,  it's wired that TX2 had some regressions. Here we find the performance data on TX2 are not stable locally, two times run with same patch(whether old or new), get different performance data,  we happen to meet the same issue on OpenSSL . We will do more investigating on it.
 Anyway, can you firstly help to see whether the updated patch performs well or not. Thanks.

> -----Original Message-----
> From: Ard Biesheuvel <ardb@kernel.org>
> Sent: Tuesday, December 14, 2021 11:59 PM
> To: Xiaokang Qian <Xiaokang.Qian@arm.com>
> Cc: Will Deacon <will@kernel.org>; Eric Biggers <ebiggers@kernel.org>;
> Herbert Xu <herbert@gondor.apana.org.au>; David S. Miller
> <davem@davemloft.net>; Catalin Marinas <Catalin.Marinas@arm.com>; nd
> <nd@arm.com>; Linux Crypto Mailing List <linux-crypto@vger.kernel.org>;
> Linux ARM <linux-arm-kernel@lists.infradead.org>; Linux Kernel Mailing List
> <linux-kernel@vger.kernel.org>
> Subject: Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way
> interleave of aes and ghash
> 
> On Tue, 14 Dec 2021 at 02:40, Xiaokang Qian <Xiaokang.Qian@arm.com>
> wrote:
> >
> > Hi Will:
> > I will post the update version 2 of this patch today or tomorrow.
> > Sorry for the delay.
> >
> 
> Great, but please make sure you run the extended test suite.
> 
> I applied this version of the patch to test the performance delta between the
> old and the new version on TX2, but it hit a failure in the self test:
> 
> [    0.592203] alg: aead: gcm-aes-ce decryption unexpectedly succeeded
> on test vector "random: alen=91 plen=5326 authsize=16 klen=32 novrfy=1";
> expected_error=-EBADMSG, cfg="random: inplace use_finup
> src_divs=[100.0%@+3779] key_offset=43"
> 
> It's non-deterministic, though, so it may take a few attempts to reproduce it.
> 
> As for the performance delta, your code is 18% slower on TX2 for 1420 byte
> packets using AES-256 (and 9% slower on AES-192). In your results, AES-256
> does not outperform the old code as much as it does with smaller key sizes
> either.
> 
> Is this something that can be solved? If not, the numbers are not as
> appealing, to be honest, given the substantial performance regressions on
> the other micro-architecture.
> 
> --
> Ard.
> 
> 
> 
> Tcrypt output follows
> 
> 
> OLD CODE
> 
> testing speed of gcm(aes) (gcm-aes-ce) encryption
> test 0 (128 bit key, 16 byte blocks): 2023626 operations in 1 seconds
> (32378016 bytes)
> test 1 (128 bit key, 64 byte blocks): 2005175 operations in 1 seconds
> (128331200 bytes)
> test 2 (128 bit key, 256 byte blocks): 1408367 operations in 1 seconds
> (360541952 bytes)
> test 3 (128 bit key, 512 byte blocks): 1011877 operations in 1 seconds
> (518081024 bytes)
> test 4 (128 bit key, 1024 byte blocks): 646552 operations in 1 seconds
> (662069248 bytes)
> test 5 (128 bit key, 1420 byte blocks): 490188 operations in 1 seconds
> (696066960 bytes)
> test 6 (128 bit key, 4096 byte blocks): 204423 operations in 1 seconds
> (837316608 bytes)
> test 7 (128 bit key, 8192 byte blocks): 105149 operations in 1 seconds
> (861380608 bytes)
> test 8 (192 bit key, 16 byte blocks): 1924506 operations in 1 seconds
> (30792096 bytes)
> test 9 (192 bit key, 64 byte blocks): 1944413 operations in 1 seconds
> (124442432 bytes)
> test 10 (192 bit key, 256 byte blocks): 1337001 operations in 1
> seconds (342272256 bytes)
> test 11 (192 bit key, 512 byte blocks): 941146 operations in 1 seconds
> (481866752 bytes)
> test 12 (192 bit key, 1024 byte blocks): 590614 operations in 1
> seconds (604788736 bytes)
> test 13 (192 bit key, 1420 byte blocks): 443363 operations in 1
> seconds (629575460 bytes)
> test 14 (192 bit key, 4096 byte blocks): 182890 operations in 1
> seconds (749117440 bytes)
> test 15 (192 bit key, 8192 byte blocks): 93813 operations in 1 seconds
> (768516096 bytes)
> test 16 (256 bit key, 16 byte blocks): 1886970 operations in 1 seconds
> (30191520 bytes)
> test 17 (256 bit key, 64 byte blocks): 1893574 operations in 1 seconds
> (121188736 bytes)
> test 18 (256 bit key, 256 byte blocks): 1245478 operations in 1
> seconds (318842368 bytes)
> test 19 (256 bit key, 512 byte blocks): 865507 operations in 1 seconds
> (443139584 bytes)
> test 20 (256 bit key, 1024 byte blocks): 537822 operations in 1
> seconds (550729728 bytes)
> test 21 (256 bit key, 1420 byte blocks): 401451 operations in 1
> seconds (570060420 bytes)
> test 22 (256 bit key, 4096 byte blocks): 164378 operations in 1
> seconds (673292288 bytes)
> test 23 (256 bit key, 8192 byte blocks): 84205 operations in 1 seconds
> (689807360 bytes)
> 
> 
> NEW CODE
> 
> testing speed of gcm(aes) (gcm-aes-ce) encryption
> test 0 (128 bit key, 16 byte blocks): 1894587 operations in 1 seconds
> (30313392 bytes)
> test 1 (128 bit key, 64 byte blocks): 1910971 operations in 1 seconds
> (122302144 bytes)
> test 2 (128 bit key, 256 byte blocks): 1360037 operations in 1 seconds
> (348169472 bytes)
> test 3 (128 bit key, 512 byte blocks): 985577 operations in 1 seconds
> (504615424 bytes)
> test 4 (128 bit key, 1024 byte blocks): 569656 operations in 1 seconds
> (583327744 bytes)
> test 5 (128 bit key, 1420 byte blocks): 462129 operations in 1 seconds
> (656223180 bytes)
> test 6 (128 bit key, 4096 byte blocks): 215284 operations in 1 seconds
> (881803264 bytes)
> test 7 (128 bit key, 8192 byte blocks): 115459 operations in 1 seconds
> (945840128 bytes)
> test 8 (192 bit key, 16 byte blocks): 1825915 operations in 1 seconds
> (29214640 bytes)
> test 9 (192 bit key, 64 byte blocks): 1836850 operations in 1 seconds
> (117558400 bytes)
> test 10 (192 bit key, 256 byte blocks): 1281626 operations in 1
> seconds (328096256 bytes)
> test 11 (192 bit key, 512 byte blocks): 913114 operations in 1 seconds
> (467514368 bytes)
> test 12 (192 bit key, 1024 byte blocks): 504804 operations in 1
> seconds (516919296 bytes)
> test 13 (192 bit key, 1420 byte blocks): 405749 operations in 1
> seconds (576163580 bytes)
> test 14 (192 bit key, 4096 byte blocks): 183999 operations in 1
> seconds (753659904 bytes)
> test 15 (192 bit key, 8192 byte blocks): 97914 operations in 1 seconds
> (802111488 bytes)
> test 16 (256 bit key, 16 byte blocks): 1776659 operations in 1 seconds
> (28426544 bytes)
> test 17 (256 bit key, 64 byte blocks): 1781110 operations in 1 seconds
> (113991040 bytes)
> test 18 (256 bit key, 256 byte blocks): 1206511 operations in 1
> seconds (308866816 bytes)
> test 19 (256 bit key, 512 byte blocks): 846284 operations in 1 seconds
> (433297408 bytes)
> test 20 (256 bit key, 1024 byte blocks): 424405 operations in 1
> seconds (434590720 bytes)
> test 21 (256 bit key, 1420 byte blocks): 331558 operations in 1
> seconds (470812360 bytes)
> test 22 (256 bit key, 4096 byte blocks): 143821 operations in 1
> seconds (589090816 bytes)
> test 23 (256 bit key, 8192 byte blocks): 75641 operations in 1 seconds
> (619651072 bytes)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash
  2021-12-15  5:48           ` Xiaokang Qian
@ 2021-12-15  7:24             ` Ard Biesheuvel
  0 siblings, 0 replies; 12+ messages in thread
From: Ard Biesheuvel @ 2021-12-15  7:24 UTC (permalink / raw)
  To: Xiaokang Qian, Eric Biggers
  Cc: Will Deacon, Herbert Xu, David S. Miller, Catalin Marinas, nd,
	Linux Crypto Mailing List, Linux ARM, Linux Kernel Mailing List

On Wed, 15 Dec 2021 at 06:48, Xiaokang Qian <Xiaokang.Qian@arm.com> wrote:
>
> Hi Ard:
>
> I have posted the updated patch with version 2. It has passed the extended test suite and extra tests.
>
> For the performance data,  it's wired that TX2 had some regressions. Here we find the performance data on TX2 are not stable locally, two times run with same patch(whether old or new), get different performance data,  we happen to meet the same issue on OpenSSL . We will do more investigating on it.
>  Anyway, can you firstly help to see whether the updated patch performs well or not. Thanks.
>

I get the same results with this version of the patch, and the results
are highly consistent between runs.

So as it stands, I don't think we should merge this, to be honest. For
the block sizes that matter, this version performs roughly the same on
some micro-architectures, but substantially slower on others (4k and
8k are also slower on TX2 for AES-256). And the larger block sizes
only matter for kTLS anyway, and I don't see the point of kernel TLS
with pure software algorithms - user space can just issue the
instructions directly if TLS is not hardware accelerated.

I do have some minor review comments on the patch itself, but please
only post a v3 if you manage to fix the performance regression:
- push_stack/pop_stack don't need to preserve the D8-15 registers
- karatsuba not karasuba

> > -----Original Message-----
> > From: Ard Biesheuvel <ardb@kernel.org>
> > Sent: Tuesday, December 14, 2021 11:59 PM
> > To: Xiaokang Qian <Xiaokang.Qian@arm.com>
> > Cc: Will Deacon <will@kernel.org>; Eric Biggers <ebiggers@kernel.org>;
> > Herbert Xu <herbert@gondor.apana.org.au>; David S. Miller
> > <davem@davemloft.net>; Catalin Marinas <Catalin.Marinas@arm.com>; nd
> > <nd@arm.com>; Linux Crypto Mailing List <linux-crypto@vger.kernel.org>;
> > Linux ARM <linux-arm-kernel@lists.infradead.org>; Linux Kernel Mailing List
> > <linux-kernel@vger.kernel.org>
> > Subject: Re: [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way
> > interleave of aes and ghash
> >
> > On Tue, 14 Dec 2021 at 02:40, Xiaokang Qian <Xiaokang.Qian@arm.com>
> > wrote:
> > >
> > > Hi Will:
> > > I will post the update version 2 of this patch today or tomorrow.
> > > Sorry for the delay.
> > >
> >
> > Great, but please make sure you run the extended test suite.
> >
> > I applied this version of the patch to test the performance delta between the
> > old and the new version on TX2, but it hit a failure in the self test:
> >
> > [    0.592203] alg: aead: gcm-aes-ce decryption unexpectedly succeeded
> > on test vector "random: alen=91 plen=5326 authsize=16 klen=32 novrfy=1";
> > expected_error=-EBADMSG, cfg="random: inplace use_finup
> > src_divs=[100.0%@+3779] key_offset=43"
> >
> > It's non-deterministic, though, so it may take a few attempts to reproduce it.
> >
> > As for the performance delta, your code is 18% slower on TX2 for 1420 byte
> > packets using AES-256 (and 9% slower on AES-192). In your results, AES-256
> > does not outperform the old code as much as it does with smaller key sizes
> > either.
> >
> > Is this something that can be solved? If not, the numbers are not as
> > appealing, to be honest, given the substantial performance regressions on
> > the other micro-architecture.
> >
> > --
> > Ard.
> >
> >
> >
> > Tcrypt output follows
> >
> >
> > OLD CODE
> >
> > testing speed of gcm(aes) (gcm-aes-ce) encryption
> > test 0 (128 bit key, 16 byte blocks): 2023626 operations in 1 seconds
> > (32378016 bytes)
> > test 1 (128 bit key, 64 byte blocks): 2005175 operations in 1 seconds
> > (128331200 bytes)
> > test 2 (128 bit key, 256 byte blocks): 1408367 operations in 1 seconds
> > (360541952 bytes)
> > test 3 (128 bit key, 512 byte blocks): 1011877 operations in 1 seconds
> > (518081024 bytes)
> > test 4 (128 bit key, 1024 byte blocks): 646552 operations in 1 seconds
> > (662069248 bytes)
> > test 5 (128 bit key, 1420 byte blocks): 490188 operations in 1 seconds
> > (696066960 bytes)
> > test 6 (128 bit key, 4096 byte blocks): 204423 operations in 1 seconds
> > (837316608 bytes)
> > test 7 (128 bit key, 8192 byte blocks): 105149 operations in 1 seconds
> > (861380608 bytes)
> > test 8 (192 bit key, 16 byte blocks): 1924506 operations in 1 seconds
> > (30792096 bytes)
> > test 9 (192 bit key, 64 byte blocks): 1944413 operations in 1 seconds
> > (124442432 bytes)
> > test 10 (192 bit key, 256 byte blocks): 1337001 operations in 1
> > seconds (342272256 bytes)
> > test 11 (192 bit key, 512 byte blocks): 941146 operations in 1 seconds
> > (481866752 bytes)
> > test 12 (192 bit key, 1024 byte blocks): 590614 operations in 1
> > seconds (604788736 bytes)
> > test 13 (192 bit key, 1420 byte blocks): 443363 operations in 1
> > seconds (629575460 bytes)
> > test 14 (192 bit key, 4096 byte blocks): 182890 operations in 1
> > seconds (749117440 bytes)
> > test 15 (192 bit key, 8192 byte blocks): 93813 operations in 1 seconds
> > (768516096 bytes)
> > test 16 (256 bit key, 16 byte blocks): 1886970 operations in 1 seconds
> > (30191520 bytes)
> > test 17 (256 bit key, 64 byte blocks): 1893574 operations in 1 seconds
> > (121188736 bytes)
> > test 18 (256 bit key, 256 byte blocks): 1245478 operations in 1
> > seconds (318842368 bytes)
> > test 19 (256 bit key, 512 byte blocks): 865507 operations in 1 seconds
> > (443139584 bytes)
> > test 20 (256 bit key, 1024 byte blocks): 537822 operations in 1
> > seconds (550729728 bytes)
> > test 21 (256 bit key, 1420 byte blocks): 401451 operations in 1
> > seconds (570060420 bytes)
> > test 22 (256 bit key, 4096 byte blocks): 164378 operations in 1
> > seconds (673292288 bytes)
> > test 23 (256 bit key, 8192 byte blocks): 84205 operations in 1 seconds
> > (689807360 bytes)
> >
> >
> > NEW CODE
> >
> > testing speed of gcm(aes) (gcm-aes-ce) encryption
> > test 0 (128 bit key, 16 byte blocks): 1894587 operations in 1 seconds
> > (30313392 bytes)
> > test 1 (128 bit key, 64 byte blocks): 1910971 operations in 1 seconds
> > (122302144 bytes)
> > test 2 (128 bit key, 256 byte blocks): 1360037 operations in 1 seconds
> > (348169472 bytes)
> > test 3 (128 bit key, 512 byte blocks): 985577 operations in 1 seconds
> > (504615424 bytes)
> > test 4 (128 bit key, 1024 byte blocks): 569656 operations in 1 seconds
> > (583327744 bytes)
> > test 5 (128 bit key, 1420 byte blocks): 462129 operations in 1 seconds
> > (656223180 bytes)
> > test 6 (128 bit key, 4096 byte blocks): 215284 operations in 1 seconds
> > (881803264 bytes)
> > test 7 (128 bit key, 8192 byte blocks): 115459 operations in 1 seconds
> > (945840128 bytes)
> > test 8 (192 bit key, 16 byte blocks): 1825915 operations in 1 seconds
> > (29214640 bytes)
> > test 9 (192 bit key, 64 byte blocks): 1836850 operations in 1 seconds
> > (117558400 bytes)
> > test 10 (192 bit key, 256 byte blocks): 1281626 operations in 1
> > seconds (328096256 bytes)
> > test 11 (192 bit key, 512 byte blocks): 913114 operations in 1 seconds
> > (467514368 bytes)
> > test 12 (192 bit key, 1024 byte blocks): 504804 operations in 1
> > seconds (516919296 bytes)
> > test 13 (192 bit key, 1420 byte blocks): 405749 operations in 1
> > seconds (576163580 bytes)
> > test 14 (192 bit key, 4096 byte blocks): 183999 operations in 1
> > seconds (753659904 bytes)
> > test 15 (192 bit key, 8192 byte blocks): 97914 operations in 1 seconds
> > (802111488 bytes)
> > test 16 (256 bit key, 16 byte blocks): 1776659 operations in 1 seconds
> > (28426544 bytes)
> > test 17 (256 bit key, 64 byte blocks): 1781110 operations in 1 seconds
> > (113991040 bytes)
> > test 18 (256 bit key, 256 byte blocks): 1206511 operations in 1
> > seconds (308866816 bytes)
> > test 19 (256 bit key, 512 byte blocks): 846284 operations in 1 seconds
> > (433297408 bytes)
> > test 20 (256 bit key, 1024 byte blocks): 424405 operations in 1
> > seconds (434590720 bytes)
> > test 21 (256 bit key, 1420 byte blocks): 331558 operations in 1
> > seconds (470812360 bytes)
> > test 22 (256 bit key, 4096 byte blocks): 143821 operations in 1
> > seconds (589090816 bytes)
> > test 23 (256 bit key, 8192 byte blocks): 75641 operations in 1 seconds
> > (619651072 bytes)

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-12-15  7:25 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-23  6:30 [PATCH] crypto: arm64/gcm-ce - unroll factors to 4-way interleave of aes and ghash XiaokangQian
2021-09-28  6:27 ` Eric Biggers
2021-09-28 21:04   ` Ard Biesheuvel
2021-09-30  1:32     ` Xiaokang Qian
2021-09-30 14:57       ` Ard Biesheuvel
2021-10-15  8:58         ` Xiaokang Qian
2021-12-13 18:29     ` Will Deacon
2021-12-14  1:39       ` Xiaokang Qian
2021-12-14 15:59         ` Ard Biesheuvel
2021-12-15  5:48           ` Xiaokang Qian
2021-12-15  7:24             ` Ard Biesheuvel
2021-12-15  3:04 ` [PATCH v2] " XiaokangQian

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).