* [PATCH 0/2] crypto: arm64/aes-ce - implement 5-way interleave for some modes
@ 2019-06-24 17:38 Ard Biesheuvel
2019-06-24 17:38 ` [PATCH 1/2] crypto: arm64/aes-ce - add 5 way interleave routines Ard Biesheuvel
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Ard Biesheuvel @ 2019-06-24 17:38 UTC (permalink / raw)
To: linux-crypto
Cc: ebiggers, Ard Biesheuvel, herbert, linux-arm-kernel, steve.capper
As it turns out, even a 4-way interleave is not sufficient to saturate
the ThunderX2 pipeline with AES instructions, so this series implements
5-way interleave for modes that can be modified without running out of
registers to maintain chaining mode state across the encryption operation,
i.e., ECB, CBC-decryption and CTR.
Ard Biesheuvel (2):
crypto: arm64/aes-ce - add 5 way interleave routines
crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR
arch/arm64/crypto/aes-ce.S | 60 ++++++----
arch/arm64/crypto/aes-modes.S | 118 +++++++++++++++-----
arch/arm64/crypto/aes-neon.S | 48 +-------
3 files changed, 127 insertions(+), 99 deletions(-)
--
2.20.1
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/2] crypto: arm64/aes-ce - add 5 way interleave routines
2019-06-24 17:38 [PATCH 0/2] crypto: arm64/aes-ce - implement 5-way interleave for some modes Ard Biesheuvel
@ 2019-06-24 17:38 ` Ard Biesheuvel
2019-06-24 17:38 ` [PATCH 2/2] crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR Ard Biesheuvel
2019-07-03 14:27 ` [PATCH 0/2] crypto: arm64/aes-ce - implement 5-way interleave for some modes Herbert Xu
2 siblings, 0 replies; 4+ messages in thread
From: Ard Biesheuvel @ 2019-06-24 17:38 UTC (permalink / raw)
To: linux-crypto
Cc: ebiggers, Ard Biesheuvel, herbert, linux-arm-kernel, steve.capper
In preparation of tweaking the accelerated AES chaining mode routines
to be able to use a 5-way stride, implement the core routines to
support processing 5 blocks of input at a time. While at it, drop
the 2 way versions, which have been unused for a while now.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
arch/arm64/crypto/aes-ce.S | 58 ++++++++++++--------
arch/arm64/crypto/aes-modes.S | 16 ++++++
arch/arm64/crypto/aes-neon.S | 46 +---------------
3 files changed, 52 insertions(+), 68 deletions(-)
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 143070510809..0fca5f463406 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -52,7 +52,7 @@
load_round_keys \rounds, \temp
.endm
- .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
+ .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4
aes\de \i0\().16b, \k\().16b
aes\mc \i0\().16b, \i0\().16b
.ifnb \i1
@@ -63,27 +63,34 @@
aes\mc \i2\().16b, \i2\().16b
aes\de \i3\().16b, \k\().16b
aes\mc \i3\().16b, \i3\().16b
+ .ifnb \i4
+ aes\de \i4\().16b, \k\().16b
+ aes\mc \i4\().16b, \i4\().16b
+ .endif
.endif
.endif
.endm
- /* up to 4 interleaved encryption rounds with the same round key */
- .macro round_Nx, enc, k, i0, i1, i2, i3
+ /* up to 5 interleaved encryption rounds with the same round key */
+ .macro round_Nx, enc, k, i0, i1, i2, i3, i4
.ifc \enc, e
- do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3
+ do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3, \i4
.else
- do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3
+ do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3, \i4
.endif
.endm
- /* up to 4 interleaved final rounds */
- .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3
+ /* up to 5 interleaved final rounds */
+ .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4
aes\de \i0\().16b, \k\().16b
.ifnb \i1
aes\de \i1\().16b, \k\().16b
.ifnb \i3
aes\de \i2\().16b, \k\().16b
aes\de \i3\().16b, \k\().16b
+ .ifnb \i4
+ aes\de \i4\().16b, \k\().16b
+ .endif
.endif
.endif
eor \i0\().16b, \i0\().16b, \k2\().16b
@@ -92,47 +99,52 @@
.ifnb \i3
eor \i2\().16b, \i2\().16b, \k2\().16b
eor \i3\().16b, \i3\().16b, \k2\().16b
+ .ifnb \i4
+ eor \i4\().16b, \i4\().16b, \k2\().16b
+ .endif
.endif
.endif
.endm
- /* up to 4 interleaved blocks */
- .macro do_block_Nx, enc, rounds, i0, i1, i2, i3
+ /* up to 5 interleaved blocks */
+ .macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4
cmp \rounds, #12
blo 2222f /* 128 bits */
beq 1111f /* 192 bits */
- round_Nx \enc, v17, \i0, \i1, \i2, \i3
- round_Nx \enc, v18, \i0, \i1, \i2, \i3
-1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3
- round_Nx \enc, v20, \i0, \i1, \i2, \i3
+ round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4
+ round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4
+1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4
+ round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
- round_Nx \enc, \key, \i0, \i1, \i2, \i3
+ round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4
.endr
- fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3
+ fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4
.endm
.macro encrypt_block, in, rounds, t0, t1, t2
do_block_Nx e, \rounds, \in
.endm
- .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2
- do_block_Nx e, \rounds, \i0, \i1
- .endm
-
.macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
.endm
- .macro decrypt_block, in, rounds, t0, t1, t2
- do_block_Nx d, \rounds, \in
+ .macro encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
+ do_block_Nx e, \rounds, \i0, \i1, \i2, \i3, \i4
.endm
- .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2
- do_block_Nx d, \rounds, \i0, \i1
+ .macro decrypt_block, in, rounds, t0, t1, t2
+ do_block_Nx d, \rounds, \in
.endm
.macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
do_block_Nx d, \rounds, \i0, \i1, \i2, \i3
.endm
+ .macro decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
+ do_block_Nx d, \rounds, \i0, \i1, \i2, \i3, \i4
+ .endm
+
+#define MAX_STRIDE 5
+
#include "aes-modes.S"
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 4ebc61375aa6..0dbeaf2ce9b1 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -13,6 +13,10 @@
.text
.align 4
+#ifndef MAX_STRIDE
+#define MAX_STRIDE 4
+#endif
+
aes_encrypt_block4x:
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
@@ -23,6 +27,18 @@ aes_decrypt_block4x:
ret
ENDPROC(aes_decrypt_block4x)
+#if MAX_STRIDE == 5
+aes_encrypt_block5x:
+ encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
+ ret
+ENDPROC(aes_encrypt_block5x)
+
+aes_decrypt_block5x:
+ decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
+ ret
+ENDPROC(aes_decrypt_block5x)
+#endif
+
/*
* aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 29100f692e8a..33bb6af309a3 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -117,26 +117,9 @@
/*
* Interleaved versions: functionally equivalent to the
- * ones above, but applied to 2 or 4 AES states in parallel.
+ * ones above, but applied to AES states in parallel.
*/
- .macro sub_bytes_2x, in0, in1
- sub v8.16b, \in0\().16b, v15.16b
- tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
- sub v9.16b, \in1\().16b, v15.16b
- tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
- sub v10.16b, v8.16b, v15.16b
- tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
- sub v11.16b, v9.16b, v15.16b
- tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
- sub v8.16b, v10.16b, v15.16b
- tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
- sub v9.16b, v11.16b, v15.16b
- tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
- tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
- tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
- .endm
-
.macro sub_bytes_4x, in0, in1, in2, in3
sub v8.16b, \in0\().16b, v15.16b
tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
@@ -215,25 +198,6 @@
eor \in1\().16b, \in1\().16b, v11.16b
.endm
- .macro do_block_2x, enc, in0, in1, rounds, rk, rkp, i
- ld1 {v15.4s}, [\rk]
- add \rkp, \rk, #16
- mov \i, \rounds
-1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
- eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
- movi v15.16b, #0x40
- tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
- tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
- sub_bytes_2x \in0, \in1
- subs \i, \i, #1
- ld1 {v15.4s}, [\rkp], #16
- beq 2222f
- mix_columns_2x \in0, \in1, \enc
- b 1111b
-2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
- eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
- .endm
-
.macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
ld1 {v15.4s}, [\rk]
add \rkp, \rk, #16
@@ -260,14 +224,6 @@
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
.endm
- .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
- do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
- .endm
-
- .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
- do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
- .endm
-
.macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
.endm
--
2.20.1
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/2] crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR
2019-06-24 17:38 [PATCH 0/2] crypto: arm64/aes-ce - implement 5-way interleave for some modes Ard Biesheuvel
2019-06-24 17:38 ` [PATCH 1/2] crypto: arm64/aes-ce - add 5 way interleave routines Ard Biesheuvel
@ 2019-06-24 17:38 ` Ard Biesheuvel
2019-07-03 14:27 ` [PATCH 0/2] crypto: arm64/aes-ce - implement 5-way interleave for some modes Herbert Xu
2 siblings, 0 replies; 4+ messages in thread
From: Ard Biesheuvel @ 2019-06-24 17:38 UTC (permalink / raw)
To: linux-crypto
Cc: ebiggers, Ard Biesheuvel, herbert, linux-arm-kernel, steve.capper
This implements 5-way interleaving for ECB, CBC decryption and CTR,
resulting in a speedup of ~11% on Marvell ThunderX2, which has a
very deep pipeline and therefore a high issue latency for NEON
instructions operating on the same registers.
Note that XTS is left alone: implementing 5-way interleave there
would either involve spilling of the calculated tweaks to the
stack, or recalculating them after the encryption operation, and
doing either of those would most likely penalize low end cores.
For ECB, this is not a concern at all, given that we have plenty
of spare registers. For CTR and CBC decryption, we take advantage
of the fact that v16 is not used by the CE version of the code
(which is the only one targeted by the optimization), and so we
can reshuffle the code a bit and avoid having to spill to memory
(with the exception of one extra reload in the CBC routine)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
arch/arm64/crypto/aes-ce.S | 2 +
arch/arm64/crypto/aes-modes.S | 102 ++++++++++++++------
arch/arm64/crypto/aes-neon.S | 2 +
3 files changed, 75 insertions(+), 31 deletions(-)
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
index 0fca5f463406..dbfc04ea20c7 100644
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -18,6 +18,8 @@
.arch armv8-a+crypto
xtsmask .req v16
+ cbciv .req v16
+ vctr .req v16
.macro xts_reload_mask, tmp
.endm
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 0dbeaf2ce9b1..42edc34e8f01 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -17,6 +17,14 @@
#define MAX_STRIDE 4
#endif
+#if MAX_STRIDE == 4
+#define ST4(x...) x
+#define ST5(x...)
+#else
+#define ST4(x...)
+#define ST5(x...) x
+#endif
+
aes_encrypt_block4x:
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
@@ -53,14 +61,17 @@ AES_ENTRY(aes_ecb_encrypt)
enc_prepare w3, x2, x5
.LecbencloopNx:
- subs w4, w4, #4
+ subs w4, w4, #MAX_STRIDE
bmi .Lecbenc1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
- bl aes_encrypt_block4x
+ST4( bl aes_encrypt_block4x )
+ST5( ld1 {v4.16b}, [x1], #16 )
+ST5( bl aes_encrypt_block5x )
st1 {v0.16b-v3.16b}, [x0], #64
+ST5( st1 {v4.16b}, [x0], #16 )
b .LecbencloopNx
.Lecbenc1x:
- adds w4, w4, #4
+ adds w4, w4, #MAX_STRIDE
beq .Lecbencout
.Lecbencloop:
ld1 {v0.16b}, [x1], #16 /* get next pt block */
@@ -81,14 +92,17 @@ AES_ENTRY(aes_ecb_decrypt)
dec_prepare w3, x2, x5
.LecbdecloopNx:
- subs w4, w4, #4
+ subs w4, w4, #MAX_STRIDE
bmi .Lecbdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
- bl aes_decrypt_block4x
+ST4( bl aes_decrypt_block4x )
+ST5( ld1 {v4.16b}, [x1], #16 )
+ST5( bl aes_decrypt_block5x )
st1 {v0.16b-v3.16b}, [x0], #64
+ST5( st1 {v4.16b}, [x0], #16 )
b .LecbdecloopNx
.Lecbdec1x:
- adds w4, w4, #4
+ adds w4, w4, #MAX_STRIDE
beq .Lecbdecout
.Lecbdecloop:
ld1 {v0.16b}, [x1], #16 /* get next ct block */
@@ -148,39 +162,56 @@ AES_ENTRY(aes_cbc_decrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
- ld1 {v7.16b}, [x5] /* get iv */
+ ld1 {cbciv.16b}, [x5] /* get iv */
dec_prepare w3, x2, x6
.LcbcdecloopNx:
- subs w4, w4, #4
+ subs w4, w4, #MAX_STRIDE
bmi .Lcbcdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
+#if MAX_STRIDE == 5
+ ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
+ mov v5.16b, v0.16b
+ mov v6.16b, v1.16b
+ mov v7.16b, v2.16b
+ bl aes_decrypt_block5x
+ sub x1, x1, #32
+ eor v0.16b, v0.16b, cbciv.16b
+ eor v1.16b, v1.16b, v5.16b
+ ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
+ ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
+ eor v2.16b, v2.16b, v6.16b
+ eor v3.16b, v3.16b, v7.16b
+ eor v4.16b, v4.16b, v5.16b
+#else
mov v4.16b, v0.16b
mov v5.16b, v1.16b
mov v6.16b, v2.16b
bl aes_decrypt_block4x
sub x1, x1, #16
- eor v0.16b, v0.16b, v7.16b
+ eor v0.16b, v0.16b, cbciv.16b
eor v1.16b, v1.16b, v4.16b
- ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
+ ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b
+#endif
st1 {v0.16b-v3.16b}, [x0], #64
+ST5( st1 {v4.16b}, [x0], #16 )
b .LcbcdecloopNx
.Lcbcdec1x:
- adds w4, w4, #4
+ adds w4, w4, #MAX_STRIDE
beq .Lcbcdecout
.Lcbcdecloop:
ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */
decrypt_block v0, w3, x2, x6, w7
- eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
- mov v7.16b, v1.16b /* ct is next iv */
+ eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
+ mov cbciv.16b, v1.16b /* ct is next iv */
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcdecloop
.Lcbcdecout:
- st1 {v7.16b}, [x5] /* return iv */
+ st1 {cbciv.16b}, [x5] /* return iv */
ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_cbc_decrypt)
@@ -373,51 +404,60 @@ AES_ENTRY(aes_ctr_encrypt)
mov x29, sp
enc_prepare w3, x2, x6
- ld1 {v4.16b}, [x5]
+ ld1 {vctr.16b}, [x5]
- umov x6, v4.d[1] /* keep swabbed ctr in reg */
+ umov x6, vctr.d[1] /* keep swabbed ctr in reg */
rev x6, x6
cmn w6, w4 /* 32 bit overflow? */
bcs .Lctrloop
.LctrloopNx:
- subs w4, w4, #4
+ subs w4, w4, #MAX_STRIDE
bmi .Lctr1x
add w7, w6, #1
- mov v0.16b, v4.16b
+ mov v0.16b, vctr.16b
add w8, w6, #2
- mov v1.16b, v4.16b
+ mov v1.16b, vctr.16b
+ add w9, w6, #3
+ mov v2.16b, vctr.16b
add w9, w6, #3
- mov v2.16b, v4.16b
rev w7, w7
- mov v3.16b, v4.16b
+ mov v3.16b, vctr.16b
rev w8, w8
+ST5( mov v4.16b, vctr.16b )
mov v1.s[3], w7
rev w9, w9
+ST5( add w10, w6, #4 )
mov v2.s[3], w8
+ST5( rev w10, w10 )
mov v3.s[3], w9
+ST5( mov v4.s[3], w10 )
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
- bl aes_encrypt_block4x
+ST4( bl aes_encrypt_block4x )
+ST5( bl aes_encrypt_block5x )
eor v0.16b, v5.16b, v0.16b
- ld1 {v5.16b}, [x1], #16 /* get 1 input block */
+ST4( ld1 {v5.16b}, [x1], #16 )
eor v1.16b, v6.16b, v1.16b
+ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b
+ST5( eor v4.16b, v6.16b, v4.16b )
st1 {v0.16b-v3.16b}, [x0], #64
- add x6, x6, #4
+ST5( st1 {v4.16b}, [x0], #16 )
+ add x6, x6, #MAX_STRIDE
rev x7, x6
- ins v4.d[1], x7
+ ins vctr.d[1], x7
cbz w4, .Lctrout
b .LctrloopNx
.Lctr1x:
- adds w4, w4, #4
+ adds w4, w4, #MAX_STRIDE
beq .Lctrout
.Lctrloop:
- mov v0.16b, v4.16b
+ mov v0.16b, vctr.16b
encrypt_block v0, w3, x2, x8, w7
adds x6, x6, #1 /* increment BE ctr */
rev x7, x6
- ins v4.d[1], x7
+ ins vctr.d[1], x7
bcs .Lctrcarry /* overflow? */
.Lctrcarrydone:
@@ -429,7 +469,7 @@ AES_ENTRY(aes_ctr_encrypt)
bne .Lctrloop
.Lctrout:
- st1 {v4.16b}, [x5] /* return next CTR value */
+ st1 {vctr.16b}, [x5] /* return next CTR value */
ldp x29, x30, [sp], #16
ret
@@ -438,11 +478,11 @@ AES_ENTRY(aes_ctr_encrypt)
b .Lctrout
.Lctrcarry:
- umov x7, v4.d[0] /* load upper word of ctr */
+ umov x7, vctr.d[0] /* load upper word of ctr */
rev x7, x7 /* ... to handle the carry */
add x7, x7, #1
rev x7, x7
- ins v4.d[0], x7
+ ins vctr.d[0], x7
b .Lctrcarrydone
AES_ENDPROC(aes_ctr_encrypt)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index 33bb6af309a3..8bd66a6c4749 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -15,6 +15,8 @@
#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
xtsmask .req v7
+ cbciv .req v7
+ vctr .req v4
.macro xts_reload_mask, tmp
xts_load_mask \tmp
--
2.20.1
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH 0/2] crypto: arm64/aes-ce - implement 5-way interleave for some modes
2019-06-24 17:38 [PATCH 0/2] crypto: arm64/aes-ce - implement 5-way interleave for some modes Ard Biesheuvel
2019-06-24 17:38 ` [PATCH 1/2] crypto: arm64/aes-ce - add 5 way interleave routines Ard Biesheuvel
2019-06-24 17:38 ` [PATCH 2/2] crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR Ard Biesheuvel
@ 2019-07-03 14:27 ` Herbert Xu
2 siblings, 0 replies; 4+ messages in thread
From: Herbert Xu @ 2019-07-03 14:27 UTC (permalink / raw)
To: Ard Biesheuvel; +Cc: ebiggers, linux-crypto, linux-arm-kernel, steve.capper
On Mon, Jun 24, 2019 at 07:38:29PM +0200, Ard Biesheuvel wrote:
> As it turns out, even a 4-way interleave is not sufficient to saturate
> the ThunderX2 pipeline with AES instructions, so this series implements
> 5-way interleave for modes that can be modified without running out of
> registers to maintain chaining mode state across the encryption operation,
> i.e., ECB, CBC-decryption and CTR.
>
> Ard Biesheuvel (2):
> crypto: arm64/aes-ce - add 5 way interleave routines
> crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR
>
> arch/arm64/crypto/aes-ce.S | 60 ++++++----
> arch/arm64/crypto/aes-modes.S | 118 +++++++++++++++-----
> arch/arm64/crypto/aes-neon.S | 48 +-------
> 3 files changed, 127 insertions(+), 99 deletions(-)
All applied. Thanks.
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2019-07-03 14:28 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-24 17:38 [PATCH 0/2] crypto: arm64/aes-ce - implement 5-way interleave for some modes Ard Biesheuvel
2019-06-24 17:38 ` [PATCH 1/2] crypto: arm64/aes-ce - add 5 way interleave routines Ard Biesheuvel
2019-06-24 17:38 ` [PATCH 2/2] crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR Ard Biesheuvel
2019-07-03 14:27 ` [PATCH 0/2] crypto: arm64/aes-ce - implement 5-way interleave for some modes Herbert Xu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).