* [PATCH for-stable-5.4] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
@ 2021-03-18 17:41 Ard Biesheuvel
2021-03-19 10:32 ` Greg KH
0 siblings, 1 reply; 2+ messages in thread
From: Ard Biesheuvel @ 2021-03-18 17:41 UTC (permalink / raw)
To: stable
Cc: linux-crypto, tmb, sashal, Ard Biesheuvel, Eric Biggers, Herbert Xu
From: Ard Biesheuvel <ardb@kernel.org>
Upstream commit 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1
The XTS asm helper arrangement is a bit odd: the 8-way stride helper
consists of back-to-back calls to the 4-way core transforms, which
are called indirectly, based on a boolean that indicates whether we
are performing encryption or decryption.
Given how costly indirect calls are on x86, let's switch to direct
calls, and given how the 8-way stride doesn't really add anything
substantial, use a 4-way stride instead, and make the asm core
routine deal with any multiple of 4 blocks. Since 512 byte sectors
or 4 KB blocks are the typical quantities XTS operates on, increase
the stride exported to the glue helper to 512 bytes as well.
As a result, the number of indirect calls is reduced from 3 per 64 bytes
of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup
when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU)
Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps")
Tested-by: Eric Biggers <ebiggers@google.com> # x86_64
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
[ardb: rebase onto stable/linux-5.4.y]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
Please apply on top of backports of
9c1e8836edbb crypto: x86 - Regularize glue function prototypes
032d049ea0f4 crypto: aesni - Use TEST %reg,%reg instead of CMP $0,%reg
arch/x86/crypto/aesni-intel_asm.S | 115 ++++++++++++++++++-----------
arch/x86/crypto/aesni-intel_glue.c | 25 ++++---
2 files changed, 84 insertions(+), 56 deletions(-)
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index f10f044c887c..dd954d8db629 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2726,25 +2726,18 @@ ENDPROC(aesni_ctr_enc)
pxor CTR, IV;
/*
- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, bool enc, le128 *iv)
+ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
*/
-ENTRY(aesni_xts_crypt8)
+ENTRY(aesni_xts_encrypt)
FRAME_BEGIN
- testb %cl, %cl
- movl $0, %ecx
- movl $240, %r10d
- leaq _aesni_enc4, %r11
- leaq _aesni_dec4, %rax
- cmovel %r10d, %ecx
- cmoveq %rax, %r11
movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
movups (IVP), IV
mov 480(KEYP), KLEN
- addq %rcx, KEYP
+.Lxts_enc_loop4:
movdqa IV, STATE1
movdqu 0x00(INP), INC
pxor INC, STATE1
@@ -2768,71 +2761,103 @@ ENTRY(aesni_xts_crypt8)
pxor INC, STATE4
movdqu IV, 0x30(OUTP)
- CALL_NOSPEC %r11
+ call _aesni_enc4
movdqu 0x00(OUTP), INC
pxor INC, STATE1
movdqu STATE1, 0x00(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE1
- movdqu 0x40(INP), INC
- pxor INC, STATE1
- movdqu IV, 0x40(OUTP)
-
movdqu 0x10(OUTP), INC
pxor INC, STATE2
movdqu STATE2, 0x10(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x50(INP), INC
- pxor INC, STATE2
- movdqu IV, 0x50(OUTP)
-
movdqu 0x20(OUTP), INC
pxor INC, STATE3
movdqu STATE3, 0x20(OUTP)
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x60(INP), INC
- pxor INC, STATE3
- movdqu IV, 0x60(OUTP)
-
movdqu 0x30(OUTP), INC
pxor INC, STATE4
movdqu STATE4, 0x30(OUTP)
_aesni_gf128mul_x_ble()
- movdqa IV, STATE4
- movdqu 0x70(INP), INC
- pxor INC, STATE4
- movdqu IV, 0x70(OUTP)
- _aesni_gf128mul_x_ble()
+ add $64, INP
+ add $64, OUTP
+ sub $64, LEN
+ ja .Lxts_enc_loop4
+
movups IV, (IVP)
- CALL_NOSPEC %r11
+ FRAME_END
+ ret
+ENDPROC(aesni_xts_encrypt)
+
+/*
+ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+ENTRY(aesni_xts_decrypt)
+ FRAME_BEGIN
+
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+ add $240, KEYP
- movdqu 0x40(OUTP), INC
+.Lxts_dec_loop4:
+ movdqa IV, STATE1
+ movdqu 0x00(INP), INC
pxor INC, STATE1
- movdqu STATE1, 0x40(OUTP)
+ movdqu IV, 0x00(OUTP)
- movdqu 0x50(OUTP), INC
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE2
+ movdqu 0x10(INP), INC
+ pxor INC, STATE2
+ movdqu IV, 0x10(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ movdqu 0x20(INP), INC
+ pxor INC, STATE3
+ movdqu IV, 0x20(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE4
+ movdqu 0x30(INP), INC
+ pxor INC, STATE4
+ movdqu IV, 0x30(OUTP)
+
+ call _aesni_dec4
+
+ movdqu 0x00(OUTP), INC
+ pxor INC, STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+ movdqu 0x10(OUTP), INC
pxor INC, STATE2
- movdqu STATE2, 0x50(OUTP)
+ movdqu STATE2, 0x10(OUTP)
- movdqu 0x60(OUTP), INC
+ movdqu 0x20(OUTP), INC
pxor INC, STATE3
- movdqu STATE3, 0x60(OUTP)
+ movdqu STATE3, 0x20(OUTP)
- movdqu 0x70(OUTP), INC
+ movdqu 0x30(OUTP), INC
pxor INC, STATE4
- movdqu STATE4, 0x70(OUTP)
+ movdqu STATE4, 0x30(OUTP)
+
+ _aesni_gf128mul_x_ble()
+
+ add $64, INP
+ add $64, OUTP
+ sub $64, LEN
+ ja .Lxts_dec_loop4
+
+ movups IV, (IVP)
FRAME_END
ret
-ENDPROC(aesni_xts_crypt8)
+ENDPROC(aesni_xts_decrypt)
#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index f795310d3da7..18cfb76daa23 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -97,6 +97,12 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
+asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
#ifdef CONFIG_X86_64
static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
@@ -104,9 +110,6 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, bool enc, le128 *iv);
-
/* asmlinkage void aesni_gcm_enc()
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
* struct gcm_context_data. May be uninitialized.
@@ -558,14 +561,14 @@ static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
}
-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- aesni_xts_crypt8(ctx, dst, src, true, iv);
+ aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
}
-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- aesni_xts_crypt8(ctx, dst, src, false, iv);
+ aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
}
static const struct common_glue_ctx aesni_enc_xts = {
@@ -573,8 +576,8 @@ static const struct common_glue_ctx aesni_enc_xts = {
.fpu_blocks_limit = 1,
.funcs = { {
- .num_blocks = 8,
- .fn_u = { .xts = aesni_xts_enc8 }
+ .num_blocks = 32,
+ .fn_u = { .xts = aesni_xts_enc32 }
}, {
.num_blocks = 1,
.fn_u = { .xts = aesni_xts_enc }
@@ -586,8 +589,8 @@ static const struct common_glue_ctx aesni_dec_xts = {
.fpu_blocks_limit = 1,
.funcs = { {
- .num_blocks = 8,
- .fn_u = { .xts = aesni_xts_dec8 }
+ .num_blocks = 32,
+ .fn_u = { .xts = aesni_xts_dec32 }
}, {
.num_blocks = 1,
.fn_u = { .xts = aesni_xts_dec }
--
2.31.0.rc2.261.g7f71774620-goog
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH for-stable-5.4] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
2021-03-18 17:41 [PATCH for-stable-5.4] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride Ard Biesheuvel
@ 2021-03-19 10:32 ` Greg KH
0 siblings, 0 replies; 2+ messages in thread
From: Greg KH @ 2021-03-19 10:32 UTC (permalink / raw)
To: Ard Biesheuvel
Cc: stable, linux-crypto, tmb, sashal, Ard Biesheuvel, Eric Biggers,
Herbert Xu
On Thu, Mar 18, 2021 at 05:41:51PM +0000, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb@kernel.org>
>
> Upstream commit 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1
>
> The XTS asm helper arrangement is a bit odd: the 8-way stride helper
> consists of back-to-back calls to the 4-way core transforms, which
> are called indirectly, based on a boolean that indicates whether we
> are performing encryption or decryption.
>
> Given how costly indirect calls are on x86, let's switch to direct
> calls, and given how the 8-way stride doesn't really add anything
> substantial, use a 4-way stride instead, and make the asm core
> routine deal with any multiple of 4 blocks. Since 512 byte sectors
> or 4 KB blocks are the typical quantities XTS operates on, increase
> the stride exported to the glue helper to 512 bytes as well.
>
> As a result, the number of indirect calls is reduced from 3 per 64 bytes
> of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup
> when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU)
>
> Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps")
> Tested-by: Eric Biggers <ebiggers@google.com> # x86_64
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> [ardb: rebase onto stable/linux-5.4.y]
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>
> Please apply on top of backports of
>
> 9c1e8836edbb crypto: x86 - Regularize glue function prototypes
> 032d049ea0f4 crypto: aesni - Use TEST %reg,%reg instead of CMP $0,%reg
Now queued up, thanks.
greg k-h
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-03-19 10:33 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-18 17:41 [PATCH for-stable-5.4] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride Ard Biesheuvel
2021-03-19 10:32 ` Greg KH
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).