stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Ard Biesheuvel <ardb@kernel.org>,
	Herbert Xu <herbert@gondor.apana.org.au>,
	Eric Biggers <ebiggers@google.com>
Subject: [PATCH 5.11 31/31] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
Date: Fri, 19 Mar 2021 13:19:25 +0100	[thread overview]
Message-ID: <20210319121748.211622571@linuxfoundation.org> (raw)
In-Reply-To: <20210319121747.203523570@linuxfoundation.org>

From: Ard Biesheuvel <ardb@kernel.org>

commit 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 upstream.

The XTS asm helper arrangement is a bit odd: the 8-way stride helper
consists of back-to-back calls to the 4-way core transforms, which
are called indirectly, based on a boolean that indicates whether we
are performing encryption or decryption.

Given how costly indirect calls are on x86, let's switch to direct
calls, and given how the 8-way stride doesn't really add anything
substantial, use a 4-way stride instead, and make the asm core
routine deal with any multiple of 4 blocks. Since 512 byte sectors
or 4 KB blocks are the typical quantities XTS operates on, increase
the stride exported to the glue helper to 512 bytes as well.

As a result, the number of indirect calls is reduced from 3 per 64 bytes
of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup
when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU)

Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps")
Tested-by: Eric Biggers <ebiggers@google.com> # x86_64
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/crypto/aesni-intel_asm.S  |  115 ++++++++++++++++++++++---------------
 arch/x86/crypto/aesni-intel_glue.c |   25 ++++----
 2 files changed, 84 insertions(+), 56 deletions(-)

--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -2715,25 +2715,18 @@ SYM_FUNC_END(aesni_ctr_enc)
 	pxor CTR, IV;
 
 /*
- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
- *			 const u8 *src, bool enc, le128 *iv)
+ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ *			  const u8 *src, unsigned int len, le128 *iv)
  */
-SYM_FUNC_START(aesni_xts_crypt8)
+SYM_FUNC_START(aesni_xts_encrypt)
 	FRAME_BEGIN
-	testb %cl, %cl
-	movl $0, %ecx
-	movl $240, %r10d
-	leaq _aesni_enc4, %r11
-	leaq _aesni_dec4, %rax
-	cmovel %r10d, %ecx
-	cmoveq %rax, %r11
 
 	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
 	movups (IVP), IV
 
 	mov 480(KEYP), KLEN
-	addq %rcx, KEYP
 
+.Lxts_enc_loop4:
 	movdqa IV, STATE1
 	movdqu 0x00(INP), INC
 	pxor INC, STATE1
@@ -2757,71 +2750,103 @@ SYM_FUNC_START(aesni_xts_crypt8)
 	pxor INC, STATE4
 	movdqu IV, 0x30(OUTP)
 
-	CALL_NOSPEC r11
+	call _aesni_enc4
 
 	movdqu 0x00(OUTP), INC
 	pxor INC, STATE1
 	movdqu STATE1, 0x00(OUTP)
 
-	_aesni_gf128mul_x_ble()
-	movdqa IV, STATE1
-	movdqu 0x40(INP), INC
-	pxor INC, STATE1
-	movdqu IV, 0x40(OUTP)
-
 	movdqu 0x10(OUTP), INC
 	pxor INC, STATE2
 	movdqu STATE2, 0x10(OUTP)
 
-	_aesni_gf128mul_x_ble()
-	movdqa IV, STATE2
-	movdqu 0x50(INP), INC
-	pxor INC, STATE2
-	movdqu IV, 0x50(OUTP)
-
 	movdqu 0x20(OUTP), INC
 	pxor INC, STATE3
 	movdqu STATE3, 0x20(OUTP)
 
-	_aesni_gf128mul_x_ble()
-	movdqa IV, STATE3
-	movdqu 0x60(INP), INC
-	pxor INC, STATE3
-	movdqu IV, 0x60(OUTP)
-
 	movdqu 0x30(OUTP), INC
 	pxor INC, STATE4
 	movdqu STATE4, 0x30(OUTP)
 
 	_aesni_gf128mul_x_ble()
-	movdqa IV, STATE4
-	movdqu 0x70(INP), INC
-	pxor INC, STATE4
-	movdqu IV, 0x70(OUTP)
 
-	_aesni_gf128mul_x_ble()
+	add $64, INP
+	add $64, OUTP
+	sub $64, LEN
+	ja .Lxts_enc_loop4
+
 	movups IV, (IVP)
 
-	CALL_NOSPEC r11
+	FRAME_END
+	ret
+SYM_FUNC_END(aesni_xts_encrypt)
+
+/*
+ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ *			  const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_decrypt)
+	FRAME_BEGIN
+
+	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+	movups (IVP), IV
+
+	mov 480(KEYP), KLEN
+	add $240, KEYP
+
+.Lxts_dec_loop4:
+	movdqa IV, STATE1
+	movdqu 0x00(INP), INC
+	pxor INC, STATE1
+	movdqu IV, 0x00(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE2
+	movdqu 0x10(INP), INC
+	pxor INC, STATE2
+	movdqu IV, 0x10(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE3
+	movdqu 0x20(INP), INC
+	pxor INC, STATE3
+	movdqu IV, 0x20(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE4
+	movdqu 0x30(INP), INC
+	pxor INC, STATE4
+	movdqu IV, 0x30(OUTP)
+
+	call _aesni_dec4
 
-	movdqu 0x40(OUTP), INC
+	movdqu 0x00(OUTP), INC
 	pxor INC, STATE1
-	movdqu STATE1, 0x40(OUTP)
+	movdqu STATE1, 0x00(OUTP)
 
-	movdqu 0x50(OUTP), INC
+	movdqu 0x10(OUTP), INC
 	pxor INC, STATE2
-	movdqu STATE2, 0x50(OUTP)
+	movdqu STATE2, 0x10(OUTP)
 
-	movdqu 0x60(OUTP), INC
+	movdqu 0x20(OUTP), INC
 	pxor INC, STATE3
-	movdqu STATE3, 0x60(OUTP)
+	movdqu STATE3, 0x20(OUTP)
 
-	movdqu 0x70(OUTP), INC
+	movdqu 0x30(OUTP), INC
 	pxor INC, STATE4
-	movdqu STATE4, 0x70(OUTP)
+	movdqu STATE4, 0x30(OUTP)
+
+	_aesni_gf128mul_x_ble()
+
+	add $64, INP
+	add $64, OUTP
+	sub $64, LEN
+	ja .Lxts_dec_loop4
+
+	movups IV, (IVP)
 
 	FRAME_END
 	ret
-SYM_FUNC_END(aesni_xts_crypt8)
+SYM_FUNC_END(aesni_xts_decrypt)
 
 #endif
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -97,6 +97,12 @@ asmlinkage void aesni_cbc_dec(struct cry
 #define AVX_GEN2_OPTSIZE 640
 #define AVX_GEN4_OPTSIZE 4096
 
+asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+				  const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+				  const u8 *in, unsigned int len, u8 *iv);
+
 #ifdef CONFIG_X86_64
 
 static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
@@ -104,9 +110,6 @@ static void (*aesni_ctr_enc_tfm)(struct
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
-				 const u8 *in, bool enc, le128 *iv);
-
 /* asmlinkage void aesni_gcm_enc()
  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
  * struct gcm_context_data.  May be uninitialized.
@@ -547,14 +550,14 @@ static void aesni_xts_dec(const void *ct
 	glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
 }
 
-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
 {
-	aesni_xts_crypt8(ctx, dst, src, true, iv);
+	aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
 }
 
-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
 {
-	aesni_xts_crypt8(ctx, dst, src, false, iv);
+	aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
 }
 
 static const struct common_glue_ctx aesni_enc_xts = {
@@ -562,8 +565,8 @@ static const struct common_glue_ctx aesn
 	.fpu_blocks_limit = 1,
 
 	.funcs = { {
-		.num_blocks = 8,
-		.fn_u = { .xts = aesni_xts_enc8 }
+		.num_blocks = 32,
+		.fn_u = { .xts = aesni_xts_enc32 }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .xts = aesni_xts_enc }
@@ -575,8 +578,8 @@ static const struct common_glue_ctx aesn
 	.fpu_blocks_limit = 1,
 
 	.funcs = { {
-		.num_blocks = 8,
-		.fn_u = { .xts = aesni_xts_dec8 }
+		.num_blocks = 32,
+		.fn_u = { .xts = aesni_xts_dec32 }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .xts = aesni_xts_dec }



  parent reply	other threads:[~2021-03-19 12:22 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-19 12:18 [PATCH 5.11 00/31] 5.11.8-rc1 review Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 01/31] io_uring: dont attempt IO reissue from the ring exit path Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 02/31] KVM: x86/mmu: Expand on the comment in kvm_vcpu_ad_need_write_protect() Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 03/31] KVM: x86/mmu: Set SPTE_AD_WRPROT_ONLY_MASK if and only if PML is enabled Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 04/31] mptcp: send ack for every add_addr Greg Kroah-Hartman
2021-03-19 12:18 ` [PATCH 5.11 05/31] mptcp: pm: add lockdep assertions Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 06/31] mptcp: dispose initial struct socket when its subflow is closed Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 07/31] io_uring: refactor scheduling in io_cqring_wait Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 08/31] io_uring: refactor io_cqring_wait Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 09/31] io_uring: dont keep looping for more events if we cant flush overflow Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 10/31] io_uring: simplify do_read return parsing Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 11/31] io_uring: clear IOCB_WAITQ for non -EIOCBQUEUED return Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 12/31] gpiolib: Read "gpio-line-names" from a firmware node Greg Kroah-Hartman
2021-03-19 12:27   ` Marek Vasut
2021-03-19 12:36     ` Greg Kroah-Hartman
2021-03-19 12:45       ` Marek Vasut
2021-03-19 12:19 ` [PATCH 5.11 13/31] net: bonding: fix error return code of bond_neigh_init() Greg Kroah-Hartman
2021-03-19 14:12   ` Jiri Kosina
2021-03-19 14:24     ` Jiri Kosina
2021-03-19 14:29       ` Greg Kroah-Hartman
2021-03-19 14:25     ` Greg Kroah-Hartman
2021-03-19 15:14       ` Jiri Kosina
2021-03-19 12:19 ` [PATCH 5.11 14/31] regulator: pca9450: Add SD_VSEL GPIO for LDO5 Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 15/31] regulator: pca9450: Enable system reset on WDOG_B assertion Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 16/31] regulator: pca9450: Clear PRESET_EN bit to fix BUCK1/2/3 voltage setting Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 17/31] gfs2: Add common helper for holding and releasing the freeze glock Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 18/31] gfs2: move freeze glock outside the make_fs_rw and _ro functions Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 19/31] gfs2: bypass signal_our_withdraw if no journal Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 20/31] bpf: Prohibit alu ops for pointer types not defining ptr_limit Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 21/31] bpf: Fix off-by-one for area size in creating mask to left Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 22/31] bpf: Simplify alu_limit masking for pointer arithmetic Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 23/31] bpf: Add sanity check for upper ptr_limit Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 24/31] bpf, selftests: Fix up some test_verifier cases for unprivileged Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 25/31] arm64: Unconditionally set virtual cpu id registers Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 26/31] RDMA/srp: Fix support for unpopulated and unbalanced NUMA nodes Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 27/31] fuse: fix live lock in fuse_iget() Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 28/31] Revert "nfsd4: remove check_conflicting_opens warning" Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 29/31] Revert "nfsd4: a clients own opens neednt prevent delegations" Greg Kroah-Hartman
2021-03-19 12:19 ` [PATCH 5.11 30/31] net: dsa: b53: Support setting learning on port Greg Kroah-Hartman
2021-03-19 12:19 ` Greg Kroah-Hartman [this message]
2021-03-19 19:38 ` [PATCH 5.11 00/31] 5.11.8-rc1 review Naresh Kamboju
2021-03-20  9:52   ` Greg Kroah-Hartman
2021-03-19 21:23 ` Guenter Roeck

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210319121748.211622571@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=ardb@kernel.org \
    --cc=ebiggers@google.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-kernel@vger.kernel.org \
    --cc=stable@vger.kernel.org \
    --subject='Re: [PATCH 5.11 31/31] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).