All of lore.kernel.org
 help / color / mirror / Atom feed
From: Eric Biggers <ebiggers@kernel.org>
To: linux-crypto@vger.kernel.org
Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
	"Chang S . Bae" <chang.seok.bae@intel.com>
Subject: [PATCH 1/3] crypto: x86/aes-xts - handle AES-128 and AES-192 more efficiently
Date: Fri, 12 Apr 2024 20:17:26 -0700	[thread overview]
Message-ID: <20240413031728.159495-2-ebiggers@kernel.org> (raw)
In-Reply-To: <20240413031728.159495-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Decrease the amount of code specific to the different AES variants by
"right-aligning" the sequence of round keys, and for AES-128 and AES-192
just skipping irrelevant rounds at the beginning.

This shrinks the size of aes-xts-avx-x86_64.o by 13.3%, and it improves
the efficiency of AES-128 and AES-192.  The tradeoff is that for AES-256
some additional not-taken conditional jumps are now executed.  But these
are predicted well and are cheap on x86.

Note that the ARMv8 CE based AES-XTS implementation uses a similar
strategy to handle the different AES variants.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/x86/crypto/aes-xts-avx-x86_64.S | 178 ++++++++++++++-------------
 1 file changed, 92 insertions(+), 86 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 52f1997ed05a..f5e7ab739105 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -80,18 +80,19 @@
 	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
 .text
 
 // Function parameters
 .set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
-				// advanced to point directly to 7th round key
+				// advanced to point to 7th-from-last round key
 .set	SRC,		%rsi	// Pointer to next source data
 .set	DST,		%rdx	// Pointer to next destination data
 .set	LEN,		%rcx	// Remaining length in bytes
 .set	TWEAK,		%r8	// Pointer to next tweak
 
-// %r9d holds the AES key length in bytes.
+// %r9 holds the AES key length in bytes.
 .set	KEYLEN,		%r9d
+.set	KEYLEN64,	%r9
 
 // %rax and %r10-r11 are available as temporaries.
 
 .macro	_define_Vi	i
 .if VL == 16
@@ -163,16 +164,22 @@
 
 	// V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
 	.set	GF_POLY_XMM,	%xmm14
 	.set	GF_POLY,	V14
 
-	// V15 holds the first AES round key, copied to all 128-bit lanes.
+	// V15 holds the key for AES "round 0", copied to all 128-bit lanes.
 	.set	KEY0_XMM,	%xmm15
 	.set	KEY0,		V15
 
 	// If 32 SIMD registers are available, then V16-V29 hold the remaining
 	// AES round keys, copied to all 128-bit lanes.
+	//
+	// AES-128, AES-192, and AES-256 use different numbers of round keys.
+	// To allow handling all three variants efficiently, we align the round
+	// keys to the *end* of this register range.  I.e., AES-128 uses
+	// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
+	// (All also use KEY0 for the XOR-only "round" at the beginning.)
 .if USE_AVX10
 	.set	KEY1_XMM,	%xmm16
 	.set	KEY1,		V16
 	.set	KEY2_XMM,	%xmm17
 	.set	KEY2,		V17
@@ -338,19 +345,19 @@
 	.set NEXT_TWEAK, NEXT_TWEAK2
 .elseif \i == 15
 	.set PREV_TWEAK, NEXT_TWEAK2
 	.set NEXT_TWEAK, NEXT_TWEAK3
 .endif
-.if \i < 20 && \i % 5 == 0
+.if \i >= 0 && \i < 20 && \i % 5 == 0
 	vpshufd		$0x13, PREV_TWEAK, V5
-.elseif \i < 20 && \i % 5 == 1
+.elseif \i >= 0 && \i < 20 && \i % 5 == 1
 	vpaddq		PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
-.elseif \i < 20 && \i % 5 == 2
+.elseif \i >= 0 && \i < 20 && \i % 5 == 2
 	vpsrad		$31, V5, V5
-.elseif \i < 20 && \i % 5 == 3
+.elseif \i >= 0 && \i < 20 && \i % 5 == 3
 	vpand		GF_POLY, V5, V5
-.elseif \i < 20 && \i % 5 == 4
+.elseif \i >= 0 && \i < 20 && \i % 5 == 4
 	vpxor		V5, NEXT_TWEAK, NEXT_TWEAK
 .elseif \i == 1000
 	vmovdqa		NEXT_TWEAK0, TWEAK0
 	vmovdqa		NEXT_TWEAK1, TWEAK1
 	vmovdqa		NEXT_TWEAK2, TWEAK2
@@ -362,25 +369,25 @@
 // (the same method _next_tweakvec uses for VL > 16).  This means multiplying
 // each tweak by x^(4*VL/16) independently.  Since 4*VL/16 is a multiple of 8
 // when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
 // which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
 .macro	_tweak_step_pclmul	i
-.if \i == 2
+.if \i == 0
 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
-.elseif \i == 4
+.elseif \i == 2
 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
-.elseif \i == 6
+.elseif \i == 4
 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
-.elseif \i == 8
+.elseif \i == 6
 	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
-.elseif \i == 10
+.elseif \i == 8
 	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
-.elseif \i == 12
+.elseif \i == 10
 	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
-.elseif \i == 14
+.elseif \i == 12
 	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
-.elseif \i == 16
+.elseif \i == 14
 	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
 .elseif \i == 1000
 	vpslldq		$(4*VL/16) / 8, TWEAK0, TWEAK0
 	vpslldq		$(4*VL/16) / 8, TWEAK1, TWEAK1
 	vpslldq		$(4*VL/16) / 8, TWEAK2, TWEAK2
@@ -391,12 +398,12 @@
 	_vpxor		NEXT_TWEAK3, TWEAK3, TWEAK3
 .endif
 .endm
 
 // _tweak_step does one step of the computation of the next set of tweaks from
-// TWEAK[0-3].  To complete all steps, this must be invoked with \i values 0
-// through at least 19, then 1000 which signals the last step.
+// TWEAK[0-3].  To complete all steps, this is invoked with increasing values of
+// \i that include at least 0 through 19, then 1000 which signals the last step.
 //
 // This is used to interleave the computation of the next set of tweaks with the
 // AES en/decryptions, which increases performance in some cases.
 .macro	_tweak_step	i
 .if VL == 16
@@ -404,26 +411,60 @@
 .else
 	_tweak_step_pclmul	\i
 .endif
 .endm
 
-// Load the round keys: just the first one if !USE_AVX10, otherwise all of them.
-.macro	_load_round_keys
-	_vbroadcast128	-7*16(KEY), KEY0
+.macro	_setup_round_keys	enc
+
+	// Select either the encryption round keys or the decryption round keys.
+.if \enc
+	.set	OFFS, 0
+.else
+	.set	OFFS, 240
+.endif
+
+	// Load the round key for "round 0".
+	_vbroadcast128	OFFS(KEY), KEY0
+
+	// Increment KEY to make it so that 7*16(KEY) is the last round key.
+	// For AES-128, increment by 3*16, resulting in the 10 round keys (not
+	// counting the zero-th round key which was just loaded into KEY0) being
+	// -2*16(KEY) through 7*16(KEY).  For AES-192, increment by 5*16 and use
+	// 12 round keys -4*16(KEY) through 7*16(KEY).  For AES-256, increment
+	// by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
+	//
+	// This rebasing provides two benefits.  First, it makes the offset to
+	// any round key be in the range [-96, 112], fitting in a signed byte.
+	// This shortens VEX-encoded instructions that access the later round
+	// keys which otherwise would need 4-byte offsets.  Second, it makes it
+	// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
+	// beginning.  Skipping rounds at the end doesn't work as well because
+	// the last round needs different instructions.
+	//
+	// An alternative approach would be to roll up all the round loops.  We
+	// don't do that because it isn't compatible with caching the round keys
+	// in registers which we do when possible (see below), and also because
+	// it seems unwise to rely *too* heavily on the CPU's branch predictor.
+	lea		OFFS-16(KEY, KEYLEN64, 4), KEY
+
+	// If all 32 SIMD registers are available, cache all the round keys.
 .if USE_AVX10
+	cmp		$24, KEYLEN
+	jl		.Laes128\@
+	je		.Laes192\@
 	_vbroadcast128	-6*16(KEY), KEY1
 	_vbroadcast128	-5*16(KEY), KEY2
+.Laes192\@:
 	_vbroadcast128	-4*16(KEY), KEY3
 	_vbroadcast128	-3*16(KEY), KEY4
+.Laes128\@:
 	_vbroadcast128	-2*16(KEY), KEY5
 	_vbroadcast128	-1*16(KEY), KEY6
 	_vbroadcast128	0*16(KEY), KEY7
 	_vbroadcast128	1*16(KEY), KEY8
 	_vbroadcast128	2*16(KEY), KEY9
 	_vbroadcast128	3*16(KEY), KEY10
-	// Note: if it's AES-128 or AES-192, the last several round keys won't
-	// be used.  We do the loads anyway to save a conditional jump.
 	_vbroadcast128	4*16(KEY), KEY11
 	_vbroadcast128	5*16(KEY), KEY12
 	_vbroadcast128	6*16(KEY), KEY13
 	_vbroadcast128	7*16(KEY), KEY14
 .endif
@@ -464,26 +505,26 @@
 .endif
 .endm
 
 // Do a single round of AES en/decryption on the blocks in registers V0-V3,
 // using the same key for all blocks.  The round key is loaded from the
-// appropriate register or memory location for round \i.  In addition, does step
-// \i of the computation of the next set of tweaks.  May clobber V4.
+// appropriate register or memory location for round \i.  In addition, does two
+// steps of the computation of the next set of tweaks.  May clobber V4.
 .macro	_vaes_4x	enc, last, i
 .if USE_AVX10
-	_tweak_step	(2*(\i-1))
+	_tweak_step	(2*(\i-5))
 	_vaes		\enc, \last, KEY\i, V0
 	_vaes		\enc, \last, KEY\i, V1
-	_tweak_step	(2*(\i-1) + 1)
+	_tweak_step	(2*(\i-5) + 1)
 	_vaes		\enc, \last, KEY\i, V2
 	_vaes		\enc, \last, KEY\i, V3
 .else
 	_vbroadcast128	(\i-7)*16(KEY), V4
-	_tweak_step	(2*(\i-1))
+	_tweak_step	(2*(\i-5))
 	_vaes		\enc, \last, V4, V0
 	_vaes		\enc, \last, V4, V1
-	_tweak_step	(2*(\i-1) + 1)
+	_tweak_step	(2*(\i-5) + 1)
 	_vaes		\enc, \last, V4, V2
 	_vaes		\enc, \last, V4, V3
 .endif
 .endm
 
@@ -491,78 +532,62 @@
 // then XOR with \tweak again) of the block(s) in \data.  To process a single
 // block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
 // length VL, use V* registers and leave \xmm_suffix empty.  May clobber V4.
 .macro	_aes_crypt	enc, xmm_suffix, tweak, data
 	_xor3		KEY0\xmm_suffix, \tweak, \data
+	cmp		$24, KEYLEN
+	jl		.Laes128\@
+	je		.Laes192\@
 	_vaes_1x	\enc, 0, 1, \xmm_suffix, \data
 	_vaes_1x	\enc, 0, 2, \xmm_suffix, \data
+.Laes192\@:
 	_vaes_1x	\enc, 0, 3, \xmm_suffix, \data
 	_vaes_1x	\enc, 0, 4, \xmm_suffix, \data
+.Laes128\@:
 	_vaes_1x	\enc, 0, 5, \xmm_suffix, \data
 	_vaes_1x	\enc, 0, 6, \xmm_suffix, \data
 	_vaes_1x	\enc, 0, 7, \xmm_suffix, \data
 	_vaes_1x	\enc, 0, 8, \xmm_suffix, \data
 	_vaes_1x	\enc, 0, 9, \xmm_suffix, \data
-	cmp		$24, KEYLEN
-	jle		.Laes_128_or_192\@
 	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
 	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
 	_vaes_1x	\enc, 0, 12, \xmm_suffix, \data
 	_vaes_1x	\enc, 0, 13, \xmm_suffix, \data
 	_vaes_1x	\enc, 1, 14, \xmm_suffix, \data
-	jmp		.Laes_done\@
-.Laes_128_or_192\@:
-	je		.Laes_192\@
-	_vaes_1x	\enc, 1, 10, \xmm_suffix, \data
-	jmp		.Laes_done\@
-.Laes_192\@:
-	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
-	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
-	_vaes_1x	\enc, 1, 12, \xmm_suffix, \data
-.Laes_done\@:
 	_vpxor		\tweak, \data, \data
 .endm
 
 .macro	_aes_xts_crypt	enc
 	_define_aliases
 
 	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
 	movl		480(KEY), KEYLEN
 
-	// Advance KEY to point to the 7th encryption round key (if encrypting)
-	// or the 7th decryption round key (if decrypting).  This makes the
-	// offset to any round key be in the range [-112, 112], fitting in a
-	// signed byte.  This shortens VEX-encoded instructions that access the
-	// 8th and later round keys which otherwise would need 4-byte offsets.
-.if \enc
-	add		$7*16, KEY
-.else
-	add		$(15+7)*16, KEY
-
+.if !\enc
 	// When decrypting a message whose length isn't a multiple of the AES
 	// block length, exclude the last full block from the main loop by
 	// subtracting 16 from LEN.  This is needed because ciphertext stealing
 	// decryption uses the last two tweaks in reverse order.  We'll handle
 	// the last full block and the partial block specially at the end.
 	test		$15, LEN
 	jnz		.Lneed_cts_dec\@
 .Lxts_init\@:
 .endif
 
-	// Cache as many round keys as possible.
-	_load_round_keys
+	// Setup the pointer to the round keys and cache as many as possible.
+	_setup_round_keys	\enc
 
 	// Compute the first set of tweaks TWEAK[0-3].
 	_compute_first_set_of_tweaks
 
 	sub		$4*VL, LEN
 	jl		.Lhandle_remainder\@
 
 .Lmain_loop\@:
 	// This is the main loop, en/decrypting 4*VL bytes per iteration.
 
-	// XOR each source block with its tweak and the first round key.
+	// XOR each source block with its tweak and the zero-th round key.
 .if USE_AVX10
 	vmovdqu8	0*VL(SRC), V0
 	vmovdqu8	1*VL(SRC), V1
 	vmovdqu8	2*VL(SRC), V2
 	vmovdqu8	3*VL(SRC), V3
@@ -578,31 +603,31 @@
 	vpxor		TWEAK0, V0, V0
 	vpxor		TWEAK1, V1, V1
 	vpxor		TWEAK2, V2, V2
 	vpxor		TWEAK3, V3, V3
 .endif
+	cmp		$24, KEYLEN
+	jl		.Laes128\@
+	je		.Laes192\@
 	// Do all the AES rounds on the data blocks, interleaved with
 	// the computation of the next set of tweaks.
 	_vaes_4x	\enc, 0, 1
 	_vaes_4x	\enc, 0, 2
+.Laes192\@:
 	_vaes_4x	\enc, 0, 3
 	_vaes_4x	\enc, 0, 4
+.Laes128\@:
 	_vaes_4x	\enc, 0, 5
 	_vaes_4x	\enc, 0, 6
 	_vaes_4x	\enc, 0, 7
 	_vaes_4x	\enc, 0, 8
 	_vaes_4x	\enc, 0, 9
-	// Try to optimize for AES-256 by keeping the code for AES-128 and
-	// AES-192 out-of-line.
-	cmp		$24, KEYLEN
-	jle		.Lencrypt_4x_aes_128_or_192\@
 	_vaes_4x	\enc, 0, 10
 	_vaes_4x	\enc, 0, 11
 	_vaes_4x	\enc, 0, 12
 	_vaes_4x	\enc, 0, 13
 	_vaes_4x	\enc, 1, 14
-.Lencrypt_4x_done\@:
 
 	// XOR in the tweaks again.
 	_vpxor		TWEAK0, V0, V0
 	_vpxor		TWEAK1, V1, V1
 	_vpxor		TWEAK2, V2, V2
@@ -676,21 +701,10 @@
 	// process the last 16 + LEN bytes.  If LEN is zero, we're done.
 	test		LEN, LEN
 	jnz		.Lcts\@
 	jmp		.Ldone\@
 
-	// Out-of-line handling of AES-128 and AES-192
-.Lencrypt_4x_aes_128_or_192\@:
-	jz		.Lencrypt_4x_aes_192\@
-	_vaes_4x	\enc, 1, 10
-	jmp		.Lencrypt_4x_done\@
-.Lencrypt_4x_aes_192\@:
-	_vaes_4x	\enc, 0, 10
-	_vaes_4x	\enc, 0, 11
-	_vaes_4x	\enc, 1, 12
-	jmp		.Lencrypt_4x_done\@
-
 .if !\enc
 .Lneed_cts_dec\@:
 	sub		$16, LEN
 	jmp		.Lxts_init\@
 .endif
@@ -762,42 +776,34 @@
 
 // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 //			   u8 iv[AES_BLOCK_SIZE]);
 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
 	vmovdqu		(%rsi), %xmm0
-	add		$7*16, %rdi
-	vpxor		-7*16(%rdi), %xmm0, %xmm0
+	vpxor		(%rdi), %xmm0, %xmm0
+	movl		480(%rdi), %eax		// AES key length
+	lea		-16(%rdi, %rax, 4), %rdi
+	cmp		$24, %eax
+	jl		.Lencrypt_iv_aes128
+	je		.Lencrypt_iv_aes192
 	vaesenc		-6*16(%rdi), %xmm0, %xmm0
 	vaesenc		-5*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes192:
 	vaesenc		-4*16(%rdi), %xmm0, %xmm0
 	vaesenc		-3*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes128:
 	vaesenc		-2*16(%rdi), %xmm0, %xmm0
 	vaesenc		-1*16(%rdi), %xmm0, %xmm0
 	vaesenc		0*16(%rdi), %xmm0, %xmm0
 	vaesenc		1*16(%rdi), %xmm0, %xmm0
 	vaesenc		2*16(%rdi), %xmm0, %xmm0
-	cmpl		$24, 480-(7*16)(%rdi)
-	jle		.Lencrypt_iv_aes_128_or_192
 	vaesenc		3*16(%rdi), %xmm0, %xmm0
 	vaesenc		4*16(%rdi), %xmm0, %xmm0
 	vaesenc		5*16(%rdi), %xmm0, %xmm0
 	vaesenc		6*16(%rdi), %xmm0, %xmm0
 	vaesenclast	7*16(%rdi), %xmm0, %xmm0
-.Lencrypt_iv_done:
 	vmovdqu		%xmm0, (%rsi)
 	RET
-
-	// Out-of-line handling of AES-128 and AES-192
-.Lencrypt_iv_aes_128_or_192:
-	jz		.Lencrypt_iv_aes_192
-	vaesenclast	3*16(%rdi), %xmm0, %xmm0
-	jmp		.Lencrypt_iv_done
-.Lencrypt_iv_aes_192:
-	vaesenc		3*16(%rdi), %xmm0, %xmm0
-	vaesenc		4*16(%rdi), %xmm0, %xmm0
-	vaesenclast	5*16(%rdi), %xmm0, %xmm0
-	jmp		.Lencrypt_iv_done
 SYM_FUNC_END(aes_xts_encrypt_iv)
 
 // Below are the actual AES-XTS encryption and decryption functions,
 // instantiated from the above macro.  They all have the following prototype:
 //
-- 
2.44.0


  reply	other threads:[~2024-04-13  3:21 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-13  3:17 [PATCH 0/3] crypto: x86/aes-xts - additional tuning Eric Biggers
2024-04-13  3:17 ` Eric Biggers [this message]
2024-04-13  3:17 ` [PATCH 2/3] crypto: x86/aes-xts - eliminate a few more instructions Eric Biggers
2024-04-13  3:17 ` [PATCH 3/3] crypto: x86/aes-xts - optimize size of instructions operating on lengths Eric Biggers
2024-04-15 12:11 ` [PATCH 0/3] crypto: x86/aes-xts - additional tuning Ard Biesheuvel
2024-04-19 11:04 ` Herbert Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240413031728.159495-2-ebiggers@kernel.org \
    --to=ebiggers@kernel.org \
    --cc=chang.seok.bae@intel.com \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.