[PATCH v2] x86-64: use 32-bit XOR to zero registers

* [PATCH v2] x86-64: use 32-bit XOR to zero registers
@ 2018-07-02 10:31 Jan Beulich
  2018-07-03  8:35 ` [tip:x86/asm] x86/asm/64: Use " tip-bot for Jan Beulich
  2018-07-04 20:29 ` [PATCH v2] x86-64: use " Pavel Machek
  0 siblings, 2 replies; 6+ messages in thread
From: Jan Beulich @ 2018-07-02 10:31 UTC (permalink / raw)
  To: mingo, tglx, hpa
  Cc: davem, herbert, rjw, Juergen Gross, pavel, linux-kernel, Alok Kataria

Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
idioms don't require execution bandwidth, as they're being taken care
of in the frontend (through register renaming). Use 32-bit XORs instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Explain what zeroing idioms are/do in the description.
---
 arch/x86/crypto/aegis128-aesni-asm.S     |    2 +-
 arch/x86/crypto/aegis128l-aesni-asm.S    |    2 +-
 arch/x86/crypto/aegis256-aesni-asm.S     |    2 +-
 arch/x86/crypto/aesni-intel_asm.S        |    8 ++++----
 arch/x86/crypto/aesni-intel_avx-x86_64.S |    4 ++--
 arch/x86/crypto/morus1280-avx2-asm.S     |    2 +-
 arch/x86/crypto/morus1280-sse2-asm.S     |    2 +-
 arch/x86/crypto/morus640-sse2-asm.S      |    2 +-
 arch/x86/crypto/sha1_ssse3_asm.S         |    2 +-
 arch/x86/kernel/head_64.S                |    2 +-
 arch/x86/kernel/paravirt_patch_64.c      |    2 +-
 arch/x86/lib/memcpy_64.S                 |    2 +-
 arch/x86/power/hibernate_asm_64.S        |    2 +-
 13 files changed, 17 insertions(+), 17 deletions(-)

--- 4.18-rc3/arch/x86/crypto/aegis128-aesni-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aegis128-aesni-asm.S
@@ -75,7 +75,7 @@
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov LEN, %r8
--- 4.18-rc3/arch/x86/crypto/aegis128l-aesni-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -66,7 +66,7 @@
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG0, MSG0
 	pxor MSG1, MSG1
 
--- 4.18-rc3/arch/x86/crypto/aegis256-aesni-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aegis256-aesni-asm.S
@@ -59,7 +59,7 @@
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov LEN, %r8
--- 4.18-rc3/arch/x86/crypto/aesni-intel_asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_asm.S
@@ -258,7 +258,7 @@ ALL_F:      .octa 0xffffffffffffffffffff
 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 	mov \AADLEN, %r11
 	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
-	xor %r11, %r11
+	xor %r11d, %r11d
 	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
@@ -286,7 +286,7 @@ ALL_F:      .octa 0xffffffffffffffffffff
 	movdqu HashKey(%arg2), %xmm13
 	add %arg5, InLen(%arg2)
 
-	xor %r11, %r11 # initialise the data pointer offset as zero
+	xor %r11d, %r11d # initialise the data pointer offset as zero
 	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 
 	sub %r11, %arg5		# sub partial block data used
@@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
 
 	# GHASH computation for the last <16 Byte block
 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-	xor	%rax,%rax
+	xor	%eax, %eax
 
 	mov	%rax, PBlockLen(%arg2)
 	jmp	_dec_done_\@
@@ -737,7 +737,7 @@ _no_extra_mask_2_\@:
 
 	# GHASH computation for the last <16 Byte block
 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-	xor	%rax,%rax
+	xor	%eax, %eax
 
 	mov	%rax, PBlockLen(%arg2)
 	jmp	_encode_done_\@
--- 4.18-rc3/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -463,7 +463,7 @@ _get_AAD_rest_final\@:
 
 _get_AAD_done\@:
 	# initialize the data pointer offset as zero
-	xor     %r11, %r11
+	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
 	mov     arg5, %rax                     # rax = *Y0
@@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@:
 
 _get_AAD_done\@:
 	# initialize the data pointer offset as zero
-	xor     %r11, %r11
+	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
 	mov     arg5, %rax                     # rax = *Y0
--- 4.18-rc3/arch/x86/crypto/morus1280-avx2-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/morus1280-avx2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero)
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	vpxor MSG, MSG, MSG
 
 	mov %rcx, %r8
--- 4.18-rc3/arch/x86/crypto/morus1280-sse2-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/morus1280-sse2-asm.S
@@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero)
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG_LO, MSG_LO
 	pxor MSG_HI, MSG_HI
 
--- 4.18-rc3/arch/x86/crypto/morus640-sse2-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/morus640-sse2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero)
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov %rcx, %r8
--- 4.18-rc3/arch/x86/crypto/sha1_ssse3_asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/sha1_ssse3_asm.S
@@ -96,7 +96,7 @@
 	# cleanup workspace
 	mov	$8, %ecx
 	mov	%rsp, %rdi
-	xor	%rax, %rax
+	xor	%eax, %eax
 	rep stosq
 
 	mov	%rbp, %rsp		# deallocate workspace
--- 4.18-rc3/arch/x86/kernel/head_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/kernel/head_64.S
@@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
 	 *		address given in m16:64.
 	 */
 	pushq	$.Lafter_lret	# put return address on stack for unwinder
-	xorq	%rbp, %rbp	# clear frame pointer
+	xorl	%ebp, %ebp	# clear frame pointer
 	movq	initial_code(%rip), %rax
 	pushq	$__KERNEL_CS	# set correct cs
 	pushq	%rax		# target address in negative space
--- 4.18-rc3/arch/x86/kernel/paravirt_patch_64.c
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/kernel/paravirt_patch_64.c
@@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
-DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
 #endif
 
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
--- 4.18-rc3/arch/x86/lib/memcpy_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/lib/memcpy_64.S
@@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe)
 
 	/* Copy successful. Return zero */
 .L_done_memcpy_trap:
-	xorq %rax, %rax
+	xorl %eax, %eax
 	ret
 ENDPROC(__memcpy_mcsafe)
 EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
--- 4.18-rc3/arch/x86/power/hibernate_asm_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/power/hibernate_asm_64.S
@@ -137,7 +137,7 @@ ENTRY(restore_registers)
 	/* Saved in save_processor_state. */
 	lgdt	saved_context_gdt_desc(%rax)
 
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 
 	/* tell the hibernation core that we've just restored the memory */
 	movq	%rax, in_suspend(%rip)



^ permalink raw reply	[flat|nested] 6+ messages in thread