linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] x86-64: use 32-bit XOR to zero registers
@ 2018-07-02 10:31 Jan Beulich
  2018-07-03  8:35 ` [tip:x86/asm] x86/asm/64: Use " tip-bot for Jan Beulich
  2018-07-04 20:29 ` [PATCH v2] x86-64: use " Pavel Machek
  0 siblings, 2 replies; 6+ messages in thread
From: Jan Beulich @ 2018-07-02 10:31 UTC (permalink / raw)
  To: mingo, tglx, hpa
  Cc: davem, herbert, rjw, Juergen Gross, pavel, linux-kernel, Alok Kataria

Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
idioms don't require execution bandwidth, as they're being taken care
of in the frontend (through register renaming). Use 32-bit XORs instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Explain what zeroing idioms are/do in the description.
---
 arch/x86/crypto/aegis128-aesni-asm.S     |    2 +-
 arch/x86/crypto/aegis128l-aesni-asm.S    |    2 +-
 arch/x86/crypto/aegis256-aesni-asm.S     |    2 +-
 arch/x86/crypto/aesni-intel_asm.S        |    8 ++++----
 arch/x86/crypto/aesni-intel_avx-x86_64.S |    4 ++--
 arch/x86/crypto/morus1280-avx2-asm.S     |    2 +-
 arch/x86/crypto/morus1280-sse2-asm.S     |    2 +-
 arch/x86/crypto/morus640-sse2-asm.S      |    2 +-
 arch/x86/crypto/sha1_ssse3_asm.S         |    2 +-
 arch/x86/kernel/head_64.S                |    2 +-
 arch/x86/kernel/paravirt_patch_64.c      |    2 +-
 arch/x86/lib/memcpy_64.S                 |    2 +-
 arch/x86/power/hibernate_asm_64.S        |    2 +-
 13 files changed, 17 insertions(+), 17 deletions(-)

--- 4.18-rc3/arch/x86/crypto/aegis128-aesni-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aegis128-aesni-asm.S
@@ -75,7 +75,7 @@
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov LEN, %r8
--- 4.18-rc3/arch/x86/crypto/aegis128l-aesni-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -66,7 +66,7 @@
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG0, MSG0
 	pxor MSG1, MSG1
 
--- 4.18-rc3/arch/x86/crypto/aegis256-aesni-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aegis256-aesni-asm.S
@@ -59,7 +59,7 @@
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov LEN, %r8
--- 4.18-rc3/arch/x86/crypto/aesni-intel_asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_asm.S
@@ -258,7 +258,7 @@ ALL_F:      .octa 0xffffffffffffffffffff
 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 	mov \AADLEN, %r11
 	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
-	xor %r11, %r11
+	xor %r11d, %r11d
 	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
@@ -286,7 +286,7 @@ ALL_F:      .octa 0xffffffffffffffffffff
 	movdqu HashKey(%arg2), %xmm13
 	add %arg5, InLen(%arg2)
 
-	xor %r11, %r11 # initialise the data pointer offset as zero
+	xor %r11d, %r11d # initialise the data pointer offset as zero
 	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 
 	sub %r11, %arg5		# sub partial block data used
@@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
 
 	# GHASH computation for the last <16 Byte block
 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-	xor	%rax,%rax
+	xor	%eax, %eax
 
 	mov	%rax, PBlockLen(%arg2)
 	jmp	_dec_done_\@
@@ -737,7 +737,7 @@ _no_extra_mask_2_\@:
 
 	# GHASH computation for the last <16 Byte block
 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-	xor	%rax,%rax
+	xor	%eax, %eax
 
 	mov	%rax, PBlockLen(%arg2)
 	jmp	_encode_done_\@
--- 4.18-rc3/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -463,7 +463,7 @@ _get_AAD_rest_final\@:
 
 _get_AAD_done\@:
 	# initialize the data pointer offset as zero
-	xor     %r11, %r11
+	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
 	mov     arg5, %rax                     # rax = *Y0
@@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@:
 
 _get_AAD_done\@:
 	# initialize the data pointer offset as zero
-	xor     %r11, %r11
+	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
 	mov     arg5, %rax                     # rax = *Y0
--- 4.18-rc3/arch/x86/crypto/morus1280-avx2-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/morus1280-avx2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero)
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	vpxor MSG, MSG, MSG
 
 	mov %rcx, %r8
--- 4.18-rc3/arch/x86/crypto/morus1280-sse2-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/morus1280-sse2-asm.S
@@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero)
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG_LO, MSG_LO
 	pxor MSG_HI, MSG_HI
 
--- 4.18-rc3/arch/x86/crypto/morus640-sse2-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/morus640-sse2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero)
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov %rcx, %r8
--- 4.18-rc3/arch/x86/crypto/sha1_ssse3_asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/sha1_ssse3_asm.S
@@ -96,7 +96,7 @@
 	# cleanup workspace
 	mov	$8, %ecx
 	mov	%rsp, %rdi
-	xor	%rax, %rax
+	xor	%eax, %eax
 	rep stosq
 
 	mov	%rbp, %rsp		# deallocate workspace
--- 4.18-rc3/arch/x86/kernel/head_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/kernel/head_64.S
@@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
 	 *		address given in m16:64.
 	 */
 	pushq	$.Lafter_lret	# put return address on stack for unwinder
-	xorq	%rbp, %rbp	# clear frame pointer
+	xorl	%ebp, %ebp	# clear frame pointer
 	movq	initial_code(%rip), %rax
 	pushq	$__KERNEL_CS	# set correct cs
 	pushq	%rax		# target address in negative space
--- 4.18-rc3/arch/x86/kernel/paravirt_patch_64.c
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/kernel/paravirt_patch_64.c
@@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
-DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
 #endif
 
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
--- 4.18-rc3/arch/x86/lib/memcpy_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/lib/memcpy_64.S
@@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe)
 
 	/* Copy successful. Return zero */
 .L_done_memcpy_trap:
-	xorq %rax, %rax
+	xorl %eax, %eax
 	ret
 ENDPROC(__memcpy_mcsafe)
 EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
--- 4.18-rc3/arch/x86/power/hibernate_asm_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/power/hibernate_asm_64.S
@@ -137,7 +137,7 @@ ENTRY(restore_registers)
 	/* Saved in save_processor_state. */
 	lgdt	saved_context_gdt_desc(%rax)
 
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 
 	/* tell the hibernation core that we've just restored the memory */
 	movq	%rax, in_suspend(%rip)



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [tip:x86/asm] x86/asm/64: Use 32-bit XOR to zero registers
  2018-07-02 10:31 [PATCH v2] x86-64: use 32-bit XOR to zero registers Jan Beulich
@ 2018-07-03  8:35 ` tip-bot for Jan Beulich
  2018-07-04 20:29 ` [PATCH v2] x86-64: use " Pavel Machek
  1 sibling, 0 replies; 6+ messages in thread
From: tip-bot for Jan Beulich @ 2018-07-03  8:35 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: dvlasenk, hpa, torvalds, luto, jgross, brgerst, jpoimboe, bp,
	tglx, peterz, linux-kernel, jbeulich, JBeulich, mingo, akataria

Commit-ID:  a7bea8308933aaeea76dad7d42a6e51000417626
Gitweb:     https://git.kernel.org/tip/a7bea8308933aaeea76dad7d42a6e51000417626
Author:     Jan Beulich <JBeulich@suse.com>
AuthorDate: Mon, 2 Jul 2018 04:31:54 -0600
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 3 Jul 2018 09:59:29 +0200

x86/asm/64: Use 32-bit XOR to zero registers

Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
idioms don't require execution bandwidth, as they're being taken care
of in the frontend (through register renaming). Use 32-bit XORs instead.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: herbert@gondor.apana.org.au
Cc: pavel@ucw.cz
Cc: rjw@rjwysocki.net
Link: http://lkml.kernel.org/r/5B39FF1A02000078001CFB54@prv1-mh.provo.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/crypto/aegis128-aesni-asm.S     | 2 +-
 arch/x86/crypto/aegis128l-aesni-asm.S    | 2 +-
 arch/x86/crypto/aegis256-aesni-asm.S     | 2 +-
 arch/x86/crypto/aesni-intel_asm.S        | 8 ++++----
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 4 ++--
 arch/x86/crypto/morus1280-avx2-asm.S     | 2 +-
 arch/x86/crypto/morus1280-sse2-asm.S     | 2 +-
 arch/x86/crypto/morus640-sse2-asm.S      | 2 +-
 arch/x86/crypto/sha1_ssse3_asm.S         | 2 +-
 arch/x86/kernel/head_64.S                | 2 +-
 arch/x86/kernel/paravirt_patch_64.c      | 2 +-
 arch/x86/lib/memcpy_64.S                 | 2 +-
 arch/x86/power/hibernate_asm_64.S        | 2 +-
 13 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 9254e0b6cc06..d5c5e2082ae7 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -75,7 +75,7 @@
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov LEN, %r8
diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S
index 9263c344f2c7..0fbdf5f00bda 100644
--- a/arch/x86/crypto/aegis128l-aesni-asm.S
+++ b/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -66,7 +66,7 @@
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG0, MSG0
 	pxor MSG1, MSG1
 
diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S
index 1d977d515bf9..a49f58e2a5dd 100644
--- a/arch/x86/crypto/aegis256-aesni-asm.S
+++ b/arch/x86/crypto/aegis256-aesni-asm.S
@@ -59,7 +59,7 @@
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov LEN, %r8
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index e762ef417562..9bd139569b41 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -258,7 +258,7 @@ ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 	mov \AADLEN, %r11
 	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
-	xor %r11, %r11
+	xor %r11d, %r11d
 	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
@@ -286,7 +286,7 @@ ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
 	movdqu HashKey(%arg2), %xmm13
 	add %arg5, InLen(%arg2)
 
-	xor %r11, %r11 # initialise the data pointer offset as zero
+	xor %r11d, %r11d # initialise the data pointer offset as zero
 	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 
 	sub %r11, %arg5		# sub partial block data used
@@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
 
 	# GHASH computation for the last <16 Byte block
 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-	xor	%rax,%rax
+	xor	%eax, %eax
 
 	mov	%rax, PBlockLen(%arg2)
 	jmp	_dec_done_\@
@@ -737,7 +737,7 @@ _no_extra_mask_2_\@:
 
 	# GHASH computation for the last <16 Byte block
 	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-	xor	%rax,%rax
+	xor	%eax, %eax
 
 	mov	%rax, PBlockLen(%arg2)
 	jmp	_encode_done_\@
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index faecb1518bf8..1985ea0b551b 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -463,7 +463,7 @@ _get_AAD_rest_final\@:
 
 _get_AAD_done\@:
 	# initialize the data pointer offset as zero
-	xor     %r11, %r11
+	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
 	mov     arg5, %rax                     # rax = *Y0
@@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@:
 
 _get_AAD_done\@:
 	# initialize the data pointer offset as zero
-	xor     %r11, %r11
+	xor     %r11d, %r11d
 
 	# start AES for num_initial_blocks blocks
 	mov     arg5, %rax                     # rax = *Y0
diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S
index 37d422e77931..c3f74913476c 100644
--- a/arch/x86/crypto/morus1280-avx2-asm.S
+++ b/arch/x86/crypto/morus1280-avx2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero)
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	vpxor MSG, MSG, MSG
 
 	mov %rcx, %r8
diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S
index 1fe637c7be9d..b3f4d103ba06 100644
--- a/arch/x86/crypto/morus1280-sse2-asm.S
+++ b/arch/x86/crypto/morus1280-sse2-asm.S
@@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero)
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG_LO, MSG_LO
 	pxor MSG_HI, MSG_HI
 
diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S
index 71c72a0a0862..d2958a47fccc 100644
--- a/arch/x86/crypto/morus640-sse2-asm.S
+++ b/arch/x86/crypto/morus640-sse2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero)
  *   %r9
  */
 __load_partial:
-	xor %r9, %r9
+	xor %r9d, %r9d
 	pxor MSG, MSG
 
 	mov %rcx, %r8
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 6204bd53528c..613d0bfc3d84 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -96,7 +96,7 @@
 	# cleanup workspace
 	mov	$8, %ecx
 	mov	%rsp, %rdi
-	xor	%rax, %rax
+	xor	%eax, %eax
 	rep stosq
 
 	mov	%rbp, %rsp		# deallocate workspace
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 8344dd2f310a..15ebc2fc166e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
 	 *		address given in m16:64.
 	 */
 	pushq	$.Lafter_lret	# put return address on stack for unwinder
-	xorq	%rbp, %rbp	# clear frame pointer
+	xorl	%ebp, %ebp	# clear frame pointer
 	movq	initial_code(%rip), %rax
 	pushq	$__KERNEL_CS	# set correct cs
 	pushq	%rax		# target address in negative space
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 9edadabf04f6..9cb98f7b07c9 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
-DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
 #endif
 
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 298ef1479240..3b24dc05251c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe)
 
 	/* Copy successful. Return zero */
 .L_done_memcpy_trap:
-	xorq %rax, %rax
+	xorl %eax, %eax
 	ret
 ENDPROC(__memcpy_mcsafe)
 EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index ce8da3a0412c..fd369a6e9ff8 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -137,7 +137,7 @@ ENTRY(restore_registers)
 	/* Saved in save_processor_state. */
 	lgdt	saved_context_gdt_desc(%rax)
 
-	xorq	%rax, %rax
+	xorl	%eax, %eax
 
 	/* tell the hibernation core that we've just restored the memory */
 	movq	%rax, in_suspend(%rip)

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86-64: use 32-bit XOR to zero registers
  2018-07-02 10:31 [PATCH v2] x86-64: use 32-bit XOR to zero registers Jan Beulich
  2018-07-03  8:35 ` [tip:x86/asm] x86/asm/64: Use " tip-bot for Jan Beulich
@ 2018-07-04 20:29 ` Pavel Machek
  2018-07-05  7:12   ` Ingo Molnar
  1 sibling, 1 reply; 6+ messages in thread
From: Pavel Machek @ 2018-07-04 20:29 UTC (permalink / raw)
  To: Jan Beulich
  Cc: mingo, tglx, hpa, davem, herbert, rjw, Juergen Gross,
	linux-kernel, Alok Kataria

[-- Attachment #1: Type: text/plain, Size: 1002 bytes --]

On Mon 2018-07-02 04:31:54, Jan Beulich wrote:
> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
> idioms don't require execution bandwidth, as they're being taken care
> of in the frontend (through register renaming). Use 32-bit XORs instead.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

> @@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
>  
>  	# GHASH computation for the last <16 Byte block
>  	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
> -	xor	%rax,%rax
> +	xor	%eax, %eax
>  
>  	mov	%rax, PBlockLen(%arg2)
>  	jmp	_dec_done_\@

This is rather subtle... and looks like a bug. To zero 64-bit
register, you zero its lower half, relying on implicit zeroing of the
upper half. Wow.

Perhaps we should get comments in the code? Because the explicit code
is more readable...

									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86-64: use 32-bit XOR to zero registers
  2018-07-04 20:29 ` [PATCH v2] x86-64: use " Pavel Machek
@ 2018-07-05  7:12   ` Ingo Molnar
  2018-07-09  8:33     ` Pavel Machek
  0 siblings, 1 reply; 6+ messages in thread
From: Ingo Molnar @ 2018-07-05  7:12 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Jan Beulich, mingo, tglx, hpa, davem, herbert, rjw,
	Juergen Gross, linux-kernel, Alok Kataria


* Pavel Machek <pavel@ucw.cz> wrote:

> On Mon 2018-07-02 04:31:54, Jan Beulich wrote:
> > Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
> > idioms don't require execution bandwidth, as they're being taken care
> > of in the frontend (through register renaming). Use 32-bit XORs instead.
> > 
> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> > @@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
> >  
> >  	# GHASH computation for the last <16 Byte block
> >  	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
> > -	xor	%rax,%rax
> > +	xor	%eax, %eax
> >  
> >  	mov	%rax, PBlockLen(%arg2)
> >  	jmp	_dec_done_\@
> 
> This is rather subtle... and looks like a bug. To zero 64-bit
> register, you zero its lower half, relying on implicit zeroing of the
> upper half. Wow.
> 
> Perhaps we should get comments in the code? Because the explicit code
> is more readable...

The automatic zero-extension of 32-bit ops to the full 64-bit register is a basic, 
fundamental and well-known x86-64 idiom in use in literally hundreds of places in 
x86-64 assembly code.

We sometimes document zero-extension on entry boundaries where we want to make it 
really clear what information gets (and what doesn't get) into the kernel, but 
generally it only needs documentation is the (very rare) cases where it's *not* 
done.

Also, why would it be a bug?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86-64: use 32-bit XOR to zero registers
  2018-07-05  7:12   ` Ingo Molnar
@ 2018-07-09  8:33     ` Pavel Machek
  2018-07-09  8:47       ` Jan Beulich
  0 siblings, 1 reply; 6+ messages in thread
From: Pavel Machek @ 2018-07-09  8:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jan Beulich, mingo, tglx, hpa, davem, herbert, rjw,
	Juergen Gross, linux-kernel, Alok Kataria

[-- Attachment #1: Type: text/plain, Size: 2075 bytes --]

On Thu 2018-07-05 09:12:16, Ingo Molnar wrote:
> 
> * Pavel Machek <pavel@ucw.cz> wrote:
> 
> > On Mon 2018-07-02 04:31:54, Jan Beulich wrote:
> > > Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
> > > idioms don't require execution bandwidth, as they're being taken care
> > > of in the frontend (through register renaming). Use 32-bit XORs instead.
> > > 
> > > Signed-off-by: Jan Beulich <jbeulich@suse.com>
> > 
> > > @@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
> > >  
> > >  	# GHASH computation for the last <16 Byte block
> > >  	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
> > > -	xor	%rax,%rax
> > > +	xor	%eax, %eax
> > >  
> > >  	mov	%rax, PBlockLen(%arg2)
> > >  	jmp	_dec_done_\@
> > 
> > This is rather subtle... and looks like a bug. To zero 64-bit
> > register, you zero its lower half, relying on implicit zeroing of the
> > upper half. Wow.
> > 
> > Perhaps we should get comments in the code? Because the explicit code
> > is more readable...
> 
> The automatic zero-extension of 32-bit ops to the full 64-bit register is a basic, 
> fundamental and well-known x86-64 idiom in use in literally hundreds of places in 
> x86-64 assembly code.
> 
> We sometimes document zero-extension on entry boundaries where we want to make it 
> really clear what information gets (and what doesn't get) into the kernel, but 
> generally it only needs documentation is the (very rare) cases where it's *not* 
> done.
> 
> Also, why would it be a bug?

Not a bug, just looks like one :-). That's why I'd add a comment.

Anyway, normally assembler is the one who chooses instruction
encoding.

xor %rax, %rax is equivalent to xor %eax, %eax; first variant is
slower on some CPUs, second variant will take one extra byte due to
operand size prefix IIRC... Should the assembler generate right
variant according to the CPU type?
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] x86-64: use 32-bit XOR to zero registers
  2018-07-09  8:33     ` Pavel Machek
@ 2018-07-09  8:47       ` Jan Beulich
  0 siblings, 0 replies; 6+ messages in thread
From: Jan Beulich @ 2018-07-09  8:47 UTC (permalink / raw)
  To: pavel
  Cc: davem, mingo, herbert, Ingo Molnar, tglx, rjw, Juergen Gross,
	linux-kernel, Alok Kataria, hpa

>>> On 09.07.18 at 10:33, <pavel@ucw.cz> wrote:
> Anyway, normally assembler is the one who chooses instruction
> encoding.

There are different possible views here, and I personally think that
while it is a compiler's job to chose optimal encodings, assemblers
shouldn't by default alter what the programmer (or compiler) has
specified, unless there are multiple encodings that are _entirely_
identical in effect, but which are _truly_ different encodings (and
presence/absence of instruction prefixes to me doesn't count as
"truly different"). This view is particularly on the basis that
assembly programmers often imply a certain encoding to be chosen,
be it just to imply size, or to go as far as meaning to run-time patch
code.

> xor %rax, %rax is equivalent to xor %eax, %eax; first variant is
> slower on some CPUs, second variant will take one extra byte due to
> operand size prefix IIRC...

The second variant is actually shorter by one byte.

> Should the assembler generate right
> variant according to the CPU type?

An "optimization" mode has recently been added to gas, but in general
as well as for the sake of older gas I don't think we should rely on this
new behavior (which is also off by default) except in cases where the
impact on the source code would be undesirable (as can e.g. be the
case when macro-izing things).

Jan



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2018-07-09  8:47 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-02 10:31 [PATCH v2] x86-64: use 32-bit XOR to zero registers Jan Beulich
2018-07-03  8:35 ` [tip:x86/asm] x86/asm/64: Use " tip-bot for Jan Beulich
2018-07-04 20:29 ` [PATCH v2] x86-64: use " Pavel Machek
2018-07-05  7:12   ` Ingo Molnar
2018-07-09  8:33     ` Pavel Machek
2018-07-09  8:47       ` Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).