* [PATCH v2] x86-64: use 32-bit XOR to zero registers
@ 2018-07-02 10:31 Jan Beulich
2018-07-03 8:35 ` [tip:x86/asm] x86/asm/64: Use " tip-bot for Jan Beulich
2018-07-04 20:29 ` [PATCH v2] x86-64: use " Pavel Machek
0 siblings, 2 replies; 6+ messages in thread
From: Jan Beulich @ 2018-07-02 10:31 UTC (permalink / raw)
To: mingo, tglx, hpa
Cc: davem, herbert, rjw, Juergen Gross, pavel, linux-kernel, Alok Kataria
Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
idioms don't require execution bandwidth, as they're being taken care
of in the frontend (through register renaming). Use 32-bit XORs instead.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Explain what zeroing idioms are/do in the description.
---
arch/x86/crypto/aegis128-aesni-asm.S | 2 +-
arch/x86/crypto/aegis128l-aesni-asm.S | 2 +-
arch/x86/crypto/aegis256-aesni-asm.S | 2 +-
arch/x86/crypto/aesni-intel_asm.S | 8 ++++----
arch/x86/crypto/aesni-intel_avx-x86_64.S | 4 ++--
arch/x86/crypto/morus1280-avx2-asm.S | 2 +-
arch/x86/crypto/morus1280-sse2-asm.S | 2 +-
arch/x86/crypto/morus640-sse2-asm.S | 2 +-
arch/x86/crypto/sha1_ssse3_asm.S | 2 +-
arch/x86/kernel/head_64.S | 2 +-
arch/x86/kernel/paravirt_patch_64.c | 2 +-
arch/x86/lib/memcpy_64.S | 2 +-
arch/x86/power/hibernate_asm_64.S | 2 +-
13 files changed, 17 insertions(+), 17 deletions(-)
--- 4.18-rc3/arch/x86/crypto/aegis128-aesni-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aegis128-aesni-asm.S
@@ -75,7 +75,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG
mov LEN, %r8
--- 4.18-rc3/arch/x86/crypto/aegis128l-aesni-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -66,7 +66,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG0, MSG0
pxor MSG1, MSG1
--- 4.18-rc3/arch/x86/crypto/aegis256-aesni-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aegis256-aesni-asm.S
@@ -59,7 +59,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG
mov LEN, %r8
--- 4.18-rc3/arch/x86/crypto/aesni-intel_asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_asm.S
@@ -258,7 +258,7 @@ ALL_F: .octa 0xffffffffffffffffffff
.macro GCM_INIT Iv SUBKEY AAD AADLEN
mov \AADLEN, %r11
mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
- xor %r11, %r11
+ xor %r11d, %r11d
mov %r11, InLen(%arg2) # ctx_data.in_length = 0
mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
@@ -286,7 +286,7 @@ ALL_F: .octa 0xffffffffffffffffffff
movdqu HashKey(%arg2), %xmm13
add %arg5, InLen(%arg2)
- xor %r11, %r11 # initialise the data pointer offset as zero
+ xor %r11d, %r11d # initialise the data pointer offset as zero
PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
sub %r11, %arg5 # sub partial block data used
@@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %rax,%rax
+ xor %eax, %eax
mov %rax, PBlockLen(%arg2)
jmp _dec_done_\@
@@ -737,7 +737,7 @@ _no_extra_mask_2_\@:
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %rax,%rax
+ xor %eax, %eax
mov %rax, PBlockLen(%arg2)
jmp _encode_done_\@
--- 4.18-rc3/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -463,7 +463,7 @@ _get_AAD_rest_final\@:
_get_AAD_done\@:
# initialize the data pointer offset as zero
- xor %r11, %r11
+ xor %r11d, %r11d
# start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0
@@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@:
_get_AAD_done\@:
# initialize the data pointer offset as zero
- xor %r11, %r11
+ xor %r11d, %r11d
# start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0
--- 4.18-rc3/arch/x86/crypto/morus1280-avx2-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/morus1280-avx2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
vpxor MSG, MSG, MSG
mov %rcx, %r8
--- 4.18-rc3/arch/x86/crypto/morus1280-sse2-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/morus1280-sse2-asm.S
@@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG_LO, MSG_LO
pxor MSG_HI, MSG_HI
--- 4.18-rc3/arch/x86/crypto/morus640-sse2-asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/morus640-sse2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG
mov %rcx, %r8
--- 4.18-rc3/arch/x86/crypto/sha1_ssse3_asm.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/crypto/sha1_ssse3_asm.S
@@ -96,7 +96,7 @@
# cleanup workspace
mov $8, %ecx
mov %rsp, %rdi
- xor %rax, %rax
+ xor %eax, %eax
rep stosq
mov %rbp, %rsp # deallocate workspace
--- 4.18-rc3/arch/x86/kernel/head_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/kernel/head_64.S
@@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
* address given in m16:64.
*/
pushq $.Lafter_lret # put return address on stack for unwinder
- xorq %rbp, %rbp # clear frame pointer
+ xorl %ebp, %ebp # clear frame pointer
movq initial_code(%rip), %rax
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
--- 4.18-rc3/arch/x86/kernel/paravirt_patch_64.c
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/kernel/paravirt_patch_64.c
@@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
-DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
--- 4.18-rc3/arch/x86/lib/memcpy_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/lib/memcpy_64.S
@@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe)
/* Copy successful. Return zero */
.L_done_memcpy_trap:
- xorq %rax, %rax
+ xorl %eax, %eax
ret
ENDPROC(__memcpy_mcsafe)
EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
--- 4.18-rc3/arch/x86/power/hibernate_asm_64.S
+++ 4.18-rc3-x86_64-32bit-XOR/arch/x86/power/hibernate_asm_64.S
@@ -137,7 +137,7 @@ ENTRY(restore_registers)
/* Saved in save_processor_state. */
lgdt saved_context_gdt_desc(%rax)
- xorq %rax, %rax
+ xorl %eax, %eax
/* tell the hibernation core that we've just restored the memory */
movq %rax, in_suspend(%rip)
^ permalink raw reply [flat|nested] 6+ messages in thread
* [tip:x86/asm] x86/asm/64: Use 32-bit XOR to zero registers
2018-07-02 10:31 [PATCH v2] x86-64: use 32-bit XOR to zero registers Jan Beulich
@ 2018-07-03 8:35 ` tip-bot for Jan Beulich
2018-07-04 20:29 ` [PATCH v2] x86-64: use " Pavel Machek
1 sibling, 0 replies; 6+ messages in thread
From: tip-bot for Jan Beulich @ 2018-07-03 8:35 UTC (permalink / raw)
To: linux-tip-commits
Cc: dvlasenk, hpa, torvalds, luto, jgross, brgerst, jpoimboe, bp,
tglx, peterz, linux-kernel, jbeulich, JBeulich, mingo, akataria
Commit-ID: a7bea8308933aaeea76dad7d42a6e51000417626
Gitweb: https://git.kernel.org/tip/a7bea8308933aaeea76dad7d42a6e51000417626
Author: Jan Beulich <JBeulich@suse.com>
AuthorDate: Mon, 2 Jul 2018 04:31:54 -0600
Committer: Ingo Molnar <mingo@kernel.org>
CommitDate: Tue, 3 Jul 2018 09:59:29 +0200
x86/asm/64: Use 32-bit XOR to zero registers
Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
idioms don't require execution bandwidth, as they're being taken care
of in the frontend (through register renaming). Use 32-bit XORs instead.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: herbert@gondor.apana.org.au
Cc: pavel@ucw.cz
Cc: rjw@rjwysocki.net
Link: http://lkml.kernel.org/r/5B39FF1A02000078001CFB54@prv1-mh.provo.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/crypto/aegis128-aesni-asm.S | 2 +-
arch/x86/crypto/aegis128l-aesni-asm.S | 2 +-
arch/x86/crypto/aegis256-aesni-asm.S | 2 +-
arch/x86/crypto/aesni-intel_asm.S | 8 ++++----
arch/x86/crypto/aesni-intel_avx-x86_64.S | 4 ++--
arch/x86/crypto/morus1280-avx2-asm.S | 2 +-
arch/x86/crypto/morus1280-sse2-asm.S | 2 +-
arch/x86/crypto/morus640-sse2-asm.S | 2 +-
arch/x86/crypto/sha1_ssse3_asm.S | 2 +-
arch/x86/kernel/head_64.S | 2 +-
arch/x86/kernel/paravirt_patch_64.c | 2 +-
arch/x86/lib/memcpy_64.S | 2 +-
arch/x86/power/hibernate_asm_64.S | 2 +-
13 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 9254e0b6cc06..d5c5e2082ae7 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -75,7 +75,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG
mov LEN, %r8
diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S
index 9263c344f2c7..0fbdf5f00bda 100644
--- a/arch/x86/crypto/aegis128l-aesni-asm.S
+++ b/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -66,7 +66,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG0, MSG0
pxor MSG1, MSG1
diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S
index 1d977d515bf9..a49f58e2a5dd 100644
--- a/arch/x86/crypto/aegis256-aesni-asm.S
+++ b/arch/x86/crypto/aegis256-aesni-asm.S
@@ -59,7 +59,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG
mov LEN, %r8
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index e762ef417562..9bd139569b41 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -258,7 +258,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.macro GCM_INIT Iv SUBKEY AAD AADLEN
mov \AADLEN, %r11
mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
- xor %r11, %r11
+ xor %r11d, %r11d
mov %r11, InLen(%arg2) # ctx_data.in_length = 0
mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
@@ -286,7 +286,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
movdqu HashKey(%arg2), %xmm13
add %arg5, InLen(%arg2)
- xor %r11, %r11 # initialise the data pointer offset as zero
+ xor %r11d, %r11d # initialise the data pointer offset as zero
PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
sub %r11, %arg5 # sub partial block data used
@@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %rax,%rax
+ xor %eax, %eax
mov %rax, PBlockLen(%arg2)
jmp _dec_done_\@
@@ -737,7 +737,7 @@ _no_extra_mask_2_\@:
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %rax,%rax
+ xor %eax, %eax
mov %rax, PBlockLen(%arg2)
jmp _encode_done_\@
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index faecb1518bf8..1985ea0b551b 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -463,7 +463,7 @@ _get_AAD_rest_final\@:
_get_AAD_done\@:
# initialize the data pointer offset as zero
- xor %r11, %r11
+ xor %r11d, %r11d
# start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0
@@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@:
_get_AAD_done\@:
# initialize the data pointer offset as zero
- xor %r11, %r11
+ xor %r11d, %r11d
# start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0
diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S
index 37d422e77931..c3f74913476c 100644
--- a/arch/x86/crypto/morus1280-avx2-asm.S
+++ b/arch/x86/crypto/morus1280-avx2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
vpxor MSG, MSG, MSG
mov %rcx, %r8
diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S
index 1fe637c7be9d..b3f4d103ba06 100644
--- a/arch/x86/crypto/morus1280-sse2-asm.S
+++ b/arch/x86/crypto/morus1280-sse2-asm.S
@@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG_LO, MSG_LO
pxor MSG_HI, MSG_HI
diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S
index 71c72a0a0862..d2958a47fccc 100644
--- a/arch/x86/crypto/morus640-sse2-asm.S
+++ b/arch/x86/crypto/morus640-sse2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG
mov %rcx, %r8
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 6204bd53528c..613d0bfc3d84 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -96,7 +96,7 @@
# cleanup workspace
mov $8, %ecx
mov %rsp, %rdi
- xor %rax, %rax
+ xor %eax, %eax
rep stosq
mov %rbp, %rsp # deallocate workspace
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 8344dd2f310a..15ebc2fc166e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
* address given in m16:64.
*/
pushq $.Lafter_lret # put return address on stack for unwinder
- xorq %rbp, %rbp # clear frame pointer
+ xorl %ebp, %ebp # clear frame pointer
movq initial_code(%rip), %rax
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 9edadabf04f6..9cb98f7b07c9 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
-DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 298ef1479240..3b24dc05251c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe)
/* Copy successful. Return zero */
.L_done_memcpy_trap:
- xorq %rax, %rax
+ xorl %eax, %eax
ret
ENDPROC(__memcpy_mcsafe)
EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index ce8da3a0412c..fd369a6e9ff8 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -137,7 +137,7 @@ ENTRY(restore_registers)
/* Saved in save_processor_state. */
lgdt saved_context_gdt_desc(%rax)
- xorq %rax, %rax
+ xorl %eax, %eax
/* tell the hibernation core that we've just restored the memory */
movq %rax, in_suspend(%rip)
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH v2] x86-64: use 32-bit XOR to zero registers
2018-07-02 10:31 [PATCH v2] x86-64: use 32-bit XOR to zero registers Jan Beulich
2018-07-03 8:35 ` [tip:x86/asm] x86/asm/64: Use " tip-bot for Jan Beulich
@ 2018-07-04 20:29 ` Pavel Machek
2018-07-05 7:12 ` Ingo Molnar
1 sibling, 1 reply; 6+ messages in thread
From: Pavel Machek @ 2018-07-04 20:29 UTC (permalink / raw)
To: Jan Beulich
Cc: mingo, tglx, hpa, davem, herbert, rjw, Juergen Gross,
linux-kernel, Alok Kataria
[-- Attachment #1: Type: text/plain, Size: 1002 bytes --]
On Mon 2018-07-02 04:31:54, Jan Beulich wrote:
> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
> idioms don't require execution bandwidth, as they're being taken care
> of in the frontend (through register renaming). Use 32-bit XORs instead.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> @@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
>
> # GHASH computation for the last <16 Byte block
> GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
> - xor %rax,%rax
> + xor %eax, %eax
>
> mov %rax, PBlockLen(%arg2)
> jmp _dec_done_\@
This is rather subtle... and looks like a bug. To zero 64-bit
register, you zero its lower half, relying on implicit zeroing of the
upper half. Wow.
Perhaps we should get comments in the code? Because the explicit code
is more readable...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] x86-64: use 32-bit XOR to zero registers
2018-07-04 20:29 ` [PATCH v2] x86-64: use " Pavel Machek
@ 2018-07-05 7:12 ` Ingo Molnar
2018-07-09 8:33 ` Pavel Machek
0 siblings, 1 reply; 6+ messages in thread
From: Ingo Molnar @ 2018-07-05 7:12 UTC (permalink / raw)
To: Pavel Machek
Cc: Jan Beulich, mingo, tglx, hpa, davem, herbert, rjw,
Juergen Gross, linux-kernel, Alok Kataria
* Pavel Machek <pavel@ucw.cz> wrote:
> On Mon 2018-07-02 04:31:54, Jan Beulich wrote:
> > Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
> > idioms don't require execution bandwidth, as they're being taken care
> > of in the frontend (through register renaming). Use 32-bit XORs instead.
> >
> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> > @@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
> >
> > # GHASH computation for the last <16 Byte block
> > GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
> > - xor %rax,%rax
> > + xor %eax, %eax
> >
> > mov %rax, PBlockLen(%arg2)
> > jmp _dec_done_\@
>
> This is rather subtle... and looks like a bug. To zero 64-bit
> register, you zero its lower half, relying on implicit zeroing of the
> upper half. Wow.
>
> Perhaps we should get comments in the code? Because the explicit code
> is more readable...
The automatic zero-extension of 32-bit ops to the full 64-bit register is a basic,
fundamental and well-known x86-64 idiom in use in literally hundreds of places in
x86-64 assembly code.
We sometimes document zero-extension on entry boundaries where we want to make it
really clear what information gets (and what doesn't get) into the kernel, but
generally it only needs documentation is the (very rare) cases where it's *not*
done.
Also, why would it be a bug?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] x86-64: use 32-bit XOR to zero registers
2018-07-05 7:12 ` Ingo Molnar
@ 2018-07-09 8:33 ` Pavel Machek
2018-07-09 8:47 ` Jan Beulich
0 siblings, 1 reply; 6+ messages in thread
From: Pavel Machek @ 2018-07-09 8:33 UTC (permalink / raw)
To: Ingo Molnar
Cc: Jan Beulich, mingo, tglx, hpa, davem, herbert, rjw,
Juergen Gross, linux-kernel, Alok Kataria
[-- Attachment #1: Type: text/plain, Size: 2075 bytes --]
On Thu 2018-07-05 09:12:16, Ingo Molnar wrote:
>
> * Pavel Machek <pavel@ucw.cz> wrote:
>
> > On Mon 2018-07-02 04:31:54, Jan Beulich wrote:
> > > Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms. Zeroing
> > > idioms don't require execution bandwidth, as they're being taken care
> > > of in the frontend (through register renaming). Use 32-bit XORs instead.
> > >
> > > Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >
> > > @@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
> > >
> > > # GHASH computation for the last <16 Byte block
> > > GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
> > > - xor %rax,%rax
> > > + xor %eax, %eax
> > >
> > > mov %rax, PBlockLen(%arg2)
> > > jmp _dec_done_\@
> >
> > This is rather subtle... and looks like a bug. To zero 64-bit
> > register, you zero its lower half, relying on implicit zeroing of the
> > upper half. Wow.
> >
> > Perhaps we should get comments in the code? Because the explicit code
> > is more readable...
>
> The automatic zero-extension of 32-bit ops to the full 64-bit register is a basic,
> fundamental and well-known x86-64 idiom in use in literally hundreds of places in
> x86-64 assembly code.
>
> We sometimes document zero-extension on entry boundaries where we want to make it
> really clear what information gets (and what doesn't get) into the kernel, but
> generally it only needs documentation is the (very rare) cases where it's *not*
> done.
>
> Also, why would it be a bug?
Not a bug, just looks like one :-). That's why I'd add a comment.
Anyway, normally assembler is the one who chooses instruction
encoding.
xor %rax, %rax is equivalent to xor %eax, %eax; first variant is
slower on some CPUs, second variant will take one extra byte due to
operand size prefix IIRC... Should the assembler generate right
variant according to the CPU type?
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] x86-64: use 32-bit XOR to zero registers
2018-07-09 8:33 ` Pavel Machek
@ 2018-07-09 8:47 ` Jan Beulich
0 siblings, 0 replies; 6+ messages in thread
From: Jan Beulich @ 2018-07-09 8:47 UTC (permalink / raw)
To: pavel
Cc: davem, mingo, herbert, Ingo Molnar, tglx, rjw, Juergen Gross,
linux-kernel, Alok Kataria, hpa
>>> On 09.07.18 at 10:33, <pavel@ucw.cz> wrote:
> Anyway, normally assembler is the one who chooses instruction
> encoding.
There are different possible views here, and I personally think that
while it is a compiler's job to chose optimal encodings, assemblers
shouldn't by default alter what the programmer (or compiler) has
specified, unless there are multiple encodings that are _entirely_
identical in effect, but which are _truly_ different encodings (and
presence/absence of instruction prefixes to me doesn't count as
"truly different"). This view is particularly on the basis that
assembly programmers often imply a certain encoding to be chosen,
be it just to imply size, or to go as far as meaning to run-time patch
code.
> xor %rax, %rax is equivalent to xor %eax, %eax; first variant is
> slower on some CPUs, second variant will take one extra byte due to
> operand size prefix IIRC...
The second variant is actually shorter by one byte.
> Should the assembler generate right
> variant according to the CPU type?
An "optimization" mode has recently been added to gas, but in general
as well as for the sake of older gas I don't think we should rely on this
new behavior (which is also off by default) except in cases where the
impact on the source code would be undesirable (as can e.g. be the
case when macro-izing things).
Jan
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2018-07-09 8:47 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-02 10:31 [PATCH v2] x86-64: use 32-bit XOR to zero registers Jan Beulich
2018-07-03 8:35 ` [tip:x86/asm] x86/asm/64: Use " tip-bot for Jan Beulich
2018-07-04 20:29 ` [PATCH v2] x86-64: use " Pavel Machek
2018-07-05 7:12 ` Ingo Molnar
2018-07-09 8:33 ` Pavel Machek
2018-07-09 8:47 ` Jan Beulich
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).