linux-crypto.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
@ 2020-08-27 17:30 Uros Bizjak
  2020-09-01 15:39 ` Ard Biesheuvel
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Uros Bizjak @ 2020-08-27 17:30 UTC (permalink / raw)
  To: linux-crypto, x86; +Cc: Uros Bizjak, Herbert Xu, David S. Miller

x86_64 zero extends 32bit operations, so for 64bit operands,
XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
a REX prefix byte when legacy registers are used.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
---
 arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 8acbb6584a37..a9edb6f8a0ba 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
 
 	asm volatile(
 		/* Clear registers to propagate the carry bit */
-		"  xor %%r8, %%r8;"
-		"  xor %%r9, %%r9;"
-		"  xor %%r10, %%r10;"
-		"  xor %%r11, %%r11;"
-		"  xor %1, %1;"
+		"  xor %%r8d, %%r8d;"
+		"  xor %%r9d, %%r9d;"
+		"  xor %%r10d, %%r10d;"
+		"  xor %%r11d, %%r11d;"
+		"  xor %k1, %k1;"
 
 		/* Begin addition chain */
 		"  addq 0(%3), %0;"
@@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 		"  cmovc %0, %%rax;"
 
 		/* Step 2: Add carry*38 to the original sum */
-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 8(%1);"
@@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 
 		/* Compute src1[0] * src2 */
 		"  movq 0(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
 		/* Compute src1[1] * src2 */
 		"  movq 8(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[2] * src2 */
 		"  movq 16(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[3] * src2 */
 		"  movq 24(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
@@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %3, %3;"
+		"  xor %k3, %k3;"
 		"  adoxq 0(%1), %%r8;"
 		"  mulxq 40(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 
 		/* Compute src1[0] * src2 */
 		"  movq 0(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
 		/* Compute src1[1] * src2 */
 		"  movq 8(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[2] * src2 */
 		"  movq 16(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[3] * src2 */
 		"  movq 24(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
@@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 
 		/* Compute src1[0] * src2 */
 		"  movq 32(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
 		/* Compute src1[1] * src2 */
 		"  movq 40(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[2] * src2 */
 		"  movq 48(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 		/* Compute src1[3] * src2 */
 		"  movq 56(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
@@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %3, %3;"
+		"  xor %k3, %k3;"
 		"  adoxq 0(%1), %%r8;"
 		"  mulxq 40(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 96(%1), %%r8, %%r13;"
-		"  xor %3, %3;"
+		"  xor %k3, %k3;"
 		"  adoxq 64(%1), %%r8;"
 		"  mulxq 104(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 
 		/* Step 1: Compute all partial products */
 		"  movq 0(%1), %%rdx;"                                       /* f[0] */
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
@@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
 
 		/* Step 2: Compute two parallel carry chains */
-		"  xor %%r15, %%r15;"
+		"  xor %%r15d, %%r15d;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
 		"  adox %%rcx, %%r11;"
@@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
 		"  adoxq 0(%1), %%r8;"
 		"  mulxq 40(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 	asm volatile(
 		/* Step 1: Compute all partial products */
 		"  movq 0(%1), %%rdx;"                                       /* f[0] */
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
@@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
 
 		/* Step 2: Compute two parallel carry chains */
-		"  xor %%r15, %%r15;"
+		"  xor %%r15d, %%r15d;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
 		"  adox %%rcx, %%r11;"
@@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 
 		/* Step 1: Compute all partial products */
 		"  movq 32(%1), %%rdx;"                                       /* f[0] */
-		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
 		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
 		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
 		"  movq 56(%1), %%rdx;"                                      /* f[3] */
@@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
 
 		/* Step 2: Compute two parallel carry chains */
-		"  xor %%r15, %%r15;"
+		"  xor %%r15d, %%r15d;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
 		"  adox %%rcx, %%r11;"
@@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
 		"  adoxq 0(%1), %%r8;"
 		"  mulxq 40(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
@@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 96(%1), %%r8, %%r13;"
-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
 		"  adoxq 64(%1), %%r8;"
 		"  mulxq 104(%1), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-08-27 17:30 [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c Uros Bizjak
@ 2020-09-01 15:39 ` Ard Biesheuvel
  2020-09-01 18:13   ` Jason A. Donenfeld
  2020-09-07 13:14 ` Jason A. Donenfeld
  2020-09-11  6:56 ` Herbert Xu
  2 siblings, 1 reply; 11+ messages in thread
From: Ard Biesheuvel @ 2020-09-01 15:39 UTC (permalink / raw)
  To: Uros Bizjak, Jason A. Donenfeld
  Cc: Linux Crypto Mailing List, X86 ML, Herbert Xu, David S. Miller

(+ Jason)

On Thu, 27 Aug 2020 at 20:31, Uros Bizjak <ubizjak@gmail.com> wrote:
>
> x86_64 zero extends 32bit operations, so for 64bit operands,
> XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> a REX prefix byte when legacy registers are used.
>
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: "David S. Miller" <davem@davemloft.net>
> ---
>  arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
>  1 file changed, 34 insertions(+), 34 deletions(-)
>
> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> index 8acbb6584a37..a9edb6f8a0ba 100644
> --- a/arch/x86/crypto/curve25519-x86_64.c
> +++ b/arch/x86/crypto/curve25519-x86_64.c
> @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
>
>         asm volatile(
>                 /* Clear registers to propagate the carry bit */
> -               "  xor %%r8, %%r8;"
> -               "  xor %%r9, %%r9;"
> -               "  xor %%r10, %%r10;"
> -               "  xor %%r11, %%r11;"
> -               "  xor %1, %1;"
> +               "  xor %%r8d, %%r8d;"
> +               "  xor %%r9d, %%r9d;"
> +               "  xor %%r10d, %%r10d;"
> +               "  xor %%r11d, %%r11d;"
> +               "  xor %k1, %k1;"
>
>                 /* Begin addition chain */
>                 "  addq 0(%3), %0;"
> @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
>                 "  cmovc %0, %%rax;"
>
>                 /* Step 2: Add carry*38 to the original sum */
> -               "  xor %%rcx, %%rcx;"
> +               "  xor %%ecx, %%ecx;"
>                 "  add %%rax, %%r8;"
>                 "  adcx %%rcx, %%r9;"
>                 "  movq %%r9, 8(%1);"
> @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
>                 /* Compute src1[0] * src2 */
>                 "  movq 0(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"
>                 /* Compute src1[1] * src2 */
>                 "  movq 8(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[2] * src2 */
>                 "  movq 16(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[3] * src2 */
>                 "  movq 24(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %3, %3;"
> +               "  xor %k3, %k3;"
>                 "  adoxq 0(%1), %%r8;"
>                 "  mulxq 40(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
>                 /* Compute src1[0] * src2 */
>                 "  movq 0(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"
>                 /* Compute src1[1] * src2 */
>                 "  movq 8(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[2] * src2 */
>                 "  movq 16(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[3] * src2 */
>                 "  movq 24(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
>                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
>                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
>                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
>                 /* Compute src1[0] * src2 */
>                 "  movq 32(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
> +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
>                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"
>                 /* Compute src1[1] * src2 */
>                 "  movq 40(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
> +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
>                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[2] * src2 */
>                 "  movq 48(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
> +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
>                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>                 /* Compute src1[3] * src2 */
>                 "  movq 56(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
> +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
>                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
>                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
>                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
> @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %3, %3;"
> +               "  xor %k3, %k3;"
>                 "  adoxq 0(%1), %%r8;"
>                 "  mulxq 40(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 96(%1), %%r8, %%r13;"
> -               "  xor %3, %3;"
> +               "  xor %k3, %k3;"
>                 "  adoxq 64(%1), %%r8;"
>                 "  mulxq 104(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>
>                 /* Step 1: Compute all partial products */
>                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
> -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
> @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>
>                 /* Step 2: Compute two parallel carry chains */
> -               "  xor %%r15, %%r15;"
> +               "  xor %%r15d, %%r15d;"
>                 "  adox %%rax, %%r10;"
>                 "  adcx %%r8, %%r8;"
>                 "  adox %%rcx, %%r11;"
> @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %%rcx, %%rcx;"
> +               "  xor %%ecx, %%ecx;"
>                 "  adoxq 0(%1), %%r8;"
>                 "  mulxq 40(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>         asm volatile(
>                 /* Step 1: Compute all partial products */
>                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
> -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
> @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>
>                 /* Step 2: Compute two parallel carry chains */
> -               "  xor %%r15, %%r15;"
> +               "  xor %%r15d, %%r15d;"
>                 "  adox %%rax, %%r10;"
>                 "  adcx %%r8, %%r8;"
>                 "  adox %%rcx, %%r11;"
> @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>
>                 /* Step 1: Compute all partial products */
>                 "  movq 32(%1), %%rdx;"                                       /* f[0] */
> -               "  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +               "  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>                 "  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>                 "  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>                 "  movq 56(%1), %%rdx;"                                      /* f[3] */
> @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 "  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>
>                 /* Step 2: Compute two parallel carry chains */
> -               "  xor %%r15, %%r15;"
> +               "  xor %%r15d, %%r15d;"
>                 "  adox %%rax, %%r10;"
>                 "  adcx %%r8, %%r8;"
>                 "  adox %%rcx, %%r11;"
> @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %%rcx, %%rcx;"
> +               "  xor %%ecx, %%ecx;"
>                 "  adoxq 0(%1), %%r8;"
>                 "  mulxq 40(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
>                 "  mulxq 96(%1), %%r8, %%r13;"
> -               "  xor %%rcx, %%rcx;"
> +               "  xor %%ecx, %%ecx;"
>                 "  adoxq 64(%1), %%r8;"
>                 "  mulxq 104(%1), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> --
> 2.26.2
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-09-01 15:39 ` Ard Biesheuvel
@ 2020-09-01 18:13   ` Jason A. Donenfeld
  2020-09-01 19:12     ` Jason A. Donenfeld
  0 siblings, 1 reply; 11+ messages in thread
From: Jason A. Donenfeld @ 2020-09-01 18:13 UTC (permalink / raw)
  To: Uros Bizjak, Karthik Bhargavan, Chris.Hawblitzel,
	Jonathan Protzenko, Aymeric Fromherz
  Cc: Linux Crypto Mailing List, X86 ML, Herbert Xu, David S. Miller,
	Ard Biesheuvel

Hi Uros,

Thanks for this patch. This seems correct to me, since indeed those
clear the top bits anyway, and having smaller code seems good. But
first -- I'm wondering -- have you stuck this into Vale/Hacl to see if
it still checks out there? I'm CC'ing Karthik/Aymeric/Chris/Jonathan
who might be interested in taking a look at that. Seems like it might
be easy to just special case this in Xor64 for the case where the
operands are the same. Also, have you seen any measurable differences
when benching this? I can stick it into kbench9000 to see if you
haven't looked yet.

Jason

On Tue, Sep 1, 2020 at 5:46 PM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> (+ Jason)
>
> On Thu, 27 Aug 2020 at 20:31, Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > x86_64 zero extends 32bit operations, so for 64bit operands,
> > XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> > a REX prefix byte when legacy registers are used.
> >
> > Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > Cc: "David S. Miller" <davem@davemloft.net>
> > ---
> >  arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
> >  1 file changed, 34 insertions(+), 34 deletions(-)
> >
> > diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> > index 8acbb6584a37..a9edb6f8a0ba 100644
> > --- a/arch/x86/crypto/curve25519-x86_64.c
> > +++ b/arch/x86/crypto/curve25519-x86_64.c
> > @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
> >
> >         asm volatile(
> >                 /* Clear registers to propagate the carry bit */
> > -               "  xor %%r8, %%r8;"
> > -               "  xor %%r9, %%r9;"
> > -               "  xor %%r10, %%r10;"
> > -               "  xor %%r11, %%r11;"
> > -               "  xor %1, %1;"
> > +               "  xor %%r8d, %%r8d;"
> > +               "  xor %%r9d, %%r9d;"
> > +               "  xor %%r10d, %%r10d;"
> > +               "  xor %%r11d, %%r11d;"
> > +               "  xor %k1, %k1;"
> >
> >                 /* Begin addition chain */
> >                 "  addq 0(%3), %0;"
> > @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
> >                 "  cmovc %0, %%rax;"
> >
> >                 /* Step 2: Add carry*38 to the original sum */
> > -               "  xor %%rcx, %%rcx;"
> > +               "  xor %%ecx, %%ecx;"
> >                 "  add %%rax, %%r8;"
> >                 "  adcx %%rcx, %%r9;"
> >                 "  movq %%r9, 8(%1);"
> > @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >
> >                 /* Compute src1[0] * src2 */
> >                 "  movq 0(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"
> >                 /* Compute src1[1] * src2 */
> >                 "  movq 8(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[2] * src2 */
> >                 "  movq 16(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[3] * src2 */
> >                 "  movq 24(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> > @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 32(%1), %%r8, %%r13;"
> > -               "  xor %3, %3;"
> > +               "  xor %k3, %k3;"
> >                 "  adoxq 0(%1), %%r8;"
> >                 "  mulxq 40(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >
> >                 /* Compute src1[0] * src2 */
> >                 "  movq 0(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"
> >                 /* Compute src1[1] * src2 */
> >                 "  movq 8(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[2] * src2 */
> >                 "  movq 16(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[3] * src2 */
> >                 "  movq 24(%1), %%rdx;"
> > -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> > +               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
> >                 "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
> >                 "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
> >                 "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> > @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >
> >                 /* Compute src1[0] * src2 */
> >                 "  movq 32(%1), %%rdx;"
> > -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
> > -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> > +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
> > +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> >                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> >                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"
> >                 /* Compute src1[1] * src2 */
> >                 "  movq 40(%1), %%rdx;"
> > -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
> > -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> > +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
> > +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> >                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[2] * src2 */
> >                 "  movq 48(%1), %%rdx;"
> > -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
> > -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> > +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
> > +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> >                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> >                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> >                 /* Compute src1[3] * src2 */
> >                 "  movq 56(%1), %%rdx;"
> > -               "  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
> > -               "  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> > +               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
> > +               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> >                 "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
> >                 "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
> >                                                    "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
> > @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 32(%1), %%r8, %%r13;"
> > -               "  xor %3, %3;"
> > +               "  xor %k3, %k3;"
> >                 "  adoxq 0(%1), %%r8;"
> >                 "  mulxq 40(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 96(%1), %%r8, %%r13;"
> > -               "  xor %3, %3;"
> > +               "  xor %k3, %k3;"
> >                 "  adoxq 64(%1), %%r8;"
> >                 "  mulxq 104(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> >
> >                 /* Step 1: Compute all partial products */
> >                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
> > -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> > +               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> >                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> >                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> >                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
> > @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> >                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> >
> >                 /* Step 2: Compute two parallel carry chains */
> > -               "  xor %%r15, %%r15;"
> > +               "  xor %%r15d, %%r15d;"
> >                 "  adox %%rax, %%r10;"
> >                 "  adcx %%r8, %%r8;"
> >                 "  adox %%rcx, %%r11;"
> > @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 32(%1), %%r8, %%r13;"
> > -               "  xor %%rcx, %%rcx;"
> > +               "  xor %%ecx, %%ecx;"
> >                 "  adoxq 0(%1), %%r8;"
> >                 "  mulxq 40(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >         asm volatile(
> >                 /* Step 1: Compute all partial products */
> >                 "  movq 0(%1), %%rdx;"                                       /* f[0] */
> > -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> > +               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> >                 "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> >                 "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> >                 "  movq 24(%1), %%rdx;"                                      /* f[3] */
> > @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >                 "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> >
> >                 /* Step 2: Compute two parallel carry chains */
> > -               "  xor %%r15, %%r15;"
> > +               "  xor %%r15d, %%r15d;"
> >                 "  adox %%rax, %%r10;"
> >                 "  adcx %%r8, %%r8;"
> >                 "  adox %%rcx, %%r11;"
> > @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >
> >                 /* Step 1: Compute all partial products */
> >                 "  movq 32(%1), %%rdx;"                                       /* f[0] */
> > -               "  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> > +               "  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> >                 "  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> >                 "  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> >                 "  movq 56(%1), %%rdx;"                                      /* f[3] */
> > @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >                 "  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> >
> >                 /* Step 2: Compute two parallel carry chains */
> > -               "  xor %%r15, %%r15;"
> > +               "  xor %%r15d, %%r15d;"
> >                 "  adox %%rax, %%r10;"
> >                 "  adcx %%r8, %%r8;"
> >                 "  adox %%rcx, %%r11;"
> > @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 32(%1), %%r8, %%r13;"
> > -               "  xor %%rcx, %%rcx;"
> > +               "  xor %%ecx, %%ecx;"
> >                 "  adoxq 0(%1), %%r8;"
> >                 "  mulxq 40(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> >                 "  mov $38, %%rdx;"
> >                 "  mulxq 96(%1), %%r8, %%r13;"
> > -               "  xor %%rcx, %%rcx;"
> > +               "  xor %%ecx, %%ecx;"
> >                 "  adoxq 64(%1), %%r8;"
> >                 "  mulxq 104(%1), %%r9, %%rbx;"
> >                 "  adcx %%r13, %%r9;"
> > --
> > 2.26.2
> >

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-09-01 18:13   ` Jason A. Donenfeld
@ 2020-09-01 19:12     ` Jason A. Donenfeld
  2020-09-02  5:50       ` Uros Bizjak
  0 siblings, 1 reply; 11+ messages in thread
From: Jason A. Donenfeld @ 2020-09-01 19:12 UTC (permalink / raw)
  To: Uros Bizjak, Karthik Bhargavan, Chris.Hawblitzel,
	Jonathan Protzenko, Aymeric Fromherz
  Cc: Linux Crypto Mailing List, X86 ML, Herbert Xu, David S. Miller,
	Ard Biesheuvel

On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> operands are the same. Also, have you seen any measurable differences
> when benching this? I can stick it into kbench9000 to see if you
> haven't looked yet.

On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
difference with this, at all, no matter how much I median or mean or
reduce noise by disabling interrupts.

One thing that sticks out is that all the replacements of r8-r15 by
their %r8d-r15d counterparts still have the REX prefix, as is
necessary to access those registers. The only ones worth changing,
then, are the legacy registers, and on a whole, this amounts to only
48 bytes of difference.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-09-01 19:12     ` Jason A. Donenfeld
@ 2020-09-02  5:50       ` Uros Bizjak
  2020-09-02  9:17         ` peterz
  0 siblings, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2020-09-02  5:50 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Karthik Bhargavan, Chris.Hawblitzel, Jonathan Protzenko,
	Aymeric Fromherz, Linux Crypto Mailing List, X86 ML, Herbert Xu,
	David S. Miller, Ard Biesheuvel

On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > operands are the same. Also, have you seen any measurable differences
> > when benching this? I can stick it into kbench9000 to see if you
> > haven't looked yet.
>
> On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> difference with this, at all, no matter how much I median or mean or
> reduce noise by disabling interrupts.
>
> One thing that sticks out is that all the replacements of r8-r15 by
> their %r8d-r15d counterparts still have the REX prefix, as is
> necessary to access those registers. The only ones worth changing,
> then, are the legacy registers, and on a whole, this amounts to only
> 48 bytes of difference.

The patch implements one of x86 target specific optimizations,
performed by gcc. The optimization results in code size savings of one
byte, where REX prefix is omitted with legacy registers, but otherwise
should have no measurable runtime effect. Since gcc applies this
optimization universally to all integer registers, I took the same
approach and implemented the same change to legacy and REX registers.
As measured above, 48 bytes saved is a good result for such a trivial
optimization.

Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-09-02  5:50       ` Uros Bizjak
@ 2020-09-02  9:17         ` peterz
  2020-09-02 11:36           ` Uros Bizjak
  2020-09-02 14:58           ` Jason A. Donenfeld
  0 siblings, 2 replies; 11+ messages in thread
From: peterz @ 2020-09-02  9:17 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: Jason A. Donenfeld, Karthik Bhargavan, Chris.Hawblitzel,
	Jonathan Protzenko, Aymeric Fromherz, Linux Crypto Mailing List,
	X86 ML, Herbert Xu, David S. Miller, Ard Biesheuvel

On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> >
> > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > operands are the same. Also, have you seen any measurable differences
> > > when benching this? I can stick it into kbench9000 to see if you
> > > haven't looked yet.
> >
> > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > difference with this, at all, no matter how much I median or mean or
> > reduce noise by disabling interrupts.
> >
> > One thing that sticks out is that all the replacements of r8-r15 by
> > their %r8d-r15d counterparts still have the REX prefix, as is
> > necessary to access those registers. The only ones worth changing,
> > then, are the legacy registers, and on a whole, this amounts to only
> > 48 bytes of difference.
> 
> The patch implements one of x86 target specific optimizations,
> performed by gcc. The optimization results in code size savings of one
> byte, where REX prefix is omitted with legacy registers, but otherwise
> should have no measurable runtime effect. Since gcc applies this
> optimization universally to all integer registers, I took the same
> approach and implemented the same change to legacy and REX registers.
> As measured above, 48 bytes saved is a good result for such a trivial
> optimization.

Could we instead implement this optimization in GAS ? Then we can leave
the code as-is.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-09-02  9:17         ` peterz
@ 2020-09-02 11:36           ` Uros Bizjak
  2020-09-02 15:00             ` Jason A. Donenfeld
  2020-09-02 14:58           ` Jason A. Donenfeld
  1 sibling, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2020-09-02 11:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jason A. Donenfeld, Karthik Bhargavan, Chris.Hawblitzel,
	Jonathan Protzenko, Aymeric Fromherz, Linux Crypto Mailing List,
	X86 ML, Herbert Xu, David S. Miller, Ard Biesheuvel

On Wed, Sep 2, 2020 at 11:17 AM <peterz@infradead.org> wrote:
>
> On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> > On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > >
> > > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > > operands are the same. Also, have you seen any measurable differences
> > > > when benching this? I can stick it into kbench9000 to see if you
> > > > haven't looked yet.
> > >
> > > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > > difference with this, at all, no matter how much I median or mean or
> > > reduce noise by disabling interrupts.
> > >
> > > One thing that sticks out is that all the replacements of r8-r15 by
> > > their %r8d-r15d counterparts still have the REX prefix, as is
> > > necessary to access those registers. The only ones worth changing,
> > > then, are the legacy registers, and on a whole, this amounts to only
> > > 48 bytes of difference.
> >
> > The patch implements one of x86 target specific optimizations,
> > performed by gcc. The optimization results in code size savings of one
> > byte, where REX prefix is omitted with legacy registers, but otherwise
> > should have no measurable runtime effect. Since gcc applies this
> > optimization universally to all integer registers, I took the same
> > approach and implemented the same change to legacy and REX registers.
> > As measured above, 48 bytes saved is a good result for such a trivial
> > optimization.
>
> Could we instead implement this optimization in GAS ? Then we can leave
> the code as-is.

I don't think that e.g. "xorq %rax,%rax" should universally be
translated to "xorl %eax,%eax" in the assembler. Perhaps the author
expected exactly 3 bytes (to align the code or similar), and the
assembler would change the length to 2 bytes behind his back, breaking
the expectations.

Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-09-02  9:17         ` peterz
  2020-09-02 11:36           ` Uros Bizjak
@ 2020-09-02 14:58           ` Jason A. Donenfeld
  1 sibling, 0 replies; 11+ messages in thread
From: Jason A. Donenfeld @ 2020-09-02 14:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Uros Bizjak, Karthik Bhargavan, Chris.Hawblitzel,
	Jonathan Protzenko, Aymeric Fromherz, Linux Crypto Mailing List,
	X86 ML, Herbert Xu, David S. Miller, Ard Biesheuvel

On Wed, Sep 2, 2020 at 11:44 AM <peterz@infradead.org> wrote:
>
> On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> > On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > >
> > > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > > operands are the same. Also, have you seen any measurable differences
> > > > when benching this? I can stick it into kbench9000 to see if you
> > > > haven't looked yet.
> > >
> > > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > > difference with this, at all, no matter how much I median or mean or
> > > reduce noise by disabling interrupts.
> > >
> > > One thing that sticks out is that all the replacements of r8-r15 by
> > > their %r8d-r15d counterparts still have the REX prefix, as is
> > > necessary to access those registers. The only ones worth changing,
> > > then, are the legacy registers, and on a whole, this amounts to only
> > > 48 bytes of difference.
> >
> > The patch implements one of x86 target specific optimizations,
> > performed by gcc. The optimization results in code size savings of one
> > byte, where REX prefix is omitted with legacy registers, but otherwise
> > should have no measurable runtime effect. Since gcc applies this
> > optimization universally to all integer registers, I took the same
> > approach and implemented the same change to legacy and REX registers.
> > As measured above, 48 bytes saved is a good result for such a trivial
> > optimization.
>
> Could we instead implement this optimization in GAS ? Then we can leave
> the code as-is.

I rather like that idea. Though I wonder if some would balk at it for
smelling a bit like the MIPS assembler with its optimization pass...

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-09-02 11:36           ` Uros Bizjak
@ 2020-09-02 15:00             ` Jason A. Donenfeld
  0 siblings, 0 replies; 11+ messages in thread
From: Jason A. Donenfeld @ 2020-09-02 15:00 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: Peter Zijlstra, Karthik Bhargavan, Chris.Hawblitzel,
	Jonathan Protzenko, Aymeric Fromherz, Linux Crypto Mailing List,
	X86 ML, Herbert Xu, David S. Miller, Ard Biesheuvel

On Wed, Sep 2, 2020 at 1:42 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Sep 2, 2020 at 11:17 AM <peterz@infradead.org> wrote:
> >
> > On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> > > On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > >
> > > > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> > > > > operands are the same. Also, have you seen any measurable differences
> > > > > when benching this? I can stick it into kbench9000 to see if you
> > > > > haven't looked yet.
> > > >
> > > > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > > > difference with this, at all, no matter how much I median or mean or
> > > > reduce noise by disabling interrupts.
> > > >
> > > > One thing that sticks out is that all the replacements of r8-r15 by
> > > > their %r8d-r15d counterparts still have the REX prefix, as is
> > > > necessary to access those registers. The only ones worth changing,
> > > > then, are the legacy registers, and on a whole, this amounts to only
> > > > 48 bytes of difference.
> > >
> > > The patch implements one of x86 target specific optimizations,
> > > performed by gcc. The optimization results in code size savings of one
> > > byte, where REX prefix is omitted with legacy registers, but otherwise
> > > should have no measurable runtime effect. Since gcc applies this
> > > optimization universally to all integer registers, I took the same
> > > approach and implemented the same change to legacy and REX registers.
> > > As measured above, 48 bytes saved is a good result for such a trivial
> > > optimization.
> >
> > Could we instead implement this optimization in GAS ? Then we can leave
> > the code as-is.
>
> I don't think that e.g. "xorq %rax,%rax" should universally be
> translated to "xorl %eax,%eax" in the assembler. Perhaps the author
> expected exactly 3 bytes (to align the code or similar), and the
> assembler would change the length to 2 bytes behind his back, breaking
> the expectations.

Are you sure that's something that GAS actually provides now? Seems
like a lot of mnemonics have ambiguous/injective encodings, and this
wouldn't make things any different. Most authors use .byte or .align
when they care about the actual bytes, no?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-08-27 17:30 [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c Uros Bizjak
  2020-09-01 15:39 ` Ard Biesheuvel
@ 2020-09-07 13:14 ` Jason A. Donenfeld
  2020-09-11  6:56 ` Herbert Xu
  2 siblings, 0 replies; 11+ messages in thread
From: Jason A. Donenfeld @ 2020-09-07 13:14 UTC (permalink / raw)
  To: Uros Bizjak, herbert
  Cc: linux-crypto, x86, Herbert Xu, David S. Miller, Chris Hawblitzel,
	Aymeric Fromherz, Karthik Bhargavan

Hi Uros, Herbert,

On Thu, Aug 27, 2020 at 07:30:58PM +0200, Uros Bizjak wrote:
> x86_64 zero extends 32bit operations, so for 64bit operands,
> XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> a REX prefix byte when legacy registers are used.
> 
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: "David S. Miller" <davem@davemloft.net>
> ---
>  arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
>  1 file changed, 34 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> index 8acbb6584a37..a9edb6f8a0ba 100644
> --- a/arch/x86/crypto/curve25519-x86_64.c
> +++ b/arch/x86/crypto/curve25519-x86_64.c
> @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
>  
>  	asm volatile(
>  		/* Clear registers to propagate the carry bit */
> -		"  xor %%r8, %%r8;"
> -		"  xor %%r9, %%r9;"
> -		"  xor %%r10, %%r10;"
> -		"  xor %%r11, %%r11;"
> -		"  xor %1, %1;"
> +		"  xor %%r8d, %%r8d;"
> +		"  xor %%r9d, %%r9d;"
> +		"  xor %%r10d, %%r10d;"
> +		"  xor %%r11d, %%r11d;"
> +		"  xor %k1, %k1;"
>  
>  		/* Begin addition chain */
>  		"  addq 0(%3), %0;"
> @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
>  		"  cmovc %0, %%rax;"
>  
>  		/* Step 2: Add carry*38 to the original sum */
> -		"  xor %%rcx, %%rcx;"
> +		"  xor %%ecx, %%ecx;"
>  		"  add %%rax, %%r8;"
>  		"  adcx %%rcx, %%r9;"
>  		"  movq %%r9, 8(%1);"
> @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  
>  		/* Compute src1[0] * src2 */
>  		"  movq 0(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"
>  		/* Compute src1[1] * src2 */
>  		"  movq 8(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[2] * src2 */
>  		"  movq 16(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[3] * src2 */
>  		"  movq 24(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %3, %3;"
> +		"  xor %k3, %k3;"
>  		"  adoxq 0(%1), %%r8;"
>  		"  mulxq 40(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  
>  		/* Compute src1[0] * src2 */
>  		"  movq 0(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"
>  		/* Compute src1[1] * src2 */
>  		"  movq 8(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[2] * src2 */
>  		"  movq 16(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[3] * src2 */
>  		"  movq 24(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
> +		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
>  		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
>  		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
>  		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  
>  		/* Compute src1[0] * src2 */
>  		"  movq 32(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> +		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
> +		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
>  		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
>  		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"
>  		/* Compute src1[1] * src2 */
>  		"  movq 40(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> +		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
> +		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
>  		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[2] * src2 */
>  		"  movq 48(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> +		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
> +		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
>  		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
>  		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
>  		/* Compute src1[3] * src2 */
>  		"  movq 56(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> +		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
> +		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
>  		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
>  		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
>  		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
> @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %3, %3;"
> +		"  xor %k3, %k3;"
>  		"  adoxq 0(%1), %%r8;"
>  		"  mulxq 40(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 96(%1), %%r8, %%r13;"
> -		"  xor %3, %3;"
> +		"  xor %k3, %k3;"
>  		"  adoxq 64(%1), %%r8;"
>  		"  mulxq 104(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>  
>  		/* Step 1: Compute all partial products */
>  		"  movq 0(%1), %%rdx;"                                       /* f[0] */
> -		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>  		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>  		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>  		"  movq 24(%1), %%rdx;"                                      /* f[3] */
> @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>  		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>  
>  		/* Step 2: Compute two parallel carry chains */
> -		"  xor %%r15, %%r15;"
> +		"  xor %%r15d, %%r15d;"
>  		"  adox %%rax, %%r10;"
>  		"  adcx %%r8, %%r8;"
>  		"  adox %%rcx, %%r11;"
> @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %%rcx, %%rcx;"
> +		"  xor %%ecx, %%ecx;"
>  		"  adoxq 0(%1), %%r8;"
>  		"  mulxq 40(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  	asm volatile(
>  		/* Step 1: Compute all partial products */
>  		"  movq 0(%1), %%rdx;"                                       /* f[0] */
> -		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>  		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>  		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>  		"  movq 24(%1), %%rdx;"                                      /* f[3] */
> @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>  
>  		/* Step 2: Compute two parallel carry chains */
> -		"  xor %%r15, %%r15;"
> +		"  xor %%r15d, %%r15d;"
>  		"  adox %%rax, %%r10;"
>  		"  adcx %%r8, %%r8;"
>  		"  adox %%rcx, %%r11;"
> @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  
>  		/* Step 1: Compute all partial products */
>  		"  movq 32(%1), %%rdx;"                                       /* f[0] */
> -		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
> +		"  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
>  		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
>  		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
>  		"  movq 56(%1), %%rdx;"                                      /* f[3] */
> @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
>  
>  		/* Step 2: Compute two parallel carry chains */
> -		"  xor %%r15, %%r15;"
> +		"  xor %%r15d, %%r15d;"
>  		"  adox %%rax, %%r10;"
>  		"  adcx %%r8, %%r8;"
>  		"  adox %%rcx, %%r11;"
> @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %%rcx, %%rcx;"
> +		"  xor %%ecx, %%ecx;"
>  		"  adoxq 0(%1), %%r8;"
>  		"  mulxq 40(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
>  		"  mulxq 96(%1), %%r8, %%r13;"
> -		"  xor %%rcx, %%rcx;"
> +		"  xor %%ecx, %%ecx;"
>  		"  adoxq 64(%1), %%r8;"
>  		"  mulxq 104(%1), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> -- 
> 2.26.2
> 

Looks like this is going into HACL in the end, so:

Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>

for cryptodev-2.6.git, rather than crypto-2.6.git

Thanks,
Jason

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c
  2020-08-27 17:30 [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c Uros Bizjak
  2020-09-01 15:39 ` Ard Biesheuvel
  2020-09-07 13:14 ` Jason A. Donenfeld
@ 2020-09-11  6:56 ` Herbert Xu
  2 siblings, 0 replies; 11+ messages in thread
From: Herbert Xu @ 2020-09-11  6:56 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: linux-crypto, x86, David S. Miller

On Thu, Aug 27, 2020 at 07:30:58PM +0200, Uros Bizjak wrote:
> x86_64 zero extends 32bit operations, so for 64bit operands,
> XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> a REX prefix byte when legacy registers are used.
> 
> Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: "David S. Miller" <davem@davemloft.net>
> ---
>  arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
>  1 file changed, 34 insertions(+), 34 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-09-11  6:56 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-27 17:30 [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c Uros Bizjak
2020-09-01 15:39 ` Ard Biesheuvel
2020-09-01 18:13   ` Jason A. Donenfeld
2020-09-01 19:12     ` Jason A. Donenfeld
2020-09-02  5:50       ` Uros Bizjak
2020-09-02  9:17         ` peterz
2020-09-02 11:36           ` Uros Bizjak
2020-09-02 15:00             ` Jason A. Donenfeld
2020-09-02 14:58           ` Jason A. Donenfeld
2020-09-07 13:14 ` Jason A. Donenfeld
2020-09-11  6:56 ` Herbert Xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).