All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely
@ 2021-12-14 16:01 Jason A. Donenfeld
  2021-12-14 17:23 ` Ard Biesheuvel
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Jason A. Donenfeld @ 2021-12-14 16:01 UTC (permalink / raw)
  To: linux-crypto, Herbert Xu
  Cc: Jason A. Donenfeld, Mathias Krause, Aymeric Fromherz

Rather than passing all variables as modified, pass ones that are only
read into that parameter. This helps with old gcc versions when
alternatives are additionally used, and lets gcc's codegen be a little
bit more efficient. This also syncs up with the latest Vale/EverCrypt
output.

Reported-by: Mathias Krause <minipli@grsecurity.net>
Cc: Aymeric Fromherz <aymeric.fromherz@inria.fr>
Link: https://lore.kernel.org/wireguard/1554725710.1290070.1639240504281.JavaMail.zimbra@inria.fr/
Link: https://github.com/project-everest/hacl-star/pull/501
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 arch/x86/crypto/curve25519-x86_64.c | 767 ++++++++++++++++++----------
 1 file changed, 489 insertions(+), 278 deletions(-)

diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 38caf61cd5b7..d55fa9e9b9e6 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -64,10 +64,9 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
 
 		/* Return the carry bit in a register */
 		"  adcx %%r11, %1;"
-	: "+&r" (f2), "=&r" (carry_r)
-	: "r" (out), "r" (f1)
-	: "%r8", "%r9", "%r10", "%r11", "memory", "cc"
-	);
+		: "+&r"(f2), "=&r"(carry_r)
+		: "r"(out), "r"(f1)
+		: "%r8", "%r9", "%r10", "%r11", "memory", "cc");
 
 	return carry_r;
 }
@@ -108,10 +107,9 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 		"  cmovc %0, %%rax;"
 		"  add %%rax, %%r8;"
 		"  movq %%r8, 0(%1);"
-	: "+&r" (f2)
-	: "r" (out), "r" (f1)
-	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
-	);
+		: "+&r"(f2)
+		: "r"(out), "r"(f1)
+		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
 }
 
 /* Computes the field subtraction of two field elements */
@@ -151,10 +149,9 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 		"  movq %%r9, 8(%0);"
 		"  movq %%r10, 16(%0);"
 		"  movq %%r11, 24(%0);"
-	:
-	: "r" (out), "r" (f1), "r" (f2)
-	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
-	);
+		:
+		: "r"(out), "r"(f1), "r"(f2)
+		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
 }
 
 /* Computes a field multiplication: out <- f1 * f2
@@ -162,239 +159,400 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 {
 	asm volatile(
+
 		/* Compute the raw multiplication: tmp <- src1 * src2 */
 
 		/* Compute src1[0] * src2 */
-		"  movq 0(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
-		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
-		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"
+		"  movq 0(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  movq %%r8, 0(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  movq %%r10, 8(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+
 		/* Compute src1[1] * src2 */
-		"  movq 8(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
-		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
-		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+		"  movq 8(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 8(%2), %%r8;"
+		"  movq %%r8, 8(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 16(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
 		/* Compute src1[2] * src2 */
-		"  movq 16(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
-		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
-		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+		"  movq 16(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 16(%2), %%r8;"
+		"  movq %%r8, 16(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 24(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
 		/* Compute src1[3] * src2 */
-		"  movq 24(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
-		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
-		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
+		"  movq 24(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 24(%2), %%r8;"
+		"  movq %%r8, 24(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 32(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  movq %%rbx, 40(%2);"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  movq %%r14, 48(%2);"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+		"  movq %%rax, 56(%2);"
+
 		/* Line up pointers */
-		"  mov %0, %1;"
 		"  mov %2, %0;"
+		"  mov %3, %2;"
 
 		/* Wrap the result back into the field */
 
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
-		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %k3, %k3;"
-		"  adoxq 0(%1), %%r8;"
-		"  mulxq 40(%1), %%r9, %%rbx;"
+		"  mulxq 32(%0), %%r8, %%r13;"
+		"  xor %k1, %k1;"
+		"  adoxq 0(%0), %%r8;"
+		"  mulxq 40(%0), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
-		"  adoxq 8(%1), %%r9;"
-		"  mulxq 48(%1), %%r10, %%r13;"
+		"  adoxq 8(%0), %%r9;"
+		"  mulxq 48(%0), %%r10, %%r13;"
 		"  adcx %%rbx, %%r10;"
-		"  adoxq 16(%1), %%r10;"
-		"  mulxq 56(%1), %%r11, %%rax;"
+		"  adoxq 16(%0), %%r10;"
+		"  mulxq 56(%0), %%r11, %%rax;"
 		"  adcx %%r13, %%r11;"
-		"  adoxq 24(%1), %%r11;"
-		"  adcx %3, %%rax;"
-		"  adox %3, %%rax;"
+		"  adoxq 24(%0), %%r11;"
+		"  adcx %1, %%rax;"
+		"  adox %1, %%rax;"
 		"  imul %%rdx, %%rax;"
 
 		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
-		"  adcx %3, %%r9;"
-		"  movq %%r9, 8(%0);"
-		"  adcx %3, %%r10;"
-		"  movq %%r10, 16(%0);"
-		"  adcx %3, %%r11;"
-		"  movq %%r11, 24(%0);"
+		"  adcx %1, %%r9;"
+		"  movq %%r9, 8(%2);"
+		"  adcx %1, %%r10;"
+		"  movq %%r10, 16(%2);"
+		"  adcx %1, %%r11;"
+		"  movq %%r11, 24(%2);"
 
 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
-		"  movq %%r8, 0(%0);"
-	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
-	:
-	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
-	);
+		"  movq %%r8, 0(%2);"
+		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+		: "r"(out)
+		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+		  "%r14", "memory", "cc");
 }
 
 /* Computes two field multiplications:
- * out[0] <- f1[0] * f2[0]
- * out[1] <- f1[1] * f2[1]
- * Uses the 16-element buffer tmp for intermediate results. */
+ *   out[0] <- f1[0] * f2[0]
+ *   out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results: */
 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 {
 	asm volatile(
+
 		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
 
 		/* Compute src1[0] * src2 */
-		"  movq 0(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
-		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
-		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"
+		"  movq 0(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  movq %%r8, 0(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  movq %%r10, 8(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+
 		/* Compute src1[1] * src2 */
-		"  movq 8(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
-		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
-		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+		"  movq 8(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 8(%2), %%r8;"
+		"  movq %%r8, 8(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 16(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
 		/* Compute src1[2] * src2 */
-		"  movq 16(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
-		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
-		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+		"  movq 16(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 16(%2), %%r8;"
+		"  movq %%r8, 16(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 24(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
 		/* Compute src1[3] * src2 */
-		"  movq 24(%1), %%rdx;"
-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
-		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
-		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
+		"  movq 24(%0), %%rdx;"
+		"  mulxq 0(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 24(%2), %%r8;"
+		"  movq %%r8, 24(%2);"
+		"  mulxq 8(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 32(%2);"
+		"  mulxq 16(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  movq %%rbx, 40(%2);"
+		"  mov $0, %%r8;"
+		"  mulxq 24(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  movq %%r14, 48(%2);"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+		"  movq %%rax, 56(%2);"
 
 		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
 
 		/* Compute src1[0] * src2 */
-		"  movq 32(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
-		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
-		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"
+		"  movq 32(%0), %%rdx;"
+		"  mulxq 32(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  movq %%r8, 64(%2);"
+		"  mulxq 40(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  movq %%r10, 72(%2);"
+		"  mulxq 48(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  mulxq 56(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+
 		/* Compute src1[1] * src2 */
-		"  movq 40(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
-		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
-		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+		"  movq 40(%0), %%rdx;"
+		"  mulxq 32(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 72(%2), %%r8;"
+		"  movq %%r8, 72(%2);"
+		"  mulxq 40(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 80(%2);"
+		"  mulxq 48(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 56(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
 		/* Compute src1[2] * src2 */
-		"  movq 48(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
-		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
-		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+		"  movq 48(%0), %%rdx;"
+		"  mulxq 32(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 80(%2), %%r8;"
+		"  movq %%r8, 80(%2);"
+		"  mulxq 40(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 88(%2);"
+		"  mulxq 48(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  mov $0, %%r8;"
+		"  mulxq 56(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+
 		/* Compute src1[3] * src2 */
-		"  movq 56(%1), %%rdx;"
-		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
-		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
-		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
-		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
-		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
+		"  movq 56(%0), %%rdx;"
+		"  mulxq 32(%1), %%r8, %%r9;"
+		"  xor %%r10d, %%r10d;"
+		"  adcxq 88(%2), %%r8;"
+		"  movq %%r8, 88(%2);"
+		"  mulxq 40(%1), %%r10, %%r11;"
+		"  adox %%r9, %%r10;"
+		"  adcx %%rbx, %%r10;"
+		"  movq %%r10, 96(%2);"
+		"  mulxq 48(%1), %%rbx, %%r13;"
+		"  adox %%r11, %%rbx;"
+		"  adcx %%r14, %%rbx;"
+		"  movq %%rbx, 104(%2);"
+		"  mov $0, %%r8;"
+		"  mulxq 56(%1), %%r14, %%rdx;"
+		"  adox %%r13, %%r14;"
+		"  adcx %%rax, %%r14;"
+		"  movq %%r14, 112(%2);"
+		"  mov $0, %%rax;"
+		"  adox %%rdx, %%rax;"
+		"  adcx %%r8, %%rax;"
+		"  movq %%rax, 120(%2);"
+
 		/* Line up pointers */
-		"  mov %0, %1;"
 		"  mov %2, %0;"
+		"  mov %3, %2;"
 
 		/* Wrap the results back into the field */
 
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
-		"  mulxq 32(%1), %%r8, %%r13;"
-		"  xor %k3, %k3;"
-		"  adoxq 0(%1), %%r8;"
-		"  mulxq 40(%1), %%r9, %%rbx;"
+		"  mulxq 32(%0), %%r8, %%r13;"
+		"  xor %k1, %k1;"
+		"  adoxq 0(%0), %%r8;"
+		"  mulxq 40(%0), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
-		"  adoxq 8(%1), %%r9;"
-		"  mulxq 48(%1), %%r10, %%r13;"
+		"  adoxq 8(%0), %%r9;"
+		"  mulxq 48(%0), %%r10, %%r13;"
 		"  adcx %%rbx, %%r10;"
-		"  adoxq 16(%1), %%r10;"
-		"  mulxq 56(%1), %%r11, %%rax;"
+		"  adoxq 16(%0), %%r10;"
+		"  mulxq 56(%0), %%r11, %%rax;"
 		"  adcx %%r13, %%r11;"
-		"  adoxq 24(%1), %%r11;"
-		"  adcx %3, %%rax;"
-		"  adox %3, %%rax;"
+		"  adoxq 24(%0), %%r11;"
+		"  adcx %1, %%rax;"
+		"  adox %1, %%rax;"
 		"  imul %%rdx, %%rax;"
 
 		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
-		"  adcx %3, %%r9;"
-		"  movq %%r9, 8(%0);"
-		"  adcx %3, %%r10;"
-		"  movq %%r10, 16(%0);"
-		"  adcx %3, %%r11;"
-		"  movq %%r11, 24(%0);"
+		"  adcx %1, %%r9;"
+		"  movq %%r9, 8(%2);"
+		"  adcx %1, %%r10;"
+		"  movq %%r10, 16(%2);"
+		"  adcx %1, %%r11;"
+		"  movq %%r11, 24(%2);"
 
 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
-		"  movq %%r8, 0(%0);"
+		"  movq %%r8, 0(%2);"
 
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
-		"  mulxq 96(%1), %%r8, %%r13;"
-		"  xor %k3, %k3;"
-		"  adoxq 64(%1), %%r8;"
-		"  mulxq 104(%1), %%r9, %%rbx;"
+		"  mulxq 96(%0), %%r8, %%r13;"
+		"  xor %k1, %k1;"
+		"  adoxq 64(%0), %%r8;"
+		"  mulxq 104(%0), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
-		"  adoxq 72(%1), %%r9;"
-		"  mulxq 112(%1), %%r10, %%r13;"
+		"  adoxq 72(%0), %%r9;"
+		"  mulxq 112(%0), %%r10, %%r13;"
 		"  adcx %%rbx, %%r10;"
-		"  adoxq 80(%1), %%r10;"
-		"  mulxq 120(%1), %%r11, %%rax;"
+		"  adoxq 80(%0), %%r10;"
+		"  mulxq 120(%0), %%r11, %%rax;"
 		"  adcx %%r13, %%r11;"
-		"  adoxq 88(%1), %%r11;"
-		"  adcx %3, %%rax;"
-		"  adox %3, %%rax;"
+		"  adoxq 88(%0), %%r11;"
+		"  adcx %1, %%rax;"
+		"  adox %1, %%rax;"
 		"  imul %%rdx, %%rax;"
 
 		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
-		"  adcx %3, %%r9;"
-		"  movq %%r9, 40(%0);"
-		"  adcx %3, %%r10;"
-		"  movq %%r10, 48(%0);"
-		"  adcx %3, %%r11;"
-		"  movq %%r11, 56(%0);"
+		"  adcx %1, %%r9;"
+		"  movq %%r9, 40(%2);"
+		"  adcx %1, %%r10;"
+		"  movq %%r10, 48(%2);"
+		"  adcx %1, %%r11;"
+		"  movq %%r11, 56(%2);"
 
 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
-		"  movq %%r8, 32(%0);"
-	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
-	:
-	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
-	);
+		"  movq %%r8, 32(%2);"
+		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+		: "r"(out)
+		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+		  "%r14", "memory", "cc");
 }
 
-/* Computes the field multiplication of four-element f1 with value in f2 */
+/* Computes the field multiplication of four-element f1 with value in f2
+ * Requires f2 to be smaller than 2^17 */
 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 {
 	register u64 f2_r asm("rdx") = f2;
 
 	asm volatile(
 		/* Compute the raw multiplication of f1*f2 */
-		"  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
-		"  mulxq 8(%2), %%r9, %%rbx;"      /* f1[1]*f2 */
+		"  mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+		"  mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
 		"  add %%rcx, %%r9;"
 		"  mov $0, %%rcx;"
-		"  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
+		"  mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
 		"  adcx %%rbx, %%r10;"
-		"  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
+		"  mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
 		"  adcx %%r13, %%r11;"
 		"  adcx %%rcx, %%rax;"
 
@@ -418,17 +576,17 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
 		"  movq %%r8, 0(%1);"
-	: "+&r" (f2_r)
-	: "r" (out), "r" (f1)
-	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
-	);
+		: "+&r"(f2_r)
+		: "r"(out), "r"(f1)
+		: "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
+		  "memory", "cc");
 }
 
 /* Computes p1 <- bit ? p2 : p1 in constant time */
 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 {
 	asm volatile(
-		/* Invert the polarity of bit to match cmov expectations */
+		/* Transfer bit into CF flag */
 		"  add $18446744073709551615, %0;"
 
 		/* cswap p1[0], p2[0] */
@@ -502,10 +660,9 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  cmovc %%r10, %%r9;"
 		"  movq %%r8, 56(%1);"
 		"  movq %%r9, 56(%2);"
-	: "+&r" (bit)
-	: "r" (p1), "r" (p2)
-	: "%r8", "%r9", "%r10", "memory", "cc"
-	);
+		: "+&r"(bit)
+		: "r"(p1), "r"(p2)
+		: "%r8", "%r9", "%r10", "memory", "cc");
 }
 
 /* Computes the square of a field element: out <- f * f
@@ -516,15 +673,22 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		/* Compute the raw multiplication: tmp <- f * f */
 
 		/* Step 1: Compute all partial products */
-		"  movq 0(%1), %%rdx;"                                       /* f[0] */
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
-		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
-		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
-		"  movq 24(%1), %%rdx;"                                      /* f[3] */
-		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
-		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
-		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
-		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+		"  movq 0(%0), %%rdx;" /* f[0] */
+		"  mulxq 8(%0), %%r8, %%r14;"
+		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
+		"  mulxq 16(%0), %%r9, %%r10;"
+		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
+		"  mulxq 24(%0), %%rax, %%rcx;"
+		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
+		"  movq 24(%0), %%rdx;" /* f[3] */
+		"  mulxq 8(%0), %%r11, %%rbx;"
+		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
+		"  mulxq 16(%0), %%rax, %%r13;"
+		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
+		"  movq 8(%0), %%rdx;"
+		"  adcx %%r15, %%r13;" /* f1 */
+		"  mulxq 16(%0), %%rax, %%rcx;"
+		"  mov $0, %%r14;" /* f[2]*f[1] */
 
 		/* Step 2: Compute two parallel carry chains */
 		"  xor %%r15d, %%r15d;"
@@ -542,39 +706,50 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%r14, %%r14;"
 
 		/* Step 3: Compute intermediate squares */
-		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
-		                           "  movq %%rax, 0(%0);"
-		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
-		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
-		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
-		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
-		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
-		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
-		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
-		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
-		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
-		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
+		"  movq 0(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+		"  movq %%rax, 0(%1);"
+		"  add %%rcx, %%r8;"
+		"  movq %%r8, 8(%1);"
+		"  movq 8(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+		"  adcx %%rax, %%r9;"
+		"  movq %%r9, 16(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 24(%1);"
+		"  movq 16(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+		"  adcx %%rax, %%r11;"
+		"  movq %%r11, 32(%1);"
+		"  adcx %%rcx, %%rbx;"
+		"  movq %%rbx, 40(%1);"
+		"  movq 24(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+		"  adcx %%rax, %%r13;"
+		"  movq %%r13, 48(%1);"
+		"  adcx %%rcx, %%r14;"
+		"  movq %%r14, 56(%1);"
 
 		/* Line up pointers */
-		"  mov %0, %1;"
-		"  mov %2, %0;"
+		"  mov %1, %0;"
+		"  mov %2, %1;"
 
 		/* Wrap the result back into the field */
 
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
-		"  mulxq 32(%1), %%r8, %%r13;"
+		"  mulxq 32(%0), %%r8, %%r13;"
 		"  xor %%ecx, %%ecx;"
-		"  adoxq 0(%1), %%r8;"
-		"  mulxq 40(%1), %%r9, %%rbx;"
+		"  adoxq 0(%0), %%r8;"
+		"  mulxq 40(%0), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
-		"  adoxq 8(%1), %%r9;"
-		"  mulxq 48(%1), %%r10, %%r13;"
+		"  adoxq 8(%0), %%r9;"
+		"  mulxq 48(%0), %%r10, %%r13;"
 		"  adcx %%rbx, %%r10;"
-		"  adoxq 16(%1), %%r10;"
-		"  mulxq 56(%1), %%r11, %%rax;"
+		"  adoxq 16(%0), %%r10;"
+		"  mulxq 56(%0), %%r11, %%rax;"
 		"  adcx %%r13, %%r11;"
-		"  adoxq 24(%1), %%r11;"
+		"  adoxq 24(%0), %%r11;"
 		"  adcx %%rcx, %%rax;"
 		"  adox %%rcx, %%rax;"
 		"  imul %%rdx, %%rax;"
@@ -582,40 +757,47 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
-		"  movq %%r9, 8(%0);"
+		"  movq %%r9, 8(%1);"
 		"  adcx %%rcx, %%r10;"
-		"  movq %%r10, 16(%0);"
+		"  movq %%r10, 16(%1);"
 		"  adcx %%rcx, %%r11;"
-		"  movq %%r11, 24(%0);"
+		"  movq %%r11, 24(%1);"
 
 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
-		"  movq %%r8, 0(%0);"
-	: "+&r" (tmp), "+&r" (f), "+&r" (out)
-	:
-	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
-	);
+		"  movq %%r8, 0(%1);"
+		: "+&r"(f), "+&r"(tmp)
+		: "r"(out)
+		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+		  "%r13", "%r14", "%r15", "memory", "cc");
 }
 
 /* Computes two field squarings:
- * out[0] <- f[0] * f[0]
- * out[1] <- f[1] * f[1]
+ *   out[0] <- f[0] * f[0]
+ *   out[1] <- f[1] * f[1]
  * Uses the 16-element buffer tmp for intermediate results */
 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 {
 	asm volatile(
 		/* Step 1: Compute all partial products */
-		"  movq 0(%1), %%rdx;"                                       /* f[0] */
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
-		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
-		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
-		"  movq 24(%1), %%rdx;"                                      /* f[3] */
-		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
-		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
-		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
-		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+		"  movq 0(%0), %%rdx;" /* f[0] */
+		"  mulxq 8(%0), %%r8, %%r14;"
+		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
+		"  mulxq 16(%0), %%r9, %%r10;"
+		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
+		"  mulxq 24(%0), %%rax, %%rcx;"
+		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
+		"  movq 24(%0), %%rdx;" /* f[3] */
+		"  mulxq 8(%0), %%r11, %%rbx;"
+		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
+		"  mulxq 16(%0), %%rax, %%r13;"
+		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
+		"  movq 8(%0), %%rdx;"
+		"  adcx %%r15, %%r13;" /* f1 */
+		"  mulxq 16(%0), %%rax, %%rcx;"
+		"  mov $0, %%r14;" /* f[2]*f[1] */
 
 		/* Step 2: Compute two parallel carry chains */
 		"  xor %%r15d, %%r15d;"
@@ -633,29 +815,47 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%r14, %%r14;"
 
 		/* Step 3: Compute intermediate squares */
-		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
-		                           "  movq %%rax, 0(%0);"
-		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
-		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
-		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
-		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
-		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
-		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
-		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
-		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
-		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
-		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
+		"  movq 0(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+		"  movq %%rax, 0(%1);"
+		"  add %%rcx, %%r8;"
+		"  movq %%r8, 8(%1);"
+		"  movq 8(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+		"  adcx %%rax, %%r9;"
+		"  movq %%r9, 16(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 24(%1);"
+		"  movq 16(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+		"  adcx %%rax, %%r11;"
+		"  movq %%r11, 32(%1);"
+		"  adcx %%rcx, %%rbx;"
+		"  movq %%rbx, 40(%1);"
+		"  movq 24(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+		"  adcx %%rax, %%r13;"
+		"  movq %%r13, 48(%1);"
+		"  adcx %%rcx, %%r14;"
+		"  movq %%r14, 56(%1);"
 
 		/* Step 1: Compute all partial products */
-		"  movq 32(%1), %%rdx;"                                       /* f[0] */
-		"  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
-		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
-		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
-		"  movq 56(%1), %%rdx;"                                      /* f[3] */
-		"  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
-		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
-		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
-		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+		"  movq 32(%0), %%rdx;" /* f[0] */
+		"  mulxq 40(%0), %%r8, %%r14;"
+		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
+		"  mulxq 48(%0), %%r9, %%r10;"
+		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
+		"  mulxq 56(%0), %%rax, %%rcx;"
+		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
+		"  movq 56(%0), %%rdx;" /* f[3] */
+		"  mulxq 40(%0), %%r11, %%rbx;"
+		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
+		"  mulxq 48(%0), %%rax, %%r13;"
+		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
+		"  movq 40(%0), %%rdx;"
+		"  adcx %%r15, %%r13;" /* f1 */
+		"  mulxq 48(%0), %%rax, %%rcx;"
+		"  mov $0, %%r14;" /* f[2]*f[1] */
 
 		/* Step 2: Compute two parallel carry chains */
 		"  xor %%r15d, %%r15d;"
@@ -673,37 +873,48 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%r14, %%r14;"
 
 		/* Step 3: Compute intermediate squares */
-		"  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
-		                           "  movq %%rax, 64(%0);"
-		"  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
-		"  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
-		"  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
-		"  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
-		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
-		"  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
-		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 104(%0);"
-		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
-		"  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
-		"  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
+		"  movq 32(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+		"  movq %%rax, 64(%1);"
+		"  add %%rcx, %%r8;"
+		"  movq %%r8, 72(%1);"
+		"  movq 40(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+		"  adcx %%rax, %%r9;"
+		"  movq %%r9, 80(%1);"
+		"  adcx %%rcx, %%r10;"
+		"  movq %%r10, 88(%1);"
+		"  movq 48(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+		"  adcx %%rax, %%r11;"
+		"  movq %%r11, 96(%1);"
+		"  adcx %%rcx, %%rbx;"
+		"  movq %%rbx, 104(%1);"
+		"  movq 56(%0), %%rdx;"
+		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+		"  adcx %%rax, %%r13;"
+		"  movq %%r13, 112(%1);"
+		"  adcx %%rcx, %%r14;"
+		"  movq %%r14, 120(%1);"
 
 		/* Line up pointers */
-		"  mov %0, %1;"
-		"  mov %2, %0;"
+		"  mov %1, %0;"
+		"  mov %2, %1;"
 
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
-		"  mulxq 32(%1), %%r8, %%r13;"
+		"  mulxq 32(%0), %%r8, %%r13;"
 		"  xor %%ecx, %%ecx;"
-		"  adoxq 0(%1), %%r8;"
-		"  mulxq 40(%1), %%r9, %%rbx;"
+		"  adoxq 0(%0), %%r8;"
+		"  mulxq 40(%0), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
-		"  adoxq 8(%1), %%r9;"
-		"  mulxq 48(%1), %%r10, %%r13;"
+		"  adoxq 8(%0), %%r9;"
+		"  mulxq 48(%0), %%r10, %%r13;"
 		"  adcx %%rbx, %%r10;"
-		"  adoxq 16(%1), %%r10;"
-		"  mulxq 56(%1), %%r11, %%rax;"
+		"  adoxq 16(%0), %%r10;"
+		"  mulxq 56(%0), %%r11, %%rax;"
 		"  adcx %%r13, %%r11;"
-		"  adoxq 24(%1), %%r11;"
+		"  adoxq 24(%0), %%r11;"
 		"  adcx %%rcx, %%rax;"
 		"  adox %%rcx, %%rax;"
 		"  imul %%rdx, %%rax;"
@@ -711,32 +922,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
-		"  movq %%r9, 8(%0);"
+		"  movq %%r9, 8(%1);"
 		"  adcx %%rcx, %%r10;"
-		"  movq %%r10, 16(%0);"
+		"  movq %%r10, 16(%1);"
 		"  adcx %%rcx, %%r11;"
-		"  movq %%r11, 24(%0);"
+		"  movq %%r11, 24(%1);"
 
 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
-		"  movq %%r8, 0(%0);"
+		"  movq %%r8, 0(%1);"
 
 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
-		"  mulxq 96(%1), %%r8, %%r13;"
+		"  mulxq 96(%0), %%r8, %%r13;"
 		"  xor %%ecx, %%ecx;"
-		"  adoxq 64(%1), %%r8;"
-		"  mulxq 104(%1), %%r9, %%rbx;"
+		"  adoxq 64(%0), %%r8;"
+		"  mulxq 104(%0), %%r9, %%rbx;"
 		"  adcx %%r13, %%r9;"
-		"  adoxq 72(%1), %%r9;"
-		"  mulxq 112(%1), %%r10, %%r13;"
+		"  adoxq 72(%0), %%r9;"
+		"  mulxq 112(%0), %%r10, %%r13;"
 		"  adcx %%rbx, %%r10;"
-		"  adoxq 80(%1), %%r10;"
-		"  mulxq 120(%1), %%r11, %%rax;"
+		"  adoxq 80(%0), %%r10;"
+		"  mulxq 120(%0), %%r11, %%rax;"
 		"  adcx %%r13, %%r11;"
-		"  adoxq 88(%1), %%r11;"
+		"  adoxq 88(%0), %%r11;"
 		"  adcx %%rcx, %%rax;"
 		"  adox %%rcx, %%rax;"
 		"  imul %%rdx, %%rax;"
@@ -744,21 +955,21 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
-		"  movq %%r9, 40(%0);"
+		"  movq %%r9, 40(%1);"
 		"  adcx %%rcx, %%r10;"
-		"  movq %%r10, 48(%0);"
+		"  movq %%r10, 48(%1);"
 		"  adcx %%rcx, %%r11;"
-		"  movq %%r11, 56(%0);"
+		"  movq %%r11, 56(%1);"
 
 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
-		"  movq %%r8, 32(%0);"
-	: "+&r" (tmp), "+&r" (f), "+&r" (out)
-	:
-	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
-	);
+		"  movq %%r8, 32(%1);"
+		: "+&r"(f), "+&r"(tmp)
+		: "r"(out)
+		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+		  "%r13", "%r14", "%r15", "memory", "cc");
 }
 
 static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely
  2021-12-14 16:01 [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely Jason A. Donenfeld
@ 2021-12-14 17:23 ` Ard Biesheuvel
  2021-12-14 18:07   ` Mathias Krause
  2021-12-20 15:25   ` Jason A. Donenfeld
  2021-12-14 18:05 ` Mathias Krause
  2021-12-24  3:25 ` Herbert Xu
  2 siblings, 2 replies; 6+ messages in thread
From: Ard Biesheuvel @ 2021-12-14 17:23 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Linux Crypto Mailing List, Herbert Xu, Mathias Krause, Aymeric Fromherz

On Tue, 14 Dec 2021 at 17:02, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> Rather than passing all variables as modified, pass ones that are only
> read into that parameter. This helps with old gcc versions when
> alternatives are additionally used, and lets gcc's codegen be a little
> bit more efficient. This also syncs up with the latest Vale/EverCrypt
> output.
>
> Reported-by: Mathias Krause <minipli@grsecurity.net>
> Cc: Aymeric Fromherz <aymeric.fromherz@inria.fr>
> Link: https://lore.kernel.org/wireguard/1554725710.1290070.1639240504281.JavaMail.zimbra@inria.fr/
> Link: https://github.com/project-everest/hacl-star/pull/501
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

Could we separate the whitespace-only changes from the code changes
please? Otherwise, this is really hard to review.


> ---
>  arch/x86/crypto/curve25519-x86_64.c | 767 ++++++++++++++++++----------
>  1 file changed, 489 insertions(+), 278 deletions(-)
>
> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> index 38caf61cd5b7..d55fa9e9b9e6 100644
> --- a/arch/x86/crypto/curve25519-x86_64.c
> +++ b/arch/x86/crypto/curve25519-x86_64.c
> @@ -64,10 +64,9 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
>
>                 /* Return the carry bit in a register */
>                 "  adcx %%r11, %1;"
> -       : "+&r" (f2), "=&r" (carry_r)
> -       : "r" (out), "r" (f1)
> -       : "%r8", "%r9", "%r10", "%r11", "memory", "cc"
> -       );
> +               : "+&r"(f2), "=&r"(carry_r)
> +               : "r"(out), "r"(f1)
> +               : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
>
>         return carry_r;
>  }
> @@ -108,10 +107,9 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
>                 "  cmovc %0, %%rax;"
>                 "  add %%rax, %%r8;"
>                 "  movq %%r8, 0(%1);"
> -       : "+&r" (f2)
> -       : "r" (out), "r" (f1)
> -       : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
> -       );
> +               : "+&r"(f2)
> +               : "r"(out), "r"(f1)
> +               : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
>  }
>
>  /* Computes the field subtraction of two field elements */
> @@ -151,10 +149,9 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
>                 "  movq %%r9, 8(%0);"
>                 "  movq %%r10, 16(%0);"
>                 "  movq %%r11, 24(%0);"
> -       :
> -       : "r" (out), "r" (f1), "r" (f2)
> -       : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
> -       );
> +               :
> +               : "r"(out), "r"(f1), "r"(f2)
> +               : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
>  }
>
>  /* Computes a field multiplication: out <- f1 * f2
> @@ -162,239 +159,400 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
>  static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  {
>         asm volatile(
> +
>                 /* Compute the raw multiplication: tmp <- src1 * src2 */
>
>                 /* Compute src1[0] * src2 */
> -               "  movq 0(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
> -               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
> -               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> -               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"
> +               "  movq 0(%0), %%rdx;"
> +               "  mulxq 0(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  movq %%r8, 0(%2);"
> +               "  mulxq 8(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  movq %%r10, 8(%2);"
> +               "  mulxq 16(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  mulxq 24(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +
>                 /* Compute src1[1] * src2 */
> -               "  movq 8(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> -               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
> -               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +               "  movq 8(%0), %%rdx;"
> +               "  mulxq 0(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  adcxq 8(%2), %%r8;"
> +               "  movq %%r8, 8(%2);"
> +               "  mulxq 8(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  adcx %%rbx, %%r10;"
> +               "  movq %%r10, 16(%2);"
> +               "  mulxq 16(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  adcx %%r14, %%rbx;"
> +               "  mov $0, %%r8;"
> +               "  mulxq 24(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  adcx %%rax, %%r14;"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +               "  adcx %%r8, %%rax;"
> +
>                 /* Compute src1[2] * src2 */
> -               "  movq 16(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
> -               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
> -               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +               "  movq 16(%0), %%rdx;"
> +               "  mulxq 0(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  adcxq 16(%2), %%r8;"
> +               "  movq %%r8, 16(%2);"
> +               "  mulxq 8(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  adcx %%rbx, %%r10;"
> +               "  movq %%r10, 24(%2);"
> +               "  mulxq 16(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  adcx %%r14, %%rbx;"
> +               "  mov $0, %%r8;"
> +               "  mulxq 24(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  adcx %%rax, %%r14;"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +               "  adcx %%r8, %%rax;"
> +
>                 /* Compute src1[3] * src2 */
> -               "  movq 24(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
> -               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
> -               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
> -               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
> +               "  movq 24(%0), %%rdx;"
> +               "  mulxq 0(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  adcxq 24(%2), %%r8;"
> +               "  movq %%r8, 24(%2);"
> +               "  mulxq 8(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  adcx %%rbx, %%r10;"
> +               "  movq %%r10, 32(%2);"
> +               "  mulxq 16(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  adcx %%r14, %%rbx;"
> +               "  movq %%rbx, 40(%2);"
> +               "  mov $0, %%r8;"
> +               "  mulxq 24(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  adcx %%rax, %%r14;"
> +               "  movq %%r14, 48(%2);"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +               "  adcx %%r8, %%rax;"
> +               "  movq %%rax, 56(%2);"
> +
>                 /* Line up pointers */
> -               "  mov %0, %1;"
>                 "  mov %2, %0;"
> +               "  mov %3, %2;"
>
>                 /* Wrap the result back into the field */
>
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
> -               "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %k3, %k3;"
> -               "  adoxq 0(%1), %%r8;"
> -               "  mulxq 40(%1), %%r9, %%rbx;"
> +               "  mulxq 32(%0), %%r8, %%r13;"
> +               "  xor %k1, %k1;"
> +               "  adoxq 0(%0), %%r8;"
> +               "  mulxq 40(%0), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> -               "  adoxq 8(%1), %%r9;"
> -               "  mulxq 48(%1), %%r10, %%r13;"
> +               "  adoxq 8(%0), %%r9;"
> +               "  mulxq 48(%0), %%r10, %%r13;"
>                 "  adcx %%rbx, %%r10;"
> -               "  adoxq 16(%1), %%r10;"
> -               "  mulxq 56(%1), %%r11, %%rax;"
> +               "  adoxq 16(%0), %%r10;"
> +               "  mulxq 56(%0), %%r11, %%rax;"
>                 "  adcx %%r13, %%r11;"
> -               "  adoxq 24(%1), %%r11;"
> -               "  adcx %3, %%rax;"
> -               "  adox %3, %%rax;"
> +               "  adoxq 24(%0), %%r11;"
> +               "  adcx %1, %%rax;"
> +               "  adox %1, %%rax;"
>                 "  imul %%rdx, %%rax;"
>
>                 /* Step 2: Fold the carry back into dst */
>                 "  add %%rax, %%r8;"
> -               "  adcx %3, %%r9;"
> -               "  movq %%r9, 8(%0);"
> -               "  adcx %3, %%r10;"
> -               "  movq %%r10, 16(%0);"
> -               "  adcx %3, %%r11;"
> -               "  movq %%r11, 24(%0);"
> +               "  adcx %1, %%r9;"
> +               "  movq %%r9, 8(%2);"
> +               "  adcx %1, %%r10;"
> +               "  movq %%r10, 16(%2);"
> +               "  adcx %1, %%r11;"
> +               "  movq %%r11, 24(%2);"
>
>                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>                 "  mov $0, %%rax;"
>                 "  cmovc %%rdx, %%rax;"
>                 "  add %%rax, %%r8;"
> -               "  movq %%r8, 0(%0);"
> -       : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
> -       :
> -       : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
> -       );
> +               "  movq %%r8, 0(%2);"
> +               : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
> +               : "r"(out)
> +               : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
> +                 "%r14", "memory", "cc");
>  }
>
>  /* Computes two field multiplications:
> - * out[0] <- f1[0] * f2[0]
> - * out[1] <- f1[1] * f2[1]
> - * Uses the 16-element buffer tmp for intermediate results. */
> + *   out[0] <- f1[0] * f2[0]
> + *   out[1] <- f1[1] * f2[1]
> + * Uses the 16-element buffer tmp for intermediate results: */
>  static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  {
>         asm volatile(
> +
>                 /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
>
>                 /* Compute src1[0] * src2 */
> -               "  movq 0(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
> -               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
> -               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> -               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"
> +               "  movq 0(%0), %%rdx;"
> +               "  mulxq 0(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  movq %%r8, 0(%2);"
> +               "  mulxq 8(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  movq %%r10, 8(%2);"
> +               "  mulxq 16(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  mulxq 24(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +
>                 /* Compute src1[1] * src2 */
> -               "  movq 8(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> -               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
> -               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +               "  movq 8(%0), %%rdx;"
> +               "  mulxq 0(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  adcxq 8(%2), %%r8;"
> +               "  movq %%r8, 8(%2);"
> +               "  mulxq 8(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  adcx %%rbx, %%r10;"
> +               "  movq %%r10, 16(%2);"
> +               "  mulxq 16(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  adcx %%r14, %%rbx;"
> +               "  mov $0, %%r8;"
> +               "  mulxq 24(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  adcx %%rax, %%r14;"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +               "  adcx %%r8, %%rax;"
> +
>                 /* Compute src1[2] * src2 */
> -               "  movq 16(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
> -               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
> -               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +               "  movq 16(%0), %%rdx;"
> +               "  mulxq 0(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  adcxq 16(%2), %%r8;"
> +               "  movq %%r8, 16(%2);"
> +               "  mulxq 8(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  adcx %%rbx, %%r10;"
> +               "  movq %%r10, 24(%2);"
> +               "  mulxq 16(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  adcx %%r14, %%rbx;"
> +               "  mov $0, %%r8;"
> +               "  mulxq 24(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  adcx %%rax, %%r14;"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +               "  adcx %%r8, %%rax;"
> +
>                 /* Compute src1[3] * src2 */
> -               "  movq 24(%1), %%rdx;"
> -               "  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
> -               "  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
> -               "  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
> -               "  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
> +               "  movq 24(%0), %%rdx;"
> +               "  mulxq 0(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  adcxq 24(%2), %%r8;"
> +               "  movq %%r8, 24(%2);"
> +               "  mulxq 8(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  adcx %%rbx, %%r10;"
> +               "  movq %%r10, 32(%2);"
> +               "  mulxq 16(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  adcx %%r14, %%rbx;"
> +               "  movq %%rbx, 40(%2);"
> +               "  mov $0, %%r8;"
> +               "  mulxq 24(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  adcx %%rax, %%r14;"
> +               "  movq %%r14, 48(%2);"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +               "  adcx %%r8, %%rax;"
> +               "  movq %%rax, 56(%2);"
>
>                 /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
>
>                 /* Compute src1[0] * src2 */
> -               "  movq 32(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> -               "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> -               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"
> +               "  movq 32(%0), %%rdx;"
> +               "  mulxq 32(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  movq %%r8, 64(%2);"
> +               "  mulxq 40(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  movq %%r10, 72(%2);"
> +               "  mulxq 48(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  mulxq 56(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +
>                 /* Compute src1[1] * src2 */
> -               "  movq 40(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> -               "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +               "  movq 40(%0), %%rdx;"
> +               "  mulxq 32(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  adcxq 72(%2), %%r8;"
> +               "  movq %%r8, 72(%2);"
> +               "  mulxq 40(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  adcx %%rbx, %%r10;"
> +               "  movq %%r10, 80(%2);"
> +               "  mulxq 48(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  adcx %%r14, %%rbx;"
> +               "  mov $0, %%r8;"
> +               "  mulxq 56(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  adcx %%rax, %%r14;"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +               "  adcx %%r8, %%rax;"
> +
>                 /* Compute src1[2] * src2 */
> -               "  movq 48(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> -               "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +               "  movq 48(%0), %%rdx;"
> +               "  mulxq 32(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  adcxq 80(%2), %%r8;"
> +               "  movq %%r8, 80(%2);"
> +               "  mulxq 40(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  adcx %%rbx, %%r10;"
> +               "  movq %%r10, 88(%2);"
> +               "  mulxq 48(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  adcx %%r14, %%rbx;"
> +               "  mov $0, %%r8;"
> +               "  mulxq 56(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  adcx %%rax, %%r14;"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +               "  adcx %%r8, %%rax;"
> +
>                 /* Compute src1[3] * src2 */
> -               "  movq 56(%1), %%rdx;"
> -               "  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
> -               "  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> -               "  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
> -               "  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
> -                                                  "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
> +               "  movq 56(%0), %%rdx;"
> +               "  mulxq 32(%1), %%r8, %%r9;"
> +               "  xor %%r10d, %%r10d;"
> +               "  adcxq 88(%2), %%r8;"
> +               "  movq %%r8, 88(%2);"
> +               "  mulxq 40(%1), %%r10, %%r11;"
> +               "  adox %%r9, %%r10;"
> +               "  adcx %%rbx, %%r10;"
> +               "  movq %%r10, 96(%2);"
> +               "  mulxq 48(%1), %%rbx, %%r13;"
> +               "  adox %%r11, %%rbx;"
> +               "  adcx %%r14, %%rbx;"
> +               "  movq %%rbx, 104(%2);"
> +               "  mov $0, %%r8;"
> +               "  mulxq 56(%1), %%r14, %%rdx;"
> +               "  adox %%r13, %%r14;"
> +               "  adcx %%rax, %%r14;"
> +               "  movq %%r14, 112(%2);"
> +               "  mov $0, %%rax;"
> +               "  adox %%rdx, %%rax;"
> +               "  adcx %%r8, %%rax;"
> +               "  movq %%rax, 120(%2);"
> +
>                 /* Line up pointers */
> -               "  mov %0, %1;"
>                 "  mov %2, %0;"
> +               "  mov %3, %2;"
>
>                 /* Wrap the results back into the field */
>
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
> -               "  mulxq 32(%1), %%r8, %%r13;"
> -               "  xor %k3, %k3;"
> -               "  adoxq 0(%1), %%r8;"
> -               "  mulxq 40(%1), %%r9, %%rbx;"
> +               "  mulxq 32(%0), %%r8, %%r13;"
> +               "  xor %k1, %k1;"
> +               "  adoxq 0(%0), %%r8;"
> +               "  mulxq 40(%0), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> -               "  adoxq 8(%1), %%r9;"
> -               "  mulxq 48(%1), %%r10, %%r13;"
> +               "  adoxq 8(%0), %%r9;"
> +               "  mulxq 48(%0), %%r10, %%r13;"
>                 "  adcx %%rbx, %%r10;"
> -               "  adoxq 16(%1), %%r10;"
> -               "  mulxq 56(%1), %%r11, %%rax;"
> +               "  adoxq 16(%0), %%r10;"
> +               "  mulxq 56(%0), %%r11, %%rax;"
>                 "  adcx %%r13, %%r11;"
> -               "  adoxq 24(%1), %%r11;"
> -               "  adcx %3, %%rax;"
> -               "  adox %3, %%rax;"
> +               "  adoxq 24(%0), %%r11;"
> +               "  adcx %1, %%rax;"
> +               "  adox %1, %%rax;"
>                 "  imul %%rdx, %%rax;"
>
>                 /* Step 2: Fold the carry back into dst */
>                 "  add %%rax, %%r8;"
> -               "  adcx %3, %%r9;"
> -               "  movq %%r9, 8(%0);"
> -               "  adcx %3, %%r10;"
> -               "  movq %%r10, 16(%0);"
> -               "  adcx %3, %%r11;"
> -               "  movq %%r11, 24(%0);"
> +               "  adcx %1, %%r9;"
> +               "  movq %%r9, 8(%2);"
> +               "  adcx %1, %%r10;"
> +               "  movq %%r10, 16(%2);"
> +               "  adcx %1, %%r11;"
> +               "  movq %%r11, 24(%2);"
>
>                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>                 "  mov $0, %%rax;"
>                 "  cmovc %%rdx, %%rax;"
>                 "  add %%rax, %%r8;"
> -               "  movq %%r8, 0(%0);"
> +               "  movq %%r8, 0(%2);"
>
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
> -               "  mulxq 96(%1), %%r8, %%r13;"
> -               "  xor %k3, %k3;"
> -               "  adoxq 64(%1), %%r8;"
> -               "  mulxq 104(%1), %%r9, %%rbx;"
> +               "  mulxq 96(%0), %%r8, %%r13;"
> +               "  xor %k1, %k1;"
> +               "  adoxq 64(%0), %%r8;"
> +               "  mulxq 104(%0), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> -               "  adoxq 72(%1), %%r9;"
> -               "  mulxq 112(%1), %%r10, %%r13;"
> +               "  adoxq 72(%0), %%r9;"
> +               "  mulxq 112(%0), %%r10, %%r13;"
>                 "  adcx %%rbx, %%r10;"
> -               "  adoxq 80(%1), %%r10;"
> -               "  mulxq 120(%1), %%r11, %%rax;"
> +               "  adoxq 80(%0), %%r10;"
> +               "  mulxq 120(%0), %%r11, %%rax;"
>                 "  adcx %%r13, %%r11;"
> -               "  adoxq 88(%1), %%r11;"
> -               "  adcx %3, %%rax;"
> -               "  adox %3, %%rax;"
> +               "  adoxq 88(%0), %%r11;"
> +               "  adcx %1, %%rax;"
> +               "  adox %1, %%rax;"
>                 "  imul %%rdx, %%rax;"
>
>                 /* Step 2: Fold the carry back into dst */
>                 "  add %%rax, %%r8;"
> -               "  adcx %3, %%r9;"
> -               "  movq %%r9, 40(%0);"
> -               "  adcx %3, %%r10;"
> -               "  movq %%r10, 48(%0);"
> -               "  adcx %3, %%r11;"
> -               "  movq %%r11, 56(%0);"
> +               "  adcx %1, %%r9;"
> +               "  movq %%r9, 40(%2);"
> +               "  adcx %1, %%r10;"
> +               "  movq %%r10, 48(%2);"
> +               "  adcx %1, %%r11;"
> +               "  movq %%r11, 56(%2);"
>
>                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>                 "  mov $0, %%rax;"
>                 "  cmovc %%rdx, %%rax;"
>                 "  add %%rax, %%r8;"
> -               "  movq %%r8, 32(%0);"
> -       : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
> -       :
> -       : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
> -       );
> +               "  movq %%r8, 32(%2);"
> +               : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
> +               : "r"(out)
> +               : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
> +                 "%r14", "memory", "cc");
>  }
>
> -/* Computes the field multiplication of four-element f1 with value in f2 */
> +/* Computes the field multiplication of four-element f1 with value in f2
> + * Requires f2 to be smaller than 2^17 */
>  static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
>  {
>         register u64 f2_r asm("rdx") = f2;
>
>         asm volatile(
>                 /* Compute the raw multiplication of f1*f2 */
> -               "  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
> -               "  mulxq 8(%2), %%r9, %%rbx;"      /* f1[1]*f2 */
> +               "  mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
> +               "  mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
>                 "  add %%rcx, %%r9;"
>                 "  mov $0, %%rcx;"
> -               "  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
> +               "  mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
>                 "  adcx %%rbx, %%r10;"
> -               "  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
> +               "  mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
>                 "  adcx %%r13, %%r11;"
>                 "  adcx %%rcx, %%rax;"
>
> @@ -418,17 +576,17 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
>                 "  cmovc %%rdx, %%rax;"
>                 "  add %%rax, %%r8;"
>                 "  movq %%r8, 0(%1);"
> -       : "+&r" (f2_r)
> -       : "r" (out), "r" (f1)
> -       : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
> -       );
> +               : "+&r"(f2_r)
> +               : "r"(out), "r"(f1)
> +               : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
> +                 "memory", "cc");
>  }
>
>  /* Computes p1 <- bit ? p2 : p1 in constant time */
>  static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
>  {
>         asm volatile(
> -               /* Invert the polarity of bit to match cmov expectations */
> +               /* Transfer bit into CF flag */
>                 "  add $18446744073709551615, %0;"
>
>                 /* cswap p1[0], p2[0] */
> @@ -502,10 +660,9 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
>                 "  cmovc %%r10, %%r9;"
>                 "  movq %%r8, 56(%1);"
>                 "  movq %%r9, 56(%2);"
> -       : "+&r" (bit)
> -       : "r" (p1), "r" (p2)
> -       : "%r8", "%r9", "%r10", "memory", "cc"
> -       );
> +               : "+&r"(bit)
> +               : "r"(p1), "r"(p2)
> +               : "%r8", "%r9", "%r10", "memory", "cc");
>  }
>
>  /* Computes the square of a field element: out <- f * f
> @@ -516,15 +673,22 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>                 /* Compute the raw multiplication: tmp <- f * f */
>
>                 /* Step 1: Compute all partial products */
> -               "  movq 0(%1), %%rdx;"                                       /* f[0] */
> -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> -               "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> -               "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> -               "  movq 24(%1), %%rdx;"                                      /* f[3] */
> -               "  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
> -               "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
> -               "  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
> -               "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> +               "  movq 0(%0), %%rdx;" /* f[0] */
> +               "  mulxq 8(%0), %%r8, %%r14;"
> +               "  xor %%r15d, %%r15d;" /* f[1]*f[0] */
> +               "  mulxq 16(%0), %%r9, %%r10;"
> +               "  adcx %%r14, %%r9;" /* f[2]*f[0] */
> +               "  mulxq 24(%0), %%rax, %%rcx;"
> +               "  adcx %%rax, %%r10;" /* f[3]*f[0] */
> +               "  movq 24(%0), %%rdx;" /* f[3] */
> +               "  mulxq 8(%0), %%r11, %%rbx;"
> +               "  adcx %%rcx, %%r11;" /* f[1]*f[3] */
> +               "  mulxq 16(%0), %%rax, %%r13;"
> +               "  adcx %%rax, %%rbx;" /* f[2]*f[3] */
> +               "  movq 8(%0), %%rdx;"
> +               "  adcx %%r15, %%r13;" /* f1 */
> +               "  mulxq 16(%0), %%rax, %%rcx;"
> +               "  mov $0, %%r14;" /* f[2]*f[1] */
>
>                 /* Step 2: Compute two parallel carry chains */
>                 "  xor %%r15d, %%r15d;"
> @@ -542,39 +706,50 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>                 "  adcx %%r14, %%r14;"
>
>                 /* Step 3: Compute intermediate squares */
> -               "  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
> -                                          "  movq %%rax, 0(%0);"
> -               "  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
> -               "  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
> -               "  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
> -               "  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
> -               "  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
> -               "  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
> -               "  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
> -               "  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
> -               "  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
> -               "  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
> +               "  movq 0(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
> +               "  movq %%rax, 0(%1);"
> +               "  add %%rcx, %%r8;"
> +               "  movq %%r8, 8(%1);"
> +               "  movq 8(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
> +               "  adcx %%rax, %%r9;"
> +               "  movq %%r9, 16(%1);"
> +               "  adcx %%rcx, %%r10;"
> +               "  movq %%r10, 24(%1);"
> +               "  movq 16(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
> +               "  adcx %%rax, %%r11;"
> +               "  movq %%r11, 32(%1);"
> +               "  adcx %%rcx, %%rbx;"
> +               "  movq %%rbx, 40(%1);"
> +               "  movq 24(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
> +               "  adcx %%rax, %%r13;"
> +               "  movq %%r13, 48(%1);"
> +               "  adcx %%rcx, %%r14;"
> +               "  movq %%r14, 56(%1);"
>
>                 /* Line up pointers */
> -               "  mov %0, %1;"
> -               "  mov %2, %0;"
> +               "  mov %1, %0;"
> +               "  mov %2, %1;"
>
>                 /* Wrap the result back into the field */
>
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
> -               "  mulxq 32(%1), %%r8, %%r13;"
> +               "  mulxq 32(%0), %%r8, %%r13;"
>                 "  xor %%ecx, %%ecx;"
> -               "  adoxq 0(%1), %%r8;"
> -               "  mulxq 40(%1), %%r9, %%rbx;"
> +               "  adoxq 0(%0), %%r8;"
> +               "  mulxq 40(%0), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> -               "  adoxq 8(%1), %%r9;"
> -               "  mulxq 48(%1), %%r10, %%r13;"
> +               "  adoxq 8(%0), %%r9;"
> +               "  mulxq 48(%0), %%r10, %%r13;"
>                 "  adcx %%rbx, %%r10;"
> -               "  adoxq 16(%1), %%r10;"
> -               "  mulxq 56(%1), %%r11, %%rax;"
> +               "  adoxq 16(%0), %%r10;"
> +               "  mulxq 56(%0), %%r11, %%rax;"
>                 "  adcx %%r13, %%r11;"
> -               "  adoxq 24(%1), %%r11;"
> +               "  adoxq 24(%0), %%r11;"
>                 "  adcx %%rcx, %%rax;"
>                 "  adox %%rcx, %%rax;"
>                 "  imul %%rdx, %%rax;"
> @@ -582,40 +757,47 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>                 /* Step 2: Fold the carry back into dst */
>                 "  add %%rax, %%r8;"
>                 "  adcx %%rcx, %%r9;"
> -               "  movq %%r9, 8(%0);"
> +               "  movq %%r9, 8(%1);"
>                 "  adcx %%rcx, %%r10;"
> -               "  movq %%r10, 16(%0);"
> +               "  movq %%r10, 16(%1);"
>                 "  adcx %%rcx, %%r11;"
> -               "  movq %%r11, 24(%0);"
> +               "  movq %%r11, 24(%1);"
>
>                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>                 "  mov $0, %%rax;"
>                 "  cmovc %%rdx, %%rax;"
>                 "  add %%rax, %%r8;"
> -               "  movq %%r8, 0(%0);"
> -       : "+&r" (tmp), "+&r" (f), "+&r" (out)
> -       :
> -       : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
> -       );
> +               "  movq %%r8, 0(%1);"
> +               : "+&r"(f), "+&r"(tmp)
> +               : "r"(out)
> +               : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
> +                 "%r13", "%r14", "%r15", "memory", "cc");
>  }
>
>  /* Computes two field squarings:
> - * out[0] <- f[0] * f[0]
> - * out[1] <- f[1] * f[1]
> + *   out[0] <- f[0] * f[0]
> + *   out[1] <- f[1] * f[1]
>   * Uses the 16-element buffer tmp for intermediate results */
>  static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  {
>         asm volatile(
>                 /* Step 1: Compute all partial products */
> -               "  movq 0(%1), %%rdx;"                                       /* f[0] */
> -               "  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> -               "  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> -               "  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> -               "  movq 24(%1), %%rdx;"                                      /* f[3] */
> -               "  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
> -               "  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
> -               "  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
> -               "  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> +               "  movq 0(%0), %%rdx;" /* f[0] */
> +               "  mulxq 8(%0), %%r8, %%r14;"
> +               "  xor %%r15d, %%r15d;" /* f[1]*f[0] */
> +               "  mulxq 16(%0), %%r9, %%r10;"
> +               "  adcx %%r14, %%r9;" /* f[2]*f[0] */
> +               "  mulxq 24(%0), %%rax, %%rcx;"
> +               "  adcx %%rax, %%r10;" /* f[3]*f[0] */
> +               "  movq 24(%0), %%rdx;" /* f[3] */
> +               "  mulxq 8(%0), %%r11, %%rbx;"
> +               "  adcx %%rcx, %%r11;" /* f[1]*f[3] */
> +               "  mulxq 16(%0), %%rax, %%r13;"
> +               "  adcx %%rax, %%rbx;" /* f[2]*f[3] */
> +               "  movq 8(%0), %%rdx;"
> +               "  adcx %%r15, %%r13;" /* f1 */
> +               "  mulxq 16(%0), %%rax, %%rcx;"
> +               "  mov $0, %%r14;" /* f[2]*f[1] */
>
>                 /* Step 2: Compute two parallel carry chains */
>                 "  xor %%r15d, %%r15d;"
> @@ -633,29 +815,47 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 "  adcx %%r14, %%r14;"
>
>                 /* Step 3: Compute intermediate squares */
> -               "  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
> -                                          "  movq %%rax, 0(%0);"
> -               "  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
> -               "  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
> -               "  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
> -               "  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
> -               "  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
> -               "  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
> -               "  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
> -               "  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
> -               "  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
> -               "  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
> +               "  movq 0(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
> +               "  movq %%rax, 0(%1);"
> +               "  add %%rcx, %%r8;"
> +               "  movq %%r8, 8(%1);"
> +               "  movq 8(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
> +               "  adcx %%rax, %%r9;"
> +               "  movq %%r9, 16(%1);"
> +               "  adcx %%rcx, %%r10;"
> +               "  movq %%r10, 24(%1);"
> +               "  movq 16(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
> +               "  adcx %%rax, %%r11;"
> +               "  movq %%r11, 32(%1);"
> +               "  adcx %%rcx, %%rbx;"
> +               "  movq %%rbx, 40(%1);"
> +               "  movq 24(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
> +               "  adcx %%rax, %%r13;"
> +               "  movq %%r13, 48(%1);"
> +               "  adcx %%rcx, %%r14;"
> +               "  movq %%r14, 56(%1);"
>
>                 /* Step 1: Compute all partial products */
> -               "  movq 32(%1), %%rdx;"                                       /* f[0] */
> -               "  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> -               "  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> -               "  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> -               "  movq 56(%1), %%rdx;"                                      /* f[3] */
> -               "  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
> -               "  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
> -               "  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
> -               "  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> +               "  movq 32(%0), %%rdx;" /* f[0] */
> +               "  mulxq 40(%0), %%r8, %%r14;"
> +               "  xor %%r15d, %%r15d;" /* f[1]*f[0] */
> +               "  mulxq 48(%0), %%r9, %%r10;"
> +               "  adcx %%r14, %%r9;" /* f[2]*f[0] */
> +               "  mulxq 56(%0), %%rax, %%rcx;"
> +               "  adcx %%rax, %%r10;" /* f[3]*f[0] */
> +               "  movq 56(%0), %%rdx;" /* f[3] */
> +               "  mulxq 40(%0), %%r11, %%rbx;"
> +               "  adcx %%rcx, %%r11;" /* f[1]*f[3] */
> +               "  mulxq 48(%0), %%rax, %%r13;"
> +               "  adcx %%rax, %%rbx;" /* f[2]*f[3] */
> +               "  movq 40(%0), %%rdx;"
> +               "  adcx %%r15, %%r13;" /* f1 */
> +               "  mulxq 48(%0), %%rax, %%rcx;"
> +               "  mov $0, %%r14;" /* f[2]*f[1] */
>
>                 /* Step 2: Compute two parallel carry chains */
>                 "  xor %%r15d, %%r15d;"
> @@ -673,37 +873,48 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 "  adcx %%r14, %%r14;"
>
>                 /* Step 3: Compute intermediate squares */
> -               "  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
> -                                          "  movq %%rax, 64(%0);"
> -               "  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
> -               "  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
> -               "  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
> -               "  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
> -               "  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
> -               "  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
> -               "  adcx %%rcx, %%rbx;"     "  movq %%rbx, 104(%0);"
> -               "  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
> -               "  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
> -               "  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
> +               "  movq 32(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
> +               "  movq %%rax, 64(%1);"
> +               "  add %%rcx, %%r8;"
> +               "  movq %%r8, 72(%1);"
> +               "  movq 40(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
> +               "  adcx %%rax, %%r9;"
> +               "  movq %%r9, 80(%1);"
> +               "  adcx %%rcx, %%r10;"
> +               "  movq %%r10, 88(%1);"
> +               "  movq 48(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
> +               "  adcx %%rax, %%r11;"
> +               "  movq %%r11, 96(%1);"
> +               "  adcx %%rcx, %%rbx;"
> +               "  movq %%rbx, 104(%1);"
> +               "  movq 56(%0), %%rdx;"
> +               "  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
> +               "  adcx %%rax, %%r13;"
> +               "  movq %%r13, 112(%1);"
> +               "  adcx %%rcx, %%r14;"
> +               "  movq %%r14, 120(%1);"
>
>                 /* Line up pointers */
> -               "  mov %0, %1;"
> -               "  mov %2, %0;"
> +               "  mov %1, %0;"
> +               "  mov %2, %1;"
>
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
> -               "  mulxq 32(%1), %%r8, %%r13;"
> +               "  mulxq 32(%0), %%r8, %%r13;"
>                 "  xor %%ecx, %%ecx;"
> -               "  adoxq 0(%1), %%r8;"
> -               "  mulxq 40(%1), %%r9, %%rbx;"
> +               "  adoxq 0(%0), %%r8;"
> +               "  mulxq 40(%0), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> -               "  adoxq 8(%1), %%r9;"
> -               "  mulxq 48(%1), %%r10, %%r13;"
> +               "  adoxq 8(%0), %%r9;"
> +               "  mulxq 48(%0), %%r10, %%r13;"
>                 "  adcx %%rbx, %%r10;"
> -               "  adoxq 16(%1), %%r10;"
> -               "  mulxq 56(%1), %%r11, %%rax;"
> +               "  adoxq 16(%0), %%r10;"
> +               "  mulxq 56(%0), %%r11, %%rax;"
>                 "  adcx %%r13, %%r11;"
> -               "  adoxq 24(%1), %%r11;"
> +               "  adoxq 24(%0), %%r11;"
>                 "  adcx %%rcx, %%rax;"
>                 "  adox %%rcx, %%rax;"
>                 "  imul %%rdx, %%rax;"
> @@ -711,32 +922,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 /* Step 2: Fold the carry back into dst */
>                 "  add %%rax, %%r8;"
>                 "  adcx %%rcx, %%r9;"
> -               "  movq %%r9, 8(%0);"
> +               "  movq %%r9, 8(%1);"
>                 "  adcx %%rcx, %%r10;"
> -               "  movq %%r10, 16(%0);"
> +               "  movq %%r10, 16(%1);"
>                 "  adcx %%rcx, %%r11;"
> -               "  movq %%r11, 24(%0);"
> +               "  movq %%r11, 24(%1);"
>
>                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>                 "  mov $0, %%rax;"
>                 "  cmovc %%rdx, %%rax;"
>                 "  add %%rax, %%r8;"
> -               "  movq %%r8, 0(%0);"
> +               "  movq %%r8, 0(%1);"
>
>                 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>                 "  mov $38, %%rdx;"
> -               "  mulxq 96(%1), %%r8, %%r13;"
> +               "  mulxq 96(%0), %%r8, %%r13;"
>                 "  xor %%ecx, %%ecx;"
> -               "  adoxq 64(%1), %%r8;"
> -               "  mulxq 104(%1), %%r9, %%rbx;"
> +               "  adoxq 64(%0), %%r8;"
> +               "  mulxq 104(%0), %%r9, %%rbx;"
>                 "  adcx %%r13, %%r9;"
> -               "  adoxq 72(%1), %%r9;"
> -               "  mulxq 112(%1), %%r10, %%r13;"
> +               "  adoxq 72(%0), %%r9;"
> +               "  mulxq 112(%0), %%r10, %%r13;"
>                 "  adcx %%rbx, %%r10;"
> -               "  adoxq 80(%1), %%r10;"
> -               "  mulxq 120(%1), %%r11, %%rax;"
> +               "  adoxq 80(%0), %%r10;"
> +               "  mulxq 120(%0), %%r11, %%rax;"
>                 "  adcx %%r13, %%r11;"
> -               "  adoxq 88(%1), %%r11;"
> +               "  adoxq 88(%0), %%r11;"
>                 "  adcx %%rcx, %%rax;"
>                 "  adox %%rcx, %%rax;"
>                 "  imul %%rdx, %%rax;"
> @@ -744,21 +955,21 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>                 /* Step 2: Fold the carry back into dst */
>                 "  add %%rax, %%r8;"
>                 "  adcx %%rcx, %%r9;"
> -               "  movq %%r9, 40(%0);"
> +               "  movq %%r9, 40(%1);"
>                 "  adcx %%rcx, %%r10;"
> -               "  movq %%r10, 48(%0);"
> +               "  movq %%r10, 48(%1);"
>                 "  adcx %%rcx, %%r11;"
> -               "  movq %%r11, 56(%0);"
> +               "  movq %%r11, 56(%1);"
>
>                 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>                 "  mov $0, %%rax;"
>                 "  cmovc %%rdx, %%rax;"
>                 "  add %%rax, %%r8;"
> -               "  movq %%r8, 32(%0);"
> -       : "+&r" (tmp), "+&r" (f), "+&r" (out)
> -       :
> -       : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
> -       );
> +               "  movq %%r8, 32(%1);"
> +               : "+&r"(f), "+&r"(tmp)
> +               : "r"(out)
> +               : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
> +                 "%r13", "%r14", "%r15", "memory", "cc");
>  }
>
>  static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely
  2021-12-14 16:01 [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely Jason A. Donenfeld
  2021-12-14 17:23 ` Ard Biesheuvel
@ 2021-12-14 18:05 ` Mathias Krause
  2021-12-24  3:25 ` Herbert Xu
  2 siblings, 0 replies; 6+ messages in thread
From: Mathias Krause @ 2021-12-14 18:05 UTC (permalink / raw)
  To: Jason A. Donenfeld, linux-crypto, Herbert Xu; +Cc: Aymeric Fromherz

Am 14.12.21 um 17:01 schrieb Jason A. Donenfeld:
> Rather than passing all variables as modified, pass ones that are only
> read into that parameter. This helps with old gcc versions when
> alternatives are additionally used, and lets gcc's codegen be a little
> bit more efficient. This also syncs up with the latest Vale/EverCrypt
> output.
> 
> Reported-by: Mathias Krause <minipli@grsecurity.net>
> Cc: Aymeric Fromherz <aymeric.fromherz@inria.fr>
> Link: https://lore.kernel.org/wireguard/1554725710.1290070.1639240504281.JavaMail.zimbra@inria.fr/
> Link: https://github.com/project-everest/hacl-star/pull/501
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
>  arch/x86/crypto/curve25519-x86_64.c | 767 ++++++++++++++++++----------
>  1 file changed, 489 insertions(+), 278 deletions(-)
> 
> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> index 38caf61cd5b7..d55fa9e9b9e6 100644
> --- a/arch/x86/crypto/curve25519-x86_64.c
> +++ b/arch/x86/crypto/curve25519-x86_64.c
> @@ -64,10 +64,9 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
>  
>  		/* Return the carry bit in a register */
>  		"  adcx %%r11, %1;"
> -	: "+&r" (f2), "=&r" (carry_r)
> -	: "r" (out), "r" (f1)
> -	: "%r8", "%r9", "%r10", "%r11", "memory", "cc"
> -	);
> +		: "+&r"(f2), "=&r"(carry_r)
> +		: "r"(out), "r"(f1)
> +		: "%r8", "%r9", "%r10", "%r11", "memory", "cc");
>  
>  	return carry_r;
>  }
> @@ -108,10 +107,9 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
>  		"  cmovc %0, %%rax;"
>  		"  add %%rax, %%r8;"
>  		"  movq %%r8, 0(%1);"
> -	: "+&r" (f2)
> -	: "r" (out), "r" (f1)
> -	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
> -	);
> +		: "+&r"(f2)
> +		: "r"(out), "r"(f1)
> +		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
>  }
>  
>  /* Computes the field subtraction of two field elements */
> @@ -151,10 +149,9 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
>  		"  movq %%r9, 8(%0);"
>  		"  movq %%r10, 16(%0);"
>  		"  movq %%r11, 24(%0);"
> -	:
> -	: "r" (out), "r" (f1), "r" (f2)
> -	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
> -	);
> +		:
> +		: "r"(out), "r"(f1), "r"(f2)
> +		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
>  }
>  
>  /* Computes a field multiplication: out <- f1 * f2
> @@ -162,239 +159,400 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
>  static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  {
>  	asm volatile(
> +
>  		/* Compute the raw multiplication: tmp <- src1 * src2 */
>  
>  		/* Compute src1[0] * src2 */
> -		"  movq 0(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
> -		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
> -		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> -		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"
> +		"  movq 0(%0), %%rdx;"
> +		"  mulxq 0(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  movq %%r8, 0(%2);"
> +		"  mulxq 8(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  movq %%r10, 8(%2);"
> +		"  mulxq 16(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  mulxq 24(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +
>  		/* Compute src1[1] * src2 */
> -		"  movq 8(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> -		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
> -		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +		"  movq 8(%0), %%rdx;"
> +		"  mulxq 0(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  adcxq 8(%2), %%r8;"
> +		"  movq %%r8, 8(%2);"
> +		"  mulxq 8(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  adcx %%rbx, %%r10;"
> +		"  movq %%r10, 16(%2);"
> +		"  mulxq 16(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  adcx %%r14, %%rbx;"
> +		"  mov $0, %%r8;"
> +		"  mulxq 24(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  adcx %%rax, %%r14;"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +		"  adcx %%r8, %%rax;"
> +
>  		/* Compute src1[2] * src2 */
> -		"  movq 16(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
> -		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
> -		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +		"  movq 16(%0), %%rdx;"
> +		"  mulxq 0(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  adcxq 16(%2), %%r8;"
> +		"  movq %%r8, 16(%2);"
> +		"  mulxq 8(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  adcx %%rbx, %%r10;"
> +		"  movq %%r10, 24(%2);"
> +		"  mulxq 16(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  adcx %%r14, %%rbx;"
> +		"  mov $0, %%r8;"
> +		"  mulxq 24(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  adcx %%rax, %%r14;"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +		"  adcx %%r8, %%rax;"
> +
>  		/* Compute src1[3] * src2 */
> -		"  movq 24(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
> -		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
> -		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
> -		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
> +		"  movq 24(%0), %%rdx;"
> +		"  mulxq 0(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  adcxq 24(%2), %%r8;"
> +		"  movq %%r8, 24(%2);"
> +		"  mulxq 8(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  adcx %%rbx, %%r10;"
> +		"  movq %%r10, 32(%2);"
> +		"  mulxq 16(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  adcx %%r14, %%rbx;"
> +		"  movq %%rbx, 40(%2);"
> +		"  mov $0, %%r8;"
> +		"  mulxq 24(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  adcx %%rax, %%r14;"
> +		"  movq %%r14, 48(%2);"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +		"  adcx %%r8, %%rax;"
> +		"  movq %%rax, 56(%2);"
> +
>  		/* Line up pointers */
> -		"  mov %0, %1;"
>  		"  mov %2, %0;"
> +		"  mov %3, %2;"
>  
>  		/* Wrap the result back into the field */
>  
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
> -		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %k3, %k3;"
> -		"  adoxq 0(%1), %%r8;"
> -		"  mulxq 40(%1), %%r9, %%rbx;"
> +		"  mulxq 32(%0), %%r8, %%r13;"
> +		"  xor %k1, %k1;"
> +		"  adoxq 0(%0), %%r8;"
> +		"  mulxq 40(%0), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> -		"  adoxq 8(%1), %%r9;"
> -		"  mulxq 48(%1), %%r10, %%r13;"
> +		"  adoxq 8(%0), %%r9;"
> +		"  mulxq 48(%0), %%r10, %%r13;"
>  		"  adcx %%rbx, %%r10;"
> -		"  adoxq 16(%1), %%r10;"
> -		"  mulxq 56(%1), %%r11, %%rax;"
> +		"  adoxq 16(%0), %%r10;"
> +		"  mulxq 56(%0), %%r11, %%rax;"
>  		"  adcx %%r13, %%r11;"
> -		"  adoxq 24(%1), %%r11;"
> -		"  adcx %3, %%rax;"
> -		"  adox %3, %%rax;"
> +		"  adoxq 24(%0), %%r11;"
> +		"  adcx %1, %%rax;"
> +		"  adox %1, %%rax;"
>  		"  imul %%rdx, %%rax;"
>  
>  		/* Step 2: Fold the carry back into dst */
>  		"  add %%rax, %%r8;"
> -		"  adcx %3, %%r9;"
> -		"  movq %%r9, 8(%0);"
> -		"  adcx %3, %%r10;"
> -		"  movq %%r10, 16(%0);"
> -		"  adcx %3, %%r11;"
> -		"  movq %%r11, 24(%0);"
> +		"  adcx %1, %%r9;"
> +		"  movq %%r9, 8(%2);"
> +		"  adcx %1, %%r10;"
> +		"  movq %%r10, 16(%2);"
> +		"  adcx %1, %%r11;"
> +		"  movq %%r11, 24(%2);"
>  
>  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>  		"  mov $0, %%rax;"
>  		"  cmovc %%rdx, %%rax;"
>  		"  add %%rax, %%r8;"
> -		"  movq %%r8, 0(%0);"
> -	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
> -	:
> -	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
> -	);
> +		"  movq %%r8, 0(%2);"
> +		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
> +		: "r"(out)
> +		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
> +		  "%r14", "memory", "cc");
>  }
>  
>  /* Computes two field multiplications:
> - * out[0] <- f1[0] * f2[0]
> - * out[1] <- f1[1] * f2[1]
> - * Uses the 16-element buffer tmp for intermediate results. */
> + *   out[0] <- f1[0] * f2[0]
> + *   out[1] <- f1[1] * f2[1]
> + * Uses the 16-element buffer tmp for intermediate results: */
>  static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>  {
>  	asm volatile(
> +
>  		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
>  
>  		/* Compute src1[0] * src2 */
> -		"  movq 0(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
> -		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
> -		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> -		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"
> +		"  movq 0(%0), %%rdx;"
> +		"  mulxq 0(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  movq %%r8, 0(%2);"
> +		"  mulxq 8(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  movq %%r10, 8(%2);"
> +		"  mulxq 16(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  mulxq 24(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +
>  		/* Compute src1[1] * src2 */
> -		"  movq 8(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
> -		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
> -		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +		"  movq 8(%0), %%rdx;"
> +		"  mulxq 0(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  adcxq 8(%2), %%r8;"
> +		"  movq %%r8, 8(%2);"
> +		"  mulxq 8(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  adcx %%rbx, %%r10;"
> +		"  movq %%r10, 16(%2);"
> +		"  mulxq 16(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  adcx %%r14, %%rbx;"
> +		"  mov $0, %%r8;"
> +		"  mulxq 24(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  adcx %%rax, %%r14;"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +		"  adcx %%r8, %%rax;"
> +
>  		/* Compute src1[2] * src2 */
> -		"  movq 16(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
> -		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
> -		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +		"  movq 16(%0), %%rdx;"
> +		"  mulxq 0(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  adcxq 16(%2), %%r8;"
> +		"  movq %%r8, 16(%2);"
> +		"  mulxq 8(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  adcx %%rbx, %%r10;"
> +		"  movq %%r10, 24(%2);"
> +		"  mulxq 16(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  adcx %%r14, %%rbx;"
> +		"  mov $0, %%r8;"
> +		"  mulxq 24(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  adcx %%rax, %%r14;"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +		"  adcx %%r8, %%rax;"
> +
>  		/* Compute src1[3] * src2 */
> -		"  movq 24(%1), %%rdx;"
> -		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
> -		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
> -		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
> -		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
> +		"  movq 24(%0), %%rdx;"
> +		"  mulxq 0(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  adcxq 24(%2), %%r8;"
> +		"  movq %%r8, 24(%2);"
> +		"  mulxq 8(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  adcx %%rbx, %%r10;"
> +		"  movq %%r10, 32(%2);"
> +		"  mulxq 16(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  adcx %%r14, %%rbx;"
> +		"  movq %%rbx, 40(%2);"
> +		"  mov $0, %%r8;"
> +		"  mulxq 24(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  adcx %%rax, %%r14;"
> +		"  movq %%r14, 48(%2);"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +		"  adcx %%r8, %%rax;"
> +		"  movq %%rax, 56(%2);"
>  
>  		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
>  
>  		/* Compute src1[0] * src2 */
> -		"  movq 32(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
> -		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
> -		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"
> +		"  movq 32(%0), %%rdx;"
> +		"  mulxq 32(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  movq %%r8, 64(%2);"
> +		"  mulxq 40(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  movq %%r10, 72(%2);"
> +		"  mulxq 48(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  mulxq 56(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +
>  		/* Compute src1[1] * src2 */
> -		"  movq 40(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
> -		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +		"  movq 40(%0), %%rdx;"
> +		"  mulxq 32(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  adcxq 72(%2), %%r8;"
> +		"  movq %%r8, 72(%2);"
> +		"  mulxq 40(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  adcx %%rbx, %%r10;"
> +		"  movq %%r10, 80(%2);"
> +		"  mulxq 48(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  adcx %%r14, %%rbx;"
> +		"  mov $0, %%r8;"
> +		"  mulxq 56(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  adcx %%rax, %%r14;"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +		"  adcx %%r8, %%rax;"
> +
>  		/* Compute src1[2] * src2 */
> -		"  movq 48(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
> -		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
> -		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
> +		"  movq 48(%0), %%rdx;"
> +		"  mulxq 32(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  adcxq 80(%2), %%r8;"
> +		"  movq %%r8, 80(%2);"
> +		"  mulxq 40(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  adcx %%rbx, %%r10;"
> +		"  movq %%r10, 88(%2);"
> +		"  mulxq 48(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  adcx %%r14, %%rbx;"
> +		"  mov $0, %%r8;"
> +		"  mulxq 56(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  adcx %%rax, %%r14;"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +		"  adcx %%r8, %%rax;"
> +
>  		/* Compute src1[3] * src2 */
> -		"  movq 56(%1), %%rdx;"
> -		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
> -		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
> -		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
> -		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
> -		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
> +		"  movq 56(%0), %%rdx;"
> +		"  mulxq 32(%1), %%r8, %%r9;"
> +		"  xor %%r10d, %%r10d;"
> +		"  adcxq 88(%2), %%r8;"
> +		"  movq %%r8, 88(%2);"
> +		"  mulxq 40(%1), %%r10, %%r11;"
> +		"  adox %%r9, %%r10;"
> +		"  adcx %%rbx, %%r10;"
> +		"  movq %%r10, 96(%2);"
> +		"  mulxq 48(%1), %%rbx, %%r13;"
> +		"  adox %%r11, %%rbx;"
> +		"  adcx %%r14, %%rbx;"
> +		"  movq %%rbx, 104(%2);"
> +		"  mov $0, %%r8;"
> +		"  mulxq 56(%1), %%r14, %%rdx;"
> +		"  adox %%r13, %%r14;"
> +		"  adcx %%rax, %%r14;"
> +		"  movq %%r14, 112(%2);"
> +		"  mov $0, %%rax;"
> +		"  adox %%rdx, %%rax;"
> +		"  adcx %%r8, %%rax;"
> +		"  movq %%rax, 120(%2);"
> +
>  		/* Line up pointers */
> -		"  mov %0, %1;"
>  		"  mov %2, %0;"
> +		"  mov %3, %2;"
>  
>  		/* Wrap the results back into the field */
>  
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
> -		"  mulxq 32(%1), %%r8, %%r13;"
> -		"  xor %k3, %k3;"
> -		"  adoxq 0(%1), %%r8;"
> -		"  mulxq 40(%1), %%r9, %%rbx;"
> +		"  mulxq 32(%0), %%r8, %%r13;"
> +		"  xor %k1, %k1;"
> +		"  adoxq 0(%0), %%r8;"
> +		"  mulxq 40(%0), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> -		"  adoxq 8(%1), %%r9;"
> -		"  mulxq 48(%1), %%r10, %%r13;"
> +		"  adoxq 8(%0), %%r9;"
> +		"  mulxq 48(%0), %%r10, %%r13;"
>  		"  adcx %%rbx, %%r10;"
> -		"  adoxq 16(%1), %%r10;"
> -		"  mulxq 56(%1), %%r11, %%rax;"
> +		"  adoxq 16(%0), %%r10;"
> +		"  mulxq 56(%0), %%r11, %%rax;"
>  		"  adcx %%r13, %%r11;"
> -		"  adoxq 24(%1), %%r11;"
> -		"  adcx %3, %%rax;"
> -		"  adox %3, %%rax;"
> +		"  adoxq 24(%0), %%r11;"
> +		"  adcx %1, %%rax;"
> +		"  adox %1, %%rax;"
>  		"  imul %%rdx, %%rax;"
>  
>  		/* Step 2: Fold the carry back into dst */
>  		"  add %%rax, %%r8;"
> -		"  adcx %3, %%r9;"
> -		"  movq %%r9, 8(%0);"
> -		"  adcx %3, %%r10;"
> -		"  movq %%r10, 16(%0);"
> -		"  adcx %3, %%r11;"
> -		"  movq %%r11, 24(%0);"
> +		"  adcx %1, %%r9;"
> +		"  movq %%r9, 8(%2);"
> +		"  adcx %1, %%r10;"
> +		"  movq %%r10, 16(%2);"
> +		"  adcx %1, %%r11;"
> +		"  movq %%r11, 24(%2);"
>  
>  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>  		"  mov $0, %%rax;"
>  		"  cmovc %%rdx, %%rax;"
>  		"  add %%rax, %%r8;"
> -		"  movq %%r8, 0(%0);"
> +		"  movq %%r8, 0(%2);"
>  
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
> -		"  mulxq 96(%1), %%r8, %%r13;"
> -		"  xor %k3, %k3;"
> -		"  adoxq 64(%1), %%r8;"
> -		"  mulxq 104(%1), %%r9, %%rbx;"
> +		"  mulxq 96(%0), %%r8, %%r13;"
> +		"  xor %k1, %k1;"
> +		"  adoxq 64(%0), %%r8;"
> +		"  mulxq 104(%0), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> -		"  adoxq 72(%1), %%r9;"
> -		"  mulxq 112(%1), %%r10, %%r13;"
> +		"  adoxq 72(%0), %%r9;"
> +		"  mulxq 112(%0), %%r10, %%r13;"
>  		"  adcx %%rbx, %%r10;"
> -		"  adoxq 80(%1), %%r10;"
> -		"  mulxq 120(%1), %%r11, %%rax;"
> +		"  adoxq 80(%0), %%r10;"
> +		"  mulxq 120(%0), %%r11, %%rax;"
>  		"  adcx %%r13, %%r11;"
> -		"  adoxq 88(%1), %%r11;"
> -		"  adcx %3, %%rax;"
> -		"  adox %3, %%rax;"
> +		"  adoxq 88(%0), %%r11;"
> +		"  adcx %1, %%rax;"
> +		"  adox %1, %%rax;"
>  		"  imul %%rdx, %%rax;"
>  
>  		/* Step 2: Fold the carry back into dst */
>  		"  add %%rax, %%r8;"
> -		"  adcx %3, %%r9;"
> -		"  movq %%r9, 40(%0);"
> -		"  adcx %3, %%r10;"
> -		"  movq %%r10, 48(%0);"
> -		"  adcx %3, %%r11;"
> -		"  movq %%r11, 56(%0);"
> +		"  adcx %1, %%r9;"
> +		"  movq %%r9, 40(%2);"
> +		"  adcx %1, %%r10;"
> +		"  movq %%r10, 48(%2);"
> +		"  adcx %1, %%r11;"
> +		"  movq %%r11, 56(%2);"
>  
>  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>  		"  mov $0, %%rax;"
>  		"  cmovc %%rdx, %%rax;"
>  		"  add %%rax, %%r8;"
> -		"  movq %%r8, 32(%0);"
> -	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
> -	:
> -	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
> -	);
> +		"  movq %%r8, 32(%2);"
> +		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
> +		: "r"(out)
> +		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
> +		  "%r14", "memory", "cc");
>  }
>  
> -/* Computes the field multiplication of four-element f1 with value in f2 */
> +/* Computes the field multiplication of four-element f1 with value in f2
> + * Requires f2 to be smaller than 2^17 */
>  static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
>  {
>  	register u64 f2_r asm("rdx") = f2;
>  
>  	asm volatile(
>  		/* Compute the raw multiplication of f1*f2 */
> -		"  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
> -		"  mulxq 8(%2), %%r9, %%rbx;"      /* f1[1]*f2 */
> +		"  mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
> +		"  mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
>  		"  add %%rcx, %%r9;"
>  		"  mov $0, %%rcx;"
> -		"  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
> +		"  mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
>  		"  adcx %%rbx, %%r10;"
> -		"  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
> +		"  mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
>  		"  adcx %%r13, %%r11;"
>  		"  adcx %%rcx, %%rax;"
>  
> @@ -418,17 +576,17 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
>  		"  cmovc %%rdx, %%rax;"
>  		"  add %%rax, %%r8;"
>  		"  movq %%r8, 0(%1);"
> -	: "+&r" (f2_r)
> -	: "r" (out), "r" (f1)
> -	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
> -	);
> +		: "+&r"(f2_r)
> +		: "r"(out), "r"(f1)
> +		: "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
> +		  "memory", "cc");
>  }
>  
>  /* Computes p1 <- bit ? p2 : p1 in constant time */
>  static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
>  {
>  	asm volatile(
> -		/* Invert the polarity of bit to match cmov expectations */
> +		/* Transfer bit into CF flag */
>  		"  add $18446744073709551615, %0;"
>  
>  		/* cswap p1[0], p2[0] */
> @@ -502,10 +660,9 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
>  		"  cmovc %%r10, %%r9;"
>  		"  movq %%r8, 56(%1);"
>  		"  movq %%r9, 56(%2);"
> -	: "+&r" (bit)
> -	: "r" (p1), "r" (p2)
> -	: "%r8", "%r9", "%r10", "memory", "cc"
> -	);
> +		: "+&r"(bit)
> +		: "r"(p1), "r"(p2)
> +		: "%r8", "%r9", "%r10", "memory", "cc");
>  }
>  
>  /* Computes the square of a field element: out <- f * f
> @@ -516,15 +673,22 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>  		/* Compute the raw multiplication: tmp <- f * f */
>  
>  		/* Step 1: Compute all partial products */
> -		"  movq 0(%1), %%rdx;"                                       /* f[0] */
> -		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> -		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> -		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> -		"  movq 24(%1), %%rdx;"                                      /* f[3] */
> -		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
> -		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
> -		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
> -		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> +		"  movq 0(%0), %%rdx;" /* f[0] */
> +		"  mulxq 8(%0), %%r8, %%r14;"
> +		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
> +		"  mulxq 16(%0), %%r9, %%r10;"
> +		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
> +		"  mulxq 24(%0), %%rax, %%rcx;"
> +		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
> +		"  movq 24(%0), %%rdx;" /* f[3] */
> +		"  mulxq 8(%0), %%r11, %%rbx;"
> +		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
> +		"  mulxq 16(%0), %%rax, %%r13;"
> +		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
> +		"  movq 8(%0), %%rdx;"
> +		"  adcx %%r15, %%r13;" /* f1 */
> +		"  mulxq 16(%0), %%rax, %%rcx;"
> +		"  mov $0, %%r14;" /* f[2]*f[1] */
>  
>  		/* Step 2: Compute two parallel carry chains */
>  		"  xor %%r15d, %%r15d;"
> @@ -542,39 +706,50 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>  		"  adcx %%r14, %%r14;"
>  
>  		/* Step 3: Compute intermediate squares */
> -		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
> -		                           "  movq %%rax, 0(%0);"
> -		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
> -		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
> -		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
> -		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
> -		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
> -		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
> -		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
> -		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
> -		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
> -		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
> +		"  movq 0(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
> +		"  movq %%rax, 0(%1);"
> +		"  add %%rcx, %%r8;"
> +		"  movq %%r8, 8(%1);"
> +		"  movq 8(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
> +		"  adcx %%rax, %%r9;"
> +		"  movq %%r9, 16(%1);"
> +		"  adcx %%rcx, %%r10;"
> +		"  movq %%r10, 24(%1);"
> +		"  movq 16(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
> +		"  adcx %%rax, %%r11;"
> +		"  movq %%r11, 32(%1);"
> +		"  adcx %%rcx, %%rbx;"
> +		"  movq %%rbx, 40(%1);"
> +		"  movq 24(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
> +		"  adcx %%rax, %%r13;"
> +		"  movq %%r13, 48(%1);"
> +		"  adcx %%rcx, %%r14;"
> +		"  movq %%r14, 56(%1);"
>  
>  		/* Line up pointers */
> -		"  mov %0, %1;"
> -		"  mov %2, %0;"
> +		"  mov %1, %0;"
> +		"  mov %2, %1;"
>  
>  		/* Wrap the result back into the field */
>  
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
> -		"  mulxq 32(%1), %%r8, %%r13;"
> +		"  mulxq 32(%0), %%r8, %%r13;"
>  		"  xor %%ecx, %%ecx;"
> -		"  adoxq 0(%1), %%r8;"
> -		"  mulxq 40(%1), %%r9, %%rbx;"
> +		"  adoxq 0(%0), %%r8;"
> +		"  mulxq 40(%0), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> -		"  adoxq 8(%1), %%r9;"
> -		"  mulxq 48(%1), %%r10, %%r13;"
> +		"  adoxq 8(%0), %%r9;"
> +		"  mulxq 48(%0), %%r10, %%r13;"
>  		"  adcx %%rbx, %%r10;"
> -		"  adoxq 16(%1), %%r10;"
> -		"  mulxq 56(%1), %%r11, %%rax;"
> +		"  adoxq 16(%0), %%r10;"
> +		"  mulxq 56(%0), %%r11, %%rax;"
>  		"  adcx %%r13, %%r11;"
> -		"  adoxq 24(%1), %%r11;"
> +		"  adoxq 24(%0), %%r11;"
>  		"  adcx %%rcx, %%rax;"
>  		"  adox %%rcx, %%rax;"
>  		"  imul %%rdx, %%rax;"
> @@ -582,40 +757,47 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>  		/* Step 2: Fold the carry back into dst */
>  		"  add %%rax, %%r8;"
>  		"  adcx %%rcx, %%r9;"
> -		"  movq %%r9, 8(%0);"
> +		"  movq %%r9, 8(%1);"
>  		"  adcx %%rcx, %%r10;"
> -		"  movq %%r10, 16(%0);"
> +		"  movq %%r10, 16(%1);"
>  		"  adcx %%rcx, %%r11;"
> -		"  movq %%r11, 24(%0);"
> +		"  movq %%r11, 24(%1);"
>  
>  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>  		"  mov $0, %%rax;"
>  		"  cmovc %%rdx, %%rax;"
>  		"  add %%rax, %%r8;"
> -		"  movq %%r8, 0(%0);"
> -	: "+&r" (tmp), "+&r" (f), "+&r" (out)
> -	:
> -	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
> -	);
> +		"  movq %%r8, 0(%1);"
> +		: "+&r"(f), "+&r"(tmp)
> +		: "r"(out)
> +		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
> +		  "%r13", "%r14", "%r15", "memory", "cc");
>  }
>  
>  /* Computes two field squarings:
> - * out[0] <- f[0] * f[0]
> - * out[1] <- f[1] * f[1]
> + *   out[0] <- f[0] * f[0]
> + *   out[1] <- f[1] * f[1]
>   * Uses the 16-element buffer tmp for intermediate results */
>  static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  {
>  	asm volatile(
>  		/* Step 1: Compute all partial products */
> -		"  movq 0(%1), %%rdx;"                                       /* f[0] */
> -		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> -		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> -		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> -		"  movq 24(%1), %%rdx;"                                      /* f[3] */
> -		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
> -		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
> -		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
> -		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> +		"  movq 0(%0), %%rdx;" /* f[0] */
> +		"  mulxq 8(%0), %%r8, %%r14;"
> +		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
> +		"  mulxq 16(%0), %%r9, %%r10;"
> +		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
> +		"  mulxq 24(%0), %%rax, %%rcx;"
> +		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
> +		"  movq 24(%0), %%rdx;" /* f[3] */
> +		"  mulxq 8(%0), %%r11, %%rbx;"
> +		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
> +		"  mulxq 16(%0), %%rax, %%r13;"
> +		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
> +		"  movq 8(%0), %%rdx;"
> +		"  adcx %%r15, %%r13;" /* f1 */
> +		"  mulxq 16(%0), %%rax, %%rcx;"
> +		"  mov $0, %%r14;" /* f[2]*f[1] */
>  
>  		/* Step 2: Compute two parallel carry chains */
>  		"  xor %%r15d, %%r15d;"
> @@ -633,29 +815,47 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		"  adcx %%r14, %%r14;"
>  
>  		/* Step 3: Compute intermediate squares */
> -		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
> -		                           "  movq %%rax, 0(%0);"
> -		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
> -		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
> -		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
> -		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
> -		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
> -		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
> -		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
> -		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
> -		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
> -		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
> +		"  movq 0(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
> +		"  movq %%rax, 0(%1);"
> +		"  add %%rcx, %%r8;"
> +		"  movq %%r8, 8(%1);"
> +		"  movq 8(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
> +		"  adcx %%rax, %%r9;"
> +		"  movq %%r9, 16(%1);"
> +		"  adcx %%rcx, %%r10;"
> +		"  movq %%r10, 24(%1);"
> +		"  movq 16(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
> +		"  adcx %%rax, %%r11;"
> +		"  movq %%r11, 32(%1);"
> +		"  adcx %%rcx, %%rbx;"
> +		"  movq %%rbx, 40(%1);"
> +		"  movq 24(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
> +		"  adcx %%rax, %%r13;"
> +		"  movq %%r13, 48(%1);"
> +		"  adcx %%rcx, %%r14;"
> +		"  movq %%r14, 56(%1);"
>  
>  		/* Step 1: Compute all partial products */
> -		"  movq 32(%1), %%rdx;"                                       /* f[0] */
> -		"  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
> -		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
> -		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
> -		"  movq 56(%1), %%rdx;"                                      /* f[3] */
> -		"  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
> -		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
> -		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
> -		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
> +		"  movq 32(%0), %%rdx;" /* f[0] */
> +		"  mulxq 40(%0), %%r8, %%r14;"
> +		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
> +		"  mulxq 48(%0), %%r9, %%r10;"
> +		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
> +		"  mulxq 56(%0), %%rax, %%rcx;"
> +		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
> +		"  movq 56(%0), %%rdx;" /* f[3] */
> +		"  mulxq 40(%0), %%r11, %%rbx;"
> +		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
> +		"  mulxq 48(%0), %%rax, %%r13;"
> +		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
> +		"  movq 40(%0), %%rdx;"
> +		"  adcx %%r15, %%r13;" /* f1 */
> +		"  mulxq 48(%0), %%rax, %%rcx;"
> +		"  mov $0, %%r14;" /* f[2]*f[1] */
>  
>  		/* Step 2: Compute two parallel carry chains */
>  		"  xor %%r15d, %%r15d;"
> @@ -673,37 +873,48 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		"  adcx %%r14, %%r14;"
>  
>  		/* Step 3: Compute intermediate squares */
> -		"  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
> -		                           "  movq %%rax, 64(%0);"
> -		"  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
> -		"  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
> -		"  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
> -		"  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
> -		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
> -		"  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
> -		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 104(%0);"
> -		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
> -		"  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
> -		"  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
> +		"  movq 32(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
> +		"  movq %%rax, 64(%1);"
> +		"  add %%rcx, %%r8;"
> +		"  movq %%r8, 72(%1);"
> +		"  movq 40(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
> +		"  adcx %%rax, %%r9;"
> +		"  movq %%r9, 80(%1);"
> +		"  adcx %%rcx, %%r10;"
> +		"  movq %%r10, 88(%1);"
> +		"  movq 48(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
> +		"  adcx %%rax, %%r11;"
> +		"  movq %%r11, 96(%1);"
> +		"  adcx %%rcx, %%rbx;"
> +		"  movq %%rbx, 104(%1);"
> +		"  movq 56(%0), %%rdx;"
> +		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
> +		"  adcx %%rax, %%r13;"
> +		"  movq %%r13, 112(%1);"
> +		"  adcx %%rcx, %%r14;"
> +		"  movq %%r14, 120(%1);"
>  
>  		/* Line up pointers */
> -		"  mov %0, %1;"
> -		"  mov %2, %0;"
> +		"  mov %1, %0;"
> +		"  mov %2, %1;"
>  
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
> -		"  mulxq 32(%1), %%r8, %%r13;"
> +		"  mulxq 32(%0), %%r8, %%r13;"
>  		"  xor %%ecx, %%ecx;"
> -		"  adoxq 0(%1), %%r8;"
> -		"  mulxq 40(%1), %%r9, %%rbx;"
> +		"  adoxq 0(%0), %%r8;"
> +		"  mulxq 40(%0), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> -		"  adoxq 8(%1), %%r9;"
> -		"  mulxq 48(%1), %%r10, %%r13;"
> +		"  adoxq 8(%0), %%r9;"
> +		"  mulxq 48(%0), %%r10, %%r13;"
>  		"  adcx %%rbx, %%r10;"
> -		"  adoxq 16(%1), %%r10;"
> -		"  mulxq 56(%1), %%r11, %%rax;"
> +		"  adoxq 16(%0), %%r10;"
> +		"  mulxq 56(%0), %%r11, %%rax;"
>  		"  adcx %%r13, %%r11;"
> -		"  adoxq 24(%1), %%r11;"
> +		"  adoxq 24(%0), %%r11;"
>  		"  adcx %%rcx, %%rax;"
>  		"  adox %%rcx, %%rax;"
>  		"  imul %%rdx, %%rax;"
> @@ -711,32 +922,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		/* Step 2: Fold the carry back into dst */
>  		"  add %%rax, %%r8;"
>  		"  adcx %%rcx, %%r9;"
> -		"  movq %%r9, 8(%0);"
> +		"  movq %%r9, 8(%1);"
>  		"  adcx %%rcx, %%r10;"
> -		"  movq %%r10, 16(%0);"
> +		"  movq %%r10, 16(%1);"
>  		"  adcx %%rcx, %%r11;"
> -		"  movq %%r11, 24(%0);"
> +		"  movq %%r11, 24(%1);"
>  
>  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>  		"  mov $0, %%rax;"
>  		"  cmovc %%rdx, %%rax;"
>  		"  add %%rax, %%r8;"
> -		"  movq %%r8, 0(%0);"
> +		"  movq %%r8, 0(%1);"
>  
>  		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
>  		"  mov $38, %%rdx;"
> -		"  mulxq 96(%1), %%r8, %%r13;"
> +		"  mulxq 96(%0), %%r8, %%r13;"
>  		"  xor %%ecx, %%ecx;"
> -		"  adoxq 64(%1), %%r8;"
> -		"  mulxq 104(%1), %%r9, %%rbx;"
> +		"  adoxq 64(%0), %%r8;"
> +		"  mulxq 104(%0), %%r9, %%rbx;"
>  		"  adcx %%r13, %%r9;"
> -		"  adoxq 72(%1), %%r9;"
> -		"  mulxq 112(%1), %%r10, %%r13;"
> +		"  adoxq 72(%0), %%r9;"
> +		"  mulxq 112(%0), %%r10, %%r13;"
>  		"  adcx %%rbx, %%r10;"
> -		"  adoxq 80(%1), %%r10;"
> -		"  mulxq 120(%1), %%r11, %%rax;"
> +		"  adoxq 80(%0), %%r10;"
> +		"  mulxq 120(%0), %%r11, %%rax;"
>  		"  adcx %%r13, %%r11;"
> -		"  adoxq 88(%1), %%r11;"
> +		"  adoxq 88(%0), %%r11;"
>  		"  adcx %%rcx, %%rax;"
>  		"  adox %%rcx, %%rax;"
>  		"  imul %%rdx, %%rax;"
> @@ -744,21 +955,21 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>  		/* Step 2: Fold the carry back into dst */
>  		"  add %%rax, %%r8;"
>  		"  adcx %%rcx, %%r9;"
> -		"  movq %%r9, 40(%0);"
> +		"  movq %%r9, 40(%1);"
>  		"  adcx %%rcx, %%r10;"
> -		"  movq %%r10, 48(%0);"
> +		"  movq %%r10, 48(%1);"
>  		"  adcx %%rcx, %%r11;"
> -		"  movq %%r11, 56(%0);"
> +		"  movq %%r11, 56(%1);"
>  
>  		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
>  		"  mov $0, %%rax;"
>  		"  cmovc %%rdx, %%rax;"
>  		"  add %%rax, %%r8;"
> -		"  movq %%r8, 32(%0);"
> -	: "+&r" (tmp), "+&r" (f), "+&r" (out)
> -	:
> -	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
> -	);
> +		"  movq %%r8, 32(%1);"
> +		: "+&r"(f), "+&r"(tmp)
> +		: "r"(out)
> +		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
> +		  "%r13", "%r14", "%r15", "memory", "cc");
>  }
>  
>  static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
> 

This is basically the same you already have in wireguard-linux-compat,
which I double checked before, thereby

Reviewed-by: Mathias Krause <minipli@grsecurity.net>

Though, same comments apply here as well (using "rm"(out) instead of
"r"(out) to give the compiler more room to play with during register
allocation. But I see that it can lead to less efficient code on the
latest Intel hardware (while older, apparently, gains a speedup)).

Mathias

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely
  2021-12-14 17:23 ` Ard Biesheuvel
@ 2021-12-14 18:07   ` Mathias Krause
  2021-12-20 15:25   ` Jason A. Donenfeld
  1 sibling, 0 replies; 6+ messages in thread
From: Mathias Krause @ 2021-12-14 18:07 UTC (permalink / raw)
  To: Ard Biesheuvel, Jason A. Donenfeld
  Cc: Linux Crypto Mailing List, Herbert Xu, Aymeric Fromherz

Am 14.12.21 um 18:23 schrieb Ard Biesheuvel:
> On Tue, 14 Dec 2021 at 17:02, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>>
>> Rather than passing all variables as modified, pass ones that are only
>> read into that parameter. This helps with old gcc versions when
>> alternatives are additionally used, and lets gcc's codegen be a little
>> bit more efficient. This also syncs up with the latest Vale/EverCrypt
>> output.
>>
>> Reported-by: Mathias Krause <minipli@grsecurity.net>
>> Cc: Aymeric Fromherz <aymeric.fromherz@inria.fr>
>> Link: https://lore.kernel.org/wireguard/1554725710.1290070.1639240504281.JavaMail.zimbra@inria.fr/
>> Link: https://github.com/project-everest/hacl-star/pull/501
>> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> 
> Could we separate the whitespace-only changes from the code changes
> please? Otherwise, this is really hard to review.
> 

Applying the patch and using `git show --color-words -w` helped me to
lower the noise ratio to a sensible level.

Mathias

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely
  2021-12-14 17:23 ` Ard Biesheuvel
  2021-12-14 18:07   ` Mathias Krause
@ 2021-12-20 15:25   ` Jason A. Donenfeld
  1 sibling, 0 replies; 6+ messages in thread
From: Jason A. Donenfeld @ 2021-12-20 15:25 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Linux Crypto Mailing List, Herbert Xu, Mathias Krause, Aymeric Fromherz

Hi Ard,

On Tue, Dec 14, 2021 at 6:23 PM Ard Biesheuvel <ardb@kernel.org> wrote:
> Could we separate the whitespace-only changes from the code changes
> please? Otherwise, this is really hard to review.

Grr, not so easily, unfortunately. It was really a mistake that the
original one wasn't clang-formatted for some level of determinism like
this one is. As Mathias mentioned, --color-words -w makes it a lot
more clear what's going on. But actually, even with whitespace
unchanged, there's a *lot* of change in this patch because the
register allocation changed, so lots of %1 became %0 and suchlike.

It's probably easier to just look at
https://github.com/project-everest/hacl-star/commit/a9f1c5fa440b7e95e3d853671ee65b31ee5c473b
to see what's changed.

Alternatively, if you don't want to review this, that's fine -- just
say so -- and Herbert can go with Mathias' Reviewed-by, which might
make sense anyway because he spent a lot of time with this code.

Jason

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely
  2021-12-14 16:01 [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely Jason A. Donenfeld
  2021-12-14 17:23 ` Ard Biesheuvel
  2021-12-14 18:05 ` Mathias Krause
@ 2021-12-24  3:25 ` Herbert Xu
  2 siblings, 0 replies; 6+ messages in thread
From: Herbert Xu @ 2021-12-24  3:25 UTC (permalink / raw)
  To: Jason A. Donenfeld; +Cc: linux-crypto, Mathias Krause, Aymeric Fromherz

On Tue, Dec 14, 2021 at 05:01:46PM +0100, Jason A. Donenfeld wrote:
> Rather than passing all variables as modified, pass ones that are only
> read into that parameter. This helps with old gcc versions when
> alternatives are additionally used, and lets gcc's codegen be a little
> bit more efficient. This also syncs up with the latest Vale/EverCrypt
> output.
> 
> Reported-by: Mathias Krause <minipli@grsecurity.net>
> Cc: Aymeric Fromherz <aymeric.fromherz@inria.fr>
> Link: https://lore.kernel.org/wireguard/1554725710.1290070.1639240504281.JavaMail.zimbra@inria.fr/
> Link: https://github.com/project-everest/hacl-star/pull/501
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> ---
>  arch/x86/crypto/curve25519-x86_64.c | 767 ++++++++++++++++++----------
>  1 file changed, 489 insertions(+), 278 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-12-24  3:25 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-14 16:01 [PATCH] crypto: x86/curve25519 - use in/out register constraints more precisely Jason A. Donenfeld
2021-12-14 17:23 ` Ard Biesheuvel
2021-12-14 18:07   ` Mathias Krause
2021-12-20 15:25   ` Jason A. Donenfeld
2021-12-14 18:05 ` Mathias Krause
2021-12-24  3:25 ` Herbert Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.