Openwrt/target/linux/generic/backport-5.4/080-wireguard-0055-crypto-x86-curve25519-leave-r12-as-spare-register.patch
Jason A. Donenfeld 3888fa7880 kernel: 5.4: import wireguard backport
Rather than using the clunky, old, slower wireguard-linux-compat out of
tree module, this commit does a patch-by-patch backport of upstream's
wireguard to 5.4. This specific backport is in widespread use, being
part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's
Android kernel, Gentoo's distro kernel, and probably more I've forgotten
about. It's definately the "more proper" way of adding wireguard to a
kernel than the ugly compat.h hell of the wireguard-linux-compat repo.
And most importantly for OpenWRT, it allows using the same module
configuration code for 5.10 as for 5.4, with no need for bifurcation.

These patches are from the backport tree which is maintained in the
open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y
I'll be sending PRs to update this as needed.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2021-02-26 20:41:01 +01:00

377 lines
20 KiB
Diff

From 481c5ed9ac2acec32d93847636707bda02208ec8 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sun, 1 Mar 2020 16:06:56 +0800
Subject: [PATCH 055/124] crypto: x86/curve25519 - leave r12 as spare register
commit dc7fc3a53ae158263196b1892b672aedf67796c5 upstream.
This updates to the newer register selection proved by HACL*, which
leads to a more compact instruction encoding, and saves around 100
cycles.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
arch/x86/crypto/curve25519-x86_64.c | 110 ++++++++++++++--------------
1 file changed, 55 insertions(+), 55 deletions(-)
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -167,28 +167,28 @@ static inline void fmul(u64 *out, const
" movq 0(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
- " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
+ " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;"
/* Compute src1[1] * src2 */
" movq 8(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
- " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
+ " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[2] * src2 */
" movq 16(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
- " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
+ " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[3] * src2 */
" movq 24(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
- " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
+ " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
/* Line up pointers */
@@ -202,11 +202,11 @@ static inline void fmul(u64 *out, const
" mulxq 32(%1), %%r8, %%r13;"
" xor %3, %3;"
" adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%r12;"
+ " mulxq 40(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
" adoxq 8(%1), %%r9;"
" mulxq 48(%1), %%r10, %%r13;"
- " adcx %%r12, %%r10;"
+ " adcx %%rbx, %%r10;"
" adoxq 16(%1), %%r10;"
" mulxq 56(%1), %%r11, %%rax;"
" adcx %%r13, %%r11;"
@@ -231,7 +231,7 @@ static inline void fmul(u64 *out, const
" movq %%r8, 0(%0);"
: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
:
- : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
+ : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
);
}
@@ -248,28 +248,28 @@ static inline void fmul2(u64 *out, const
" movq 0(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
- " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
+ " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;"
/* Compute src1[1] * src2 */
" movq 8(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
- " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
+ " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[2] * src2 */
" movq 16(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
- " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
+ " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[3] * src2 */
" movq 24(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
- " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
+ " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
@@ -279,28 +279,28 @@ static inline void fmul2(u64 *out, const
" movq 32(%1), %%rdx;"
" mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
" mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
- " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
+ " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;"
/* Compute src1[1] * src2 */
" movq 40(%1), %%rdx;"
" mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 80(%0);"
- " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
+ " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[2] * src2 */
" movq 48(%1), %%rdx;"
" mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 88(%0);"
- " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
+ " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[3] * src2 */
" movq 56(%1), %%rdx;"
" mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 96(%0);"
- " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 104(%0);" " mov $0, %%r8;"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
+ " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
/* Line up pointers */
@@ -314,11 +314,11 @@ static inline void fmul2(u64 *out, const
" mulxq 32(%1), %%r8, %%r13;"
" xor %3, %3;"
" adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%r12;"
+ " mulxq 40(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
" adoxq 8(%1), %%r9;"
" mulxq 48(%1), %%r10, %%r13;"
- " adcx %%r12, %%r10;"
+ " adcx %%rbx, %%r10;"
" adoxq 16(%1), %%r10;"
" mulxq 56(%1), %%r11, %%rax;"
" adcx %%r13, %%r11;"
@@ -347,11 +347,11 @@ static inline void fmul2(u64 *out, const
" mulxq 96(%1), %%r8, %%r13;"
" xor %3, %3;"
" adoxq 64(%1), %%r8;"
- " mulxq 104(%1), %%r9, %%r12;"
+ " mulxq 104(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
" adoxq 72(%1), %%r9;"
" mulxq 112(%1), %%r10, %%r13;"
- " adcx %%r12, %%r10;"
+ " adcx %%rbx, %%r10;"
" adoxq 80(%1), %%r10;"
" mulxq 120(%1), %%r11, %%rax;"
" adcx %%r13, %%r11;"
@@ -376,7 +376,7 @@ static inline void fmul2(u64 *out, const
" movq %%r8, 32(%0);"
: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
:
- : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
+ : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
);
}
@@ -388,11 +388,11 @@ static inline void fmul_scalar(u64 *out,
asm volatile(
/* Compute the raw multiplication of f1*f2 */
" mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
- " mulxq 8(%2), %%r9, %%r12;" /* f1[1]*f2 */
+ " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
" add %%rcx, %%r9;"
" mov $0, %%rcx;"
" mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
- " adcx %%r12, %%r10;"
+ " adcx %%rbx, %%r10;"
" mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
" adcx %%r13, %%r11;"
" adcx %%rcx, %%rax;"
@@ -419,7 +419,7 @@ static inline void fmul_scalar(u64 *out,
" movq %%r8, 0(%1);"
: "+&r" (f2_r)
: "r" (out), "r" (f1)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
);
}
@@ -520,8 +520,8 @@ static inline void fsqr(u64 *out, const
" mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
" mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
" movq 24(%1), %%rdx;" /* f[3] */
- " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
+ " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
" movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
" mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
@@ -531,12 +531,12 @@ static inline void fsqr(u64 *out, const
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
" adcx %%r9, %%r9;"
- " adox %%r15, %%r12;"
+ " adox %%r15, %%rbx;"
" adcx %%r10, %%r10;"
" adox %%r15, %%r13;"
" adcx %%r11, %%r11;"
" adox %%r15, %%r14;"
- " adcx %%r12, %%r12;"
+ " adcx %%rbx, %%rbx;"
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
@@ -549,7 +549,7 @@ static inline void fsqr(u64 *out, const
" adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
" movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
" adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
- " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
+ " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
" movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
" adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
" adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
@@ -565,11 +565,11 @@ static inline void fsqr(u64 *out, const
" mulxq 32(%1), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
" adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%r12;"
+ " mulxq 40(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
" adoxq 8(%1), %%r9;"
" mulxq 48(%1), %%r10, %%r13;"
- " adcx %%r12, %%r10;"
+ " adcx %%rbx, %%r10;"
" adoxq 16(%1), %%r10;"
" mulxq 56(%1), %%r11, %%rax;"
" adcx %%r13, %%r11;"
@@ -594,7 +594,7 @@ static inline void fsqr(u64 *out, const
" movq %%r8, 0(%0);"
: "+&r" (tmp), "+&r" (f), "+&r" (out)
:
- : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+ : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
);
}
@@ -611,8 +611,8 @@ static inline void fsqr2(u64 *out, const
" mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
" mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
" movq 24(%1), %%rdx;" /* f[3] */
- " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
+ " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
" movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
" mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
@@ -622,12 +622,12 @@ static inline void fsqr2(u64 *out, const
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
" adcx %%r9, %%r9;"
- " adox %%r15, %%r12;"
+ " adox %%r15, %%rbx;"
" adcx %%r10, %%r10;"
" adox %%r15, %%r13;"
" adcx %%r11, %%r11;"
" adox %%r15, %%r14;"
- " adcx %%r12, %%r12;"
+ " adcx %%rbx, %%rbx;"
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
@@ -640,7 +640,7 @@ static inline void fsqr2(u64 *out, const
" adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
" movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
" adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
- " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
+ " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
" movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
" adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
" adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
@@ -651,8 +651,8 @@ static inline void fsqr2(u64 *out, const
" mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
" mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
" movq 56(%1), %%rdx;" /* f[3] */
- " mulxq 40(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
+ " mulxq 40(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
" movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
" mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
@@ -662,12 +662,12 @@ static inline void fsqr2(u64 *out, const
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
" adcx %%r9, %%r9;"
- " adox %%r15, %%r12;"
+ " adox %%r15, %%rbx;"
" adcx %%r10, %%r10;"
" adox %%r15, %%r13;"
" adcx %%r11, %%r11;"
" adox %%r15, %%r14;"
- " adcx %%r12, %%r12;"
+ " adcx %%rbx, %%rbx;"
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
@@ -680,7 +680,7 @@ static inline void fsqr2(u64 *out, const
" adcx %%rcx, %%r10;" " movq %%r10, 88(%0);"
" movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
" adcx %%rax, %%r11;" " movq %%r11, 96(%0);"
- " adcx %%rcx, %%r12;" " movq %%r12, 104(%0);"
+ " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%0);"
" movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
" adcx %%rax, %%r13;" " movq %%r13, 112(%0);"
" adcx %%rcx, %%r14;" " movq %%r14, 120(%0);"
@@ -694,11 +694,11 @@ static inline void fsqr2(u64 *out, const
" mulxq 32(%1), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
" adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%r12;"
+ " mulxq 40(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
" adoxq 8(%1), %%r9;"
" mulxq 48(%1), %%r10, %%r13;"
- " adcx %%r12, %%r10;"
+ " adcx %%rbx, %%r10;"
" adoxq 16(%1), %%r10;"
" mulxq 56(%1), %%r11, %%rax;"
" adcx %%r13, %%r11;"
@@ -727,11 +727,11 @@ static inline void fsqr2(u64 *out, const
" mulxq 96(%1), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
" adoxq 64(%1), %%r8;"
- " mulxq 104(%1), %%r9, %%r12;"
+ " mulxq 104(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
" adoxq 72(%1), %%r9;"
" mulxq 112(%1), %%r10, %%r13;"
- " adcx %%r12, %%r10;"
+ " adcx %%rbx, %%r10;"
" adoxq 80(%1), %%r10;"
" mulxq 120(%1), %%r11, %%rax;"
" adcx %%r13, %%r11;"
@@ -756,7 +756,7 @@ static inline void fsqr2(u64 *out, const
" movq %%r8, 32(%0);"
: "+&r" (tmp), "+&r" (f), "+&r" (out)
:
- : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+ : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
);
}