From: jsing Date: Mon, 23 Jan 2023 18:22:15 +0000 (+0000) Subject: Bring in various s2n-bignum functions for amd64. X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=e0bd33c5a6f2c53574cbb600361bc7fd887da0a9;p=openbsd Bring in various s2n-bignum functions for amd64. This brings in bignum_add(), bignum_cmadd(), bignum_cmul(), bignum_mul() and bignum_sub(), along with bignum_{mul,sqr}_4_8_alt() and bignum_{mul,sqr}_8_16_alt(). Discussed with tb@ --- diff --git a/lib/libcrypto/bn/arch/amd64/bignum_add.S b/lib/libcrypto/bn/arch/amd64/bignum_add.S new file mode 100644 index 00000000000..33663916bbb --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_add.S @@ -0,0 +1,153 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Add, z := x + y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] +// +// extern uint64_t bignum_add +// (uint64_t p, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the z := x + y operation, truncating modulo p words in general and +// returning a top carry (0 or 1) in the p'th place, only adding the input +// words below p (as well as m and n respectively) to get the sum and carry. +// +// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX +// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add) + .text + +#define p rdi +#define z rsi +#define m rdx +#define x rcx +#define n r8 +#define y r9 +#define i r10 +#define a rax + +#define ashort eax + + + +S2N_BN_SYMBOL(bignum_add): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, [rsp+56] + mov r9, [rsp+64] +#endif + +// Zero the main index counter for both branches + + xor i, i + +// First clamp the two input sizes m := min(p,m) and n := min(p,n) since +// we'll never need words past the p'th. Can now assume m <= p and n <= p. +// Then compare the modified m and n and branch accordingly + + cmp p, m + cmovc m, p + cmp p, n + cmovc n, p + cmp m, n + jc ylonger + +// The case where x is longer or of the same size (p >= m >= n) + + sub p, m + sub m, n + inc m + test n, n + jz xtest +xmainloop: + mov a, [x+8*i] + adc a, [y+8*i] + mov [z+8*i],a + inc i + dec n + jnz xmainloop + jmp xtest +xtoploop: + mov a, [x+8*i] + adc a, 0 + mov [z+8*i],a + inc i +xtest: + dec m + jnz xtoploop + mov ashort, 0 + adc a, 0 + test p, p + jnz tails +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +// The case where y is longer (p >= n > m) + +ylonger: + + sub p, n + sub n, m + test m, m + jz ytoploop +ymainloop: + mov a, [x+8*i] + adc a, [y+8*i] + mov [z+8*i],a + inc i + dec m + jnz ymainloop +ytoploop: + mov a, [y+8*i] + adc a, 0 + mov [z+8*i],a + inc i + dec n + jnz ytoploop + mov ashort, 0 + adc a, 0 + test p, p + jnz tails +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +// Adding a non-trivial tail, when p > max(m,n) + +tails: + mov [z+8*i],a + xor a, a + jmp tail +tailloop: + mov [z+8*i],a +tail: + inc i + dec p + jnz tailloop +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S b/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S new file mode 100644 index 00000000000..33f9be2fa04 --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_cmadd.S @@ -0,0 +1,143 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Multiply-add with single-word multiplier, z := z + c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_cmadd +// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); +// +// Does the "z := z + c * y" operation where y is n digits, result z is p. +// Truncates the result in general. +// +// The return value is a high/carry word that is meaningful when p = n + 1, or +// more generally when n <= p and the result fits in p + 1 digits. In these +// cases it gives the top digit of the (p + 1)-digit result. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd) + .text + +#define p rdi +#define z rsi +#define c r9 +#define n rcx +#define x r8 + +#define i r10 +#define h r11 + +#define r rbx + +#define hshort r11d +#define ishort r10d + + + +S2N_BN_SYMBOL(bignum_cmadd): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, [rsp+56] +#endif + +// Seems hard to avoid one more register + + push rbx + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. +// Subtract p := p - min(n,p) so it holds the size of the extra tail needed + + cmp p, n + cmovc n, p + sub p, n + +// Initialize high part h = 0; if n = 0 do nothing but return that zero + + xor h, h + test n, n + jz end + +// Move c into a safer register as multiplies overwrite rdx + + mov c, rdx + +// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0 + + mov rax, [x] + mul c + add [z], rax + mov h, rdx + mov ishort, 1 + dec n + jz hightail + +// Main loop, where we always have CF + previous high part h to add in + +loop: + adc h, [z+8*i] + sbb r, r + mov rax, [x+8*i] + mul c + sub rdx, r + add rax, h + mov [z+8*i], rax + mov h, rdx + inc i + dec n + jnz loop + +hightail: + adc h, 0 + +// Propagate the carry all the way to the end with h as extra carry word + +tail: + test p, p + jz end + + add [z+8*i], h + mov hshort, 0 + inc i + dec p + jz highend + +tloop: + adc [z+8*i], h + inc i + dec p + jnz tloop + +highend: + + adc h, 0 + +// Return the high/carry word + +end: + mov rax, h + + pop rbx +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libcrypto/bn/arch/amd64/bignum_cmul.S b/lib/libcrypto/bn/arch/amd64/bignum_cmul.S new file mode 100644 index 00000000000..6d184e3f393 --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_cmul.S @@ -0,0 +1,126 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Multiply by a single word, z := c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_cmul +// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); +// +// Does the "z := c * y" operation where y is n digits, result z is p. +// Truncates the result in general unless p >= n + 1. +// +// The return value is a high/carry word that is meaningful when p >= n as +// giving the high part of the result. Since this is always zero if p > n, +// it is mainly of interest in the special case p = n, i.e. where the source +// and destination have the same nominal size, when it gives the extra word +// of the full result. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul) + .text + +#define p rdi +#define z rsi +#define c r9 +#define n rcx +#define x r8 + +#define i r10 +#define h r11 + + + +S2N_BN_SYMBOL(bignum_cmul): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, [rsp+56] +#endif + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. Now we can +// assume that n <= p + + cmp p, n + cmovc n, p + +// Initialize current input/output pointer offset i and high part h. +// But then if n = 0 skip the multiplication and go to the tail part + + xor h, h + xor i, i + test n, n + jz tail + +// Move c into a safer register as multiplies overwrite rdx + + mov c, rdx + +// Initialization of the loop: [h,l] = c * x_0 + + mov rax, [x] + mul c + mov [z], rax + mov h, rdx + inc i + cmp i, n + jz tail + +// Main loop doing the multiplications + +loop: + mov rax, [x+8*i] + mul c + add rax, h + adc rdx, 0 + mov [z+8*i], rax + mov h, rdx + inc i + cmp i, n + jc loop + +// Add a tail when the destination is longer + +tail: + cmp i, p + jnc end + mov [z+8*i], h + xor h, h + inc i + cmp i, p + jnc end + +tloop: + mov [z+8*i], h + inc i + cmp i, p + jc tloop + +// Return the high/carry word + +end: + mov rax, h + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libcrypto/bn/arch/amd64/bignum_mul.S b/lib/libcrypto/bn/arch/amd64/bignum_mul.S new file mode 100644 index 00000000000..20c3f702022 --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_mul.S @@ -0,0 +1,155 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[m], y[n]; output z[k] +// +// extern void bignum_mul +// (uint64_t k, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the "z := x * y" operation where x is m digits, y is n, result z is k. +// Truncates the result in general unless k >= m + n +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul) + .text + +// These are actually right + +#define p rdi +#define z rsi +#define n r8 + +// These are not + +#define c r15 +#define h r14 +#define l r13 +#define x r12 +#define y r11 +#define i rbx +#define k r10 +#define m rbp + +// These are always local scratch since multiplier result is in these + +#define a rax +#define d rdx + + + +S2N_BN_SYMBOL(bignum_mul): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, [rsp+56] + mov r9, [rsp+64] +#endif + +// We use too many registers, and also we need rax:rdx for multiplications + + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov m, rdx + +// If the result size is zero, do nothing +// Note that even if either or both inputs has size zero, we can't +// just give up because we at least need to zero the output array +// If we did a multiply-add variant, however, then we could + + test p, p + jz end + +// Set initial 2-part sum to zero (we zero c inside the body) + + xor h,h + xor l,l + +// Otherwise do outer loop k = 0 ... k = p - 1 + + xor k, k + +outerloop: + +// Zero our carry term first; we eventually want it and a zero is useful now +// Set a = max 0 (k + 1 - n), i = min (k + 1) m +// This defines the range a <= j < i for the inner summation +// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow +// And since we want to increment it anyway, we might as well do it now + + xor c, c // c = 0 + inc k // k = k + 1 + + mov a, k // a = k + 1 + sub a, n // a = k + 1 - n + cmovc a, c // a = max 0 (k + 1 - n) + + mov i, m // i = m + cmp k, m // CF <=> k + 1 < m + cmovc i, k // i = min (k + 1) m + +// Turn i into a loop count, and skip things if it's <= 0 +// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a] +// and then launch into the main inner loop, postdecrementing i + + mov d, k + sub d, i + sub i, a + jbe innerend + lea x,[rcx+8*a] + lea y,[r9+8*d-8] + +innerloop: + mov rax, [y+8*i] + mul QWORD PTR [x] + add x, 8 + add l, rax + adc h, rdx + adc c, 0 + dec i + jnz innerloop + +innerend: + + mov [z], l + mov l, h + mov h, c + add z, 8 + + cmp k, p + jc outerloop + +end: + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S b/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S new file mode 100644 index 00000000000..e70ed116da6 --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_mul_4_8_alt.S @@ -0,0 +1,145 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[4], y[4]; output z[8] +// +// extern void bignum_mul_4_8_alt +// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt) + .text + +// These are actually right + +#define z rdi +#define x rsi + +// This is moved from rdx to free it for muls + +#define y rcx + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 r8 +#define t1 r9 +#define t2 r10 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx; \ + adc c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx + +S2N_BN_SYMBOL(bignum_mul_4_8_alt): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 +#endif + +// Copy y into a safe register to start with + + mov y, rdx + +// Result term 0 + + mov rax, [x] + mul QWORD PTR [y] + + mov [z], rax + mov t0, rdx + xor t1, t1 + +// Result term 1 + + xor t2, t2 + combads(t1,t0,[x],[y+8]) + combadz(t2,t1,t0,[x+8],[y]) + mov [z+8], t0 + +// Result term 2 + + xor t0, t0 + combadz(t0,t2,t1,[x],[y+16]) + combadd(t0,t2,t1,[x+8],[y+8]) + combadd(t0,t2,t1,[x+16],[y]) + mov [z+16], t1 + +// Result term 3 + + xor t1, t1 + combadz(t1,t0,t2,[x],[y+24]) + combadd(t1,t0,t2,[x+8],[y+16]) + combadd(t1,t0,t2,[x+16],[y+8]) + combadd(t1,t0,t2,[x+24],[y]) + mov [z+24], t2 + +// Result term 4 + + xor t2, t2 + combadz(t2,t1,t0,[x+8],[y+24]) + combadd(t2,t1,t0,[x+16],[y+16]) + combadd(t2,t1,t0,[x+24],[y+8]) + mov [z+32], t0 + +// Result term 5 + + xor t0, t0 + combadz(t0,t2,t1,[x+16],[y+24]) + combadd(t0,t2,t1,[x+24],[y+16]) + mov [z+40], t1 + +// Result term 6 + + xor t1, t1 + combads(t0,t2,[x+24],[y+24]) + mov [z+48], t2 + +// Result term 7 + + mov [z+56], t0 + +// Return + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S b/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S new file mode 100644 index 00000000000..43c6c486672 --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_mul_8_16_alt.S @@ -0,0 +1,232 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[8], y[8]; output z[16] +// +// extern void bignum_mul_8_16_alt +// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt) + .text + +// These are actually right + +#define z rdi +#define x rsi + +// This is moved from rdx to free it for muls + +#define y rcx + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 r8 +#define t1 r9 +#define t2 r10 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx; \ + adc c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx + +S2N_BN_SYMBOL(bignum_mul_8_16_alt): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 +#endif + +// Copy y into a safe register to start with + + mov y, rdx + +// Result term 0 + + mov rax, [x] + mul QWORD PTR [y] + + mov [z], rax + mov t0, rdx + xor t1, t1 + +// Result term 1 + + xor t2, t2 + combads(t1,t0,[x],[y+8]) + combadz(t2,t1,t0,[x+8],[y]) + mov [z+8], t0 + +// Result term 2 + + xor t0, t0 + combadz(t0,t2,t1,[x],[y+16]) + combadd(t0,t2,t1,[x+8],[y+8]) + combadd(t0,t2,t1,[x+16],[y]) + mov [z+16], t1 + +// Result term 3 + + xor t1, t1 + combadz(t1,t0,t2,[x],[y+24]) + combadd(t1,t0,t2,[x+8],[y+16]) + combadd(t1,t0,t2,[x+16],[y+8]) + combadd(t1,t0,t2,[x+24],[y]) + mov [z+24], t2 + +// Result term 4 + + xor t2, t2 + combadz(t2,t1,t0,[x],[y+32]) + combadd(t2,t1,t0,[x+8],[y+24]) + combadd(t2,t1,t0,[x+16],[y+16]) + combadd(t2,t1,t0,[x+24],[y+8]) + combadd(t2,t1,t0,[x+32],[y]) + mov [z+32], t0 + +// Result term 5 + + xor t0, t0 + combadz(t0,t2,t1,[x],[y+40]) + combadd(t0,t2,t1,[x+8],[y+32]) + combadd(t0,t2,t1,[x+16],[y+24]) + combadd(t0,t2,t1,[x+24],[y+16]) + combadd(t0,t2,t1,[x+32],[y+8]) + combadd(t0,t2,t1,[x+40],[y]) + mov [z+40], t1 + +// Result term 6 + + xor t1, t1 + combadz(t1,t0,t2,[x],[y+48]) + combadd(t1,t0,t2,[x+8],[y+40]) + combadd(t1,t0,t2,[x+16],[y+32]) + combadd(t1,t0,t2,[x+24],[y+24]) + combadd(t1,t0,t2,[x+32],[y+16]) + combadd(t1,t0,t2,[x+40],[y+8]) + combadd(t1,t0,t2,[x+48],[y]) + mov [z+48], t2 + +// Result term 7 + + xor t2, t2 + combadz(t2,t1,t0,[x],[y+56]) + combadd(t2,t1,t0,[x+8],[y+48]) + combadd(t2,t1,t0,[x+16],[y+40]) + combadd(t2,t1,t0,[x+24],[y+32]) + combadd(t2,t1,t0,[x+32],[y+24]) + combadd(t2,t1,t0,[x+40],[y+16]) + combadd(t2,t1,t0,[x+48],[y+8]) + combadd(t2,t1,t0,[x+56],[y]) + mov [z+56], t0 + +// Result term 8 + + xor t0, t0 + combadz(t0,t2,t1,[x+8],[y+56]) + combadd(t0,t2,t1,[x+16],[y+48]) + combadd(t0,t2,t1,[x+24],[y+40]) + combadd(t0,t2,t1,[x+32],[y+32]) + combadd(t0,t2,t1,[x+40],[y+24]) + combadd(t0,t2,t1,[x+48],[y+16]) + combadd(t0,t2,t1,[x+56],[y+8]) + mov [z+64], t1 + +// Result term 9 + + xor t1, t1 + combadz(t1,t0,t2,[x+16],[y+56]) + combadd(t1,t0,t2,[x+24],[y+48]) + combadd(t1,t0,t2,[x+32],[y+40]) + combadd(t1,t0,t2,[x+40],[y+32]) + combadd(t1,t0,t2,[x+48],[y+24]) + combadd(t1,t0,t2,[x+56],[y+16]) + mov [z+72], t2 + +// Result term 10 + + xor t2, t2 + combadz(t2,t1,t0,[x+24],[y+56]) + combadd(t2,t1,t0,[x+32],[y+48]) + combadd(t2,t1,t0,[x+40],[y+40]) + combadd(t2,t1,t0,[x+48],[y+32]) + combadd(t2,t1,t0,[x+56],[y+24]) + mov [z+80], t0 + +// Result term 11 + + xor t0, t0 + combadz(t0,t2,t1,[x+32],[y+56]) + combadd(t0,t2,t1,[x+40],[y+48]) + combadd(t0,t2,t1,[x+48],[y+40]) + combadd(t0,t2,t1,[x+56],[y+32]) + mov [z+88], t1 + +// Result term 12 + + xor t1, t1 + combadz(t1,t0,t2,[x+40],[y+56]) + combadd(t1,t0,t2,[x+48],[y+48]) + combadd(t1,t0,t2,[x+56],[y+40]) + mov [z+96], t2 + +// Result term 13 + + xor t2, t2 + combadz(t2,t1,t0,[x+48],[y+56]) + combadd(t2,t1,t0,[x+56],[y+48]) + mov [z+104], t0 + +// Result term 14 + + combads(t2,t1,[x+56],[y+56]) + mov [z+112], t1 + +// Result term 11 + + mov [z+120], t2 + +// Return + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S b/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S new file mode 100644 index 00000000000..db483a57233 --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_sqr_4_8_alt.S @@ -0,0 +1,133 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[4]; output z[8] +// +// extern void bignum_sqr_4_8_alt +// (uint64_t z[static 8], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt) + .text + +// Input arguments + +#define z rdi +#define x rsi + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 rcx +#define t1 r8 +#define t2 r9 + +// Macro for the key "multiply and add to (c,h,l)" step, for square term + +#define combadd1(c,h,l,numa) \ + mov rax, numa; \ + mul rax; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +// A short form where we don't expect a top carry + +#define combads(h,l,numa) \ + mov rax, numa; \ + mul rax; \ + add l, rax; \ + adc h, rdx + +// A version doubling before adding, for non-square terms + +#define combadd2(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add rax, rax; \ + adc rdx, rdx; \ + adc c, 0; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +S2N_BN_SYMBOL(bignum_sqr_4_8_alt): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx +#endif + +// Result term 0 + + mov rax, [x] + mul rax + + mov [z], rax + mov t0, rdx + xor t1, t1 + +// Result term 1 + + xor t2, t2 + combadd2(t2,t1,t0,[x],[x+8]) + mov [z+8], t0 + +// Result term 2 + + xor t0, t0 + combadd1(t0,t2,t1,[x+8]) + combadd2(t0,t2,t1,[x],[x+16]) + mov [z+16], t1 + +// Result term 3 + + xor t1, t1 + combadd2(t1,t0,t2,[x],[x+24]) + combadd2(t1,t0,t2,[x+8],[x+16]) + mov [z+24], t2 + +// Result term 4 + + xor t2, t2 + combadd2(t2,t1,t0,[x+8],[x+24]) + combadd1(t2,t1,t0,[x+16]) + mov [z+32], t0 + +// Result term 5 + + xor t0, t0 + combadd2(t0,t2,t1,[x+16],[x+24]) + mov [z+40], t1 + +// Result term 6 + + xor t1, t1 + combads(t0,t2,[x+24]) + mov [z+48], t2 + +// Result term 7 + + mov [z+56], t0 + +// Return + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S b/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S new file mode 100644 index 00000000000..dcf3c92e5b8 --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_sqr_8_16_alt.S @@ -0,0 +1,230 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[8]; output z[16] +// +// extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt) + .text + +// Input arguments + +#define z rdi +#define x rsi + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 r8 +#define t1 r9 +#define t2 r10 + +// Additional temporaries for local windows to share doublings + +#define u0 rcx +#define u1 r11 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +// Set up initial window (c,h,l) = numa * numb + +#define combaddz(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + xor c, c; \ + mov l, rax; \ + mov h, rdx + +// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) + +#define doubladd(c,h,l,hh,ll) \ + add ll, ll; \ + adc hh, hh; \ + adc c, c; \ + add l, ll; \ + adc h, hh; \ + adc c, 0 + +// Square term incorporation (c,h,l) += numba^2 + +#define combadd1(c,h,l,numa) \ + mov rax, numa; \ + mul rax; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +// A short form where we don't expect a top carry + +#define combads(h,l,numa) \ + mov rax, numa; \ + mul rax; \ + add l, rax; \ + adc h, rdx + +// A version doubling directly before adding, for single non-square terms + +#define combadd2(c,h,l,numa,numb) \ + mov rax, numa; \ + mul QWORD PTR numb; \ + add rax, rax; \ + adc rdx, rdx; \ + adc c, 0; \ + add l, rax; \ + adc h, rdx; \ + adc c, 0 + +S2N_BN_SYMBOL(bignum_sqr_8_16_alt): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx +#endif + +// Result term 0 + + mov rax, [x] + mul rax + + mov [z], rax + mov t0, rdx + xor t1, t1 + +// Result term 1 + + xor t2, t2 + combadd2(t2,t1,t0,[x],[x+8]) + mov [z+8], t0 + +// Result term 2 + + xor t0, t0 + combadd1(t0,t2,t1,[x+8]) + combadd2(t0,t2,t1,[x],[x+16]) + mov [z+16], t1 + +// Result term 3 + + combaddz(t1,u1,u0,[x],[x+24]) + combadd(t1,u1,u0,[x+8],[x+16]) + doubladd(t1,t0,t2,u1,u0) + mov [z+24], t2 + +// Result term 4 + + combaddz(t2,u1,u0,[x],[x+32]) + combadd(t2,u1,u0,[x+8],[x+24]) + doubladd(t2,t1,t0,u1,u0) + combadd1(t2,t1,t0,[x+16]) + mov [z+32], t0 + +// Result term 5 + + combaddz(t0,u1,u0,[x],[x+40]) + combadd(t0,u1,u0,[x+8],[x+32]) + combadd(t0,u1,u0,[x+16],[x+24]) + doubladd(t0,t2,t1,u1,u0) + mov [z+40], t1 + +// Result term 6 + + combaddz(t1,u1,u0,[x],[x+48]) + combadd(t1,u1,u0,[x+8],[x+40]) + combadd(t1,u1,u0,[x+16],[x+32]) + doubladd(t1,t0,t2,u1,u0) + combadd1(t1,t0,t2,[x+24]) + mov [z+48], t2 + +// Result term 7 + + combaddz(t2,u1,u0,[x],[x+56]) + combadd(t2,u1,u0,[x+8],[x+48]) + combadd(t2,u1,u0,[x+16],[x+40]) + combadd(t2,u1,u0,[x+24],[x+32]) + doubladd(t2,t1,t0,u1,u0) + mov [z+56], t0 + +// Result term 8 + + combaddz(t0,u1,u0,[x+8],[x+56]) + combadd(t0,u1,u0,[x+16],[x+48]) + combadd(t0,u1,u0,[x+24],[x+40]) + doubladd(t0,t2,t1,u1,u0) + combadd1(t0,t2,t1,[x+32]) + mov [z+64], t1 + +// Result term 9 + + combaddz(t1,u1,u0,[x+16],[x+56]) + combadd(t1,u1,u0,[x+24],[x+48]) + combadd(t1,u1,u0,[x+32],[x+40]) + doubladd(t1,t0,t2,u1,u0) + mov [z+72], t2 + +// Result term 10 + + combaddz(t2,u1,u0,[x+24],[x+56]) + combadd(t2,u1,u0,[x+32],[x+48]) + doubladd(t2,t1,t0,u1,u0) + combadd1(t2,t1,t0,[x+40]) + mov [z+80], t0 + +// Result term 11 + + combaddz(t0,u1,u0,[x+32],[x+56]) + combadd(t0,u1,u0,[x+40],[x+48]) + doubladd(t0,t2,t1,u1,u0) + mov [z+88], t1 + +// Result term 12 + + xor t1, t1 + combadd2(t1,t0,t2,[x+40],[x+56]) + combadd1(t1,t0,t2,[x+48]) + mov [z+96], t2 + +// Result term 13 + + xor t2, t2 + combadd2(t2,t1,t0,[x+48],[x+56]) + mov [z+104], t0 + +// Result term 14 + + combads(t2,t1,[x+56]) + mov [z+112], t1 + +// Result term 15 + + mov [z+120], t2 + +// Return + +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libcrypto/bn/arch/amd64/bignum_sub.S b/lib/libcrypto/bn/arch/amd64/bignum_sub.S new file mode 100644 index 00000000000..42eb6a82ef5 --- /dev/null +++ b/lib/libcrypto/bn/arch/amd64/bignum_sub.S @@ -0,0 +1,141 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Subtract, z := x - y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] +// +// extern uint64_t bignum_sub +// (uint64_t p, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the z := x - y operation, truncating modulo p words in general and +// returning a top borrow (0 or 1) in the p'th place, only subtracting input +// words below p (as well as m and n respectively) to get the diff and borrow. +// +// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX +// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + .intel_syntax noprefix + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub) + .text + +#define p rdi +#define z rsi +#define m rdx +#define x rcx +#define n r8 +#define y r9 +#define i r10 +#define a rax + +#define ashort eax + + + +S2N_BN_SYMBOL(bignum_sub): + +#if WINDOWS_ABI + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, [rsp+56] + mov r9, [rsp+64] +#endif + +// Zero the main index counter for both branches + + xor i, i + +// First clamp the two input sizes m := min(p,m) and n := min(p,n) since +// we'll never need words past the p'th. Can now assume m <= p and n <= p. +// Then compare the modified m and n and branch accordingly + + cmp p, m + cmovc m, p + cmp p, n + cmovc n, p + cmp m, n + jc ylonger + +// The case where x is longer or of the same size (p >= m >= n) + + sub p, m + sub m, n + inc m + test n, n + jz xtest +xmainloop: + mov a, [x+8*i] + sbb a, [y+8*i] + mov [z+8*i],a + inc i + dec n + jnz xmainloop + jmp xtest +xtoploop: + mov a, [x+8*i] + sbb a, 0 + mov [z+8*i],a + inc i +xtest: + dec m + jnz xtoploop + sbb a, a + test p, p + jz tailskip +tailloop: + mov [z+8*i],a + inc i + dec p + jnz tailloop +tailskip: + neg a +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +// The case where y is longer (p >= n > m) + +ylonger: + + sub p, n + sub n, m + test m, m + jz ytoploop +ymainloop: + mov a, [x+8*i] + sbb a, [y+8*i] + mov [z+8*i],a + inc i + dec m + jnz ymainloop +ytoploop: + mov ashort, 0 + sbb a, [y+8*i] + mov [z+8*i],a + inc i + dec n + jnz ytoploop + sbb a, a + test p, p + jnz tailloop + neg a +#if WINDOWS_ABI + pop rsi + pop rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif