--- /dev/null
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Add, z := x + y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+//
+// extern uint64_t bignum_add
+// (uint64_t p, uint64_t *z,
+// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the z := x + y operation, truncating modulo p words in general and
+// returning a top carry (0 or 1) in the p'th place, only adding the input
+// words below p (as well as m and n respectively) to get the sum and carry.
+//
+// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
+// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+ .intel_syntax noprefix
+ S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add)
+ S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add)
+ .text
+
+#define p rdi
+#define z rsi
+#define m rdx
+#define x rcx
+#define n r8
+#define y r9
+#define i r10
+#define a rax
+
+#define ashort eax
+
+
+
+S2N_BN_SYMBOL(bignum_add):
+
+#if WINDOWS_ABI
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rsp+56]
+ mov r9, [rsp+64]
+#endif
+
+// Zero the main index counter for both branches
+
+ xor i, i
+
+// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
+// we'll never need words past the p'th. Can now assume m <= p and n <= p.
+// Then compare the modified m and n and branch accordingly
+
+ cmp p, m
+ cmovc m, p
+ cmp p, n
+ cmovc n, p
+ cmp m, n
+ jc ylonger
+
+// The case where x is longer or of the same size (p >= m >= n)
+
+ sub p, m
+ sub m, n
+ inc m
+ test n, n
+ jz xtest
+xmainloop:
+ mov a, [x+8*i]
+ adc a, [y+8*i]
+ mov [z+8*i],a
+ inc i
+ dec n
+ jnz xmainloop
+ jmp xtest
+xtoploop:
+ mov a, [x+8*i]
+ adc a, 0
+ mov [z+8*i],a
+ inc i
+xtest:
+ dec m
+ jnz xtoploop
+ mov ashort, 0
+ adc a, 0
+ test p, p
+ jnz tails
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+// The case where y is longer (p >= n > m)
+
+ylonger:
+
+ sub p, n
+ sub n, m
+ test m, m
+ jz ytoploop
+ymainloop:
+ mov a, [x+8*i]
+ adc a, [y+8*i]
+ mov [z+8*i],a
+ inc i
+ dec m
+ jnz ymainloop
+ytoploop:
+ mov a, [y+8*i]
+ adc a, 0
+ mov [z+8*i],a
+ inc i
+ dec n
+ jnz ytoploop
+ mov ashort, 0
+ adc a, 0
+ test p, p
+ jnz tails
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+// Adding a non-trivial tail, when p > max(m,n)
+
+tails:
+ mov [z+8*i],a
+ xor a, a
+ jmp tail
+tailloop:
+ mov [z+8*i],a
+tail:
+ inc i
+ dec p
+ jnz tailloop
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- /dev/null
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Multiply-add with single-word multiplier, z := z + c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+// extern uint64_t bignum_cmadd
+// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := z + c * y" operation where y is n digits, result z is p.
+// Truncates the result in general.
+//
+// The return value is a high/carry word that is meaningful when p = n + 1, or
+// more generally when n <= p and the result fits in p + 1 digits. In these
+// cases it gives the top digit of the (p + 1)-digit result.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
+// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+ .intel_syntax noprefix
+ S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd)
+ S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd)
+ .text
+
+#define p rdi
+#define z rsi
+#define c r9
+#define n rcx
+#define x r8
+
+#define i r10
+#define h r11
+
+#define r rbx
+
+#define hshort r11d
+#define ishort r10d
+
+
+
+S2N_BN_SYMBOL(bignum_cmadd):
+
+#if WINDOWS_ABI
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rsp+56]
+#endif
+
+// Seems hard to avoid one more register
+
+ push rbx
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output.
+// Subtract p := p - min(n,p) so it holds the size of the extra tail needed
+
+ cmp p, n
+ cmovc n, p
+ sub p, n
+
+// Initialize high part h = 0; if n = 0 do nothing but return that zero
+
+ xor h, h
+ test n, n
+ jz end
+
+// Move c into a safer register as multiplies overwrite rdx
+
+ mov c, rdx
+
+// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0
+
+ mov rax, [x]
+ mul c
+ add [z], rax
+ mov h, rdx
+ mov ishort, 1
+ dec n
+ jz hightail
+
+// Main loop, where we always have CF + previous high part h to add in
+
+loop:
+ adc h, [z+8*i]
+ sbb r, r
+ mov rax, [x+8*i]
+ mul c
+ sub rdx, r
+ add rax, h
+ mov [z+8*i], rax
+ mov h, rdx
+ inc i
+ dec n
+ jnz loop
+
+hightail:
+ adc h, 0
+
+// Propagate the carry all the way to the end with h as extra carry word
+
+tail:
+ test p, p
+ jz end
+
+ add [z+8*i], h
+ mov hshort, 0
+ inc i
+ dec p
+ jz highend
+
+tloop:
+ adc [z+8*i], h
+ inc i
+ dec p
+ jnz tloop
+
+highend:
+
+ adc h, 0
+
+// Return the high/carry word
+
+end:
+ mov rax, h
+
+ pop rbx
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- /dev/null
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Multiply by a single word, z := c * y
+// Inputs c, y[n]; outputs function return (carry-out) and z[k]
+//
+// extern uint64_t bignum_cmul
+// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y);
+//
+// Does the "z := c * y" operation where y is n digits, result z is p.
+// Truncates the result in general unless p >= n + 1.
+//
+// The return value is a high/carry word that is meaningful when p >= n as
+// giving the high part of the result. Since this is always zero if p > n,
+// it is mainly of interest in the special case p = n, i.e. where the source
+// and destination have the same nominal size, when it gives the extra word
+// of the full result.
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX
+// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+ .intel_syntax noprefix
+ S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul)
+ S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul)
+ .text
+
+#define p rdi
+#define z rsi
+#define c r9
+#define n rcx
+#define x r8
+
+#define i r10
+#define h r11
+
+
+
+S2N_BN_SYMBOL(bignum_cmul):
+
+#if WINDOWS_ABI
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rsp+56]
+#endif
+
+// First clamp the input size n := min(p,n) since we can never need to read
+// past the p'th term of the input to generate p-digit output. Now we can
+// assume that n <= p
+
+ cmp p, n
+ cmovc n, p
+
+// Initialize current input/output pointer offset i and high part h.
+// But then if n = 0 skip the multiplication and go to the tail part
+
+ xor h, h
+ xor i, i
+ test n, n
+ jz tail
+
+// Move c into a safer register as multiplies overwrite rdx
+
+ mov c, rdx
+
+// Initialization of the loop: [h,l] = c * x_0
+
+ mov rax, [x]
+ mul c
+ mov [z], rax
+ mov h, rdx
+ inc i
+ cmp i, n
+ jz tail
+
+// Main loop doing the multiplications
+
+loop:
+ mov rax, [x+8*i]
+ mul c
+ add rax, h
+ adc rdx, 0
+ mov [z+8*i], rax
+ mov h, rdx
+ inc i
+ cmp i, n
+ jc loop
+
+// Add a tail when the destination is longer
+
+tail:
+ cmp i, p
+ jnc end
+ mov [z+8*i], h
+ xor h, h
+ inc i
+ cmp i, p
+ jnc end
+
+tloop:
+ mov [z+8*i], h
+ inc i
+ cmp i, p
+ jc tloop
+
+// Return the high/carry word
+
+end:
+ mov rax, h
+
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- /dev/null
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[m], y[n]; output z[k]
+//
+// extern void bignum_mul
+// (uint64_t k, uint64_t *z,
+// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the "z := x * y" operation where x is m digits, y is n, result z is k.
+// Truncates the result in general unless k >= m + n
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y
+// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+ .intel_syntax noprefix
+ S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul)
+ S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul)
+ .text
+
+// These are actually right
+
+#define p rdi
+#define z rsi
+#define n r8
+
+// These are not
+
+#define c r15
+#define h r14
+#define l r13
+#define x r12
+#define y r11
+#define i rbx
+#define k r10
+#define m rbp
+
+// These are always local scratch since multiplier result is in these
+
+#define a rax
+#define d rdx
+
+
+
+S2N_BN_SYMBOL(bignum_mul):
+
+#if WINDOWS_ABI
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rsp+56]
+ mov r9, [rsp+64]
+#endif
+
+// We use too many registers, and also we need rax:rdx for multiplications
+
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ mov m, rdx
+
+// If the result size is zero, do nothing
+// Note that even if either or both inputs has size zero, we can't
+// just give up because we at least need to zero the output array
+// If we did a multiply-add variant, however, then we could
+
+ test p, p
+ jz end
+
+// Set initial 2-part sum to zero (we zero c inside the body)
+
+ xor h,h
+ xor l,l
+
+// Otherwise do outer loop k = 0 ... k = p - 1
+
+ xor k, k
+
+outerloop:
+
+// Zero our carry term first; we eventually want it and a zero is useful now
+// Set a = max 0 (k + 1 - n), i = min (k + 1) m
+// This defines the range a <= j < i for the inner summation
+// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow
+// And since we want to increment it anyway, we might as well do it now
+
+ xor c, c // c = 0
+ inc k // k = k + 1
+
+ mov a, k // a = k + 1
+ sub a, n // a = k + 1 - n
+ cmovc a, c // a = max 0 (k + 1 - n)
+
+ mov i, m // i = m
+ cmp k, m // CF <=> k + 1 < m
+ cmovc i, k // i = min (k + 1) m
+
+// Turn i into a loop count, and skip things if it's <= 0
+// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a]
+// and then launch into the main inner loop, postdecrementing i
+
+ mov d, k
+ sub d, i
+ sub i, a
+ jbe innerend
+ lea x,[rcx+8*a]
+ lea y,[r9+8*d-8]
+
+innerloop:
+ mov rax, [y+8*i]
+ mul QWORD PTR [x]
+ add x, 8
+ add l, rax
+ adc h, rdx
+ adc c, 0
+ dec i
+ jnz innerloop
+
+innerend:
+
+ mov [z], l
+ mov l, h
+ mov h, c
+ add z, 8
+
+ cmp k, p
+ jc outerloop
+
+end:
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- /dev/null
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[4], y[4]; output z[8]
+//
+// extern void bignum_mul_4_8_alt
+// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+ .intel_syntax noprefix
+ S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt)
+ S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt)
+ .text
+
+// These are actually right
+
+#define z rdi
+#define x rsi
+
+// This is moved from rdx to free it for muls
+
+#define y rcx
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 r8
+#define t1 r9
+#define t2 r10
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ add l, rax; \
+ adc h, rdx; \
+ adc c, 0
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ add l, rax; \
+ adc h, rdx; \
+ adc c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ add l, rax; \
+ adc h, rdx
+
+S2N_BN_SYMBOL(bignum_mul_4_8_alt):
+
+#if WINDOWS_ABI
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+#endif
+
+// Copy y into a safe register to start with
+
+ mov y, rdx
+
+// Result term 0
+
+ mov rax, [x]
+ mul QWORD PTR [y]
+
+ mov [z], rax
+ mov t0, rdx
+ xor t1, t1
+
+// Result term 1
+
+ xor t2, t2
+ combads(t1,t0,[x],[y+8])
+ combadz(t2,t1,t0,[x+8],[y])
+ mov [z+8], t0
+
+// Result term 2
+
+ xor t0, t0
+ combadz(t0,t2,t1,[x],[y+16])
+ combadd(t0,t2,t1,[x+8],[y+8])
+ combadd(t0,t2,t1,[x+16],[y])
+ mov [z+16], t1
+
+// Result term 3
+
+ xor t1, t1
+ combadz(t1,t0,t2,[x],[y+24])
+ combadd(t1,t0,t2,[x+8],[y+16])
+ combadd(t1,t0,t2,[x+16],[y+8])
+ combadd(t1,t0,t2,[x+24],[y])
+ mov [z+24], t2
+
+// Result term 4
+
+ xor t2, t2
+ combadz(t2,t1,t0,[x+8],[y+24])
+ combadd(t2,t1,t0,[x+16],[y+16])
+ combadd(t2,t1,t0,[x+24],[y+8])
+ mov [z+32], t0
+
+// Result term 5
+
+ xor t0, t0
+ combadz(t0,t2,t1,[x+16],[y+24])
+ combadd(t0,t2,t1,[x+24],[y+16])
+ mov [z+40], t1
+
+// Result term 6
+
+ xor t1, t1
+ combads(t0,t2,[x+24],[y+24])
+ mov [z+48], t2
+
+// Result term 7
+
+ mov [z+56], t0
+
+// Return
+
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- /dev/null
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Multiply z := x * y
+// Inputs x[8], y[8]; output z[16]
+//
+// extern void bignum_mul_8_16_alt
+// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
+// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+ .intel_syntax noprefix
+ S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt)
+ S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt)
+ .text
+
+// These are actually right
+
+#define z rdi
+#define x rsi
+
+// This is moved from rdx to free it for muls
+
+#define y rcx
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 r8
+#define t1 r9
+#define t2 r10
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ add l, rax; \
+ adc h, rdx; \
+ adc c, 0
+
+// A minutely shorter form for when c = 0 initially
+
+#define combadz(c,h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ add l, rax; \
+ adc h, rdx; \
+ adc c, c
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ add l, rax; \
+ adc h, rdx
+
+S2N_BN_SYMBOL(bignum_mul_8_16_alt):
+
+#if WINDOWS_ABI
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+#endif
+
+// Copy y into a safe register to start with
+
+ mov y, rdx
+
+// Result term 0
+
+ mov rax, [x]
+ mul QWORD PTR [y]
+
+ mov [z], rax
+ mov t0, rdx
+ xor t1, t1
+
+// Result term 1
+
+ xor t2, t2
+ combads(t1,t0,[x],[y+8])
+ combadz(t2,t1,t0,[x+8],[y])
+ mov [z+8], t0
+
+// Result term 2
+
+ xor t0, t0
+ combadz(t0,t2,t1,[x],[y+16])
+ combadd(t0,t2,t1,[x+8],[y+8])
+ combadd(t0,t2,t1,[x+16],[y])
+ mov [z+16], t1
+
+// Result term 3
+
+ xor t1, t1
+ combadz(t1,t0,t2,[x],[y+24])
+ combadd(t1,t0,t2,[x+8],[y+16])
+ combadd(t1,t0,t2,[x+16],[y+8])
+ combadd(t1,t0,t2,[x+24],[y])
+ mov [z+24], t2
+
+// Result term 4
+
+ xor t2, t2
+ combadz(t2,t1,t0,[x],[y+32])
+ combadd(t2,t1,t0,[x+8],[y+24])
+ combadd(t2,t1,t0,[x+16],[y+16])
+ combadd(t2,t1,t0,[x+24],[y+8])
+ combadd(t2,t1,t0,[x+32],[y])
+ mov [z+32], t0
+
+// Result term 5
+
+ xor t0, t0
+ combadz(t0,t2,t1,[x],[y+40])
+ combadd(t0,t2,t1,[x+8],[y+32])
+ combadd(t0,t2,t1,[x+16],[y+24])
+ combadd(t0,t2,t1,[x+24],[y+16])
+ combadd(t0,t2,t1,[x+32],[y+8])
+ combadd(t0,t2,t1,[x+40],[y])
+ mov [z+40], t1
+
+// Result term 6
+
+ xor t1, t1
+ combadz(t1,t0,t2,[x],[y+48])
+ combadd(t1,t0,t2,[x+8],[y+40])
+ combadd(t1,t0,t2,[x+16],[y+32])
+ combadd(t1,t0,t2,[x+24],[y+24])
+ combadd(t1,t0,t2,[x+32],[y+16])
+ combadd(t1,t0,t2,[x+40],[y+8])
+ combadd(t1,t0,t2,[x+48],[y])
+ mov [z+48], t2
+
+// Result term 7
+
+ xor t2, t2
+ combadz(t2,t1,t0,[x],[y+56])
+ combadd(t2,t1,t0,[x+8],[y+48])
+ combadd(t2,t1,t0,[x+16],[y+40])
+ combadd(t2,t1,t0,[x+24],[y+32])
+ combadd(t2,t1,t0,[x+32],[y+24])
+ combadd(t2,t1,t0,[x+40],[y+16])
+ combadd(t2,t1,t0,[x+48],[y+8])
+ combadd(t2,t1,t0,[x+56],[y])
+ mov [z+56], t0
+
+// Result term 8
+
+ xor t0, t0
+ combadz(t0,t2,t1,[x+8],[y+56])
+ combadd(t0,t2,t1,[x+16],[y+48])
+ combadd(t0,t2,t1,[x+24],[y+40])
+ combadd(t0,t2,t1,[x+32],[y+32])
+ combadd(t0,t2,t1,[x+40],[y+24])
+ combadd(t0,t2,t1,[x+48],[y+16])
+ combadd(t0,t2,t1,[x+56],[y+8])
+ mov [z+64], t1
+
+// Result term 9
+
+ xor t1, t1
+ combadz(t1,t0,t2,[x+16],[y+56])
+ combadd(t1,t0,t2,[x+24],[y+48])
+ combadd(t1,t0,t2,[x+32],[y+40])
+ combadd(t1,t0,t2,[x+40],[y+32])
+ combadd(t1,t0,t2,[x+48],[y+24])
+ combadd(t1,t0,t2,[x+56],[y+16])
+ mov [z+72], t2
+
+// Result term 10
+
+ xor t2, t2
+ combadz(t2,t1,t0,[x+24],[y+56])
+ combadd(t2,t1,t0,[x+32],[y+48])
+ combadd(t2,t1,t0,[x+40],[y+40])
+ combadd(t2,t1,t0,[x+48],[y+32])
+ combadd(t2,t1,t0,[x+56],[y+24])
+ mov [z+80], t0
+
+// Result term 11
+
+ xor t0, t0
+ combadz(t0,t2,t1,[x+32],[y+56])
+ combadd(t0,t2,t1,[x+40],[y+48])
+ combadd(t0,t2,t1,[x+48],[y+40])
+ combadd(t0,t2,t1,[x+56],[y+32])
+ mov [z+88], t1
+
+// Result term 12
+
+ xor t1, t1
+ combadz(t1,t0,t2,[x+40],[y+56])
+ combadd(t1,t0,t2,[x+48],[y+48])
+ combadd(t1,t0,t2,[x+56],[y+40])
+ mov [z+96], t2
+
+// Result term 13
+
+ xor t2, t2
+ combadz(t2,t1,t0,[x+48],[y+56])
+ combadd(t2,t1,t0,[x+56],[y+48])
+ mov [z+104], t0
+
+// Result term 14
+
+ combads(t2,t1,[x+56],[y+56])
+ mov [z+112], t1
+
+// Result term 11
+
+ mov [z+120], t2
+
+// Return
+
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- /dev/null
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[4]; output z[8]
+//
+// extern void bignum_sqr_4_8_alt
+// (uint64_t z[static 8], uint64_t x[static 4]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI: RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+ .intel_syntax noprefix
+ S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt)
+ S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt)
+ .text
+
+// Input arguments
+
+#define z rdi
+#define x rsi
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 rcx
+#define t1 r8
+#define t2 r9
+
+// Macro for the key "multiply and add to (c,h,l)" step, for square term
+
+#define combadd1(c,h,l,numa) \
+ mov rax, numa; \
+ mul rax; \
+ add l, rax; \
+ adc h, rdx; \
+ adc c, 0
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa) \
+ mov rax, numa; \
+ mul rax; \
+ add l, rax; \
+ adc h, rdx
+
+// A version doubling before adding, for non-square terms
+
+#define combadd2(c,h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ add rax, rax; \
+ adc rdx, rdx; \
+ adc c, 0; \
+ add l, rax; \
+ adc h, rdx; \
+ adc c, 0
+
+S2N_BN_SYMBOL(bignum_sqr_4_8_alt):
+
+#if WINDOWS_ABI
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+#endif
+
+// Result term 0
+
+ mov rax, [x]
+ mul rax
+
+ mov [z], rax
+ mov t0, rdx
+ xor t1, t1
+
+// Result term 1
+
+ xor t2, t2
+ combadd2(t2,t1,t0,[x],[x+8])
+ mov [z+8], t0
+
+// Result term 2
+
+ xor t0, t0
+ combadd1(t0,t2,t1,[x+8])
+ combadd2(t0,t2,t1,[x],[x+16])
+ mov [z+16], t1
+
+// Result term 3
+
+ xor t1, t1
+ combadd2(t1,t0,t2,[x],[x+24])
+ combadd2(t1,t0,t2,[x+8],[x+16])
+ mov [z+24], t2
+
+// Result term 4
+
+ xor t2, t2
+ combadd2(t2,t1,t0,[x+8],[x+24])
+ combadd1(t2,t1,t0,[x+16])
+ mov [z+32], t0
+
+// Result term 5
+
+ xor t0, t0
+ combadd2(t0,t2,t1,[x+16],[x+24])
+ mov [z+40], t1
+
+// Result term 6
+
+ xor t1, t1
+ combads(t0,t2,[x+24])
+ mov [z+48], t2
+
+// Result term 7
+
+ mov [z+56], t0
+
+// Return
+
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- /dev/null
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Square, z := x^2
+// Input x[8]; output z[16]
+//
+// extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]);
+//
+// Standard x86-64 ABI: RDI = z, RSI = x
+// Microsoft x64 ABI: RCX = z, RDX = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+ .intel_syntax noprefix
+ S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt)
+ S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt)
+ .text
+
+// Input arguments
+
+#define z rdi
+#define x rsi
+
+// Other variables used as a rotating 3-word window to add terms to
+
+#define t0 r8
+#define t1 r9
+#define t2 r10
+
+// Additional temporaries for local windows to share doublings
+
+#define u0 rcx
+#define u1 r11
+
+// Macro for the key "multiply and add to (c,h,l)" step
+
+#define combadd(c,h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ add l, rax; \
+ adc h, rdx; \
+ adc c, 0
+
+// Set up initial window (c,h,l) = numa * numb
+
+#define combaddz(c,h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ xor c, c; \
+ mov l, rax; \
+ mov h, rdx
+
+// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l)
+
+#define doubladd(c,h,l,hh,ll) \
+ add ll, ll; \
+ adc hh, hh; \
+ adc c, c; \
+ add l, ll; \
+ adc h, hh; \
+ adc c, 0
+
+// Square term incorporation (c,h,l) += numba^2
+
+#define combadd1(c,h,l,numa) \
+ mov rax, numa; \
+ mul rax; \
+ add l, rax; \
+ adc h, rdx; \
+ adc c, 0
+
+// A short form where we don't expect a top carry
+
+#define combads(h,l,numa) \
+ mov rax, numa; \
+ mul rax; \
+ add l, rax; \
+ adc h, rdx
+
+// A version doubling directly before adding, for single non-square terms
+
+#define combadd2(c,h,l,numa,numb) \
+ mov rax, numa; \
+ mul QWORD PTR numb; \
+ add rax, rax; \
+ adc rdx, rdx; \
+ adc c, 0; \
+ add l, rax; \
+ adc h, rdx; \
+ adc c, 0
+
+S2N_BN_SYMBOL(bignum_sqr_8_16_alt):
+
+#if WINDOWS_ABI
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+#endif
+
+// Result term 0
+
+ mov rax, [x]
+ mul rax
+
+ mov [z], rax
+ mov t0, rdx
+ xor t1, t1
+
+// Result term 1
+
+ xor t2, t2
+ combadd2(t2,t1,t0,[x],[x+8])
+ mov [z+8], t0
+
+// Result term 2
+
+ xor t0, t0
+ combadd1(t0,t2,t1,[x+8])
+ combadd2(t0,t2,t1,[x],[x+16])
+ mov [z+16], t1
+
+// Result term 3
+
+ combaddz(t1,u1,u0,[x],[x+24])
+ combadd(t1,u1,u0,[x+8],[x+16])
+ doubladd(t1,t0,t2,u1,u0)
+ mov [z+24], t2
+
+// Result term 4
+
+ combaddz(t2,u1,u0,[x],[x+32])
+ combadd(t2,u1,u0,[x+8],[x+24])
+ doubladd(t2,t1,t0,u1,u0)
+ combadd1(t2,t1,t0,[x+16])
+ mov [z+32], t0
+
+// Result term 5
+
+ combaddz(t0,u1,u0,[x],[x+40])
+ combadd(t0,u1,u0,[x+8],[x+32])
+ combadd(t0,u1,u0,[x+16],[x+24])
+ doubladd(t0,t2,t1,u1,u0)
+ mov [z+40], t1
+
+// Result term 6
+
+ combaddz(t1,u1,u0,[x],[x+48])
+ combadd(t1,u1,u0,[x+8],[x+40])
+ combadd(t1,u1,u0,[x+16],[x+32])
+ doubladd(t1,t0,t2,u1,u0)
+ combadd1(t1,t0,t2,[x+24])
+ mov [z+48], t2
+
+// Result term 7
+
+ combaddz(t2,u1,u0,[x],[x+56])
+ combadd(t2,u1,u0,[x+8],[x+48])
+ combadd(t2,u1,u0,[x+16],[x+40])
+ combadd(t2,u1,u0,[x+24],[x+32])
+ doubladd(t2,t1,t0,u1,u0)
+ mov [z+56], t0
+
+// Result term 8
+
+ combaddz(t0,u1,u0,[x+8],[x+56])
+ combadd(t0,u1,u0,[x+16],[x+48])
+ combadd(t0,u1,u0,[x+24],[x+40])
+ doubladd(t0,t2,t1,u1,u0)
+ combadd1(t0,t2,t1,[x+32])
+ mov [z+64], t1
+
+// Result term 9
+
+ combaddz(t1,u1,u0,[x+16],[x+56])
+ combadd(t1,u1,u0,[x+24],[x+48])
+ combadd(t1,u1,u0,[x+32],[x+40])
+ doubladd(t1,t0,t2,u1,u0)
+ mov [z+72], t2
+
+// Result term 10
+
+ combaddz(t2,u1,u0,[x+24],[x+56])
+ combadd(t2,u1,u0,[x+32],[x+48])
+ doubladd(t2,t1,t0,u1,u0)
+ combadd1(t2,t1,t0,[x+40])
+ mov [z+80], t0
+
+// Result term 11
+
+ combaddz(t0,u1,u0,[x+32],[x+56])
+ combadd(t0,u1,u0,[x+40],[x+48])
+ doubladd(t0,t2,t1,u1,u0)
+ mov [z+88], t1
+
+// Result term 12
+
+ xor t1, t1
+ combadd2(t1,t0,t2,[x+40],[x+56])
+ combadd1(t1,t0,t2,[x+48])
+ mov [z+96], t2
+
+// Result term 13
+
+ xor t2, t2
+ combadd2(t2,t1,t0,[x+48],[x+56])
+ mov [z+104], t0
+
+// Result term 14
+
+ combads(t2,t1,[x+56])
+ mov [z+112], t1
+
+// Result term 15
+
+ mov [z+120], t2
+
+// Return
+
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
--- /dev/null
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Subtract, z := x - y
+// Inputs x[m], y[n]; outputs function return (carry-out) and z[p]
+//
+// extern uint64_t bignum_sub
+// (uint64_t p, uint64_t *z,
+// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y);
+//
+// Does the z := x - y operation, truncating modulo p words in general and
+// returning a top borrow (0 or 1) in the p'th place, only subtracting input
+// words below p (as well as m and n respectively) to get the diff and borrow.
+//
+// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX
+// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+ .intel_syntax noprefix
+ S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub)
+ S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub)
+ .text
+
+#define p rdi
+#define z rsi
+#define m rdx
+#define x rcx
+#define n r8
+#define y r9
+#define i r10
+#define a rax
+
+#define ashort eax
+
+
+
+S2N_BN_SYMBOL(bignum_sub):
+
+#if WINDOWS_ABI
+ push rdi
+ push rsi
+ mov rdi, rcx
+ mov rsi, rdx
+ mov rdx, r8
+ mov rcx, r9
+ mov r8, [rsp+56]
+ mov r9, [rsp+64]
+#endif
+
+// Zero the main index counter for both branches
+
+ xor i, i
+
+// First clamp the two input sizes m := min(p,m) and n := min(p,n) since
+// we'll never need words past the p'th. Can now assume m <= p and n <= p.
+// Then compare the modified m and n and branch accordingly
+
+ cmp p, m
+ cmovc m, p
+ cmp p, n
+ cmovc n, p
+ cmp m, n
+ jc ylonger
+
+// The case where x is longer or of the same size (p >= m >= n)
+
+ sub p, m
+ sub m, n
+ inc m
+ test n, n
+ jz xtest
+xmainloop:
+ mov a, [x+8*i]
+ sbb a, [y+8*i]
+ mov [z+8*i],a
+ inc i
+ dec n
+ jnz xmainloop
+ jmp xtest
+xtoploop:
+ mov a, [x+8*i]
+ sbb a, 0
+ mov [z+8*i],a
+ inc i
+xtest:
+ dec m
+ jnz xtoploop
+ sbb a, a
+ test p, p
+ jz tailskip
+tailloop:
+ mov [z+8*i],a
+ inc i
+ dec p
+ jnz tailloop
+tailskip:
+ neg a
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+// The case where y is longer (p >= n > m)
+
+ylonger:
+
+ sub p, n
+ sub n, m
+ test m, m
+ jz ytoploop
+ymainloop:
+ mov a, [x+8*i]
+ sbb a, [y+8*i]
+ mov [z+8*i],a
+ inc i
+ dec m
+ jnz ymainloop
+ytoploop:
+ mov ashort, 0
+ sbb a, [y+8*i]
+ mov [z+8*i],a
+ inc i
+ dec n
+ jnz ytoploop
+ sbb a, a
+ test p, p
+ jnz tailloop
+ neg a
+#if WINDOWS_ABI
+ pop rsi
+ pop rdi
+#endif
+ ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif