From 52bac4660cf211986941af2555ddc5c4679501ea Mon Sep 17 00:00:00 2001
From: jsing <jsing@openbsd.org>
Date: Sat, 21 Jan 2023 16:29:52 +0000
Subject: [PATCH] Bring in s2n-bignum's bignum_sqr() for amd64.

ok tb@
---
 lib/libcrypto/bn/arch/amd64/bignum_sqr.S | 185 +++++++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 lib/libcrypto/bn/arch/amd64/bignum_sqr.S

diff --git a/lib/libcrypto/bn/arch/amd64/bignum_sqr.S b/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
new file mode 100644
index 00000000000..97cbebea7d0
--- /dev/null
+++ b/lib/libcrypto/bn/arch/amd64/bignum_sqr.S
@@ -0,0 +1,185 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+// ----------------------------------------------------------------------------
+// Square z := x^2
+// Input x[n]; output z[k]
+//
+//    extern void bignum_sqr
+//     (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x);
+//
+// Does the "z := x^2" operation where x is n digits and result z is k.
+// Truncates the result in general unless k >= 2 * n
+//
+// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x
+// Microsoft x64 ABI:   RCX = k, RDX = z, R8 = n, R9 = x
+// ----------------------------------------------------------------------------
+
+#include "_internal_s2n_bignum.h"
+
+        .intel_syntax noprefix
+        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr)
+        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr)
+        .text
+
+// First three are where arguments come in, but n is moved.
+
+#define p rdi
+#define z rsi
+#define x rcx
+#define n r8
+
+// These are always local scratch since multiplier result is in these
+
+#define a rax
+#define d rdx
+
+// Other variables
+
+#define i rbx
+#define ll rbp
+#define hh r9
+#define k r10
+#define y r11
+#define htop r12
+#define l r13
+#define h r14
+#define c r15
+
+// Short versions
+
+#define llshort ebp
+
+S2N_BN_SYMBOL(bignum_sqr):
+
+#if WINDOWS_ABI
+        push    rdi
+        push    rsi
+        mov     rdi, rcx
+        mov     rsi, rdx
+        mov     rdx, r8
+        mov     rcx, r9
+#endif
+
+// We use too many registers, and also we need rax:rdx for multiplications
+
+        push    rbx
+        push    rbp
+        push    r12
+        push    r13
+        push    r14
+        push    r15
+        mov     n, rdx
+
+// If p = 0 the result is trivial and nothing needs doing
+
+        test    p, p
+        jz      end
+
+// initialize (hh,ll) = 0
+
+        xor     llshort, llshort
+        xor     hh, hh
+
+// Iterate outer loop from k = 0 ... k = p - 1 producing result digits
+
+        xor     k, k
+
+outerloop:
+
+// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n
+// We want to accumulate all x[i] * x[k - i] for bot <= i < top
+// For the optimization of squaring we avoid duplication and do
+// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n
+// Initialize i = bot; in fact just compute bot as i directly.
+
+        xor     c, c
+        lea     i, [k+1]
+        mov     htop, i
+        shr     htop, 1
+        sub     i, n
+        cmovc   i, c
+        cmp     htop, n
+        cmovnc  htop, n
+
+// Initialize the three-part local sum (c,h,l); c was already done above
+
+        xor     l, l
+        xor     h, h
+
+// If htop <= bot then main doubled part of the sum is empty
+
+        cmp     i, htop
+        jnc     nosumming
+
+// Use a moving pointer for [y] = x[k-i] for the cofactor
+
+        mov     a, k
+        sub     a, i
+        lea     y, [x+8*a]
+
+// Do the main part of the sum x[i] * x[k - i] for 2 * i < k
+
+innerloop:
+        mov     a, [x+8*i]
+        mul     QWORD PTR [y]
+        add     l, a
+        adc     h, d
+        adc     c, 0
+        sub     y, 8
+        inc     i
+        cmp     i, htop
+        jc      innerloop
+
+// Now double it
+
+        add     l, l
+        adc     h, h
+        adc     c, c
+
+// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term
+
+nosumming:
+        test    k, 1
+        jnz     innerend
+        cmp     i, n
+        jnc     innerend
+
+        mov     a, [x+8*i]
+        mul     a
+        add     l, a
+        adc     h, d
+        adc     c, 0
+
+// Now add the local sum into the global sum, store and shift
+
+innerend:
+        add     l, ll
+        mov     [z+8*k], l
+        adc     h, hh
+        mov     ll, h
+        adc     c, 0
+        mov     hh, c
+
+        inc     k
+        cmp     k, p
+        jc      outerloop
+
+// Restore registers and return
+
+end:
+        pop     r15
+        pop     r14
+        pop     r13
+        pop     r12
+        pop     rbp
+        pop     rbx
+#if WINDOWS_ABI
+        pop    rsi
+        pop    rdi
+#endif
+        ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
-- 
2.20.1