From: jsing Date: Sun, 28 May 2023 17:42:30 +0000 (+0000) Subject: Provide optimised bn_mulw_{addw,addw_addw,addtw}() for aarch64. X-Git-Url: http://artulab.com/gitweb/?a=commitdiff_plain;h=83af2fa26020e9a31ed4ee7f51afeed058ae2575;p=openbsd Provide optimised bn_mulw_{addw,addw_addw,addtw}() for aarch64. This results in bn_mul_comba4() and bn_mul_comba8() requiring ~30% less instructions than they did previously. --- diff --git a/lib/libcrypto/bn/arch/aarch64/bn_arch.h b/lib/libcrypto/bn/arch/aarch64/bn_arch.h index 1b9358e710a..708083aaf2d 100644 --- a/lib/libcrypto/bn/arch/aarch64/bn_arch.h +++ b/lib/libcrypto/bn/arch/aarch64/bn_arch.h @@ -1,4 +1,4 @@ -/* $OpenBSD: bn_arch.h,v 1.8 2023/05/28 17:22:04 jsing Exp $ */ +/* $OpenBSD: bn_arch.h,v 1.9 2023/05/28 17:42:30 jsing Exp $ */ /* * Copyright (c) 2023 Joel Sing * @@ -81,6 +81,73 @@ bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0) *out_r0 = r0; } +#define HAVE_BN_MULW_ADDW + +static inline void +bn_mulw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1, + BN_ULONG *out_r0) +{ + BN_ULONG r1, r0; + + __asm__ ( + "umulh %[r1], %[a], %[b] \n" + "mul %[r0], %[a], %[b] \n" + "adds %[r0], %[r0], %[c] \n" + "adc %[r1], %[r1], xzr \n" + : [r1]"=&r"(r1), [r0]"=&r"(r0) + : [a]"r"(a), [b]"r"(b), [c]"r"(c) + : "cc"); + + *out_r1 = r1; + *out_r0 = r0; +} + +#define HAVE_BN_MULW_ADDW_ADDW + +static inline void +bn_mulw_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG d, + BN_ULONG *out_r1, BN_ULONG *out_r0) +{ + BN_ULONG r1, r0; + + __asm__ ( + "umulh %[r1], %[a], %[b] \n" + "mul %[r0], %[a], %[b] \n" + "adds %[r0], %[r0], %[c] \n" + "adc %[r1], %[r1], xzr \n" + "adds %[r0], %[r0], %[d] \n" + "adc %[r1], %[r1], xzr \n" + : [r1]"=&r"(r1), [r0]"=&r"(r0) + : [a]"r"(a), [b]"r"(b), [c]"r"(c), [d]"r"(d) + : "cc"); + + *out_r1 = r1; + *out_r0 = r0; +} + +#define HAVE_BN_MULW_ADDTW + +static inline void +bn_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, + BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0) +{ + BN_ULONG r2, r1, r0; + + __asm__ ( + "umulh %[r1], %[a], %[b] \n" + "mul %[r0], %[a], %[b] \n" + "adds %[r0], %[r0], %[c0] \n" + "adcs %[r1], %[r1], %[c1] \n" + "adc %[r2], xzr, %[c2] \n" + : [r2]"=&r"(r2), [r1]"=&r"(r1), [r0]"=&r"(r0) + : [a]"r"(a), [b]"r"(b), [c2]"r"(c2), [c1]"r"(c1), [c0]"r"(c0) + : "cc"); + + *out_r2 = r2; + *out_r1 = r1; + *out_r0 = r0; +} + #define HAVE_BN_SUBW static inline void